From 8b757de303489715a0c8c63478fb30d601f60c5f Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 7 Oct 2019 09:14:33 -0400 Subject: [PATCH] Make pmon-restart honour startuptime config option Each monitored process's config file contains a startuptime label that specifies how many seconds it takes for that newly started process to stabalize and produce its pidfile. The pmon-restart feature needs to delay monitoring newly restarted process for 'startuptime' seconds. Failing to do so can cause it to fail the restarted process to early if there is pidfile creation delay. Test Plan: PASS: Verify collectd pmon-restart function with soak ; > 5000+ collectd pmon-restarts. PASS: Verify pmond regression test suite (test-pmon-action.sh) > restart command ; graceful restart all monitored processes. (5 loops) > kill command ; kill and recover all monitored processes. (5 loops) Regression: PASS: Verify pmon-stop command/function PASS: Verify pmon-start command/function also honors the startuptime. PASS: Verify pmon-stop auto start after auto-start timeout PASS: Verify System Install PASS: Verify Patching (soak) Change-Id: I9fd7bba8e49fe4c28281539ab4930bdac370ef11 Closes-Bug: #1844724 Signed-off-by: Eric MacDonald --- mtce/src/pmon/pmonFsm.cpp | 38 +++++++++++++++++++------------------- mtce/src/pmon/pmonHdlr.cpp | 27 +++++++++++++-------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/mtce/src/pmon/pmonFsm.cpp b/mtce/src/pmon/pmonFsm.cpp index dab04ceb..7e9920db 100644 --- a/mtce/src/pmon/pmonFsm.cpp +++ b/mtce/src/pmon/pmonFsm.cpp @@ -562,7 +562,7 @@ int pmon_passive_handler ( process_config_type * ptr ) /* Spawn the process */ case PMON_STAGE__RESPAWN: { - ilog ("%s stability period (%d secs)\n", ptr->process, ptr->debounce ); + dlog ("%s stability period (%d secs)\n", ptr->process, ptr->debounce ); /* Restart the process */ respawn_process ( ptr ) ; @@ -570,20 +570,9 @@ int pmon_passive_handler ( process_config_type * ptr ) /* Start the monitor debounce timer. */ mtcTimer_reset ( ptr->pt_ptr ); - /* Don't wait for the debounce timer to take this process out of 'commanded restart' mode. - * Do it now, otherwise tight patch loop stress testing might fail */ - if ( ptr->restart == true ) - { - ilog ("%s Restarted\n", ptr->process ) - ptr->restart = false ; - ptr->registered = false ; - passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; - } - else - { - mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); - passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ; - } + mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime ); + passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ; + break ; } @@ -619,10 +608,21 @@ int pmon_passive_handler ( process_config_type * ptr ) /* clear the monitor debounce counter */ ptr->debounce_cnt = 0 ; - /* Start debounce monitor phase */ - passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ; - process_running ( ptr ); - ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid ); + if ( ptr->restart == true ) + { + ptr->restart = false ; + ptr->registered = false ; + ptr->pid = get_process_pid ( ptr ); + ilog ("%s Restarted (%d)\n", ptr->process, ptr->pid ); + passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; + } + else + { + /* Start debounce monitor phase */ + passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ; + process_running ( ptr ); + ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid ); + } } ptr->sigchld_rxed = false ; diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index f3bf72e2..59ee37c1 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -297,13 +297,16 @@ void manage_process_failure ( process_config_type * ptr ) * If not, then query the current system state and save it. */ if ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__STOPPING ) { + elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process, + ptr->pid, + ptr->failed, + ptr->active_failed); /* update current state */ _pmon_ctrl_ptr->system_state = get_system_state(); } - - /* Ignore process failures while in stopping (i.e. shutdown) mode */ - if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING ) + else { + /* Ignore process failures while in stopping (i.e. shutdown) mode */ /* don't report process failures during system shutdown. */ wlog ("%s terminated by system shutdown (pid:%d) ; ignoring\n", ptr->process , ptr->pid ); @@ -317,7 +320,6 @@ void manage_process_failure ( process_config_type * ptr ) slog ("%s process is in the stopped state\n", ptr->process); } - elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process, ptr->pid, ptr->failed, ptr->active_failed); passiveStageChange ( ptr, PMON_STAGE__MANAGE) ; if ( ptr->failed == false ) @@ -836,15 +838,12 @@ int process_config_load (process_config_type * pc_ptr, const char * config_file_ int get_process_pid ( process_config_type * ptr ) { int pid = 0 ; - FILE * pid_file_stream = fopen ( ptr->pidfile, "r" ); - if ( pid_file_stream != NULL ) + if ( ptr ) { - int num = fscanf ( pid_file_stream, "%d", &pid); - if ( num != 1 ) + if ( daemon_is_file_present ( ptr->pidfile ) == true ) { - wlog ("fscanf failed to read pid from %s\n", ptr->pidfile ); + pid = daemon_get_file_int ( ptr->pidfile ); } - fclose (pid_file_stream); } return (pid); } @@ -939,7 +938,7 @@ bool kill_running_process ( int pid ) daemon_remove_file ( ptr->pidfile ); } } - wlog ("%s kill succeeded (%d)\n", proc_name_ptr, pid ); + wlog ("%s Killed (%d)\n", proc_name_ptr, pid ); rc = true ; } else @@ -1100,7 +1099,7 @@ int unregister_process ( process_config_type * ptr ) } else { - ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid ); + ilog ("%s Unregister (%d)\n", ptr->process, ptr->pid ); } } ptr->registered = false ; @@ -1202,7 +1201,7 @@ int respawn_process ( process_config_type * ptr ) unregister_process ( ptr ); if ( process_running ( ptr ) == true ) { - ilog ("%s restart of running process\n", ptr->process ); + dlog ("%s still running\n", ptr->process ); restart = true ; kill_running_process ( ptr->pid ); } @@ -1454,7 +1453,7 @@ void daemon_sigchld_hdlr ( void ) { if ( process_ptr->status_monitoring == false ) { - dlog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n", + ilog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n", process_ptr->process, process_ptr->status, process_ptr->status,