From e379fdfe189f067445daeac69e8997192c8f0aed Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Mon, 15 Jun 2020 11:09:47 -0400 Subject: [PATCH] Prevent pmond process recovery when system is not running The maintenance process monitor (pmon) should only recover failed processes when the system state is 'running' or 'degraded'. The current implementation allowed process recovery for other non-inservice states, including an unknown state if systemd returns no data on the state query. This update tighten's up the system state check by adding retries to the state query utility and restricting accepted states to 'running' and 'degraded'. This change then prevents pmon from inadvertently killing and recovering the mtcClient which indirectly kills off the mtcClient's fail-safe sysreq reboot child thread if pmon state query returns anything other than running or degraded during a shut down. Change-Id: I605ae8be06f8f8351a51afce98a4f8bae54a40fd Closes-Bug: 1883519 Signed-off-by: Eric MacDonald --- mtce-common/src/common/nodeUtil.cpp | 88 ++++++++++++++++++++--------- mtce-common/src/common/nodeUtil.h | 3 +- mtce/src/pmon/pmonHdlr.cpp | 52 +++++++++++++---- 3 files changed, 104 insertions(+), 39 deletions(-) diff --git a/mtce-common/src/common/nodeUtil.cpp b/mtce-common/src/common/nodeUtil.cpp index 460a3fe7..bc245703 100755 --- a/mtce-common/src/common/nodeUtil.cpp +++ b/mtce-common/src/common/nodeUtil.cpp @@ -1835,33 +1835,69 @@ int execute_pipe_cmd(const char *command, char *result, unsigned int result_size #define PIPE_COMMAND_RESPON_LEN (100) #endif -system_state_enum get_system_state ( void ) +const char * get_system_state_str ( system_state_enum system_state ) { - char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ; - execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN ); - if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 ) + switch(system_state) { - ilog ("systemctl reports host as '%s'\n", pipe_cmd_output ); - string temp = pipe_cmd_output ; - if ( temp.find ("stopping") != string::npos ) - return MTC_SYSTEM_STATE__STOPPING; - if ( temp.find ("running") != string::npos ) - return MTC_SYSTEM_STATE__RUNNING; - if ( temp.find ("degraded") != string::npos ) - return MTC_SYSTEM_STATE__DEGRADED; - if ( temp.find ("starting") != string::npos ) - return MTC_SYSTEM_STATE__STARTING; - if ( temp.find ("initializing") != string::npos ) - return MTC_SYSTEM_STATE__INITIALIZING; - if ( temp.find ("offline") != string::npos ) - return MTC_SYSTEM_STATE__OFFLINE; - if ( temp.find ("maintenance") != string::npos ) - return MTC_SYSTEM_STATE__MAINTENANCE; - slog ("unexpected response: <%s>\n", temp.c_str()); + case MTC_SYSTEM_STATE__RUNNING: return("running"); + case MTC_SYSTEM_STATE__STOPPING: return("stopping"); + case MTC_SYSTEM_STATE__STARTING: return("starting"); + case MTC_SYSTEM_STATE__DEGRADED: return("degraded"); + case MTC_SYSTEM_STATE__INITIALIZING: return("initializing"); + case MTC_SYSTEM_STATE__OFFLINE: return("offline"); + case MTC_SYSTEM_STATE__MAINTENANCE: return("maintenance"); + default: return("unknown"); } - else - { - wlog ("systemctl is-system-running yielded no response\n"); - } - return MTC_SYSTEM_STATE__UNKNOWN ; +} + +system_state_enum get_system_state ( bool verbose ) +{ + + int retry = 0 ; + bool unexpected_response = false ; + string temp = "" ; + system_state_enum system_state = MTC_SYSTEM_STATE__UNKNOWN ; + for ( ; retry < 3 ; retry++ ) + { + char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ; + execute_pipe_cmd ( "systemctl is-system-running", + &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN ); + if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 ) + { + temp = pipe_cmd_output ; + if ( temp.find ("stopping") != string::npos ) + system_state = MTC_SYSTEM_STATE__STOPPING; + else if ( temp.find ("running") != string::npos ) + system_state = MTC_SYSTEM_STATE__RUNNING; + else if ( temp.find ("degraded") != string::npos ) + system_state = MTC_SYSTEM_STATE__DEGRADED; + else if ( temp.find ("starting") != string::npos ) + system_state = MTC_SYSTEM_STATE__STARTING; + else if ( temp.find ("initializing") != string::npos ) + system_state = MTC_SYSTEM_STATE__INITIALIZING; + else if ( temp.find ("offline") != string::npos ) + system_state = MTC_SYSTEM_STATE__OFFLINE; + else if ( temp.find ("maintenance") != string::npos ) + system_state = MTC_SYSTEM_STATE__MAINTENANCE; + else + unexpected_response = true ; + } + + if ( system_state != MTC_SYSTEM_STATE__UNKNOWN ) + break ; + } + + if ( verbose || unexpected_response ) + { + if ( unexpected_response ) + { + ilog ("systemctl provided unexpected response:'%s'", temp.c_str()); + } + else + { + ilog ("systemctl reports host in '%s' state (%d)", + get_system_state_str(system_state), retry); + } + } + return system_state ; } diff --git a/mtce-common/src/common/nodeUtil.h b/mtce-common/src/common/nodeUtil.h index 00a6828b..c56f3796 100755 --- a/mtce-common/src/common/nodeUtil.h +++ b/mtce-common/src/common/nodeUtil.h @@ -127,6 +127,7 @@ typedef enum MTC_SYSTEM_STATE__UNKNOWN } system_state_enum ; -system_state_enum get_system_state ( void ); +system_state_enum get_system_state ( bool verbose=true ); +const char * get_system_state_str ( system_state_enum system_state ); #endif diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 7b5977d1..7ab0a8ee 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -1807,7 +1807,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) ilog ("Starting 'Degrade Audit' timer (%d secs)\n", degrade_period ); mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period ); - ilog ("Starting 'Pulse' timer (%d secs)\n", pulse_period ); + ilog ("Starting 'Pulse' timer (%d msecs)\n", pulse_period ); mtcTimer_start_msec ( pmonTimer_pulse, pmon_timer_handler, pulse_period ); ilog ("Starting 'Host Watchdog' timer (%d secs)\n", hostwd_period ); @@ -1887,17 +1887,6 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) } } - /* Avoid pmond thrashing trying to recover processes during - * system shutdown. */ - if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING ) - { - wlog_throttled ( shutdown_log_throttle, 500, - "process monitoring disabled during system shutdown\n"); - usleep (500); - continue ; - } - if ( shutdown_log_throttle ) shutdown_log_throttle = 0 ; - if ( inotify_fault == false ) { if ( get_inotify_events ( ctrl_ptr->fd ) == true ) @@ -1992,9 +1981,48 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) _get_events ( ); } + /* Check system state before managing processes. + * + * Prevent process recoverty while not in the + * running or degraded state. */ + if (( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__RUNNING ) && + ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__DEGRADED )) + { + system_state_enum system_state = get_system_state(false); + if ( system_state != _pmon_ctrl_ptr->system_state ) + { + _pmon_ctrl_ptr->system_state = system_state ; + if (( system_state != MTC_SYSTEM_STATE__RUNNING ) && + ( system_state != MTC_SYSTEM_STATE__DEGRADED )) + { + /* log every state change that is not running / degraded */ + wlog ("process monitoring disabled while in '%s' state", + get_system_state_str(system_state)); + } + else + { + /* log every state change that is not running / degraded */ + wlog ("process monitoring re-enabled while in '%s' state", + get_system_state_str(system_state)); + } + } + + /* throttle the disabled state during shutdown log */ + if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING ) + { + wlog_throttled ( shutdown_log_throttle, 60, + "process monitoring disabled during system shutdown\n"); + } + sleep (1); + continue ; + } + else if ( shutdown_log_throttle ) + shutdown_log_throttle = 0 ; + /* Monitor Processes */ for ( int i = 0 ; i < ctrl_ptr->processes ; i++ ) { + /* Allow a process to be ignored */ if ( process_config[i].ignore == true ) {