Prevent pmond process recovery when system is not running

The maintenance process monitor (pmon) should only
recover failed processes when the system state is
'running' or 'degraded'.

The current implementation allowed process recovery
for other non-inservice states, including an unknown
state if systemd returns no data on the state query.

This update tighten's up the system state check by
adding retries to the state query utility and
restricting accepted states to 'running' and 'degraded'.

This change then prevents pmon from inadvertently killing
and recovering the mtcClient which indirectly kills off
the mtcClient's fail-safe sysreq reboot child thread
if pmon state query returns anything other than running
or degraded during a shut down.

Change-Id: I605ae8be06f8f8351a51afce98a4f8bae54a40fd
Closes-Bug: 1883519
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-06-15 11:09:47 -04:00
parent 440731a014
commit e379fdfe18
3 changed files with 104 additions and 39 deletions

View File

@ -1835,33 +1835,69 @@ int execute_pipe_cmd(const char *command, char *result, unsigned int result_size
#define PIPE_COMMAND_RESPON_LEN (100)
#endif
system_state_enum get_system_state ( void )
const char * get_system_state_str ( system_state_enum system_state )
{
switch(system_state)
{
case MTC_SYSTEM_STATE__RUNNING: return("running");
case MTC_SYSTEM_STATE__STOPPING: return("stopping");
case MTC_SYSTEM_STATE__STARTING: return("starting");
case MTC_SYSTEM_STATE__DEGRADED: return("degraded");
case MTC_SYSTEM_STATE__INITIALIZING: return("initializing");
case MTC_SYSTEM_STATE__OFFLINE: return("offline");
case MTC_SYSTEM_STATE__MAINTENANCE: return("maintenance");
default: return("unknown");
}
}
system_state_enum get_system_state ( bool verbose )
{
int retry = 0 ;
bool unexpected_response = false ;
string temp = "" ;
system_state_enum system_state = MTC_SYSTEM_STATE__UNKNOWN ;
for ( ; retry < 3 ; retry++ )
{
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
execute_pipe_cmd ( "systemctl is-system-running",
&pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
{
ilog ("systemctl reports host as '%s'\n", pipe_cmd_output );
string temp = pipe_cmd_output ;
temp = pipe_cmd_output ;
if ( temp.find ("stopping") != string::npos )
return MTC_SYSTEM_STATE__STOPPING;
if ( temp.find ("running") != string::npos )
return MTC_SYSTEM_STATE__RUNNING;
if ( temp.find ("degraded") != string::npos )
return MTC_SYSTEM_STATE__DEGRADED;
if ( temp.find ("starting") != string::npos )
return MTC_SYSTEM_STATE__STARTING;
if ( temp.find ("initializing") != string::npos )
return MTC_SYSTEM_STATE__INITIALIZING;
if ( temp.find ("offline") != string::npos )
return MTC_SYSTEM_STATE__OFFLINE;
if ( temp.find ("maintenance") != string::npos )
return MTC_SYSTEM_STATE__MAINTENANCE;
slog ("unexpected response: <%s>\n", temp.c_str());
system_state = MTC_SYSTEM_STATE__STOPPING;
else if ( temp.find ("running") != string::npos )
system_state = MTC_SYSTEM_STATE__RUNNING;
else if ( temp.find ("degraded") != string::npos )
system_state = MTC_SYSTEM_STATE__DEGRADED;
else if ( temp.find ("starting") != string::npos )
system_state = MTC_SYSTEM_STATE__STARTING;
else if ( temp.find ("initializing") != string::npos )
system_state = MTC_SYSTEM_STATE__INITIALIZING;
else if ( temp.find ("offline") != string::npos )
system_state = MTC_SYSTEM_STATE__OFFLINE;
else if ( temp.find ("maintenance") != string::npos )
system_state = MTC_SYSTEM_STATE__MAINTENANCE;
else
unexpected_response = true ;
}
if ( system_state != MTC_SYSTEM_STATE__UNKNOWN )
break ;
}
if ( verbose || unexpected_response )
{
if ( unexpected_response )
{
ilog ("systemctl provided unexpected response:'%s'", temp.c_str());
}
else
{
wlog ("systemctl is-system-running yielded no response\n");
ilog ("systemctl reports host in '%s' state (%d)",
get_system_state_str(system_state), retry);
}
return MTC_SYSTEM_STATE__UNKNOWN ;
}
return system_state ;
}

View File

@ -127,6 +127,7 @@ typedef enum
MTC_SYSTEM_STATE__UNKNOWN
} system_state_enum ;
system_state_enum get_system_state ( void );
system_state_enum get_system_state ( bool verbose=true );
const char * get_system_state_str ( system_state_enum system_state );
#endif

View File

@ -1807,7 +1807,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
ilog ("Starting 'Degrade Audit' timer (%d secs)\n", degrade_period );
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period );
ilog ("Starting 'Pulse' timer (%d secs)\n", pulse_period );
ilog ("Starting 'Pulse' timer (%d msecs)\n", pulse_period );
mtcTimer_start_msec ( pmonTimer_pulse, pmon_timer_handler, pulse_period );
ilog ("Starting 'Host Watchdog' timer (%d secs)\n", hostwd_period );
@ -1887,17 +1887,6 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
}
}
/* Avoid pmond thrashing trying to recover processes during
* system shutdown. */
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
{
wlog_throttled ( shutdown_log_throttle, 500,
"process monitoring disabled during system shutdown\n");
usleep (500);
continue ;
}
if ( shutdown_log_throttle ) shutdown_log_throttle = 0 ;
if ( inotify_fault == false )
{
if ( get_inotify_events ( ctrl_ptr->fd ) == true )
@ -1992,9 +1981,48 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
_get_events ( );
}
/* Check system state before managing processes.
*
* Prevent process recoverty while not in the
* running or degraded state. */
if (( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__RUNNING ) &&
( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__DEGRADED ))
{
system_state_enum system_state = get_system_state(false);
if ( system_state != _pmon_ctrl_ptr->system_state )
{
_pmon_ctrl_ptr->system_state = system_state ;
if (( system_state != MTC_SYSTEM_STATE__RUNNING ) &&
( system_state != MTC_SYSTEM_STATE__DEGRADED ))
{
/* log every state change that is not running / degraded */
wlog ("process monitoring disabled while in '%s' state",
get_system_state_str(system_state));
}
else
{
/* log every state change that is not running / degraded */
wlog ("process monitoring re-enabled while in '%s' state",
get_system_state_str(system_state));
}
}
/* throttle the disabled state during shutdown log */
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
{
wlog_throttled ( shutdown_log_throttle, 60,
"process monitoring disabled during system shutdown\n");
}
sleep (1);
continue ;
}
else if ( shutdown_log_throttle )
shutdown_log_throttle = 0 ;
/* Monitor Processes */
for ( int i = 0 ; i < ctrl_ptr->processes ; i++ )
{
/* Allow a process to be ignored */
if ( process_config[i].ignore == true )
{