Prevent pmond process recovery when system is not running
The maintenance process monitor (pmon) should only recover failed processes when the system state is 'running' or 'degraded'. The current implementation allowed process recovery for other non-inservice states, including an unknown state if systemd returns no data on the state query. This update tighten's up the system state check by adding retries to the state query utility and restricting accepted states to 'running' and 'degraded'. This change then prevents pmon from inadvertently killing and recovering the mtcClient which indirectly kills off the mtcClient's fail-safe sysreq reboot child thread if pmon state query returns anything other than running or degraded during a shut down. Change-Id: I605ae8be06f8f8351a51afce98a4f8bae54a40fd Closes-Bug: 1883519 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
440731a014
commit
e379fdfe18
@ -1835,33 +1835,69 @@ int execute_pipe_cmd(const char *command, char *result, unsigned int result_size
|
||||
#define PIPE_COMMAND_RESPON_LEN (100)
|
||||
#endif
|
||||
|
||||
system_state_enum get_system_state ( void )
|
||||
const char * get_system_state_str ( system_state_enum system_state )
|
||||
{
|
||||
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
|
||||
execute_pipe_cmd ( "systemctl is-system-running", &pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
|
||||
if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
|
||||
switch(system_state)
|
||||
{
|
||||
ilog ("systemctl reports host as '%s'\n", pipe_cmd_output );
|
||||
string temp = pipe_cmd_output ;
|
||||
if ( temp.find ("stopping") != string::npos )
|
||||
return MTC_SYSTEM_STATE__STOPPING;
|
||||
if ( temp.find ("running") != string::npos )
|
||||
return MTC_SYSTEM_STATE__RUNNING;
|
||||
if ( temp.find ("degraded") != string::npos )
|
||||
return MTC_SYSTEM_STATE__DEGRADED;
|
||||
if ( temp.find ("starting") != string::npos )
|
||||
return MTC_SYSTEM_STATE__STARTING;
|
||||
if ( temp.find ("initializing") != string::npos )
|
||||
return MTC_SYSTEM_STATE__INITIALIZING;
|
||||
if ( temp.find ("offline") != string::npos )
|
||||
return MTC_SYSTEM_STATE__OFFLINE;
|
||||
if ( temp.find ("maintenance") != string::npos )
|
||||
return MTC_SYSTEM_STATE__MAINTENANCE;
|
||||
slog ("unexpected response: <%s>\n", temp.c_str());
|
||||
case MTC_SYSTEM_STATE__RUNNING: return("running");
|
||||
case MTC_SYSTEM_STATE__STOPPING: return("stopping");
|
||||
case MTC_SYSTEM_STATE__STARTING: return("starting");
|
||||
case MTC_SYSTEM_STATE__DEGRADED: return("degraded");
|
||||
case MTC_SYSTEM_STATE__INITIALIZING: return("initializing");
|
||||
case MTC_SYSTEM_STATE__OFFLINE: return("offline");
|
||||
case MTC_SYSTEM_STATE__MAINTENANCE: return("maintenance");
|
||||
default: return("unknown");
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("systemctl is-system-running yielded no response\n");
|
||||
}
|
||||
return MTC_SYSTEM_STATE__UNKNOWN ;
|
||||
}
|
||||
|
||||
system_state_enum get_system_state ( bool verbose )
|
||||
{
|
||||
|
||||
int retry = 0 ;
|
||||
bool unexpected_response = false ;
|
||||
string temp = "" ;
|
||||
system_state_enum system_state = MTC_SYSTEM_STATE__UNKNOWN ;
|
||||
for ( ; retry < 3 ; retry++ )
|
||||
{
|
||||
char pipe_cmd_output [PIPE_COMMAND_RESPON_LEN] ;
|
||||
execute_pipe_cmd ( "systemctl is-system-running",
|
||||
&pipe_cmd_output[0], PIPE_COMMAND_RESPON_LEN );
|
||||
if ( strnlen ( pipe_cmd_output, PIPE_COMMAND_RESPON_LEN ) > 0 )
|
||||
{
|
||||
temp = pipe_cmd_output ;
|
||||
if ( temp.find ("stopping") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__STOPPING;
|
||||
else if ( temp.find ("running") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__RUNNING;
|
||||
else if ( temp.find ("degraded") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__DEGRADED;
|
||||
else if ( temp.find ("starting") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__STARTING;
|
||||
else if ( temp.find ("initializing") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__INITIALIZING;
|
||||
else if ( temp.find ("offline") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__OFFLINE;
|
||||
else if ( temp.find ("maintenance") != string::npos )
|
||||
system_state = MTC_SYSTEM_STATE__MAINTENANCE;
|
||||
else
|
||||
unexpected_response = true ;
|
||||
}
|
||||
|
||||
if ( system_state != MTC_SYSTEM_STATE__UNKNOWN )
|
||||
break ;
|
||||
}
|
||||
|
||||
if ( verbose || unexpected_response )
|
||||
{
|
||||
if ( unexpected_response )
|
||||
{
|
||||
ilog ("systemctl provided unexpected response:'%s'", temp.c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("systemctl reports host in '%s' state (%d)",
|
||||
get_system_state_str(system_state), retry);
|
||||
}
|
||||
}
|
||||
return system_state ;
|
||||
}
|
||||
|
@ -127,6 +127,7 @@ typedef enum
|
||||
MTC_SYSTEM_STATE__UNKNOWN
|
||||
} system_state_enum ;
|
||||
|
||||
system_state_enum get_system_state ( void );
|
||||
system_state_enum get_system_state ( bool verbose=true );
|
||||
const char * get_system_state_str ( system_state_enum system_state );
|
||||
|
||||
#endif
|
||||
|
@ -1807,7 +1807,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
ilog ("Starting 'Degrade Audit' timer (%d secs)\n", degrade_period );
|
||||
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period );
|
||||
|
||||
ilog ("Starting 'Pulse' timer (%d secs)\n", pulse_period );
|
||||
ilog ("Starting 'Pulse' timer (%d msecs)\n", pulse_period );
|
||||
mtcTimer_start_msec ( pmonTimer_pulse, pmon_timer_handler, pulse_period );
|
||||
|
||||
ilog ("Starting 'Host Watchdog' timer (%d secs)\n", hostwd_period );
|
||||
@ -1887,17 +1887,6 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
}
|
||||
}
|
||||
|
||||
/* Avoid pmond thrashing trying to recover processes during
|
||||
* system shutdown. */
|
||||
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
|
||||
{
|
||||
wlog_throttled ( shutdown_log_throttle, 500,
|
||||
"process monitoring disabled during system shutdown\n");
|
||||
usleep (500);
|
||||
continue ;
|
||||
}
|
||||
if ( shutdown_log_throttle ) shutdown_log_throttle = 0 ;
|
||||
|
||||
if ( inotify_fault == false )
|
||||
{
|
||||
if ( get_inotify_events ( ctrl_ptr->fd ) == true )
|
||||
@ -1992,9 +1981,48 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
||||
_get_events ( );
|
||||
}
|
||||
|
||||
/* Check system state before managing processes.
|
||||
*
|
||||
* Prevent process recoverty while not in the
|
||||
* running or degraded state. */
|
||||
if (( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__RUNNING ) &&
|
||||
( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__DEGRADED ))
|
||||
{
|
||||
system_state_enum system_state = get_system_state(false);
|
||||
if ( system_state != _pmon_ctrl_ptr->system_state )
|
||||
{
|
||||
_pmon_ctrl_ptr->system_state = system_state ;
|
||||
if (( system_state != MTC_SYSTEM_STATE__RUNNING ) &&
|
||||
( system_state != MTC_SYSTEM_STATE__DEGRADED ))
|
||||
{
|
||||
/* log every state change that is not running / degraded */
|
||||
wlog ("process monitoring disabled while in '%s' state",
|
||||
get_system_state_str(system_state));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* log every state change that is not running / degraded */
|
||||
wlog ("process monitoring re-enabled while in '%s' state",
|
||||
get_system_state_str(system_state));
|
||||
}
|
||||
}
|
||||
|
||||
/* throttle the disabled state during shutdown log */
|
||||
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
|
||||
{
|
||||
wlog_throttled ( shutdown_log_throttle, 60,
|
||||
"process monitoring disabled during system shutdown\n");
|
||||
}
|
||||
sleep (1);
|
||||
continue ;
|
||||
}
|
||||
else if ( shutdown_log_throttle )
|
||||
shutdown_log_throttle = 0 ;
|
||||
|
||||
/* Monitor Processes */
|
||||
for ( int i = 0 ; i < ctrl_ptr->processes ; i++ )
|
||||
{
|
||||
|
||||
/* Allow a process to be ignored */
|
||||
if ( process_config[i].ignore == true )
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user