metal/mtce/src/pmon/pmonFsm.cpp

1142 lines
41 KiB
C++

/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River CGCS Platform Process Monitor Service
* Passive and Active Monitoring FSMs.
*/
#include "pmon.h"
#include "alarmUtil.h" /* for ... alarmUtil_getSev_str */
#define WARNING_THROTTLE (10)
const char passiveStages_str [PMON_STAGE__STAGES][32] =
{
"Start",
"Manage",
"Respawn",
"Monitor-Wait",
"Monitor-Process",
"Restart-Wait",
"Ignore-Process",
"Handler-Finish",
"Subfunction-Polling",
"Monitor-Start-Wait",
} ;
const char statusStages_str [STATUS_STAGE__STAGES][32] =
{
"Begin",
"Execute-Status",
"Execute-Status-Wait",
"Execute-Start",
"Execute-Start-Wait",
"Interval-Wait"
} ;
int statusStageChange ( process_config_type * ptr , statusStage_enum newStage )
{
if (( newStage < STATUS_STAGE__STAGES ) &&
( ptr->status_stage < STATUS_STAGE__STAGES ))
{
clog ("%s %s -> %s (%d->%d)\n",
ptr->process,
statusStages_str[ptr->status_stage],
statusStages_str[newStage],
ptr->status_stage, newStage);
ptr->status_stage = newStage ;
return (PASS);
}
else
{
slog ("%s Invalid Stage (now:%d new:%d)\n", ptr->process, ptr->status_stage, newStage );
ptr->status_stage = STATUS_STAGE__BEGIN ;
return (FAIL);
}
}
int passiveStageChange ( process_config_type * ptr , passiveStage_enum newStage )
{
if (( newStage < PMON_STAGE__STAGES ) &&
( ptr->stage < PMON_STAGE__STAGES ))
{
clog ("%s %s -> %s (%d->%d)\n",
ptr->process,
passiveStages_str[ptr->stage],
passiveStages_str[newStage],
ptr->stage, newStage);
ptr->stage = newStage ;
return (PASS);
}
else
{
slog ("%s Invalid Stage (now:%d new:%d)\n",
ptr->process, ptr->stage, newStage );
ptr->stage = PMON_STAGE__FINISH ;
return (FAIL);
}
}
const char * get_pmonStage_str ( process_config_type * ptr )
{
if ( ptr->stage < PMON_STAGE__STAGES )
{
return ( &passiveStages_str[ptr->stage][0] ) ;
}
return (NULL);
}
const char activeStages_str [ACTIVE_STAGE__STAGES][32] =
{
"Idle", /* 0 */
"Start", /* 1 */
"Request", /* 2 */
"Wait", /* 3 */
"Response", /* 4 */
"Gap-Setup", /* 5 */
"Gap", /* 6 */
"Failed", /* 7 */
"Debounce-Setup",
"Debounce",
"Finish",
} ;
int activeStageChange ( process_config_type * ptr , activeStage_enum newStage )
{
if (( newStage < ACTIVE_STAGE__STAGES ) &&
( ptr->active_stage < ACTIVE_STAGE__STAGES ))
{
clog ("%s %s -> %s (%d->%d)\n",
ptr->process,
activeStages_str[ptr->active_stage],
activeStages_str[newStage],
ptr->active_stage, newStage);
ptr->active_stage = newStage ;
return (PASS);
}
else
{
slog ("%s Invalid Stage (now:%d new:%d)\n",
ptr->process, ptr->active_stage, newStage );
ptr->active_stage = ACTIVE_STAGE__PULSE_REQUEST ;
return (FAIL);
}
}
const char * get_amonStage_str ( process_config_type * ptr )
{
if ( ptr->active_stage < ACTIVE_STAGE__STAGES )
{
return ( &activeStages_str[ptr->active_stage][0] ) ;
}
return (NULL);
}
void clear_amon_counts ( process_config_type * ptr )
{
if ( ptr->b2b_miss_count > ptr->b2b_miss_peak )
ptr->b2b_miss_peak = ptr->b2b_miss_count ;
if ( ptr->mesg_err_cnt > ptr->mesg_err_peak )
ptr->mesg_err_peak = ptr->mesg_err_cnt ;
ptr->b2b_miss_count = 0 ;
ptr->send_err_cnt = 0 ;
ptr->recv_err_cnt = 0 ;
ptr->mesg_err_cnt = 0 ;
}
/* Active Monitoring Handler
* --------------------------
* Verifies that the process has an active pid */
int pmon_active_handler ( process_config_type * ptr )
{
int rc = PASS ;
if ( ptr->active_stage < ACTIVE_STAGE__STAGES )
{
dlog2 ("%s Active-%s Stage %d\n",
ptr->process,
activeStages_str[ptr->active_stage],
ptr->active_stage );
}
else
{
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST );
}
switch ( ptr->active_stage )
{
case ACTIVE_STAGE__IDLE:
{
break ;
}
case ACTIVE_STAGE__START_MONITOR:
{
rc = open_process_socket ( ptr );
if ( rc != PASS )
{
ptr->active_failed = true ;
elog ("%s 'open_process_socket' failed (%d)\n", ptr->process , rc );
manage_process_failure ( ptr );
}
activeStageChange ( ptr, ACTIVE_STAGE__GAP_SETUP );
break ;
}
case ACTIVE_STAGE__GAP_SETUP:
{
mtcTimer_reset ( ptr->pt_ptr );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period );
activeStageChange ( ptr, ACTIVE_STAGE__GAP_WAIT );
break ;
}
case ACTIVE_STAGE__GAP_WAIT:
{
if ( ptr->pt_ptr->ring == true )
{
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST );
}
break ;
}
case ACTIVE_STAGE__FAILED:
{
ptr->active_response = false ;
ptr->active_failed = true ;
ptr->afailed_count++ ;
ptr->b2b_miss_count = 0 ;
mtcTimer_reset ( ptr->pt_ptr );
manage_process_failure ( ptr );
/* Stage change is handled else where */
break ;
}
case ACTIVE_STAGE__REQUEST_WAIT:
{
if ( ptr->pt_ptr->ring == true )
{
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST );
}
break ;
}
case ACTIVE_STAGE__PULSE_REQUEST:
{
ptr->waiting = true ;
if ( amon_send_request ( ptr ) != PASS )
{
ptr->waiting = false ;
ptr->send_err_cnt++ ;
wlog ("%s pulse request send failed (%d:%d)\n",
ptr->process,
ptr->b2b_miss_count,
ptr->send_err_cnt );
if ( ++ptr->b2b_miss_count >= ptr->threshold )
{
activeStageChange ( ptr, ACTIVE_STAGE__FAILED );
}
else
{
activeStageChange ( ptr, ACTIVE_STAGE__GAP_SETUP );
}
}
else
{
ptr->pulse_count++ ;
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->timeout );
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_RESPONSE );
}
break ;
}
case ACTIVE_STAGE__PULSE_RESPONSE:
{
if ( ptr->rx_sequence != 0 )
{
/* handle the first response */
if ( ptr->active_response == false )
{
ptr->active_response = true ;
}
if ( ptr->rx_sequence != ptr->tx_sequence )
{
ptr->b2b_miss_count++ ;
ptr->mesg_err_cnt++ ;
wlog ( "%s out-of-sequence response (%d:%d)\n",
ptr->process ,
ptr->tx_sequence,
ptr->rx_sequence);
if ( ptr->b2b_miss_count >= ptr->threshold )
{
activeStageChange ( ptr, ACTIVE_STAGE__FAILED );
}
}
else
{
if ( ptr->b2b_miss_count > ptr->b2b_miss_peak )
ptr->b2b_miss_peak = ptr->b2b_miss_count ;
ptr->b2b_miss_count = 0 ;
if ( ptr->active_debounce == true )
{
ilog ("%s is healthy (debouncing)\n", ptr->process );
}
else
{
mlog2 ("%s is healthy\n", ptr->process );
}
}
/* manage active monitoring debounce */
if ( ptr->active_debounce == true )
{
if ( ++ptr->adebounce_cnt >= ((ptr->period+1)) )
{
ilog ("%s Debounced (%d)\n", ptr->process, ptr->pid );
ptr->active_debounce = false;
ptr->adebounce_cnt = 0 ;
ptr->restarts_cnt = 0 ;
ptr->quorum_failure = false;
ptr->quorum_unrecoverable = false;
clear_amon_counts ( ptr );
ptr->active_failed = false ;
manage_alarm ( ptr , PMON_CLEAR );
}
}
ptr->rx_sequence = 0 ;
/* Wait out the remaining part of the period */
ptr->waiting = false ;
}
if ( ptr->pt_ptr->ring == true )
{
/* Are we still waiting for a response ? */
if ( ptr->waiting == true )
{
/* handle case where response is delayed due to goenabled */
if ( ptr->full_init_reqd &&
!( is_goenabled ( get_ctrl_ptr()->nodetype, true ) )
)
{
/* we don't expect a resonse... do nothing to wait
another loop */
}
else
{
ptr->recv_err_cnt++ ;
/* don't log the first single pulse miss. */
if ( ptr->b2b_miss_count++ > 1 )
{
wlog ("%s missing pulse response (Miss:%d) (%d:%d)\n",
ptr->process,
ptr->b2b_miss_count,
ptr->tx_sequence,
ptr->rx_sequence);
}
if ( ptr->b2b_miss_count >= ptr->threshold )
{
/*****************************************************
* Only fail active heartbeating after MTC_MINS_3 of
* never having received a response.
*
* This condition is added to address an issue
* reported where the kernel takes a
* long time to timeout on external dns namservers
* after a DOR when the system is isolated from the
* external network.
****************************************************/
if (( ptr->active_response == false ) &&
( ptr->period < MTC_MINS_3 ) &&
( ptr->b2b_miss_count < (MTC_MINS_3/ptr->period )))
{
; /* more forgiving startup handling */
}
else
{
activeStageChange ( ptr, ACTIVE_STAGE__FAILED );
break ;
}
}
}
}
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST );
break ;
}
else if ( ptr->waiting == false )
{
; /* got the data ; just wait out the timer */
}
break ;
}
default:
{
activeStageChange ( ptr, ACTIVE_STAGE__GAP_SETUP );
break ;
}
}
return (rc);
}
/* Passive Monitoring Handler
* --------------------------
* Verifies that the process has an active pid */
int pmon_passive_handler ( process_config_type * ptr )
{
int rc = RETRY ;
if ( ptr->stage < PMON_STAGE__STAGES )
{
flog ("%s %s Stage %d\n", ptr->process, passiveStages_str[ptr->stage], ptr->stage );
}
else
{
slog ("%s Invalid stage (%d) ; correcting\n", ptr->process, ptr->stage );
passiveStageChange ( ptr, PMON_STAGE__FINISH );
}
switch ( ptr->stage )
{
case PMON_STAGE__START:
{
dlog ( "%s failed:%d severity:%s restarts_cnt:%d debounce_cnt:%d\n",
ptr->process,
ptr->failed,
alarmUtil_getSev_str(ptr->alarm_severity).c_str(),
ptr->restarts_cnt,
ptr->debounce_cnt);
ptr->stage_cnt = 0 ;
break ;
}
/* Manage Restart Counts */
case PMON_STAGE__MANAGE:
{
if ( ptr->restart == true )
{
pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr() ;
if ( ctrl_ptr->patching_in_progress == true )
{
/* if patching is in progress and we get a process restart command
* then that means the rpms have all been installed already so we
* can exit patching in progress state */
ctrl_ptr->patching_in_progress = false ;
}
; /* fall through and just change state at the bottom */
}
/* Handle Critical processes.
* Critical with 0 restarts
* Critical with # restarts
* Req'ts:
* 1. Avoid re-reporting the event
* 2. Send minor for first occurance
* 3. Try restarts if it supports it
* 4. Stay in this stage once the max restarts has been reached.
*/
else if ( ptr->sev == SEVERITY_CRITICAL )
{
/* handle the No-restarts case */
/* Go straight to event assertion */
if ( ptr->restarts == 0 )
{
manage_alarm ( ptr, PMON_ASSERT );
/* Send critical notification */
pmon_send_event ( MTC_EVENT_PMON_CRIT, ptr );
wlog ("%s auto-restart disabled\n", ptr->process );
passiveStageChange ( ptr, PMON_STAGE__IGNORE ) ;
/* if process is in quorum, and we're not trying to restart
* it, we declare the quorum failed */
if ( ptr->quorum )
{
quorum_process_failure ( ptr );
}
break ;
}
else if ( ptr->restarts_cnt >= ptr->restarts )
{
manage_alarm ( ptr, PMON_ASSERT );
/* Send critical notification */
pmon_send_event ( MTC_EVENT_PMON_CRIT, ptr );
ptr->restarts_cnt = 0 ;
ilog ("%s allowing auto-restart of failed critical process\n", ptr->process);
/* if process is in quorum, and we haven't been able to
* restart it, we declare the quorum failed */
if ( ptr->quorum )
{
quorum_process_failure ( ptr );
}
/* Note: the above clear or restarts_cnt and commented break below
* forces pmond to try and continue to recover the failing
* critical process if for some reason the host does not
* go through a reboot */
/* avoid stage change below and wait for the reboot */
// break ;
}
else
{
/* Send a restart log to maintenance on the first restart only */
if ( ptr->restarts_cnt == 0 )
{
manage_alarm ( ptr, PMON_LOG );
}
/* Try and recover if the process is critical but
* supports some number of restart attempts first */
}
}
/* Send a log on the first restart */
/* */
/* Note: This clause needs to be before the next one */
/* to handle the restarts = 0 case */
else if (( ptr->restarts_cnt == 0 ) && ( ptr->restarts != 0 ))
{
ilog ("%s Sending Log Event to Maintenance\n", ptr->process );
/* Send a log on the first one or every time
* we start a fresh restart cycle */
manage_alarm ( ptr, PMON_LOG );
}
else if (( ptr->restarts_cnt == 0 ) && ( ptr->restarts == 0 ))
{
/* Auto recovery is disable, generate a log and raise a minor alarm */
wlog ("%s Sending Log Event to Maintenance\n", ptr->process );
manage_alarm ( ptr, PMON_LOG );
manage_alarm ( ptr, PMON_ASSERT );
wlog ("%s Auto-Restart Disabled ... but monitoring for recovery\n", ptr->process );
/* if process is in quorum, and we're not trying to
* restart it, we declare the quorum failed */
if ( ptr->quorum )
{
ptr->quorum_failure = true;
quorum_process_failure ( ptr );
}
passiveStageChange ( ptr, PMON_STAGE__IGNORE ) ;
break ;
}
/* Manage notification based on restart
* threshold for non-critical processes */
else if ( ptr->restarts_cnt >= ptr->restarts )
{
/* Restart threshold reached ; sending event to maintenance */
manage_alarm ( ptr, PMON_ASSERT );
/* Start the counts again */
ptr->restarts_cnt = 0 ;
ptr->debounce_cnt = 0 ;
/* if process is in quorum, and we haven't been able to
* restart it, we declare the quorum failed */
if ( ptr->quorum )
{
quorum_process_failure ( ptr );
}
}
passiveStageChange ( ptr, PMON_STAGE__RESPAWN ) ;
break ;
}
/* Spawn the process */
case PMON_STAGE__RESPAWN:
{
dlog ("%s stability period (%d secs)\n", ptr->process, ptr->debounce );
/* Restart the process */
respawn_process ( ptr ) ;
/* Start the monitor debounce timer. */
mtcTimer_reset ( ptr->pt_ptr );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->startuptime );
passiveStageChange ( ptr, PMON_STAGE__MONITOR_WAIT ) ;
break ;
}
/* Give the process time to startup
* before trying to monitor it */
case PMON_STAGE__MONITOR_WAIT:
{
/* Give the process time to start */
if ( ptr->pt_ptr->ring == true )
{
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || ( ptr->status ))
{
if ( ptr->child_pid == 0 )
{
elog ("%s spawn has null child pid\n", ptr->process );
}
else if ( ptr->sigchld_rxed == false )
{
elog ("%s spawn timeout (%d)\n", ptr->process, ptr->child_pid );
}
else if ( ptr->status != PASS )
{
elog ("%s spawn failed (rc:%d) (%d)\n", ptr->process, ptr->status, ptr->child_pid );
}
kill_running_child ( ptr ) ;
/* we had a startup timeout ; do restart */
mtcTimer_start( ptr->pt_ptr, pmon_timer_handler, ptr->interval );
passiveStageChange ( ptr, PMON_STAGE__RESTART_WAIT ) ;
}
else
{
/* clear the monitor debounce counter */
ptr->debounce_cnt = 0 ;
if ( ptr->restart == true )
{
ptr->restart = false ;
ptr->registered = false ;
ptr->pid = get_process_pid ( ptr );
ilog ("%s Restarted (%d)\n", ptr->process, ptr->pid );
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
}
else
{
/* Start debounce monitor phase */
passiveStageChange ( ptr, PMON_STAGE__MONITOR ) ;
process_running ( ptr );
ilog ("%s Monitor (%d)\n", ptr->process, ptr->pid );
}
}
ptr->sigchld_rxed = false ;
}
break ;
}
/* Monitor the newly respawned process */
case PMON_STAGE__MONITOR:
{
/* The process needs to stay running for x seconds before
* clearing any assertion or declaring that this restart
* attempt was successful */
/* The process should be running.
* If not then cancel the timer and start over through
* the RESTART_WAIT stage which ensures that we manage
* back to back restarts properly */
if ( ! process_running ( ptr ) )
{
wlog ("%s Respawn Monitor Failed (%d of %d), retrying in (%d secs)\n",
ptr->process,
ptr->restarts_cnt,
ptr->restarts,
ptr->interval);
passiveStageChange ( ptr, PMON_STAGE__TIMER_WAIT ) ;
}
else if ( ptr->pt_ptr->ring == true )
{
if ( ++ptr->debounce_cnt >= ptr->debounce )
{
/* We made it through the monitor debounce
* period so lets finish up */
ilog ("%s Stable (%d)\n", ptr->process, ptr->pid );
passiveStageChange ( ptr, PMON_STAGE__FINISH ) ;
}
/* else continue to monitor the freshly respawned process */
else
{
/* Start the monitor timer again since
* the debounce period is not over */
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, 1 );
dlog ("%s Debounce Monitor (TID:%p)\n", ptr->process, ptr->pt_ptr->tid );
}
}
break ;
}
case PMON_STAGE__TIMER_WAIT:
{
if ( mtcTimer_expired ( ptr->pt_ptr ) )
{
/* if restart interval is zero then just ring the timer right away */
if ( ptr->interval == 0 )
{
ptr->pt_ptr->ring = true ;
}
else
{
/* Now we are in the restart wait phase */
mtcTimer_start( ptr->pt_ptr, pmon_timer_handler, ptr->interval );
}
kill_running_child ( ptr ) ;
passiveStageChange ( ptr, PMON_STAGE__RESTART_WAIT ) ;
}
else
{
dlog ("%s debounce timer wait\n", ptr->process);
}
break ;
}
/* Lets wait a bit before we try another restart */
case PMON_STAGE__RESTART_WAIT:
{
if ( ptr->pt_ptr->ring == true )
{
/* Force the immediate (re)start */
passiveStageChange ( ptr, PMON_STAGE__MANAGE) ;
}
break ;
}
/* A state that leaves a process failed but takes it out of
* that failed state if it auto recovers on its own or
* through external means */
case PMON_STAGE__IGNORE:
{
int pid ;
if ((pid = get_process_pid ( ptr )))
{
int result = kill (pid, 0 );
if ( result == 0 )
{
/* allow process recovery if it is started outside pmond */
if ( ptr->stopped == true )
ptr->stopped = false ;
passiveStageChange ( ptr, PMON_STAGE__FINISH );
}
}
break ;
}
case PMON_STAGE__FINISH:
{
kill_running_child ( ptr ) ;
ilog ("%s Recovered (%d)\n", ptr->process, ptr->pid );
/* Set all counts to default state ;
* Even if they may have already been :) */
ptr->failed = false ;
ptr->debounce_cnt = 0 ;
passiveStageChange ( ptr, PMON_STAGE__START ) ;
/* Register the new process with the kernel */
register_process ( ptr );
if ( !ptr->active_monitoring )
{
ptr->restarts_cnt = 0 ;
/* It's possible that a restart succeeded even after the
* max restarts threshold was reached (and we thought things
* were dead, so we marked quorum processes as unrecoverable)
*/
if ( ptr->quorum )
{
ptr->quorum_failure = false;
ptr->quorum_unrecoverable = false;
}
manage_alarm ( ptr, PMON_CLEAR );
}
/* Recover Active monitoring ,
* event clear will occur in the active monitoring
* FSM after it passes the debouce cycle */
else
{
/* Open the process's active monitoring
* socket if it was or is closed */
if ( ptr->msg.tx_sock == 0 )
open_process_socket ( ptr );
/* Clear sequence*/
ptr->tx_sequence = 0 ;
ptr->rx_sequence = 0 ;
/* Clear active monitoring state controls */
// ptr->active_failed = false ;
ptr->waiting = false ;
/* Set the active monitor debounce flag and clear its counter */
ptr->active_debounce = true ;
ptr->adebounce_cnt = 0 ;
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST ) ;
}
rc = PASS ;
break ;
}
/******************************************************************************
*
* This polling stage was introduced for the introduction of the 2-Server
* configuration, aka combo blade/host.
*
* /etc/pmon.d/<process.conf> files that declare
*
* subfunction = worker
* or
* subfunction = storage
*
* .. are not immediately monitored by pmond on startup.
*
* Instead, pmond will wait for the specified subfunction config complete
* file to be present before starting to monitor that process.
*
* This stage is here to manage that delayed monitoring startup of
* subfunction dependent processes.
*
*******************************************************************************/
case PMON_STAGE__POLLING:
{
if ( ptr->pt_ptr->ring == false )
{
break ;
}
else
{
string config_filename = "" ;
pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr() ;
if ( ptr->subfunction )
{
if ( !strcmp (ptr->subfunction, "worker" ) )
{
config_filename = CONFIG_COMPLETE_WORKER ;
}
else if ( !strcmp (ptr->subfunction, "storage" ) )
{
config_filename = CONFIG_COMPLETE_STORAGE ;
}
/********************************************************
* issue: processes that set the subfunction to
* 'last-config' get a dependency override in
* the AIO system. Such processes need to be
* monitored only after the last configuration
* step. Right now that is worker in aio.
*
********************************************************/
else if (( ctrl_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
( !strcmp (ptr->subfunction, "last-config" )))
{
config_filename = CONFIG_COMPLETE_WORKER ;
dlog ("%s dependency over-ride ; will wait for %s\n",
ptr->process,
config_filename.c_str());
}
}
if ( config_filename.empty() )
{
passiveStageChange ( ptr, PMON_STAGE__IGNORE );
elog ("%s is subfunction polling with no subfunction ; ignoring\n", ptr->process );
}
else
{
bool start_monitoring = true;
string waiting_for = "";
if ( daemon_is_file_present ( config_filename.data() ) != true )
{
start_monitoring = false;
waiting_for = config_filename;
}
mtcTimer_reset ( ptr->pt_ptr );
if ( start_monitoring == true )
{
ptr->passive_monitoring = true ;
/* check for startup failures from alarm query. */
if ( ptr->failed == true )
{
/* manage the process if its in the failed state */
passiveStageChange ( ptr, PMON_STAGE__MANAGE );
}
else
{
ilog ("monitor start of %s in %d seconds\n", ptr->process, daemon_get_cfg_ptr()->start_delay );
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, daemon_get_cfg_ptr()->start_delay );
passiveStageChange ( ptr, PMON_STAGE__START_WAIT );
}
}
else
{
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, 3 );
wlog_throttled ( ptr->stage_cnt, 500, "%s monitoring is waiting on %s\n",
ptr->process, waiting_for.c_str());
}
}
}
break ;
}
case PMON_STAGE__START_WAIT:
{
if ( ptr->pt_ptr->ring == true )
{
ilog ("%s process monitoring started\n", ptr->process );
register_process ( ptr );
if ( ptr->active_monitoring == true )
{
if ( open_process_socket ( ptr ) != PASS )
{
elog ("%s failed to open process socket\n",
ptr->process );
}
}
passiveStageChange ( ptr, PMON_STAGE__MANAGE );
}
break ;
}
default:
{
slog ("%s Invalid stage (%d)\n", ptr->process, ptr->stage );
/* Default to finish for invalid case.
* If there is an issue then it will be detected */
passiveStageChange ( ptr, PMON_STAGE__FINISH );
}
}
return (rc);
}
/* Status Monitoring Handler
* --------------------------
* Monitors a process with status command */
int pmon_status_handler ( process_config_type * ptr )
{
if ( ptr->status_stage >= STATUS_STAGE__STAGES )
{
wlog ("%s Invalid status_stage (%d) ; correcting\n", ptr->process, ptr->status_stage );
statusStageChange ( ptr, STATUS_STAGE__BEGIN);
}
switch ( ptr->status_stage )
{
// First state
case STATUS_STAGE__BEGIN:
{
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->period );
dlog ("%s start period timer %p\n", ptr->process, ptr->pt_ptr->tid );
statusStageChange ( ptr, STATUS_STAGE__EXECUTE_STATUS );
break ;
}
// Execute the status command
case STATUS_STAGE__EXECUTE_STATUS:
{
if ( ptr->pt_ptr->ring == true ) //wake up from period
{
ptr->status = PASS;
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->timeout );
dlog ("%s start the status command timer %p\n", ptr->process, ptr->pt_ptr->tid );
// Execute the status call
int rc = execute_status_command(ptr);
if (rc != PASS)
{
elog ("%s execute_status_command returned a failure (%d)\n", ptr->process, rc);
ptr->status = rc;
}
statusStageChange ( ptr, STATUS_STAGE__EXECUTE_STATUS_WAIT );
}
break ;
}
// Wait for the status command to finish and process results
case STATUS_STAGE__EXECUTE_STATUS_WAIT:
{
// Give the command time to execute. The daemon_sigchld_hdlr will force
// a ring when the command execute successfully or returns a failure
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS ) )
{
mtcTimer_reset( ptr->pt_ptr);
ptr->pt_ptr->ring = false;
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || (ptr->status != PASS))
{
if ( ptr->child_pid == 0 )
{
elog ("%s status command has null child pid\n", ptr->process );
}
else if ( ptr->sigchld_rxed == false )
{
elog ("%s status command execution timed out (%d)\n", ptr->process, ptr->child_pid );
kill_running_process ( ptr->child_pid );
}
elog ("%s status returned a failure (rc:%d) ; process(es) start pending\n", ptr->process, ptr->status );
// Go to execute start state since we do not know the status of the process
ptr->status_failed = true;
ptr->was_failed = true ;
statusStageChange ( ptr, STATUS_STAGE__EXECUTE_START );
}
else
{
// Status reports everything is ok, reset variables
dlog ("%s status command was successful\n", ptr->process);
ptr->restarts_cnt = 0;
if ( ptr->failed == true )
{
manage_alarm ( ptr, PMON_CLEAR );
}
ptr->status_failed = false;
ptr->failed = false;
statusStageChange ( ptr, STATUS_STAGE__BEGIN );
}
ptr->child_pid = 0;
ptr->sigchld_rxed = false;
}
break;
}
// Interval wait time before doing a start again if the start
// had previously failed
case STATUS_STAGE__INTERVAL_WAIT:
{
if (ptr->pt_ptr->ring == true)
{
statusStageChange ( ptr, STATUS_STAGE__EXECUTE_START );
}
break ;
}
// Execute the start command
case STATUS_STAGE__EXECUTE_START:
{
ptr->status = PASS;
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->timeout );
dlog ("%s start the start command timer %p\n", ptr->process, ptr->pt_ptr->tid );
int rc = execute_start_command (ptr);
if (rc != PASS)
{
elog ("%s execute_start_command returned a failure (%d)\n", ptr->process, rc);
ptr->status = rc;
}
statusStageChange ( ptr, STATUS_STAGE__EXECUTE_START_WAIT );
break;
}
// Wait for the start command to finish and process results
case STATUS_STAGE__EXECUTE_START_WAIT:
{
// Give the command time to execute. The daemon_sigchld_hdlr will force
// a ring when the command execute successfully or returns a failure
if ( (ptr->pt_ptr->ring == true) || (ptr->status != PASS) )
{
mtcTimer_reset( ptr->pt_ptr);
ptr->pt_ptr->ring = false;
// If the status had failed then ptr->status_failed will be set to true. Status failure
// will also cause restarts count increment, alarm and degrade state
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || ( ptr->status ) || (ptr->status_failed))
{
if ( ptr->child_pid == 0 )
{
elog ("%s start command has null child pid\n", ptr->process );
}
else if ( ptr->sigchld_rxed == false )
{
elog ("%s start command execution timed out (%d)\n", ptr->process, ptr->child_pid );
kill_running_process ( ptr->child_pid );
}
else if ( ptr->status != PASS )
{
elog ("%s start command returned a failure (rc:%d)\n", ptr->process, ptr->status);
}
/* Send a log on the first failure */
if (( ptr->restarts_cnt == 0 ) && ( ptr->restarts != 0 ) )
{
wlog ("%s Sending Log Event to Maintenance\n", ptr->process );
/* Send a log on the first one or every time we start a fresh restart cycle */
manage_alarm ( ptr, PMON_LOG );
}
/* Manage notification based on restart */
else if ( ptr->restarts_cnt >= ptr->restarts )
{
wlog ("%s Failure threshold (%d) reached ; alarming\n", ptr->process, ptr->restarts );
manage_alarm ( ptr, PMON_ASSERT );
ptr->failed = true; // this is used to degrade un-degrade the host
ptr->restarts_cnt = 0 ;
}
else
{
wlog ("%s has %d of %d failures ; retrying ...\n",
ptr->process,
ptr->restarts_cnt,
ptr->restarts );
}
ptr->restarts_cnt++;
ptr->failed_cnt++ ;
//only want to check for status false on first restart iteration so reset the flag
ptr->status_failed = false;
// Go to interval state only if start failed otherwise we want to check
// the process status again
if (( !ptr->sigchld_rxed ) || ( !ptr->child_pid ) || ( ptr->status ))
{
// In here because the start failed
// Wait the interval time and then execute a start command again
mtcTimer_start ( ptr->pt_ptr, pmon_timer_handler, ptr->interval );
dlog ("%s start interval timer %p\n", ptr->process, ptr->pt_ptr->tid );
statusStageChange ( ptr, STATUS_STAGE__INTERVAL_WAIT );
}
else
{
// In here because status failed but start was successful
wlog ("%s start command was successful ; here because status had failed\n", ptr->process);
statusStageChange ( ptr, STATUS_STAGE__BEGIN );
}
}
else
{
// Start was successful
wlog ("%s start command was successful\n", ptr->process);
statusStageChange ( ptr, STATUS_STAGE__BEGIN );
}
ptr->child_pid = 0;
ptr->sigchld_rxed = false;
}
break;
}
default:
{
elog ("%s invalid status_stage (%d)\n", ptr->process, ptr->status_stage );
/* Default to first state for invalid case. there is an issue then it will be detected */
statusStageChange ( ptr, STATUS_STAGE__BEGIN );
}
}
return (PASS);
}