537935bb0c
All compute hosts seen to self reboot by hostw during patching due to stuck pmond process Current method to kill the running process leads to a race condition that results in a user space futex dead lock that hangs pmond and results in a watchdog self-reset due to quorum master 'pmond' failure. The dead lock was traced to the ordering of the kill process. Current steps to kill: - kill process - remove pidfile - unregister pid with kernel Deadlock is avoided by reversing the kill steps to what is more logical. - unregister pid with kernel - remove pidfile - kill process Also introduced audit that registers manually restarted processes with the kernel. Failure Rate Before Fix: 1 every 25 process restarts. Mostly fails before 5. Failure Rate After Fix: No failures after 15000 process restarts across 8 hosts including all host types between 2 different labs 2 different loads 18.07 and 18.08. Test Method: Pmon restart regression test restarts all processes on a host. Total soak restart of 25 monitored processes for 50 loops over 12 hosts = 15000 restarts. Also regressed process kill / recovery handling. (5000 process recoveries) Change-Id: Icac64df52df9d8074fcd886567dda6e53641572d Signed-off-by: David Sullivan <david.sullivan@windriver.com> Story: 2002993 Task: 23007
2143 lines
71 KiB
C++
2143 lines
71 KiB
C++
/*
|
|
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGCS Platform Process Monitor Service Handler
|
|
*/
|
|
|
|
#include <libgen.h> /* dirname */
|
|
|
|
using namespace std;
|
|
|
|
#include "pmon.h"
|
|
#include "nodeEvent.h" /* for ... set_inotify_watch, set_inotify_close */
|
|
#include "nodeTimers.h" /* for ... mtcTimer_init */
|
|
#include "alarmUtil.h" /* for ... alarmUtil_getSev_str */
|
|
#include "pmonAlarm.h" /* for ... PMON_ALARM_ID__PMOND */
|
|
|
|
/* Preserve a local copy of a pointer to the control struct to
|
|
* avoid having to publish a get utility prototype into pmon.h */
|
|
static pmon_ctrl_type * _pmon_ctrl_ptr = NULL ;
|
|
|
|
void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
/* Save the control pointer */
|
|
_pmon_ctrl_ptr = ctrl_ptr ;
|
|
}
|
|
|
|
/* pmonTimer_audit - get_events periodic audit timer */
|
|
static struct mtc_timer pmonTimer_audit ;
|
|
static struct mtc_timer pmonTimer_degrade ;
|
|
static struct mtc_timer pmonTimer_pulse ;
|
|
static struct mtc_timer pmonTimer_hostwd ;
|
|
static struct mtc_timer ptimer[MAX_PROCESSES] ;
|
|
|
|
/** List of config files */
|
|
std::list<string> config_files ;
|
|
std::list<string>::iterator string_iter_ptr ;
|
|
|
|
/* If there is an alarm in the list that matches one in the process list
|
|
* then update that process with its severity and failed state.
|
|
* If there is a process in the saved list that is not in the process list
|
|
* then clear its alarm as it is no longer valid.
|
|
*/
|
|
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
|
process_config_type * const ptr,
|
|
int const processes );
|
|
|
|
static process_config_type process_config[MAX_PROCESSES] ;
|
|
|
|
/* lookup process control by index and return its pointer if found.
|
|
* Otherwise if not found return NULL */
|
|
process_config_type * get_process_config_ptr ( int index )
|
|
{
|
|
if ( index < _pmon_ctrl_ptr->processes )
|
|
return ( &process_config[index] );
|
|
return ( NULL );
|
|
}
|
|
|
|
/* lookup process control by name and return its pointer if found.
|
|
* Otherwise if not found return NULL */
|
|
process_config_type * get_process_config_ptr ( string process )
|
|
{
|
|
if ( _pmon_ctrl_ptr )
|
|
{
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
if ( process.compare(process_config[i].process) == 0 )
|
|
{
|
|
dlog ("%s process found\n", process.c_str());
|
|
return (&process_config[i]);
|
|
}
|
|
}
|
|
}
|
|
wlog ("%s process not found in control list\n", process.c_str());
|
|
return (NULL);
|
|
}
|
|
|
|
#define _MAX_LEN_ (MAX_FILE_SIZE*2)
|
|
|
|
/*******************************************************************
|
|
* Process Dump Support *
|
|
*******************************************************************
|
|
* *
|
|
* Utilities that add specific config lines to the dump list *
|
|
* *
|
|
******************************************************************/
|
|
|
|
/* Log nostname, ip, mac and pulse period */
|
|
void mem_log_ctrl ( pmon_ctrl_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, "%s %s %s Pulse Rate:%d msecs\n",
|
|
&ptr->my_hostname[0],
|
|
ptr->my_address.c_str(),
|
|
ptr->my_macaddr.c_str(),
|
|
ptr->pulse_period );
|
|
mem_log(str);
|
|
}
|
|
|
|
/* Log process specific controls */
|
|
void mem_log_process ( process_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, "%-25s [%5d] %8s Restarts:%u Interval:%u Debounce:%u Startuptime:%u\n",
|
|
ptr->process,
|
|
ptr->pid,
|
|
ptr->severity,
|
|
ptr->restarts,
|
|
ptr->debounce,
|
|
ptr->interval,
|
|
ptr->startuptime);
|
|
mem_log(str);
|
|
}
|
|
|
|
/* Log process specific state */
|
|
void mem_log_pstate ( process_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, " Passive: %10s (%d) Failed:%s Restart:%s FCount:%2u subFunc:%s Severity:%s %s %s\n",
|
|
get_pmonStage_str(ptr),
|
|
ptr->stage,
|
|
ptr->failed ? "true " : "false",
|
|
ptr->restart ? "true " : "false",
|
|
ptr->failed_cnt,
|
|
ptr->subfunction ? ptr->subfunction : "None",
|
|
alarmUtil_getSev_str(ptr->alarm_severity).c_str(),
|
|
ptr->ignore ? "ignored" : "",
|
|
ptr->stopped ? "stopped" : "" );
|
|
mem_log(str);
|
|
}
|
|
|
|
/* Log process specific active monitor controls */
|
|
void mem_log_aconfig ( process_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, " Active : %10s (%d) Pulses:%2u Seq:%2u Period:%2u Timeout:%2u Thld:%2u %s\n",
|
|
get_amonStage_str(ptr),
|
|
ptr->active_stage,
|
|
ptr->pulse_count,
|
|
ptr->tx_sequence,
|
|
ptr->period,
|
|
ptr->timeout,
|
|
ptr->threshold,
|
|
ptr->waiting ? "... waiting" : "");
|
|
mem_log(str);
|
|
}
|
|
|
|
/* Log process specific active monitor state */
|
|
void mem_log_astate ( process_config_type * ptr )
|
|
{
|
|
#define MAX_LEN 500
|
|
char str[MAX_LEN] ;
|
|
snprintf (&str[0], MAX_LEN, " Stats - Failed:%s Count:%2u b2bp:%2u b2bc:%2u rxer:%2u txer:%2u msge:%2u msgp:%2u\n",
|
|
ptr->active_failed ? "true " : "false",
|
|
ptr->afailed_count,
|
|
ptr->b2b_miss_peak,
|
|
ptr->b2b_miss_count,
|
|
ptr->recv_err_cnt,
|
|
ptr->send_err_cnt,
|
|
ptr->mesg_err_cnt,
|
|
ptr->mesg_err_peak);
|
|
mem_log(str);
|
|
}
|
|
|
|
/* Push daemon state to log file */
|
|
void daemon_dump_info ( void )
|
|
{
|
|
if ( _pmon_ctrl_ptr )
|
|
{
|
|
daemon_dump_membuf_banner();
|
|
mem_log_ctrl ( _pmon_ctrl_ptr );
|
|
daemon_dump_membuf();
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
process_config_type * ptr = get_process_config_ptr(i);
|
|
mem_log ('\n');
|
|
mem_log_process ( ptr );
|
|
mem_log_pstate ( ptr );
|
|
if ( ptr->active_monitoring )
|
|
{
|
|
mem_log_aconfig ( ptr );
|
|
mem_log_astate ( ptr );
|
|
}
|
|
}
|
|
daemon_dump_membuf();
|
|
}
|
|
}
|
|
|
|
/*******************************************************************
|
|
* Module Initialize and Finalizes Interfaces *
|
|
******************************************************************/
|
|
|
|
/* Initial init of timers. */
|
|
/* Not run on a sighup */
|
|
void pmon_timer_init ( void )
|
|
{
|
|
mtcTimer_init ( pmonTimer_audit, _pmon_ctrl_ptr->my_hostname, "audit" ) ;
|
|
mtcTimer_init ( pmonTimer_pulse, _pmon_ctrl_ptr->my_hostname, "pulse" ) ;
|
|
mtcTimer_init ( pmonTimer_hostwd , _pmon_ctrl_ptr->my_hostname, "hostwd" ) ;
|
|
mtcTimer_init ( pmonTimer_degrade, _pmon_ctrl_ptr->my_hostname, "degrade audit" );
|
|
|
|
for ( int i = 0 ; i < MAX_PROCESSES ; i++ )
|
|
{
|
|
/* Bind the process timer to the process struct */
|
|
process_config[i].pt_ptr = &ptimer[i] ;
|
|
|
|
/* Init the timer for this process */
|
|
mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ;
|
|
}
|
|
}
|
|
|
|
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr );
|
|
|
|
/* Register realtime signal handler with the kernel */
|
|
int signal_hdlr_init ( int sig_num )
|
|
{
|
|
int rc ;
|
|
|
|
memset (&_pmon_ctrl_ptr->info, 0, sizeof(_pmon_ctrl_ptr->info));
|
|
memset (&_pmon_ctrl_ptr->prev, 0, sizeof(_pmon_ctrl_ptr->info));
|
|
|
|
_pmon_ctrl_ptr->info.sa_sigaction = _process_death_hdlr ;
|
|
_pmon_ctrl_ptr->info.sa_flags = (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO) ;
|
|
|
|
rc = sigaction ( sig_num, &_pmon_ctrl_ptr->info , &_pmon_ctrl_ptr->prev );
|
|
if ( rc )
|
|
{
|
|
elog("Registering : Realtime Signal %d - (%d) (%s)\n",
|
|
sig_num, errno, strerror(errno));
|
|
rc = FAIL_SIGNAL_INIT ;
|
|
}
|
|
else
|
|
{
|
|
ilog("Registering : Realtime Signal %d\n", sig_num);
|
|
}
|
|
return (rc) ;
|
|
}
|
|
|
|
/*
|
|
* Init the handler
|
|
* - Must support re-init that might occur over a SIGHUP
|
|
**/
|
|
int pmon_hdlr_init ( pmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
int rc ;
|
|
|
|
/* Save the control pointer */
|
|
_pmon_ctrl_ptr = ctrl_ptr ;
|
|
|
|
/* Force running of the audit at the very start */
|
|
_pmon_ctrl_ptr->run_audit = true ;
|
|
|
|
rc = signal_hdlr_init ( PMON_RT_SIGNAL );
|
|
|
|
/* Log the control setting going into the main loop */
|
|
mem_log_ctrl ( _pmon_ctrl_ptr ) ;
|
|
|
|
/* init the inotify file descriptors */
|
|
_pmon_ctrl_ptr->fd = 0 ;
|
|
_pmon_ctrl_ptr->wd = 0 ;
|
|
|
|
return (rc) ;
|
|
}
|
|
|
|
|
|
/* Module Cleanup */
|
|
void pmon_hdlr_fini ( pmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
/* Close any active monitoring sockets */
|
|
close_process_socket ( &process_config[i] );
|
|
}
|
|
|
|
/* Turn off inotify */
|
|
set_inotify_close ( ctrl_ptr->fd, ctrl_ptr->wd );
|
|
}
|
|
|
|
void manage_process_failure ( process_config_type * ptr )
|
|
{
|
|
/*******************************************************************
|
|
* The next 2 'if' clauses try to prevent raising alarms for
|
|
* process failure detections while the host is shutting down.
|
|
*******************************************************************/
|
|
|
|
/* When handling a process failure check to see if we are already in
|
|
* the stopping state.
|
|
* If not, then query the current system state and save it. */
|
|
if ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__STOPPING )
|
|
{
|
|
/* update current state */
|
|
_pmon_ctrl_ptr->system_state = get_system_state();
|
|
}
|
|
|
|
/* Ignore process failures while in stopping (i.e. shutdown) mode */
|
|
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
|
|
{
|
|
/* don't report process failures during system shutdown. */
|
|
wlog ("%s terminated by system shutdown (pid:%d) ; ignoring\n",
|
|
ptr->process , ptr->pid );
|
|
ptr->ignore = true ;
|
|
return ;
|
|
}
|
|
|
|
/* Should not need this clause */
|
|
if ( ptr->stopped == true )
|
|
{
|
|
slog ("%s process is in the stopped state\n", ptr->process);
|
|
}
|
|
|
|
elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process, ptr->pid, ptr->failed, ptr->active_failed);
|
|
passiveStageChange ( ptr, PMON_STAGE__MANAGE) ;
|
|
|
|
if ( ptr->failed == false )
|
|
{
|
|
ptr->failed = true ;
|
|
ptr->restart = false ;
|
|
// pmon_send_event ( MTC_EVENT_PMON_LOG, ptr ) ;
|
|
}
|
|
|
|
/* TODO: Consider clearing active_failed flag regardless */
|
|
if ( ptr->active_monitoring == true )
|
|
{
|
|
activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST ) ;
|
|
ptr->active_failed = true ;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Manage process config strdup memory over a config/reconfig.
|
|
* On reconfig ; the PMOND_INIT_CHECK should be set and for each
|
|
* config pointed that is not null ; feee the memory.
|
|
*/
|
|
void init_process_config_memory ( void )
|
|
{
|
|
for ( int i = 0 ; i < MAX_PROCESSES ; i++ )
|
|
{
|
|
if ( process_config[i].init_check == PMOND_INIT_CHECK )
|
|
{
|
|
if ( process_config[i].process ) free ((void*)process_config[i].process);
|
|
if ( process_config[i].service ) free ((void*)process_config[i].service);
|
|
if ( process_config[i].script ) free ((void*)process_config[i].script);
|
|
if ( process_config[i].style ) free ((void*)process_config[i].style);
|
|
if ( process_config[i].pidfile ) free ((void*)process_config[i].pidfile);
|
|
if ( process_config[i].severity ) free ((void*)process_config[i].severity);
|
|
if ( process_config[i].mode ) free ((void*)process_config[i].mode);
|
|
if ( process_config[i].start_arg ) free ((void*)process_config[i].start_arg);
|
|
if ( process_config[i].status_arg) free ((void*)process_config[i].status_arg);
|
|
|
|
if ( process_config[i].status_failure_text_file) free ((void*)process_config[i].status_failure_text_file);
|
|
if ( process_config[i].subfunction ) free ((void*)process_config[i].subfunction);
|
|
if ( process_config[i].recovery_method ) free ((void*)process_config[i].recovery_method);
|
|
}
|
|
/* init the process config memory ; now that we have freed past strdup allocations*/
|
|
memset ( (char*)&process_config[i], 0, sizeof(process_config_type));
|
|
}
|
|
}
|
|
|
|
/* Read and load process monitor configuration from
|
|
* all the process config files from /etc/pmon.d */
|
|
void load_processes ( void )
|
|
{
|
|
list<active_process_alarms_type> saved_alarm_list ;
|
|
|
|
int rc = PASS ;
|
|
|
|
/* 1. Free timers,
|
|
* 2. shutdown sockets
|
|
* 3. track processes with raised alarms
|
|
*/
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
mtcTimer_reset ( process_config[i].pt_ptr );
|
|
close_process_socket ( &process_config[i] );
|
|
}
|
|
|
|
/* Query fm for existing pmon process alarms and
|
|
* for each that is found store their 'name' and
|
|
* 'severity' in the passed in saved list */
|
|
manage_queried_alarms ( saved_alarm_list );
|
|
|
|
/* init the process config memory */
|
|
init_process_config_memory ();
|
|
|
|
/* Default to event mode */
|
|
_pmon_ctrl_ptr->event_mode = true ;
|
|
|
|
/* Start with zero processes */
|
|
_pmon_ctrl_ptr->processes = 0 ;
|
|
|
|
/* Read in the list of config files and their contents */
|
|
load_filenames_in_dir ( CONFIG_DIR, config_files ) ;
|
|
|
|
ilog ("Loading Process Configurations\n");
|
|
ilog ("--------------------------------------------------------------\n");
|
|
|
|
/* Run Maintenance on Inventory */
|
|
for ( string_iter_ptr = config_files.begin () ;
|
|
string_iter_ptr != config_files.end () ;
|
|
++string_iter_ptr )
|
|
{
|
|
process_config_type * ptr = &process_config[_pmon_ctrl_ptr->processes] ;
|
|
rc = process_config_load ( ptr, string_iter_ptr->data() );
|
|
if ( rc )
|
|
{
|
|
memset ((char*)ptr, 0, sizeof(process_config_type));
|
|
}
|
|
else
|
|
{
|
|
/* stages for passive and active monitoring are initially set
|
|
* inside the process_config_load */
|
|
_pmon_ctrl_ptr->processes++ ;
|
|
ptr->init_check = PMOND_INIT_CHECK ;
|
|
}
|
|
}
|
|
|
|
pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ;
|
|
|
|
ilog ("Registering Processes With Kernel\n");
|
|
ilog ("---------------------------------------------------------------\n");
|
|
|
|
/* Register all the processes with the kernel */
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
process_config[i].restart= false;
|
|
process_config[i].failed = false;
|
|
if ( process_config[i].status_monitoring )
|
|
{
|
|
process_config[i].status_stage = STATUS_STAGE__BEGIN ;
|
|
}
|
|
else if ( process_config[i].stage == PMON_STAGE__MANAGE )
|
|
{
|
|
register_process ( &process_config[i] );
|
|
if ( process_config[i].active_monitoring == true )
|
|
{
|
|
if ( open_process_socket ( &process_config[i] ) != PASS )
|
|
{
|
|
elog ("%s failed to open process socket\n",
|
|
process_config[i].process );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
_pmon_ctrl_ptr->reload_config = false ;
|
|
|
|
/* If there were process alarms that existed over the reload
|
|
* then ensure that those processes are updated with that information. */
|
|
if ( saved_alarm_list.size () )
|
|
{
|
|
ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size());
|
|
manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes );
|
|
}
|
|
}
|
|
|
|
|
|
/* Looks up the timer ID and asserts the corresponding ringer */
|
|
void pmon_timer_handler ( int sig, siginfo_t *si, void *uc)
|
|
{
|
|
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
|
|
|
|
/* Avoid compiler errors/warnings for parms we must
|
|
* have but currently do nothing with */
|
|
UNUSED(sig);
|
|
UNUSED(uc);
|
|
|
|
if ( !(*tid_ptr) )
|
|
{
|
|
return ;
|
|
}
|
|
|
|
else if ( *tid_ptr == pmonTimer_pulse.tid )
|
|
{
|
|
pmonTimer_pulse.ring = true ;
|
|
}
|
|
else if ( *tid_ptr == pmonTimer_degrade.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( pmonTimer_degrade );
|
|
pmonTimer_degrade.ring = true ;
|
|
_pmon_ctrl_ptr->patching_in_progress = false ;
|
|
}
|
|
else if ( *tid_ptr == pmonTimer_audit.tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( pmonTimer_audit );
|
|
pmonTimer_audit.ring = true ;
|
|
}
|
|
/* is host watchdog pmon timer */
|
|
else if ( *tid_ptr == pmonTimer_hostwd.tid )
|
|
{
|
|
pmonTimer_hostwd.ring = true ;
|
|
|
|
/* we do not stop the timer; instead let it auto-restart */
|
|
}
|
|
else
|
|
{
|
|
bool found = false ;
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
if ( *tid_ptr == process_config[i].pt_ptr->tid )
|
|
{
|
|
mtcTimer_stop_int_safe ( process_config[i].pt_ptr );
|
|
process_config[i].pt_ptr->ring = true ;
|
|
found = true ;
|
|
break ;
|
|
}
|
|
}
|
|
if ( !found )
|
|
{
|
|
//wlog ("Unknown timer\n");
|
|
/* try and cleanup by stopping this unknown timer via its tid */
|
|
mtcTimer_stop_tid_int_safe (tid_ptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
/****************************************************************************
|
|
*
|
|
* Name : service_file_exists
|
|
*
|
|
* Description: Look in some well known places for the specified service file.
|
|
*
|
|
* Returns : Return true if the specified service file is found.
|
|
*
|
|
* Updates : If the service file is found then update the supplied
|
|
* character string buffer with the full path/name of that
|
|
* service file.
|
|
*
|
|
****************************************************************************/
|
|
bool service_file_exists ( string service_filename,
|
|
char * path_n_name_ptr,
|
|
int max_len )
|
|
{
|
|
/* load the name of the service file */
|
|
snprintf ( path_n_name_ptr, max_len, "%s/%s",
|
|
SYSTEMD_SERVICE_FILE_DIR1,
|
|
service_filename.data());
|
|
if (( path_n_name_ptr ) && (strnlen ( path_n_name_ptr, max_len )))
|
|
{
|
|
if ( daemon_is_file_present ( path_n_name_ptr ) == true )
|
|
return true ;
|
|
}
|
|
snprintf ( path_n_name_ptr, max_len, "%s/%s",
|
|
SYSTEMD_SERVICE_FILE_DIR2,
|
|
service_filename.data());
|
|
if (( path_n_name_ptr ) && ( strnlen ( path_n_name_ptr, max_len )))
|
|
{
|
|
if ( daemon_is_file_present ( path_n_name_ptr ) == true )
|
|
return true ;
|
|
}
|
|
return false ;
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : process_config_load
|
|
*
|
|
* Purpose : Load the content of a config file
|
|
*
|
|
*****************************************************************************/
|
|
int process_config_load (process_config_type * pc_ptr, const char * config_file_ptr )
|
|
{
|
|
char recovery_method_buf [_MAX_LEN_] ;
|
|
memset (recovery_method_buf,0, sizeof(recovery_method_buf));
|
|
|
|
if ( _pmon_ctrl_ptr->processes >= MAX_PROCESSES )
|
|
{
|
|
wlog ("Cannot Monitor more than %d processes\n", MAX_PROCESSES );
|
|
return (FAIL);
|
|
}
|
|
|
|
/* Read the process config file */
|
|
pc_ptr->mask = 0 ;
|
|
pc_ptr->amask = 0 ;
|
|
pc_ptr->status_mask = 0 ;
|
|
pc_ptr->status_monitoring = false;
|
|
pc_ptr->passive_monitoring = false;
|
|
pc_ptr->audit_alarm_refresh_count = 0 ;
|
|
|
|
if (ini_parse( config_file_ptr, pmon_process_config, pc_ptr) < 0)
|
|
{
|
|
elog("Read Failure : %s\n", config_file_ptr );
|
|
return (FAIL);
|
|
}
|
|
|
|
/* Set some defaults just in case they were not specified */
|
|
if ( !pc_ptr->mode )
|
|
{
|
|
pc_ptr->mode = strdup("Passive") ;
|
|
}
|
|
if ( !pc_ptr->startuptime )
|
|
{
|
|
pc_ptr->startuptime = PMON_MIN_START_DELAY ;
|
|
}
|
|
|
|
/* Many process conf files came from a sysvinit origin and might not
|
|
* have a service file label. Account for that in the following
|
|
* load of recovery_method_buf.
|
|
* Accept a script name if the service name is missing. */
|
|
bool recovery_method_found = false ;
|
|
|
|
/* look for the service file */
|
|
if ( pc_ptr->service )
|
|
{
|
|
string service = pc_ptr->service ;
|
|
if ( service.find(".service") == string::npos )
|
|
service.append(".service");
|
|
if ( service_file_exists(service, &recovery_method_buf[0], _MAX_LEN_) == true )
|
|
recovery_method_found = true ;
|
|
}
|
|
else if ( pc_ptr->script )
|
|
{
|
|
string script = basename((char*)pc_ptr->script);
|
|
if ( script.find(".service") == string::npos )
|
|
script.append(".service");
|
|
if ( service_file_exists(script, &recovery_method_buf[0], _MAX_LEN_) == true )
|
|
recovery_method_found = true ;
|
|
else
|
|
{
|
|
/* resort to the script file only */
|
|
/* load the name of the process init script */
|
|
snprintf ( &recovery_method_buf[0], _MAX_LEN_, "%s", pc_ptr->script );
|
|
if ( daemon_is_file_present ( recovery_method_buf ) == true )
|
|
{
|
|
recovery_method_found = true ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s has script but not found (%s)\n",
|
|
pc_ptr->process, recovery_method_buf );
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* print a log if we have no recovery method */
|
|
wlog ("%s has no recovery method ; process not monitored\n", pc_ptr->process );
|
|
wlog ("... conf file has no 'service' or 'script' recovery entry\n");
|
|
return (FAIL_NOT_FOUND);
|
|
}
|
|
|
|
if ( recovery_method_found == false )
|
|
{
|
|
wlog ("%s has no recovery method found ; process not monitored\n", pc_ptr->process );
|
|
return (FAIL_NOT_FOUND);
|
|
}
|
|
|
|
update_config_option ( &pc_ptr->recovery_method , recovery_method_buf );
|
|
|
|
if ( !strcmp ( pc_ptr->mode, "status" ) )
|
|
{
|
|
pc_ptr->status_monitoring = true;
|
|
|
|
if (( pc_ptr->status_mask == CONF_STATUS_MON_MASK ) &&
|
|
( pc_ptr->process[0] != '\0' ) &&
|
|
( pc_ptr->severity[0] != '\0'))
|
|
{
|
|
dlog1 ("Config File : %s\n", string_iter_ptr->c_str());
|
|
|
|
if ( !strcmp ( pc_ptr->severity, "critical" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_CRITICAL ;
|
|
}
|
|
else if ( !strcmp ( pc_ptr->severity, "major" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_MAJOR ;
|
|
}
|
|
else if ( !strcmp ( pc_ptr->severity, "minor" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_MINOR ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s has invalid severity ; ignoring\n", pc_ptr->process );
|
|
pc_ptr->ignore = strdup ("ignored");
|
|
}
|
|
|
|
/* Bind the process timer to the process struct */
|
|
pc_ptr->pt_ptr = &ptimer[_pmon_ctrl_ptr->processes] ;
|
|
|
|
/* set the timer service owner to the process name */
|
|
pc_ptr->pt_ptr->service = pc_ptr->process ;
|
|
|
|
pc_ptr->restarts_cnt = 0 ;
|
|
pc_ptr->pid = 0 ;
|
|
pc_ptr->child_pid = 0 ;
|
|
pc_ptr->restart = false ;
|
|
pc_ptr->failed = false ;
|
|
pc_ptr->status_failed = false ;
|
|
pc_ptr->was_failed = false ;
|
|
pc_ptr->sigchld_rxed = false ;
|
|
|
|
ilog ("%7s Mon : %-27s %-8s\n", pc_ptr->mode,
|
|
pc_ptr->process,
|
|
pc_ptr->ignore ? "ignored" : pc_ptr->severity);
|
|
pc_ptr->status_stage = STATUS_STAGE__BEGIN ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("Status Parse Failure: %s\n", string_iter_ptr->c_str());
|
|
wlog ("Status Mask Expected: %x Detected: %x\n", CONF_STATUS_MON_MASK, pc_ptr->status_mask );
|
|
return (FAIL);
|
|
}
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
|
|
if (( pc_ptr->mask == CONF_MASK ) &&
|
|
( pc_ptr->process[0] != '\0' ) &&
|
|
( pc_ptr->severity[0] != '\0'))
|
|
{
|
|
dlog1 ("Config File : %s\n", string_iter_ptr->c_str());
|
|
|
|
if ( !strcmp ( pc_ptr->severity, "critical" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_CRITICAL ;
|
|
}
|
|
else if ( !strcmp ( pc_ptr->severity, "major" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_MAJOR ;
|
|
}
|
|
else if ( !strcmp ( pc_ptr->severity, "minor" ))
|
|
{
|
|
pc_ptr->sev = SEVERITY_MINOR ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s has invalid severity ; ignoring\n", pc_ptr->process );
|
|
pc_ptr->ignore = strdup ("ignored");
|
|
}
|
|
|
|
/* Bind the process timer to the process struct */
|
|
pc_ptr->pt_ptr = &ptimer[_pmon_ctrl_ptr->processes] ;
|
|
|
|
/* Init the timer for this process */
|
|
mtcTimer_init ( pc_ptr->pt_ptr ) ;
|
|
pc_ptr->pt_ptr->hostname = pc_ptr->process ;
|
|
pc_ptr->pt_ptr->service = pc_ptr->process ;
|
|
|
|
pc_ptr->restarts_cnt = 0 ;
|
|
pc_ptr->debounce_cnt = 0 ;
|
|
pc_ptr->pid = 0 ;
|
|
pc_ptr->child_pid = 0 ;
|
|
pc_ptr->restart = false ;
|
|
pc_ptr->failed = false ;
|
|
pc_ptr->sigchld_rxed = false ;
|
|
pc_ptr->stopped = false ;
|
|
|
|
pc_ptr->alarm_severity = FM_ALARM_SEVERITY_CLEAR ;
|
|
|
|
if (( _pmon_ctrl_ptr->system_type != SYSTEM_TYPE__NORMAL ) &&
|
|
( pc_ptr->subfunction != NULL ))
|
|
{
|
|
/* subfunction process monitoring is deferred until
|
|
* that subfunction init is complete */
|
|
ilog ("%7s Def : %-30s %-8s - %s (%s)\n", pc_ptr->mode,
|
|
pc_ptr->process,
|
|
pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf,
|
|
pc_ptr->subfunction);
|
|
/* defer subfunction processes to the FSM to get enabled */
|
|
pc_ptr->stage = PMON_STAGE__POLLING ;
|
|
pc_ptr->pt_ptr->ring = true ;
|
|
}
|
|
else
|
|
{
|
|
/* if not a subfunction then monitoring defaults
|
|
* to true immediately */
|
|
pc_ptr->passive_monitoring = true ;
|
|
|
|
ilog ("%7s Mon : %-30s %-8s - %s\n", pc_ptr->mode,
|
|
pc_ptr->process,
|
|
pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf);
|
|
pc_ptr->stage = PMON_STAGE__MANAGE ;
|
|
}
|
|
// mem_log_process ( pc_ptr );
|
|
}
|
|
else
|
|
{
|
|
wlog ("Parse Failure: %s\n", string_iter_ptr->c_str());
|
|
wlog ("Mask Expected: %x Detected: %x\n", CONF_MASK, pc_ptr->mask );
|
|
return (FAIL);
|
|
}
|
|
|
|
if ( !strcmp ( pc_ptr->mode, "active" ) )
|
|
{
|
|
if ( pc_ptr->amask == CONF_AMON_MASK )
|
|
{
|
|
if (( pc_ptr->period == 0 ) ||
|
|
( pc_ptr->period > PMON_MAX_ACTIVE_PERIOD ))
|
|
{
|
|
elog ("%s monitor period out-of-range (%d secs), setting to max\n",
|
|
pc_ptr->process,
|
|
pc_ptr->period );
|
|
|
|
pc_ptr->period = PMON_MAX_ACTIVE_PERIOD ;
|
|
}
|
|
if ( pc_ptr->timeout > pc_ptr->period )
|
|
{
|
|
elog ("%s monitor 'timeout' longer than 'period' (%d:%d secs), rounding down\n",
|
|
pc_ptr->process,
|
|
pc_ptr->timeout,
|
|
pc_ptr->period );
|
|
|
|
pc_ptr->timeout = pc_ptr->period ;
|
|
}
|
|
|
|
/* Init the active component */
|
|
pc_ptr->active_stage = ACTIVE_STAGE__PULSE_REQUEST ;
|
|
pc_ptr->active_monitoring = true ;
|
|
pc_ptr->active_failed = false ;
|
|
pc_ptr->pulse_count = 0 ;
|
|
pc_ptr->b2b_miss_peak = 0 ;
|
|
pc_ptr->b2b_miss_count = 0 ;
|
|
}
|
|
else
|
|
{
|
|
wlog ("%s Parse Failure\n", string_iter_ptr->c_str());
|
|
wlog ("%s Active Mask Expected: %x Detected: %x\n",
|
|
pc_ptr->process,
|
|
CONF_AMON_MASK,
|
|
pc_ptr->amask );
|
|
return (FAIL);
|
|
}
|
|
}
|
|
return (PASS);
|
|
}
|
|
|
|
int get_process_pid ( process_config_type * ptr )
|
|
{
|
|
int pid = 0 ;
|
|
FILE * pid_file_stream = fopen ( ptr->pidfile, "r" );
|
|
if ( pid_file_stream != NULL )
|
|
{
|
|
int num = fscanf ( pid_file_stream, "%d", &pid);
|
|
if ( num != 1 )
|
|
{
|
|
wlog ("fscanf failed to read pid from %s\n", ptr->pidfile );
|
|
}
|
|
fclose (pid_file_stream);
|
|
}
|
|
return (pid);
|
|
}
|
|
|
|
/* search the process list for the child_pid in
|
|
* order to find the parent it is associated with */
|
|
process_config_type * find_parent_process ( int child_pid )
|
|
{
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
if ( process_config[i].child_pid == child_pid )
|
|
{
|
|
return (&process_config[i]);
|
|
}
|
|
}
|
|
/* look based on PID */
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
if ( process_config[i].pid == child_pid )
|
|
{
|
|
return (&process_config[i]);
|
|
}
|
|
}
|
|
|
|
return (NULL);
|
|
}
|
|
|
|
/* search the process list for the child_pid in
|
|
* order to find the parent it is associated with */
|
|
bool want_degrade_clear ( void )
|
|
{
|
|
int i ;
|
|
bool clear = true ;
|
|
for ( i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
/* Don't report current or previous status on
|
|
* processes that are not being monitored */
|
|
if (( !process_config[i].passive_monitoring ) &&
|
|
( !process_config[i].status_monitoring ))
|
|
{
|
|
continue ;
|
|
}
|
|
if (( process_config[i].failed == true ) || ( process_config[i].active_failed == true ))
|
|
{
|
|
if (( process_config[i].alarm_severity == FM_ALARM_SEVERITY_MAJOR ) ||
|
|
( process_config[i].alarm_severity == FM_ALARM_SEVERITY_CRITICAL ))
|
|
{
|
|
wlog ("%s is still failed '%s' ; degrade assert\n",
|
|
process_config[i].process,
|
|
alarmUtil_getSev_str(process_config[i].alarm_severity).c_str());
|
|
|
|
/* Resend the process event to maintenance every threshold count */
|
|
if ( ++process_config[i].audit_alarm_refresh_count > AUDIT_EVENT_SEND_REFESH_THRESHOLD )
|
|
{
|
|
process_config[i].audit_alarm_refresh_count = 0 ;
|
|
if ( process_config[i].alarm_severity == FM_ALARM_SEVERITY_MAJOR )
|
|
pmon_send_event ( MTC_EVENT_PMON_MAJOR, &process_config[i] ) ;
|
|
else
|
|
pmon_send_event ( MTC_EVENT_PMON_CRIT, &process_config[i] ) ;
|
|
}
|
|
clear = false ;
|
|
}
|
|
}
|
|
}
|
|
return (clear);
|
|
}
|
|
|
|
static char unknown_process[] = "unknown process" ;
|
|
bool kill_running_process ( int pid )
|
|
{
|
|
bool rc = false ;
|
|
if ( pid )
|
|
{
|
|
int result = kill ( pid, 0 );
|
|
if ( result == 0 )
|
|
{
|
|
char * proc_name_ptr = &unknown_process[0] ;
|
|
process_config_type * ptr = find_parent_process ( pid ) ;
|
|
if ( ptr )
|
|
{
|
|
daemon_remove_file ( ptr->pidfile );
|
|
proc_name_ptr = (char*)ptr->process ;
|
|
}
|
|
result = kill ( pid, SIGKILL );
|
|
if ( ptr && ( result == 0 ) )
|
|
{
|
|
if ( daemon_is_file_present ( ptr->pidfile ) )
|
|
{
|
|
if ( get_process_pid ( ptr ) == pid )
|
|
{
|
|
ilog ("%s removing stale pidfile (%d) %s\n", ptr->process, pid, ptr->pidfile );
|
|
daemon_remove_file ( ptr->pidfile );
|
|
}
|
|
}
|
|
wlog ("%s kill succeeded (%d)\n", proc_name_ptr, pid );
|
|
rc = true ;
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s kill failed (%d)\n", proc_name_ptr, pid );
|
|
}
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
/* if the child (startup script) pid is still running then kill it */
|
|
void kill_running_child ( process_config_type * ptr )
|
|
{
|
|
if ( ptr->child_pid )
|
|
{
|
|
if ( kill_running_process ( ptr->child_pid ) == true )
|
|
{
|
|
wlog ("%s start script still running (%d) ; killed\n", ptr->process, ptr->child_pid );
|
|
}
|
|
ptr->child_pid = 0 ;
|
|
}
|
|
}
|
|
|
|
bool process_running ( process_config_type * ptr )
|
|
{
|
|
int pid = get_process_pid ( ptr );
|
|
if ( pid )
|
|
{
|
|
int result = kill (pid, 0 );
|
|
ptr->pid = pid ;
|
|
if ( result == 0 )
|
|
{
|
|
if (( ptr->pid != 0 ) && ( ptr->pid != pid ))
|
|
{
|
|
wlog ("%s pid changed (was:%d now:%d)\n",
|
|
ptr->process ,
|
|
ptr->pid,
|
|
pid);
|
|
|
|
ptr->pid = 0 ;
|
|
return (false);
|
|
}
|
|
else if (( ptr->pid == 0 ) && ( pid ))
|
|
{
|
|
ilog ("%s Running (%d)\n", ptr->process, ptr->pid);
|
|
}
|
|
else
|
|
{
|
|
dlog1 ("%s Running (%d) (%d)\n", ptr->process, pid, ptr->pid );
|
|
}
|
|
return (true) ;
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s process not running (kill 0 result:%d) (get_process_pid:%d)\n", ptr->process, result, pid );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s process not running\n", ptr->process );
|
|
}
|
|
ptr->pid = 0 ;
|
|
return (false);
|
|
}
|
|
|
|
/* Temporary till we get kernel event */
|
|
void _get_events ( void )
|
|
{
|
|
int pid = 0 ;
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
bool running = false ;
|
|
|
|
if ( process_config[i].ignore == true )
|
|
{
|
|
process_config[i].failed = false ;
|
|
process_config[i].restart= false ;
|
|
continue ;
|
|
}
|
|
|
|
else if ( process_config[i].stage == PMON_STAGE__POLLING )
|
|
{
|
|
continue ;
|
|
}
|
|
|
|
else if ( process_config[i].status_monitoring )
|
|
{
|
|
continue ;
|
|
}
|
|
|
|
/* Skip already failed processes */
|
|
else if ( process_config[i].failed == false )
|
|
{
|
|
if ((pid = get_process_pid ( &process_config[i] )))
|
|
{
|
|
int result = kill (pid, 0 );
|
|
process_config[i].pid = pid ;
|
|
if ( result == 0 )
|
|
{
|
|
dlog3 ("%s (%d) is running\n", process_config[i].process, pid);
|
|
running = true ;
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s (%d) not running (%d:%d) (%s)\n",
|
|
process_config[i].process, pid,
|
|
result, errno, strerror(errno)) ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
dlog ("%s Pid (unknown) - no pidfile\n", process_config[i].process )
|
|
}
|
|
|
|
/* If not running then fail the process
|
|
* to trigger auto-recovery */
|
|
if ( running == false )
|
|
{
|
|
wlog ("%s Not Running\n", process_config[i].process );
|
|
|
|
manage_process_failure ( &process_config[i] );
|
|
}
|
|
}
|
|
}
|
|
|
|
/* turn off the audit */
|
|
_pmon_ctrl_ptr->run_audit = false ;
|
|
}
|
|
|
|
/* This is the data structure for requestion process death
|
|
* (and other state change) information. Sig of -1 means
|
|
* query, sig of 0 means deregistration, positive sig means
|
|
* that you want to set it. sig and events are value-result
|
|
* and will be updated with the previous values on every
|
|
* successful call. */
|
|
|
|
int unregister_process ( process_config_type * ptr )
|
|
{
|
|
dlog1 ("%s pid %d\n", ptr->process, ptr->pid );
|
|
if ( ptr->pid )
|
|
{
|
|
struct task_state_notify_info info ;
|
|
info.pid = ptr->pid ;
|
|
info.sig = 0 ;
|
|
info.events = PMON_EVENT_FLAGS ;
|
|
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ))
|
|
{
|
|
if ( errno != ESRCH )
|
|
{
|
|
wlog ("%s unregister pid:%d (%d:%s)\n",
|
|
ptr->process,
|
|
ptr->pid,
|
|
errno,
|
|
strerror(errno) );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid );
|
|
}
|
|
}
|
|
ptr->registered = false ;
|
|
return (PASS);
|
|
}
|
|
|
|
int register_process ( process_config_type * ptr )
|
|
{
|
|
int pid = get_process_pid ( ptr );
|
|
if ( pid )
|
|
{
|
|
ptr->pid = pid ;
|
|
ptr->restart= false ;
|
|
if (( _pmon_ctrl_ptr->event_mode ) && ( !ptr->ignore ))
|
|
{
|
|
struct task_state_notify_info info ;
|
|
info.pid = pid ;
|
|
info.sig = PMON_RT_SIGNAL ;
|
|
info.events = PMON_EVENT_FLAGS;
|
|
if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) )
|
|
{
|
|
elog ("%s failed to register pid:%d (%d:%s)\n", ptr->process, pid, errno, strerror(errno));
|
|
if ( errno == EINVAL )
|
|
{
|
|
_pmon_ctrl_ptr->event_mode = false ;
|
|
wlog ( "%s Switching to Polling mode\n", ptr->process);
|
|
}
|
|
else
|
|
{
|
|
ptr->failed = true ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s Registered (%d)\n", ptr->process , pid );
|
|
ptr->failed = false ;
|
|
ptr->registered = true ;
|
|
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
|
|
if ( ptr->active_monitoring == false )
|
|
{
|
|
manage_alarm ( ptr, PMON_CLEAR );
|
|
}
|
|
}
|
|
}
|
|
/* Don't 'else' because event mode might
|
|
* change in the above clause */
|
|
if ( _pmon_ctrl_ptr->event_mode == false )
|
|
{
|
|
wlog ("%s Registered (%d) in polling mode\n",
|
|
ptr->process , pid);
|
|
|
|
/* prevent infinite reg retry in polling mode */
|
|
ptr->registered = true ;
|
|
|
|
if ( process_running ( ptr ) == false )
|
|
{
|
|
ptr->failed = true ;
|
|
}
|
|
else
|
|
{
|
|
ptr->failed = false ;
|
|
manage_alarm ( ptr, PMON_CLEAR );
|
|
passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ilog ("%s is not running\n", ptr->process );
|
|
ptr->failed = true ;
|
|
}
|
|
|
|
if ( ptr->failed )
|
|
{
|
|
manage_process_failure ( ptr );
|
|
return (FAIL);
|
|
}
|
|
else
|
|
{
|
|
return (PASS);
|
|
}
|
|
}
|
|
|
|
|
|
/* This respawns a process through the 'script' string from the process config file.
|
|
* The pmond log files are first closed so their fd's are not duped to the child.
|
|
* The syslog facility is used to log child messages to user.log
|
|
* The waitpid interface is used to manage acknowledging the exit of the child process */
|
|
|
|
#define PMOND_EXECV_ARGS (4)
|
|
|
|
int respawn_process ( process_config_type * ptr )
|
|
{
|
|
pid_t pid ;
|
|
|
|
int rc = PASS ;
|
|
bool restart = false ;
|
|
|
|
unregister_process ( ptr );
|
|
if ( process_running ( ptr ) == true )
|
|
{
|
|
ilog ("%s restart of running process\n", ptr->process );
|
|
restart = true ;
|
|
kill_running_process ( ptr->pid );
|
|
}
|
|
|
|
ptr->restarts_cnt++ ;
|
|
|
|
/* default restart result and ponitoring controls */
|
|
ptr->status = RETRY ; /* keep looking */
|
|
ptr->pidwait_cnt = 0 ; /* TODO: should be a timer .... start count */
|
|
ptr->sigchld_rxed = false ; /* sigchild handler did not run */
|
|
|
|
/* Fork the daemon to trigger the process specific restart */
|
|
ptr->child_pid = pid = fork () ;
|
|
if (pid == 0)
|
|
{
|
|
/* execv arg list */
|
|
char * argv[PMOND_EXECV_ARGS] ;
|
|
for ( int i = 0 ; i < PMOND_EXECV_ARGS ; i++ ) argv[i] = NULL ;
|
|
|
|
char recovery_cmd[_MAX_LEN_] ;
|
|
|
|
bool close_file_descriptors = true ;
|
|
if ( setup_child ( close_file_descriptors ) != PASS )
|
|
exit(EXIT_FAILURE);
|
|
|
|
signal (SIGCHLD, SIG_DFL);
|
|
|
|
openlog ((char*)ptr->process, LOG_PID, LOG_USER );
|
|
|
|
/* Default File Creation Mask */
|
|
umask(022);
|
|
|
|
memset (recovery_cmd,0,sizeof(recovery_cmd));
|
|
|
|
ilog ("Service:%s\n", ptr->service ? ptr->service : "unknown");
|
|
|
|
#define SYSTEMCTL_CMD "/usr/bin/systemctl"
|
|
#define RESTART_CMD "restart"
|
|
#define START_CMD "start"
|
|
if ( get_ctrl_ptr()->recovery_method == PMOND_RECOVERY_METHOD__SYSTEMD )
|
|
{
|
|
|
|
/* systemd recovery method - if the service is specified then it takes precidence */
|
|
if ( ptr->service )
|
|
sprintf ( &recovery_cmd[0], "%s", ptr->service );
|
|
else
|
|
sprintf ( &recovery_cmd[0], "%s", ptr->process );
|
|
|
|
argv[0] = (char*)&SYSTEMCTL_CMD ; /* path to executable */
|
|
argv[1] = (char*)&RESTART_CMD ; /* the recovery command */
|
|
argv[2] = &recovery_cmd[0] ; /* the process name */
|
|
}
|
|
else
|
|
{
|
|
/* init script method */
|
|
snprintf( &recovery_cmd[0], _MAX_LEN_, "%s", ptr->script ) ;
|
|
argv[0] = &recovery_cmd[0] ; /* path to script */
|
|
argv[1] = (restart ? (char*)&RESTART_CMD : (char*)&START_CMD) ; /* the process name */
|
|
}
|
|
|
|
rc = execv(argv[0], argv );
|
|
if ( 0 > rc )
|
|
{
|
|
syslog ( LOG_WARNING, "%s recovery failed with method '%s': (%s %s %s) (%d:%m)\n",
|
|
ptr->process,
|
|
ptr->recovery_method,
|
|
argv[0],
|
|
argv[1],
|
|
argv[2] ? "" : argv[2] ,
|
|
errno );
|
|
}
|
|
else
|
|
{
|
|
syslog ( LOG_INFO, "%s recovered witb method '%s': (%s %s %s)\n",
|
|
ptr->process,
|
|
ptr->recovery_method,
|
|
argv[0],
|
|
argv[1],
|
|
argv[2] ? "" : argv[2] );
|
|
}
|
|
|
|
closelog();
|
|
exit (rc);
|
|
}
|
|
if ( pid == -1 )
|
|
{
|
|
elog ("%s fork failed (%s)\n", ptr->process , strerror(errno));
|
|
|
|
/* TODO: Consider making this a critical fault
|
|
* after 100 retries.
|
|
* All possibilities based on man page are
|
|
* due to resource limitations and if that does
|
|
* not resolve in 100 retries then it probably will never.
|
|
**/
|
|
return (FAIL);
|
|
}
|
|
|
|
gettime ( ptr->time_start );
|
|
|
|
ilog ("%s Spawn (%d)\n", ptr->process, ptr->child_pid );
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : execute_start_command
|
|
*
|
|
* Purpose : execute start script command
|
|
|
|
*****************************************************************************/
|
|
int execute_start_command(process_config_type * ptr)
|
|
{
|
|
pid_t child_pid;
|
|
|
|
wlog("%s process(es) start\n", ptr->process);
|
|
|
|
dlog ("Main Pid:%d \n", getpid() );
|
|
|
|
ptr->sigchld_rxed = false ; /* sigchild handler did not run */
|
|
|
|
ptr->child_pid = child_pid = fork ();
|
|
if (child_pid == 0)
|
|
{
|
|
dlog ("Child Pid:%d \n", getpid() );
|
|
|
|
char* argv[] = { basename((char*)ptr->script), (char*)ptr->start_arg, NULL};
|
|
char cmd[MAX_FILE_SIZE] ;
|
|
memset (cmd,0,sizeof(cmd));
|
|
|
|
snprintf ( &cmd[0], MAX_FILE_SIZE, "%s", ptr->script);
|
|
|
|
bool close_file_descriptors = true ;
|
|
if ( setup_child ( close_file_descriptors ) != PASS )
|
|
{
|
|
exit(255);
|
|
}
|
|
|
|
/* Set child to ignore child exit */
|
|
signal (SIGCHLD, SIG_DFL);
|
|
|
|
/* Setup the exec arguement */
|
|
int res = execv(cmd, argv);
|
|
elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) );
|
|
exit (255);
|
|
}
|
|
|
|
if ( child_pid == -1 )
|
|
{
|
|
elog ("Fork failed (%s)\n", strerror(errno));
|
|
return (FAIL);
|
|
}
|
|
|
|
gettime ( ptr->time_start );
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* Name : execute_status_command
|
|
*
|
|
* Purpose : execute status script command
|
|
|
|
*****************************************************************************/
|
|
int execute_status_command (process_config_type * ptr)
|
|
{
|
|
pid_t child_pid;
|
|
|
|
dlog("%s process(es) status query\n", ptr->process);
|
|
dlog ("Main Pid:%d \n", getpid() );
|
|
|
|
ptr->sigchld_rxed = false ; /* sigchild handler did not run */
|
|
|
|
ptr->child_pid = child_pid = fork ();
|
|
if (child_pid == 0)
|
|
{
|
|
dlog ("Child Pid:%d \n", getpid() );
|
|
|
|
char* argv[] = {basename((char*)ptr->script), (char*)ptr->status_arg, NULL};
|
|
char cmd[MAX_FILE_SIZE] ;
|
|
memset (cmd,0,sizeof(cmd));
|
|
|
|
snprintf ( &cmd[0], MAX_FILE_SIZE, "%s", ptr->script);
|
|
|
|
bool close_file_descriptors = true ;
|
|
if ( setup_child ( close_file_descriptors ) != PASS )
|
|
{
|
|
exit(255);
|
|
}
|
|
|
|
/* Set child to ignore child exit */
|
|
signal (SIGCHLD, SIG_DFL);
|
|
|
|
/* Setup the exec arguement */
|
|
int res = execv(cmd, argv);
|
|
elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) );
|
|
exit (255);
|
|
}
|
|
|
|
if ( child_pid == -1 )
|
|
{
|
|
elog ("Fork failed (%s)\n", strerror(errno));
|
|
return (FAIL);
|
|
}
|
|
|
|
gettime ( ptr->time_start );
|
|
|
|
return (PASS);
|
|
}
|
|
|
|
void daemon_sigchld_hdlr ( void )
|
|
{
|
|
pid_t tpid = 0 ;
|
|
bool found = 0 ;
|
|
int status = 0 ;
|
|
|
|
dlog("Received SIGCHLD ...\n");
|
|
|
|
while ( 0 < ( tpid = waitpid ( -1, &status, WNOHANG | WUNTRACED )))
|
|
{
|
|
process_config_type * process_ptr = find_parent_process ( tpid ) ;
|
|
if ( process_ptr )
|
|
{
|
|
process_ptr->sigchld_rxed = true ;
|
|
|
|
if (WIFEXITED(status))
|
|
{
|
|
if ( process_ptr->status_monitoring == false )
|
|
{
|
|
dlog ("%s spawn script exited properly (%d)\n", process_ptr->process, tpid );
|
|
}
|
|
else
|
|
{
|
|
/* with status mode we do not need to wait for a timeout since we got a response */
|
|
/* force a ring */
|
|
process_ptr->pt_ptr->ring = true;
|
|
}
|
|
|
|
gettime ( process_ptr->time_stop );
|
|
timedelta ( process_ptr->time_start,
|
|
process_ptr->time_stop,
|
|
process_ptr->time_delta );
|
|
|
|
/* only print log if there is an error */
|
|
process_ptr->status = WEXITSTATUS(status) ;
|
|
|
|
if ( process_ptr->status )
|
|
{
|
|
if ( process_ptr->status_monitoring == false )
|
|
{
|
|
dlog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n",
|
|
process_ptr->process,
|
|
process_ptr->status,
|
|
process_ptr->status,
|
|
process_ptr->time_delta.secs,
|
|
process_ptr->time_delta.msecs/1000);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( process_ptr->status_monitoring == false )
|
|
{
|
|
/* only print this log if the spawn time took longer than 1 second */
|
|
if ( process_ptr->time_delta.secs )
|
|
{
|
|
ilog ("%s spawned in %ld.%03ld secs\n",
|
|
process_ptr->process,
|
|
process_ptr->time_delta.secs,
|
|
process_ptr->time_delta.msecs/1000);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else if (WIFSIGNALED(status))
|
|
{
|
|
process_ptr->status = FAIL ;
|
|
wlog ("%s test uncaught signal\n", process_ptr->process );
|
|
}
|
|
else if (WIFSTOPPED(status))
|
|
{
|
|
process_ptr->status = FAIL ;
|
|
wlog ("%s test stopped.\n", process_ptr->process );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
dlog ("parent process for PID:%d lookup failed ; reaped likely after timeout\n", tpid );
|
|
return ;
|
|
}
|
|
}
|
|
if ( ( tpid > 0 ) && ( found == false ) )
|
|
{
|
|
wlog ("PID:%d found no corresponding process\n", tpid );
|
|
}
|
|
}
|
|
|
|
int manage_alarm ( process_config_type * ptr, int action )
|
|
{
|
|
int rc = PASS ;
|
|
|
|
pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ;
|
|
|
|
string processInfo = ptr->process;
|
|
// check for extra text
|
|
if((ptr->status_monitoring ) && (ptr->status_failure_text_file))
|
|
{
|
|
string extra_text = get_status_failure_text(ptr);
|
|
if(!extra_text.empty())
|
|
{
|
|
processInfo.append(" (");
|
|
processInfo.append(extra_text);
|
|
processInfo.append(")");
|
|
}
|
|
}
|
|
|
|
if ( action == PMON_CLEAR )
|
|
{
|
|
if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CLEAR )
|
|
{
|
|
ilog ("%s from '%s' to 'clear'\n", ptr->process, alarmUtil_getSev_str(ptr->alarm_severity).c_str());
|
|
pmonAlarm_clear ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo );
|
|
ptr->alarm_severity = FM_ALARM_SEVERITY_CLEAR ;
|
|
}
|
|
ptr->failed = false ;
|
|
}
|
|
else if ( action == PMON_LOG )
|
|
{
|
|
/* CGTS 4010: Pmon logs and alarm ID should not be identical.
|
|
* Choice was made to not raise pmon logs for process
|
|
* failures. If we do in the future then we should
|
|
* use a different number from 200.006
|
|
* pmonAlarm_minor_log ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo, ptr->restarts );
|
|
*/
|
|
ilog ("%s process has failed ; %s\n", ptr->process,
|
|
(ptr->restarts == 0) ? "Manual recovery is required." : "Auto recovery in progress.");
|
|
|
|
/* Unlike the above call to pmonAlarm_minor_log, this call only creates a log entry in mtcAgent.log */
|
|
pmon_send_event ( MTC_EVENT_PMON_LOG, ptr ) ;
|
|
}
|
|
else
|
|
{
|
|
if ( ptr->restart == true )
|
|
{
|
|
/* handle as error now rather than command */
|
|
ptr->restart = false ;
|
|
}
|
|
switch ( ptr->sev )
|
|
{
|
|
case SEVERITY_CRITICAL:
|
|
{
|
|
wlog ("%s Critical Assert\n", ptr->process );
|
|
ptr->failed = true ;
|
|
if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CRITICAL )
|
|
{
|
|
pmonAlarm_critical ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo );
|
|
ptr->alarm_severity = FM_ALARM_SEVERITY_CRITICAL ;
|
|
}
|
|
break ;
|
|
}
|
|
case SEVERITY_MAJOR:
|
|
{
|
|
wlog ("%s Major Assert\n", ptr->process );
|
|
ptr->failed = true ;
|
|
if ( ptr->alarm_severity != FM_ALARM_SEVERITY_MAJOR )
|
|
{
|
|
pmonAlarm_major ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo );
|
|
ptr->alarm_severity = FM_ALARM_SEVERITY_MAJOR ;
|
|
}
|
|
break ;
|
|
}
|
|
case SEVERITY_MINOR:
|
|
{
|
|
wlog ("%s Minor Assert\n", ptr->process );
|
|
ptr->failed = true ;
|
|
if ( ptr->alarm_severity != FM_ALARM_SEVERITY_MINOR )
|
|
{
|
|
pmonAlarm_minor ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo, ptr->restarts );
|
|
ptr->alarm_severity = FM_ALARM_SEVERITY_MINOR ;
|
|
}
|
|
break ;
|
|
}
|
|
default:
|
|
{
|
|
slog ("%s has Invalid Severity", ptr->process);
|
|
ptr->sev = SEVERITY_CLEAR ;
|
|
ptr->failed = false ;
|
|
rc = RETRY ;
|
|
break ;
|
|
}
|
|
}
|
|
}
|
|
return (rc);
|
|
}
|
|
|
|
|
|
/*********************************************************************************
|
|
*
|
|
* Name : _process_death_hdlr
|
|
*
|
|
* Purpose : Handle realtime signal events from "Notification of death
|
|
* of arbitrary process" (NODOAP) service in the kernel.
|
|
*
|
|
* Description: This handler is bound into the kernel with signal_hdlr_init
|
|
* Monitored processes are registered with the NODOAP feature
|
|
*
|
|
* 1. when service starts
|
|
* 2. after a process is re-spawned and deemed stable and recovered
|
|
*
|
|
* The kernel passes the pid of the dead process in through info_ptr->si_pid.
|
|
* This handler searches the process list for that pid. If found then it triggers
|
|
* that process to be recovered by the fsm. if that process for some crazy reason
|
|
* is already in the failed state then this handler deferrs to allowing the fsm
|
|
* to complete.
|
|
*
|
|
* If the pid is not found in the process control structure then the pidfiles
|
|
* are searched. if the process is not fould in that secondary search then the
|
|
* handler forces the get_events audit to run as a catch all.
|
|
*
|
|
* Note: The _get_events audit already runs periodically but at a much slower rate.
|
|
*
|
|
* Update: emacdona: commented out debug logs as we should not be logging
|
|
* in a signal handler
|
|
*
|
|
*/
|
|
void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr )
|
|
{
|
|
UNUSED(context_ptr);
|
|
UNUSED(sig_num) ;
|
|
|
|
if ( info_ptr )
|
|
{
|
|
process_config_type * ptr = &process_config[0] ;
|
|
bool found = false ;
|
|
dlog ("Sig:%d Pid:%d Code:%d Exit:%d\n",
|
|
info_ptr->si_signo,
|
|
info_ptr->si_pid,
|
|
info_ptr->si_code,
|
|
info_ptr->si_status );
|
|
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
ptr = &process_config[i] ;
|
|
|
|
if ( ptr->pid == info_ptr->si_pid )
|
|
{
|
|
found = true ;
|
|
|
|
if ( ptr->failed != true )
|
|
{
|
|
ptr->failed = true ;
|
|
manage_process_failure ( ptr );
|
|
}
|
|
break ;
|
|
}
|
|
}
|
|
if ( !found )
|
|
{
|
|
for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ )
|
|
{
|
|
int pid ;
|
|
ptr = &process_config[i] ;
|
|
|
|
if ((pid = get_process_pid ( ptr )))
|
|
{
|
|
if ( pid == info_ptr->si_pid )
|
|
{
|
|
found = true ;
|
|
if ( ptr->failed != true )
|
|
{
|
|
/* One notification from the kernel is all we need */
|
|
manage_process_failure ( ptr );
|
|
}
|
|
break ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( !found )
|
|
{
|
|
/* Failed to find process for pid */
|
|
/* Forcing _get_events audit */
|
|
_pmon_ctrl_ptr->run_audit = true ;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Handler called with NULL siginfo pointer */
|
|
/* Forcing _get_events audit */
|
|
_pmon_ctrl_ptr->run_audit = true ;
|
|
}
|
|
}
|
|
|
|
/************************************************************************
|
|
*
|
|
* Name : manage_process_alarms
|
|
*
|
|
* Description: This interface manages process alarms over a process
|
|
* configuration reload
|
|
*
|
|
* Steps:
|
|
*
|
|
* 1. Loop over each item in the list and mark the process as failed
|
|
* with the specified severity level.
|
|
*
|
|
* 2. If the process is not found then clear its alarm as it is no
|
|
* longer a valid process in the new profile and we don't want a
|
|
* lingering stuck alarm.
|
|
*
|
|
*************************************************************************/
|
|
|
|
void manage_process_alarms ( list<active_process_alarms_type> & _list,
|
|
process_config_type * const ptr,
|
|
int const processes )
|
|
{
|
|
/* get out if the list is empty ; should not have been called if
|
|
* empty but ... just in case */
|
|
if ( ! _list.empty() )
|
|
{
|
|
list<active_process_alarms_type>::iterator _iter_ptr ;
|
|
|
|
/* loop over the list ... */
|
|
for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr )
|
|
{
|
|
/* for each item assum it is not found */
|
|
bool found = false ;
|
|
|
|
/* try and find this process in the new process profile */
|
|
for ( int i = 0 ; i < processes ; i++ )
|
|
{
|
|
if ( ! _iter_ptr->process.compare((ptr+i)->process) )
|
|
{
|
|
/* If the process is found then mark it as failed and update its severity.
|
|
* At this point we then assume that there is an alarm raised for this process. */
|
|
found = true ;
|
|
|
|
(ptr+i)->failed = false ;
|
|
wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() );
|
|
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
|
}
|
|
}
|
|
|
|
/* if not found then just clear the alarm */
|
|
if ( found == false)
|
|
{
|
|
wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() );
|
|
pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void pmon_service ( pmon_ctrl_type * ctrl_ptr )
|
|
{
|
|
std::list<int> socks ;
|
|
struct timeval waitd;
|
|
fd_set readfds;
|
|
int select_fail_count = 0 ;
|
|
int flush_thld = 0 ;
|
|
int rc = PASS ;
|
|
int shutdown_log_throttle = 0;
|
|
|
|
/* iNotify stuff */
|
|
bool inotify_fault = false ;
|
|
|
|
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr ();
|
|
pmon_socket_type * sock_ptr = pmon_getSock_ptr ();
|
|
int select_timeout = (cfg_ptr->audit_period*100);
|
|
int audit_period = (cfg_ptr->audit_period/10);
|
|
int pulse_period = cfg_ptr->audit_period ;
|
|
int hostwd_period = (cfg_ptr->hostwd_update_period);
|
|
int degrade_period = (cfg_ptr->audit_period/50);
|
|
|
|
if ( audit_period == 0 ) audit_period = 10 ;
|
|
if ( degrade_period == 0 ) degrade_period = 10 ;
|
|
|
|
ilog ("Starting to monitor processes\n");
|
|
pmon_send_hostwd ( );
|
|
|
|
/* Load and register generic processes - not subfunction processes */
|
|
load_processes ();
|
|
|
|
/* Setup inotify to watch CONFIG_DIR */
|
|
if ( set_inotify_watch ( CONFIG_DIR, ctrl_ptr->fd, ctrl_ptr->wd ) )
|
|
inotify_fault = true ;
|
|
|
|
socks.clear();
|
|
socks.push_front (sock_ptr->cmd_sock->getFD());
|
|
socks.push_front (sock_ptr->event_sock->getFD());
|
|
socks.push_front (sock_ptr->amon_sock);
|
|
socks.sort();
|
|
|
|
ilog ("Starting 'Audit' timer (%d secs)\n", audit_period );
|
|
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
|
|
|
|
ilog ("Starting 'Degrade Audit' timer (%d secs)\n", degrade_period );
|
|
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period );
|
|
|
|
ilog ("Starting 'Pulse' timer (%d secs)\n", pulse_period );
|
|
mtcTimer_start_msec ( pmonTimer_pulse, pmon_timer_handler, pulse_period );
|
|
|
|
ilog ("Starting 'Host Watchdog' timer (%d secs)\n", hostwd_period );
|
|
mtcTimer_start ( pmonTimer_hostwd, pmon_timer_handler, hostwd_period );
|
|
|
|
for ( ; ; )
|
|
{
|
|
/* Accomodate for hup reconfig */
|
|
select_timeout = (cfg_ptr->audit_period*100);
|
|
audit_period = (cfg_ptr->audit_period/10);
|
|
degrade_period = (cfg_ptr->audit_period/50);
|
|
|
|
if ( audit_period < 1 ) audit_period = 10 ;
|
|
if ( degrade_period < 1 ) degrade_period = 10 ;
|
|
|
|
daemon_signal_hdlr ();
|
|
|
|
/* Initialize the master fd_set */
|
|
FD_ZERO(&readfds);
|
|
if ( sock_ptr->cmd_sock->getFD() )
|
|
{
|
|
FD_SET(sock_ptr->cmd_sock->getFD(), &readfds);
|
|
}
|
|
if ( sock_ptr->event_sock->getFD() )
|
|
{
|
|
FD_SET(sock_ptr->event_sock->getFD(), &readfds);
|
|
}
|
|
if ( sock_ptr->amon_sock )
|
|
{
|
|
FD_SET(sock_ptr->amon_sock, &readfds);
|
|
}
|
|
|
|
waitd.tv_sec = 0;
|
|
waitd.tv_usec = select_timeout ;
|
|
|
|
/* This is used as a delay up to select_timeout */
|
|
rc = select( socks.back()+1, &readfds, NULL, NULL, &waitd);
|
|
/* If the select time out expired then */
|
|
if (( rc < 0 ) || ( rc == 0 ))
|
|
{
|
|
/* Check to see if the select call failed. */
|
|
/* ... but filter Interrupt signal */
|
|
if (( rc < 0 ) && ( errno != EINTR ))
|
|
{
|
|
wlog_throttled ( select_fail_count, 20,
|
|
"Socket Select Failed (rc:%d) %s \n",
|
|
errno, strerror(errno));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( FD_ISSET(sock_ptr->cmd_sock->getFD(), &readfds))
|
|
{
|
|
pmon_service_inbox ();
|
|
}
|
|
|
|
if (FD_ISSET(sock_ptr->amon_sock, &readfds))
|
|
{
|
|
amon_service_inbox ( _pmon_ctrl_ptr->processes );
|
|
}
|
|
}
|
|
|
|
if (pmonTimer_pulse.ring == true )
|
|
{
|
|
pmonTimer_pulse.ring = false ;
|
|
/* Send a I'm Alive message to the pulse interface */
|
|
/* Robustness Update: Added an event_mode bool that will
|
|
* be true if the kernel supports notification of death
|
|
* of arbitrary process patch. If that feature is not present
|
|
* then allow pmon to operate but in a degraded state. Eventually
|
|
* we can turn this into a customer alarm/log.
|
|
* Degrade is acheived by not sending the pulses to the watcher.
|
|
*/
|
|
if ( ctrl_ptr->event_mode == true )
|
|
{
|
|
pmon_send_pulse ( );
|
|
}
|
|
}
|
|
|
|
/* Avoid pmond thrashing trying to recover processes during
|
|
* system shutdown. */
|
|
if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING )
|
|
{
|
|
wlog_throttled ( shutdown_log_throttle, 500,
|
|
"process monitoring disabled during system shutdown\n");
|
|
usleep (500);
|
|
continue ;
|
|
}
|
|
if ( shutdown_log_throttle ) shutdown_log_throttle = 0 ;
|
|
|
|
if ( inotify_fault == false )
|
|
{
|
|
if ( get_inotify_events ( ctrl_ptr->fd ) == true )
|
|
{
|
|
if ( _pmon_ctrl_ptr->reload_config == false )
|
|
{
|
|
_pmon_ctrl_ptr->reload_config = true ;
|
|
ilog ("Setting config reload flag\n");
|
|
|
|
/* Hijack the audit timer for the next period for config reload */
|
|
mtcTimer_reset (pmonTimer_degrade);
|
|
if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true )
|
|
{
|
|
_pmon_ctrl_ptr->patching_in_progress = true ;
|
|
wlog ("Patching in progress ; delaying config reload by 30 secs...\n");
|
|
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, (degrade_period + 30) );
|
|
}
|
|
else
|
|
{
|
|
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( pmonTimer_hostwd.ring == true )
|
|
{
|
|
/* inservice recovery from hostw connection failures */
|
|
if ( sock_ptr->hostwd_sock == 0 )
|
|
{
|
|
hostwd_port_init();
|
|
}
|
|
if ( ctrl_ptr->event_mode == true )
|
|
{
|
|
pmon_send_hostwd ( );
|
|
pmonTimer_hostwd.ring = false;
|
|
}
|
|
}
|
|
|
|
/* Run Get Events by audit timer */
|
|
if (pmonTimer_audit.ring == true )
|
|
{
|
|
_get_events ();
|
|
mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period );
|
|
}
|
|
|
|
/* Run the degrade set/clear by audit */
|
|
if (pmonTimer_degrade.ring == true )
|
|
{
|
|
/* run the degrade clear audit */
|
|
if ( want_degrade_clear () == true )
|
|
{
|
|
dlog ("sending degrade clear\n");
|
|
pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ;
|
|
}
|
|
else
|
|
{
|
|
dlog ("sending degrade assert\n");
|
|
// pmon_send_event ( MTC_EVENT_PMON_MAJOR, &process_config[0] ) ;
|
|
}
|
|
|
|
/* Check for config reload state request */
|
|
if ( _pmon_ctrl_ptr->reload_config == true )
|
|
{
|
|
/* But defer it while there is a process in the
|
|
* manually requested restart state */
|
|
bool restart_request_active = false ;
|
|
for ( int i = 0 ; i < ctrl_ptr->processes ; i++ )
|
|
{
|
|
if ( process_config[i].restart == true )
|
|
{
|
|
/* Added as fix */
|
|
wlog ("deferring process config reload to next audit\n");
|
|
wlog ("... while manual restart of '%s' is in progress\n",
|
|
process_config[i].process );
|
|
restart_request_active = true ;
|
|
break ;
|
|
}
|
|
}
|
|
if ( restart_request_active == false )
|
|
{
|
|
load_processes ();
|
|
}
|
|
}
|
|
mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period );
|
|
}
|
|
|
|
/* Get_events run by forced audit or not in event mode */
|
|
else if (( ctrl_ptr->run_audit == true ) ||
|
|
( ctrl_ptr->event_mode == false ))
|
|
{
|
|
_get_events ( );
|
|
}
|
|
|
|
/* Monitor Processes */
|
|
for ( int i = 0 ; i < ctrl_ptr->processes ; i++ )
|
|
{
|
|
/* Allow a process to be ignored */
|
|
if ( process_config[i].ignore == true )
|
|
{
|
|
process_config[i].failed = false ;
|
|
process_config[i].active_failed = false ;
|
|
|
|
/* Handle process auto recovery from stopped state */
|
|
if (( process_config[i].pt_ptr->ring == true ) && ( process_config[i].stopped == true ))
|
|
{
|
|
elog ("%s process was stopped but never restarted ; auto recovery in progress\n", process_config[i].process );
|
|
process_config[i].stopped = false ;
|
|
process_config[i].ignore = false ;
|
|
passiveStageChange ( &process_config[i], PMON_STAGE__MANAGE );
|
|
}
|
|
continue ;
|
|
}
|
|
else if ( process_config[i].status_monitoring )
|
|
{
|
|
pmon_status_handler ( &process_config[i] );
|
|
}
|
|
else if (( process_config[i].stage == PMON_STAGE__POLLING ) ||
|
|
( process_config[i].stage == PMON_STAGE__START_WAIT ) ||
|
|
( process_config[i].restart == true ) ||
|
|
( process_config[i].failed == true ))
|
|
{
|
|
/* Run the FSM for this failed process */
|
|
pmon_passive_handler ( &process_config[i] ) ;
|
|
}
|
|
else if ( process_config[i].active_monitoring )
|
|
{
|
|
// if ( process_config[i].active_failed == false )
|
|
if ( process_config[i].failed == false )
|
|
{
|
|
pmon_active_handler ( &process_config[i] );
|
|
}
|
|
else
|
|
{
|
|
elog ("%s Failed Active Monitoring ... recovering.\n", process_config[i].process );
|
|
manage_process_failure ( &process_config[i]) ;
|
|
}
|
|
}
|
|
|
|
/* Audit to ensure that running processes are
|
|
* registered with the kernel */
|
|
if (( process_config[i].registered == false ) &&
|
|
( _pmon_ctrl_ptr->event_mode ) &&
|
|
( process_config[i].restart == false ) &&
|
|
( process_config[i].failed == false ) &&
|
|
( process_config[i].ignore == false ))
|
|
{
|
|
int pid = get_process_pid ( &process_config[i] );
|
|
if ( pid )
|
|
{
|
|
if ( kill (pid, 0 ) == 0 )
|
|
{
|
|
process_config[i].pid = pid ;
|
|
register_process ( &process_config[i] );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Debugging */
|
|
if (daemon_get_cfg_ptr()->debug_level & 1 )
|
|
{
|
|
char proc_mask [MAX_PROCESSES*2] ;
|
|
bool somefailed = false ;
|
|
memset (&proc_mask[0], 0, sizeof(proc_mask));
|
|
for ( int x = 0 , y = 0 ; x < ctrl_ptr->processes ; x++, y+=2 )
|
|
{
|
|
if ( process_config[x].failed )
|
|
{
|
|
proc_mask[y] = '1' ;
|
|
somefailed = true ;
|
|
}
|
|
else
|
|
proc_mask[y] = '0' ;
|
|
proc_mask[y+1] = ' ' ;
|
|
}
|
|
if ( somefailed )
|
|
{
|
|
alog ( "Process Mask: %s\n", &proc_mask[0] );
|
|
}
|
|
}
|
|
|
|
/* Support the log flush config option */
|
|
if ( cfg_ptr->flush )
|
|
{
|
|
if ( ++flush_thld > cfg_ptr->flush )
|
|
{
|
|
flush_thld = 0 ;
|
|
fflush (stdout);
|
|
fflush (stderr);
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
string get_status_failure_text ( process_config_type * ptr )
|
|
{
|
|
string extra_text("");
|
|
if(( ptr->status_failure_text_file != NULL ) &&
|
|
( ptr->status_failure_text_file[0] != '\0'))
|
|
{
|
|
FILE * status_text_file_stream =
|
|
fopen ( ptr->status_failure_text_file, "r" );
|
|
if ( status_text_file_stream == NULL )
|
|
{
|
|
wlog (" Failed to get extra alam text from file %s\n",
|
|
ptr->status_failure_text_file );
|
|
}
|
|
else
|
|
{
|
|
char buffer[MAX_STATUS_ERROR_TEXT_LEN];
|
|
if ( fgets(buffer, MAX_STATUS_ERROR_TEXT_LEN,
|
|
status_text_file_stream) != NULL)
|
|
{
|
|
extra_text = buffer;
|
|
}
|
|
fclose(status_text_file_stream);
|
|
}
|
|
}
|
|
return extra_text;
|
|
}
|
|
|
|
/****************************************************************************
|
|
*
|
|
* Name : quorum_process_failure
|
|
*
|
|
* Description: manage debounce and log report of quorum process failure
|
|
*
|
|
* Warnings : Only call this when there is a quorum process faiure
|
|
* that has exceeded the threshold count.
|
|
*
|
|
****************************************************************************/
|
|
void quorum_process_failure ( process_config_type * ptr )
|
|
{
|
|
wlog ("%s quorum process %s\n",
|
|
ptr->process,
|
|
ptr->quorum_failure ? "unrecoverable" : "failed" );
|
|
|
|
if ( ptr->quorum_failure == true )
|
|
{
|
|
ptr->quorum_unrecoverable = true;
|
|
}
|
|
else
|
|
{
|
|
ptr->quorum_failure = true;
|
|
}
|
|
}
|
|
|