/* * Copyright (c) 2013-2016 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * */ /** * @file * Wind River CGCS Platform Process Monitor Service Handler */ #include /* dirname */ using namespace std; #include "pmon.h" #include "nodeEvent.h" /* for ... set_inotify_watch, set_inotify_close */ #include "nodeTimers.h" /* for ... mtcTimer_init */ #include "alarmUtil.h" /* for ... alarmUtil_getSev_str */ #include "pmonAlarm.h" /* for ... PMON_ALARM_ID__PMOND */ /* Preserve a local copy of a pointer to the control struct to * avoid having to publish a get utility prototype into pmon.h */ static pmon_ctrl_type * _pmon_ctrl_ptr = NULL ; void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr ) { /* Save the control pointer */ _pmon_ctrl_ptr = ctrl_ptr ; } /* pmonTimer_audit - get_events periodic audit timer */ static struct mtc_timer pmonTimer_audit ; static struct mtc_timer pmonTimer_degrade ; static struct mtc_timer pmonTimer_pulse ; static struct mtc_timer pmonTimer_hostwd ; static struct mtc_timer ptimer[MAX_PROCESSES] ; /** List of config files */ std::list config_files ; std::list::iterator string_iter_ptr ; /* If there is an alarm in the list that matches one in the process list * then update that process with its severity and failed state. * If there is a process in the saved list that is not in the process list * then clear its alarm as it is no longer valid. */ void manage_process_alarms ( list & _list, process_config_type * const ptr, int const processes ); static process_config_type process_config[MAX_PROCESSES] ; /* lookup process control by index and return its pointer if found. * Otherwise if not found return NULL */ process_config_type * get_process_config_ptr ( int index ) { if ( index < _pmon_ctrl_ptr->processes ) return ( &process_config[index] ); return ( NULL ); } /* lookup process control by name and return its pointer if found. * Otherwise if not found return NULL */ process_config_type * get_process_config_ptr ( string process ) { if ( _pmon_ctrl_ptr ) { for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { if ( process.compare(process_config[i].process) == 0 ) { dlog ("%s process found\n", process.c_str()); return (&process_config[i]); } } } wlog ("%s process not found in control list\n", process.c_str()); return (NULL); } #define _MAX_LEN_ (MAX_FILE_SIZE*2) /******************************************************************* * Process Dump Support * ******************************************************************* * * * Utilities that add specific config lines to the dump list * * * ******************************************************************/ /* Log nostname, ip, mac and pulse period */ void mem_log_ctrl ( pmon_ctrl_type * ptr ) { #define MAX_LEN 500 char str[MAX_LEN] ; snprintf (&str[0], MAX_LEN, "%s %s %s Pulse Rate:%d msecs\n", &ptr->my_hostname[0], ptr->my_address.c_str(), ptr->my_macaddr.c_str(), ptr->pulse_period ); mem_log(str); } /* Log process specific controls */ void mem_log_process ( process_config_type * ptr ) { #define MAX_LEN 500 char str[MAX_LEN] ; snprintf (&str[0], MAX_LEN, "%-25s [%5d] %8s Restarts:%u Interval:%u Debounce:%u Startuptime:%u\n", ptr->process, ptr->pid, ptr->severity, ptr->restarts, ptr->debounce, ptr->interval, ptr->startuptime); mem_log(str); } /* Log process specific state */ void mem_log_pstate ( process_config_type * ptr ) { #define MAX_LEN 500 char str[MAX_LEN] ; snprintf (&str[0], MAX_LEN, " Passive: %10s (%d) Failed:%s Restart:%s FCount:%2u subFunc:%s Severity:%s %s %s\n", get_pmonStage_str(ptr), ptr->stage, ptr->failed ? "true " : "false", ptr->restart ? "true " : "false", ptr->failed_cnt, ptr->subfunction ? ptr->subfunction : "None", alarmUtil_getSev_str(ptr->alarm_severity).c_str(), ptr->ignore ? "ignored" : "", ptr->stopped ? "stopped" : "" ); mem_log(str); } /* Log process specific active monitor controls */ void mem_log_aconfig ( process_config_type * ptr ) { #define MAX_LEN 500 char str[MAX_LEN] ; snprintf (&str[0], MAX_LEN, " Active : %10s (%d) Pulses:%2u Seq:%2u Period:%2u Timeout:%2u Thld:%2u %s\n", get_amonStage_str(ptr), ptr->active_stage, ptr->pulse_count, ptr->tx_sequence, ptr->period, ptr->timeout, ptr->threshold, ptr->waiting ? "... waiting" : ""); mem_log(str); } /* Log process specific active monitor state */ void mem_log_astate ( process_config_type * ptr ) { #define MAX_LEN 500 char str[MAX_LEN] ; snprintf (&str[0], MAX_LEN, " Stats - Failed:%s Count:%2u b2bp:%2u b2bc:%2u rxer:%2u txer:%2u msge:%2u msgp:%2u\n", ptr->active_failed ? "true " : "false", ptr->afailed_count, ptr->b2b_miss_peak, ptr->b2b_miss_count, ptr->recv_err_cnt, ptr->send_err_cnt, ptr->mesg_err_cnt, ptr->mesg_err_peak); mem_log(str); } /* Push daemon state to log file */ void daemon_dump_info ( void ) { if ( _pmon_ctrl_ptr ) { daemon_dump_membuf_banner(); mem_log_ctrl ( _pmon_ctrl_ptr ); daemon_dump_membuf(); for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { process_config_type * ptr = get_process_config_ptr(i); mem_log ('\n'); mem_log_process ( ptr ); mem_log_pstate ( ptr ); if ( ptr->active_monitoring ) { mem_log_aconfig ( ptr ); mem_log_astate ( ptr ); } } daemon_dump_membuf(); } } /******************************************************************* * Module Initialize and Finalizes Interfaces * ******************************************************************/ /* Initial init of timers. */ /* Not run on a sighup */ void pmon_timer_init ( void ) { mtcTimer_init ( pmonTimer_audit, _pmon_ctrl_ptr->my_hostname, "audit" ) ; mtcTimer_init ( pmonTimer_pulse, _pmon_ctrl_ptr->my_hostname, "pulse" ) ; mtcTimer_init ( pmonTimer_hostwd , _pmon_ctrl_ptr->my_hostname, "hostwd" ) ; mtcTimer_init ( pmonTimer_degrade, _pmon_ctrl_ptr->my_hostname, "degrade audit" ); for ( int i = 0 ; i < MAX_PROCESSES ; i++ ) { /* Bind the process timer to the process struct */ process_config[i].pt_ptr = &ptimer[i] ; /* Init the timer for this process */ mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ; } } void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr ); /* Register realtime signal handler with the kernel */ int signal_hdlr_init ( int sig_num ) { int rc ; memset (&_pmon_ctrl_ptr->info, 0, sizeof(_pmon_ctrl_ptr->info)); memset (&_pmon_ctrl_ptr->prev, 0, sizeof(_pmon_ctrl_ptr->info)); _pmon_ctrl_ptr->info.sa_sigaction = _process_death_hdlr ; _pmon_ctrl_ptr->info.sa_flags = (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO) ; rc = sigaction ( sig_num, &_pmon_ctrl_ptr->info , &_pmon_ctrl_ptr->prev ); if ( rc ) { elog("Registering : Realtime Signal %d - (%d) (%s)\n", sig_num, errno, strerror(errno)); rc = FAIL_SIGNAL_INIT ; } else { ilog("Registering : Realtime Signal %d\n", sig_num); } return (rc) ; } /* * Init the handler * - Must support re-init that might occur over a SIGHUP **/ int pmon_hdlr_init ( pmon_ctrl_type * ctrl_ptr ) { int rc ; /* Save the control pointer */ _pmon_ctrl_ptr = ctrl_ptr ; /* Force running of the audit at the very start */ _pmon_ctrl_ptr->run_audit = true ; rc = signal_hdlr_init ( PMON_RT_SIGNAL ); /* Log the control setting going into the main loop */ mem_log_ctrl ( _pmon_ctrl_ptr ) ; /* init the inotify file descriptors */ _pmon_ctrl_ptr->fd = 0 ; _pmon_ctrl_ptr->wd = 0 ; return (rc) ; } /* Module Cleanup */ void pmon_hdlr_fini ( pmon_ctrl_type * ctrl_ptr ) { for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { /* Close any active monitoring sockets */ close_process_socket ( &process_config[i] ); } /* Turn off inotify */ set_inotify_close ( ctrl_ptr->fd, ctrl_ptr->wd ); } void manage_process_failure ( process_config_type * ptr ) { /******************************************************************* * The next 2 'if' clauses try to prevent raising alarms for * process failure detections while the host is shutting down. *******************************************************************/ /* When handling a process failure check to see if we are already in * the stopping state. * If not, then query the current system state and save it. */ if ( _pmon_ctrl_ptr->system_state != MTC_SYSTEM_STATE__STOPPING ) { /* update current state */ _pmon_ctrl_ptr->system_state = get_system_state(); } /* Ignore process failures while in stopping (i.e. shutdown) mode */ if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING ) { /* don't report process failures during system shutdown. */ wlog ("%s terminated by system shutdown (pid:%d) ; ignoring\n", ptr->process , ptr->pid ); ptr->ignore = true ; return ; } /* Should not need this clause */ if ( ptr->stopped == true ) { slog ("%s process is in the stopped state\n", ptr->process); } elog ("%s failed (%d) (p:%d a:%d)\n", ptr->process, ptr->pid, ptr->failed, ptr->active_failed); passiveStageChange ( ptr, PMON_STAGE__MANAGE) ; if ( ptr->failed == false ) { ptr->failed = true ; ptr->restart = false ; // pmon_send_event ( MTC_EVENT_PMON_LOG, ptr ) ; } /* TODO: Consider clearing active_failed flag regardless */ if ( ptr->active_monitoring == true ) { activeStageChange ( ptr, ACTIVE_STAGE__PULSE_REQUEST ) ; ptr->active_failed = true ; } } /* * Manage process config strdup memory over a config/reconfig. * On reconfig ; the PMOND_INIT_CHECK should be set and for each * config pointed that is not null ; feee the memory. */ void init_process_config_memory ( void ) { for ( int i = 0 ; i < MAX_PROCESSES ; i++ ) { if ( process_config[i].init_check == PMOND_INIT_CHECK ) { if ( process_config[i].process ) free ((void*)process_config[i].process); if ( process_config[i].service ) free ((void*)process_config[i].service); if ( process_config[i].script ) free ((void*)process_config[i].script); if ( process_config[i].style ) free ((void*)process_config[i].style); if ( process_config[i].pidfile ) free ((void*)process_config[i].pidfile); if ( process_config[i].severity ) free ((void*)process_config[i].severity); if ( process_config[i].mode ) free ((void*)process_config[i].mode); if ( process_config[i].start_arg ) free ((void*)process_config[i].start_arg); if ( process_config[i].status_arg) free ((void*)process_config[i].status_arg); if ( process_config[i].status_failure_text_file) free ((void*)process_config[i].status_failure_text_file); if ( process_config[i].subfunction ) free ((void*)process_config[i].subfunction); if ( process_config[i].recovery_method ) free ((void*)process_config[i].recovery_method); } /* init the process config memory ; now that we have freed past strdup allocations*/ memset ( (char*)&process_config[i], 0, sizeof(process_config_type)); } } /* Read and load process monitor configuration from * all the process config files from /etc/pmon.d */ void load_processes ( void ) { list saved_alarm_list ; int rc = PASS ; /* 1. Free timers, * 2. shutdown sockets * 3. track processes with raised alarms */ for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { mtcTimer_reset ( process_config[i].pt_ptr ); close_process_socket ( &process_config[i] ); } /* Query fm for existing pmon process alarms and * for each that is found store their 'name' and * 'severity' in the passed in saved list */ manage_queried_alarms ( saved_alarm_list ); /* init the process config memory */ init_process_config_memory (); /* Default to event mode */ _pmon_ctrl_ptr->event_mode = true ; /* Start with zero processes */ _pmon_ctrl_ptr->processes = 0 ; /* Read in the list of config files and their contents */ load_filenames_in_dir ( CONFIG_DIR, config_files ) ; ilog ("Loading Process Configurations\n"); ilog ("--------------------------------------------------------------\n"); /* Run Maintenance on Inventory */ for ( string_iter_ptr = config_files.begin () ; string_iter_ptr != config_files.end () ; ++string_iter_ptr ) { process_config_type * ptr = &process_config[_pmon_ctrl_ptr->processes] ; rc = process_config_load ( ptr, string_iter_ptr->data() ); if ( rc ) { memset ((char*)ptr, 0, sizeof(process_config_type)); } else { /* stages for passive and active monitoring are initially set * inside the process_config_load */ _pmon_ctrl_ptr->processes++ ; ptr->init_check = PMOND_INIT_CHECK ; } } pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ; ilog ("Registering Processes With Kernel\n"); ilog ("---------------------------------------------------------------\n"); /* Register all the processes with the kernel */ for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { process_config[i].restart= false; process_config[i].failed = false; if ( process_config[i].status_monitoring ) { process_config[i].status_stage = STATUS_STAGE__BEGIN ; } else if ( process_config[i].stage == PMON_STAGE__MANAGE ) { register_process ( &process_config[i] ); if ( process_config[i].active_monitoring == true ) { if ( open_process_socket ( &process_config[i] ) != PASS ) { elog ("%s failed to open process socket\n", process_config[i].process ); } } } } _pmon_ctrl_ptr->reload_config = false ; /* If there were process alarms that existed over the reload * then ensure that those processes are updated with that information. */ if ( saved_alarm_list.size () ) { ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size()); manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes ); } } /* Looks up the timer ID and asserts the corresponding ringer */ void pmon_timer_handler ( int sig, siginfo_t *si, void *uc) { timer_t * tid_ptr = (void**)si->si_value.sival_ptr ; /* Avoid compiler errors/warnings for parms we must * have but currently do nothing with */ UNUSED(sig); UNUSED(uc); if ( !(*tid_ptr) ) { return ; } else if ( *tid_ptr == pmonTimer_pulse.tid ) { pmonTimer_pulse.ring = true ; } else if ( *tid_ptr == pmonTimer_degrade.tid ) { mtcTimer_stop_int_safe ( pmonTimer_degrade ); pmonTimer_degrade.ring = true ; _pmon_ctrl_ptr->patching_in_progress = false ; } else if ( *tid_ptr == pmonTimer_audit.tid ) { mtcTimer_stop_int_safe ( pmonTimer_audit ); pmonTimer_audit.ring = true ; } /* is host watchdog pmon timer */ else if ( *tid_ptr == pmonTimer_hostwd.tid ) { pmonTimer_hostwd.ring = true ; /* we do not stop the timer; instead let it auto-restart */ } else { bool found = false ; for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { if ( *tid_ptr == process_config[i].pt_ptr->tid ) { mtcTimer_stop_int_safe ( process_config[i].pt_ptr ); process_config[i].pt_ptr->ring = true ; found = true ; break ; } } if ( !found ) { //wlog ("Unknown timer\n"); /* try and cleanup by stopping this unknown timer via its tid */ mtcTimer_stop_tid_int_safe (tid_ptr); } } } /**************************************************************************** * * Name : service_file_exists * * Description: Look in some well known places for the specified service file. * * Returns : Return true if the specified service file is found. * * Updates : If the service file is found then update the supplied * character string buffer with the full path/name of that * service file. * ****************************************************************************/ bool service_file_exists ( string service_filename, char * path_n_name_ptr, int max_len ) { /* load the name of the service file */ snprintf ( path_n_name_ptr, max_len, "%s/%s", SYSTEMD_SERVICE_FILE_DIR1, service_filename.data()); if (( path_n_name_ptr ) && (strnlen ( path_n_name_ptr, max_len ))) { if ( daemon_is_file_present ( path_n_name_ptr ) == true ) return true ; } snprintf ( path_n_name_ptr, max_len, "%s/%s", SYSTEMD_SERVICE_FILE_DIR2, service_filename.data()); if (( path_n_name_ptr ) && ( strnlen ( path_n_name_ptr, max_len ))) { if ( daemon_is_file_present ( path_n_name_ptr ) == true ) return true ; } return false ; } /***************************************************************************** * * Name : process_config_load * * Purpose : Load the content of a config file * *****************************************************************************/ int process_config_load (process_config_type * pc_ptr, const char * config_file_ptr ) { char recovery_method_buf [_MAX_LEN_] ; memset (recovery_method_buf,0, sizeof(recovery_method_buf)); if ( _pmon_ctrl_ptr->processes >= MAX_PROCESSES ) { wlog ("Cannot Monitor more than %d processes\n", MAX_PROCESSES ); return (FAIL); } /* Read the process config file */ pc_ptr->mask = 0 ; pc_ptr->amask = 0 ; pc_ptr->status_mask = 0 ; pc_ptr->status_monitoring = false; pc_ptr->passive_monitoring = false; pc_ptr->audit_alarm_refresh_count = 0 ; if (ini_parse( config_file_ptr, pmon_process_config, pc_ptr) < 0) { elog("Read Failure : %s\n", config_file_ptr ); return (FAIL); } /* Set some defaults just in case they were not specified */ if ( !pc_ptr->mode ) { pc_ptr->mode = strdup("Passive") ; } if ( !pc_ptr->startuptime ) { pc_ptr->startuptime = PMON_MIN_START_DELAY ; } /* Many process conf files came from a sysvinit origin and might not * have a service file label. Account for that in the following * load of recovery_method_buf. * Accept a script name if the service name is missing. */ bool recovery_method_found = false ; /* look for the service file */ if ( pc_ptr->service ) { string service = pc_ptr->service ; if ( service.find(".service") == string::npos ) service.append(".service"); if ( service_file_exists(service, &recovery_method_buf[0], _MAX_LEN_) == true ) recovery_method_found = true ; } else if ( pc_ptr->script ) { string script = basename((char*)pc_ptr->script); if ( script.find(".service") == string::npos ) script.append(".service"); if ( service_file_exists(script, &recovery_method_buf[0], _MAX_LEN_) == true ) recovery_method_found = true ; else { /* resort to the script file only */ /* load the name of the process init script */ snprintf ( &recovery_method_buf[0], _MAX_LEN_, "%s", pc_ptr->script ); if ( daemon_is_file_present ( recovery_method_buf ) == true ) { recovery_method_found = true ; } else { wlog ("%s has script but not found (%s)\n", pc_ptr->process, recovery_method_buf ); } } } else { /* print a log if we have no recovery method */ wlog ("%s has no recovery method ; process not monitored\n", pc_ptr->process ); wlog ("... conf file has no 'service' or 'script' recovery entry\n"); return (FAIL_NOT_FOUND); } if ( recovery_method_found == false ) { wlog ("%s has no recovery method found ; process not monitored\n", pc_ptr->process ); return (FAIL_NOT_FOUND); } update_config_option ( &pc_ptr->recovery_method , recovery_method_buf ); if ( !strcmp ( pc_ptr->mode, "status" ) ) { pc_ptr->status_monitoring = true; if (( pc_ptr->status_mask == CONF_STATUS_MON_MASK ) && ( pc_ptr->process[0] != '\0' ) && ( pc_ptr->severity[0] != '\0')) { dlog1 ("Config File : %s\n", string_iter_ptr->c_str()); if ( !strcmp ( pc_ptr->severity, "critical" )) { pc_ptr->sev = SEVERITY_CRITICAL ; } else if ( !strcmp ( pc_ptr->severity, "major" )) { pc_ptr->sev = SEVERITY_MAJOR ; } else if ( !strcmp ( pc_ptr->severity, "minor" )) { pc_ptr->sev = SEVERITY_MINOR ; } else { wlog ("%s has invalid severity ; ignoring\n", pc_ptr->process ); pc_ptr->ignore = strdup ("ignored"); } /* Bind the process timer to the process struct */ pc_ptr->pt_ptr = &ptimer[_pmon_ctrl_ptr->processes] ; /* set the timer service owner to the process name */ pc_ptr->pt_ptr->service = pc_ptr->process ; pc_ptr->restarts_cnt = 0 ; pc_ptr->pid = 0 ; pc_ptr->child_pid = 0 ; pc_ptr->restart = false ; pc_ptr->failed = false ; pc_ptr->status_failed = false ; pc_ptr->was_failed = false ; pc_ptr->sigchld_rxed = false ; ilog ("%7s Mon : %-27s %-8s\n", pc_ptr->mode, pc_ptr->process, pc_ptr->ignore ? "ignored" : pc_ptr->severity); pc_ptr->status_stage = STATUS_STAGE__BEGIN ; } else { wlog ("Status Parse Failure: %s\n", string_iter_ptr->c_str()); wlog ("Status Mask Expected: %x Detected: %x\n", CONF_STATUS_MON_MASK, pc_ptr->status_mask ); return (FAIL); } return (PASS); } if (( pc_ptr->mask == CONF_MASK ) && ( pc_ptr->process[0] != '\0' ) && ( pc_ptr->severity[0] != '\0')) { dlog1 ("Config File : %s\n", string_iter_ptr->c_str()); if ( !strcmp ( pc_ptr->severity, "critical" )) { pc_ptr->sev = SEVERITY_CRITICAL ; } else if ( !strcmp ( pc_ptr->severity, "major" )) { pc_ptr->sev = SEVERITY_MAJOR ; } else if ( !strcmp ( pc_ptr->severity, "minor" )) { pc_ptr->sev = SEVERITY_MINOR ; } else { wlog ("%s has invalid severity ; ignoring\n", pc_ptr->process ); pc_ptr->ignore = strdup ("ignored"); } /* Bind the process timer to the process struct */ pc_ptr->pt_ptr = &ptimer[_pmon_ctrl_ptr->processes] ; /* Init the timer for this process */ mtcTimer_init ( pc_ptr->pt_ptr ) ; pc_ptr->pt_ptr->hostname = pc_ptr->process ; pc_ptr->pt_ptr->service = pc_ptr->process ; pc_ptr->restarts_cnt = 0 ; pc_ptr->debounce_cnt = 0 ; pc_ptr->pid = 0 ; pc_ptr->child_pid = 0 ; pc_ptr->restart = false ; pc_ptr->failed = false ; pc_ptr->sigchld_rxed = false ; pc_ptr->stopped = false ; pc_ptr->alarm_severity = FM_ALARM_SEVERITY_CLEAR ; if (( _pmon_ctrl_ptr->system_type != SYSTEM_TYPE__NORMAL ) && ( pc_ptr->subfunction != NULL )) { /* subfunction process monitoring is deferred until * that subfunction init is complete */ ilog ("%7s Def : %-30s %-8s - %s (%s)\n", pc_ptr->mode, pc_ptr->process, pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf, pc_ptr->subfunction); /* defer subfunction processes to the FSM to get enabled */ pc_ptr->stage = PMON_STAGE__POLLING ; pc_ptr->pt_ptr->ring = true ; } else { /* if not a subfunction then monitoring defaults * to true immediately */ pc_ptr->passive_monitoring = true ; ilog ("%7s Mon : %-30s %-8s - %s\n", pc_ptr->mode, pc_ptr->process, pc_ptr->ignore ? "ignored" : pc_ptr->severity, recovery_method_buf); pc_ptr->stage = PMON_STAGE__MANAGE ; } // mem_log_process ( pc_ptr ); } else { wlog ("Parse Failure: %s\n", string_iter_ptr->c_str()); wlog ("Mask Expected: %x Detected: %x\n", CONF_MASK, pc_ptr->mask ); return (FAIL); } if ( !strcmp ( pc_ptr->mode, "active" ) ) { if ( pc_ptr->amask == CONF_AMON_MASK ) { if (( pc_ptr->period == 0 ) || ( pc_ptr->period > PMON_MAX_ACTIVE_PERIOD )) { elog ("%s monitor period out-of-range (%d secs), setting to max\n", pc_ptr->process, pc_ptr->period ); pc_ptr->period = PMON_MAX_ACTIVE_PERIOD ; } if ( pc_ptr->timeout > pc_ptr->period ) { elog ("%s monitor 'timeout' longer than 'period' (%d:%d secs), rounding down\n", pc_ptr->process, pc_ptr->timeout, pc_ptr->period ); pc_ptr->timeout = pc_ptr->period ; } /* Init the active component */ pc_ptr->active_stage = ACTIVE_STAGE__PULSE_REQUEST ; pc_ptr->active_monitoring = true ; pc_ptr->active_failed = false ; pc_ptr->pulse_count = 0 ; pc_ptr->b2b_miss_peak = 0 ; pc_ptr->b2b_miss_count = 0 ; } else { wlog ("%s Parse Failure\n", string_iter_ptr->c_str()); wlog ("%s Active Mask Expected: %x Detected: %x\n", pc_ptr->process, CONF_AMON_MASK, pc_ptr->amask ); return (FAIL); } } return (PASS); } int get_process_pid ( process_config_type * ptr ) { int pid = 0 ; FILE * pid_file_stream = fopen ( ptr->pidfile, "r" ); if ( pid_file_stream != NULL ) { int num = fscanf ( pid_file_stream, "%d", &pid); if ( num != 1 ) { wlog ("fscanf failed to read pid from %s\n", ptr->pidfile ); } fclose (pid_file_stream); } return (pid); } /* search the process list for the child_pid in * order to find the parent it is associated with */ process_config_type * find_parent_process ( int child_pid ) { for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { if ( process_config[i].child_pid == child_pid ) { return (&process_config[i]); } } /* look based on PID */ for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { if ( process_config[i].pid == child_pid ) { return (&process_config[i]); } } return (NULL); } /* search the process list for the child_pid in * order to find the parent it is associated with */ bool want_degrade_clear ( void ) { int i ; bool clear = true ; for ( i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { /* Don't report current or previous status on * processes that are not being monitored */ if (( !process_config[i].passive_monitoring ) && ( !process_config[i].status_monitoring )) { continue ; } if (( process_config[i].failed == true ) || ( process_config[i].active_failed == true )) { if (( process_config[i].alarm_severity == FM_ALARM_SEVERITY_MAJOR ) || ( process_config[i].alarm_severity == FM_ALARM_SEVERITY_CRITICAL )) { wlog ("%s is still failed '%s' ; degrade assert\n", process_config[i].process, alarmUtil_getSev_str(process_config[i].alarm_severity).c_str()); /* Resend the process event to maintenance every threshold count */ if ( ++process_config[i].audit_alarm_refresh_count > AUDIT_EVENT_SEND_REFESH_THRESHOLD ) { process_config[i].audit_alarm_refresh_count = 0 ; if ( process_config[i].alarm_severity == FM_ALARM_SEVERITY_MAJOR ) pmon_send_event ( MTC_EVENT_PMON_MAJOR, &process_config[i] ) ; else pmon_send_event ( MTC_EVENT_PMON_CRIT, &process_config[i] ) ; } clear = false ; } } } return (clear); } static char unknown_process[] = "unknown process" ; bool kill_running_process ( int pid ) { bool rc = false ; if ( pid ) { int result = kill ( pid, 0 ); if ( result == 0 ) { char * proc_name_ptr = &unknown_process[0] ; process_config_type * ptr = find_parent_process ( pid ) ; if ( ptr ) { daemon_remove_file ( ptr->pidfile ); proc_name_ptr = (char*)ptr->process ; } result = kill ( pid, SIGKILL ); if ( ptr && ( result == 0 ) ) { if ( daemon_is_file_present ( ptr->pidfile ) ) { if ( get_process_pid ( ptr ) == pid ) { ilog ("%s removing stale pidfile (%d) %s\n", ptr->process, pid, ptr->pidfile ); daemon_remove_file ( ptr->pidfile ); } } wlog ("%s kill succeeded (%d)\n", proc_name_ptr, pid ); rc = true ; } else { ilog ("%s kill failed (%d)\n", proc_name_ptr, pid ); } } } return (rc); } /* if the child (startup script) pid is still running then kill it */ void kill_running_child ( process_config_type * ptr ) { if ( ptr->child_pid ) { if ( kill_running_process ( ptr->child_pid ) == true ) { wlog ("%s start script still running (%d) ; killed\n", ptr->process, ptr->child_pid ); } ptr->child_pid = 0 ; } } bool process_running ( process_config_type * ptr ) { int pid = get_process_pid ( ptr ); if ( pid ) { int result = kill (pid, 0 ); ptr->pid = pid ; if ( result == 0 ) { if (( ptr->pid != 0 ) && ( ptr->pid != pid )) { wlog ("%s pid changed (was:%d now:%d)\n", ptr->process , ptr->pid, pid); ptr->pid = 0 ; return (false); } else if (( ptr->pid == 0 ) && ( pid )) { ilog ("%s Running (%d)\n", ptr->process, ptr->pid); } else { dlog1 ("%s Running (%d) (%d)\n", ptr->process, pid, ptr->pid ); } return (true) ; } else { dlog ("%s process not running (kill 0 result:%d) (get_process_pid:%d)\n", ptr->process, result, pid ); } } else { ilog ("%s process not running\n", ptr->process ); } ptr->pid = 0 ; return (false); } /* Temporary till we get kernel event */ void _get_events ( void ) { int pid = 0 ; for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { bool running = false ; if ( process_config[i].ignore == true ) { process_config[i].failed = false ; process_config[i].restart= false ; continue ; } else if ( process_config[i].stage == PMON_STAGE__POLLING ) { continue ; } else if ( process_config[i].status_monitoring ) { continue ; } /* Skip already failed processes */ else if ( process_config[i].failed == false ) { if ((pid = get_process_pid ( &process_config[i] ))) { int result = kill (pid, 0 ); process_config[i].pid = pid ; if ( result == 0 ) { dlog3 ("%s (%d) is running\n", process_config[i].process, pid); running = true ; } else { dlog ("%s (%d) not running (%d:%d) (%s)\n", process_config[i].process, pid, result, errno, strerror(errno)) ; } } else { dlog ("%s Pid (unknown) - no pidfile\n", process_config[i].process ) } /* If not running then fail the process * to trigger auto-recovery */ if ( running == false ) { wlog ("%s Not Running\n", process_config[i].process ); manage_process_failure ( &process_config[i] ); } } } /* turn off the audit */ _pmon_ctrl_ptr->run_audit = false ; } /* This is the data structure for requestion process death * (and other state change) information. Sig of -1 means * query, sig of 0 means deregistration, positive sig means * that you want to set it. sig and events are value-result * and will be updated with the previous values on every * successful call. */ int unregister_process ( process_config_type * ptr ) { dlog1 ("%s pid %d\n", ptr->process, ptr->pid ); if ( ptr->pid ) { struct task_state_notify_info info ; info.pid = ptr->pid ; info.sig = 0 ; info.events = PMON_EVENT_FLAGS ; if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info )) { if ( errno != ESRCH ) { wlog ("%s unregister pid:%d (%d:%s)\n", ptr->process, ptr->pid, errno, strerror(errno) ); } } else { ilog ("%s unregistered (%d)\n", ptr->process, ptr->pid ); } } ptr->registered = false ; return (PASS); } int register_process ( process_config_type * ptr ) { int pid = get_process_pid ( ptr ); if ( pid ) { ptr->pid = pid ; ptr->restart= false ; if (( _pmon_ctrl_ptr->event_mode ) && ( !ptr->ignore )) { struct task_state_notify_info info ; info.pid = pid ; info.sig = PMON_RT_SIGNAL ; info.events = PMON_EVENT_FLAGS; if ( prctl (PR_DO_NOTIFY_TASK_STATE, &info ) ) { elog ("%s failed to register pid:%d (%d:%s)\n", ptr->process, pid, errno, strerror(errno)); if ( errno == EINVAL ) { _pmon_ctrl_ptr->event_mode = false ; wlog ( "%s Switching to Polling mode\n", ptr->process); } else { ptr->failed = true ; } } else { ilog ("%s Registered (%d)\n", ptr->process , pid ); ptr->failed = false ; ptr->registered = true ; passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; if ( ptr->active_monitoring == false ) { manage_alarm ( ptr, PMON_CLEAR ); } } } /* Don't 'else' because event mode might * change in the above clause */ if ( _pmon_ctrl_ptr->event_mode == false ) { wlog ("%s Registered (%d) in polling mode\n", ptr->process , pid); /* prevent infinite reg retry in polling mode */ ptr->registered = true ; if ( process_running ( ptr ) == false ) { ptr->failed = true ; } else { ptr->failed = false ; manage_alarm ( ptr, PMON_CLEAR ); passiveStageChange ( ptr, PMON_STAGE__MANAGE ) ; } } } else { ilog ("%s is not running\n", ptr->process ); ptr->failed = true ; } if ( ptr->failed ) { manage_process_failure ( ptr ); return (FAIL); } else { return (PASS); } } /* This respawns a process through the 'script' string from the process config file. * The pmond log files are first closed so their fd's are not duped to the child. * The syslog facility is used to log child messages to user.log * The waitpid interface is used to manage acknowledging the exit of the child process */ #define PMOND_EXECV_ARGS (4) int respawn_process ( process_config_type * ptr ) { pid_t pid ; int rc = PASS ; bool restart = false ; unregister_process ( ptr ); if ( process_running ( ptr ) == true ) { ilog ("%s restart of running process\n", ptr->process ); restart = true ; kill_running_process ( ptr->pid ); } ptr->restarts_cnt++ ; /* default restart result and ponitoring controls */ ptr->status = RETRY ; /* keep looking */ ptr->pidwait_cnt = 0 ; /* TODO: should be a timer .... start count */ ptr->sigchld_rxed = false ; /* sigchild handler did not run */ /* Fork the daemon to trigger the process specific restart */ ptr->child_pid = pid = fork () ; if (pid == 0) { /* execv arg list */ char * argv[PMOND_EXECV_ARGS] ; for ( int i = 0 ; i < PMOND_EXECV_ARGS ; i++ ) argv[i] = NULL ; char recovery_cmd[_MAX_LEN_] ; bool close_file_descriptors = true ; if ( setup_child ( close_file_descriptors ) != PASS ) exit(EXIT_FAILURE); signal (SIGCHLD, SIG_DFL); openlog ((char*)ptr->process, LOG_PID, LOG_USER ); /* Default File Creation Mask */ umask(022); memset (recovery_cmd,0,sizeof(recovery_cmd)); ilog ("Service:%s\n", ptr->service ? ptr->service : "unknown"); #define SYSTEMCTL_CMD "/usr/bin/systemctl" #define RESTART_CMD "restart" #define START_CMD "start" if ( get_ctrl_ptr()->recovery_method == PMOND_RECOVERY_METHOD__SYSTEMD ) { /* systemd recovery method - if the service is specified then it takes precidence */ if ( ptr->service ) sprintf ( &recovery_cmd[0], "%s", ptr->service ); else sprintf ( &recovery_cmd[0], "%s", ptr->process ); argv[0] = (char*)&SYSTEMCTL_CMD ; /* path to executable */ argv[1] = (char*)&RESTART_CMD ; /* the recovery command */ argv[2] = &recovery_cmd[0] ; /* the process name */ } else { /* init script method */ snprintf( &recovery_cmd[0], _MAX_LEN_, "%s", ptr->script ) ; argv[0] = &recovery_cmd[0] ; /* path to script */ argv[1] = (restart ? (char*)&RESTART_CMD : (char*)&START_CMD) ; /* the process name */ } rc = execv(argv[0], argv ); if ( 0 > rc ) { syslog ( LOG_WARNING, "%s recovery failed with method '%s': (%s %s %s) (%d:%m)\n", ptr->process, ptr->recovery_method, argv[0], argv[1], argv[2] ? "" : argv[2] , errno ); } else { syslog ( LOG_INFO, "%s recovered witb method '%s': (%s %s %s)\n", ptr->process, ptr->recovery_method, argv[0], argv[1], argv[2] ? "" : argv[2] ); } closelog(); exit (rc); } if ( pid == -1 ) { elog ("%s fork failed (%s)\n", ptr->process , strerror(errno)); /* TODO: Consider making this a critical fault * after 100 retries. * All possibilities based on man page are * due to resource limitations and if that does * not resolve in 100 retries then it probably will never. **/ return (FAIL); } gettime ( ptr->time_start ); ilog ("%s Spawn (%d)\n", ptr->process, ptr->child_pid ); return (PASS); } /***************************************************************************** * * Name : execute_start_command * * Purpose : execute start script command *****************************************************************************/ int execute_start_command(process_config_type * ptr) { pid_t child_pid; wlog("%s process(es) start\n", ptr->process); dlog ("Main Pid:%d \n", getpid() ); ptr->sigchld_rxed = false ; /* sigchild handler did not run */ ptr->child_pid = child_pid = fork (); if (child_pid == 0) { dlog ("Child Pid:%d \n", getpid() ); char* argv[] = { basename((char*)ptr->script), (char*)ptr->start_arg, NULL}; char cmd[MAX_FILE_SIZE] ; memset (cmd,0,sizeof(cmd)); snprintf ( &cmd[0], MAX_FILE_SIZE, "%s", ptr->script); bool close_file_descriptors = true ; if ( setup_child ( close_file_descriptors ) != PASS ) { exit(255); } /* Set child to ignore child exit */ signal (SIGCHLD, SIG_DFL); /* Setup the exec arguement */ int res = execv(cmd, argv); elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) ); exit (255); } if ( child_pid == -1 ) { elog ("Fork failed (%s)\n", strerror(errno)); return (FAIL); } gettime ( ptr->time_start ); return (PASS); } /***************************************************************************** * * Name : execute_status_command * * Purpose : execute status script command *****************************************************************************/ int execute_status_command (process_config_type * ptr) { pid_t child_pid; dlog("%s process(es) status query\n", ptr->process); dlog ("Main Pid:%d \n", getpid() ); ptr->sigchld_rxed = false ; /* sigchild handler did not run */ ptr->child_pid = child_pid = fork (); if (child_pid == 0) { dlog ("Child Pid:%d \n", getpid() ); char* argv[] = {basename((char*)ptr->script), (char*)ptr->status_arg, NULL}; char cmd[MAX_FILE_SIZE] ; memset (cmd,0,sizeof(cmd)); snprintf ( &cmd[0], MAX_FILE_SIZE, "%s", ptr->script); bool close_file_descriptors = true ; if ( setup_child ( close_file_descriptors ) != PASS ) { exit(255); } /* Set child to ignore child exit */ signal (SIGCHLD, SIG_DFL); /* Setup the exec arguement */ int res = execv(cmd, argv); elog ( "Failed to run %s return code:%d error:%s\n", cmd, res, strerror(errno) ); exit (255); } if ( child_pid == -1 ) { elog ("Fork failed (%s)\n", strerror(errno)); return (FAIL); } gettime ( ptr->time_start ); return (PASS); } void daemon_sigchld_hdlr ( void ) { pid_t tpid = 0 ; bool found = 0 ; int status = 0 ; dlog("Received SIGCHLD ...\n"); while ( 0 < ( tpid = waitpid ( -1, &status, WNOHANG | WUNTRACED ))) { process_config_type * process_ptr = find_parent_process ( tpid ) ; if ( process_ptr ) { process_ptr->sigchld_rxed = true ; if (WIFEXITED(status)) { if ( process_ptr->status_monitoring == false ) { dlog ("%s spawn script exited properly (%d)\n", process_ptr->process, tpid ); } else { /* with status mode we do not need to wait for a timeout since we got a response */ /* force a ring */ process_ptr->pt_ptr->ring = true; } gettime ( process_ptr->time_stop ); timedelta ( process_ptr->time_start, process_ptr->time_stop, process_ptr->time_delta ); /* only print log if there is an error */ process_ptr->status = WEXITSTATUS(status) ; if ( process_ptr->status ) { if ( process_ptr->status_monitoring == false ) { dlog ("%s spawn failed (rc:%d:%x) (%ld.%03ld secs)\n", process_ptr->process, process_ptr->status, process_ptr->status, process_ptr->time_delta.secs, process_ptr->time_delta.msecs/1000); } } else { if ( process_ptr->status_monitoring == false ) { /* only print this log if the spawn time took longer than 1 second */ if ( process_ptr->time_delta.secs ) { ilog ("%s spawned in %ld.%03ld secs\n", process_ptr->process, process_ptr->time_delta.secs, process_ptr->time_delta.msecs/1000); } } } } else if (WIFSIGNALED(status)) { process_ptr->status = FAIL ; wlog ("%s test uncaught signal\n", process_ptr->process ); } else if (WIFSTOPPED(status)) { process_ptr->status = FAIL ; wlog ("%s test stopped.\n", process_ptr->process ); } } else { dlog ("parent process for PID:%d lookup failed ; reaped likely after timeout\n", tpid ); return ; } } if ( ( tpid > 0 ) && ( found == false ) ) { wlog ("PID:%d found no corresponding process\n", tpid ); } } int manage_alarm ( process_config_type * ptr, int action ) { int rc = PASS ; pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ; string processInfo = ptr->process; // check for extra text if((ptr->status_monitoring ) && (ptr->status_failure_text_file)) { string extra_text = get_status_failure_text(ptr); if(!extra_text.empty()) { processInfo.append(" ("); processInfo.append(extra_text); processInfo.append(")"); } } if ( action == PMON_CLEAR ) { if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { ilog ("%s from '%s' to 'clear'\n", ptr->process, alarmUtil_getSev_str(ptr->alarm_severity).c_str()); pmonAlarm_clear ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo ); ptr->alarm_severity = FM_ALARM_SEVERITY_CLEAR ; } ptr->failed = false ; } else if ( action == PMON_LOG ) { /* CGTS 4010: Pmon logs and alarm ID should not be identical. * Choice was made to not raise pmon logs for process * failures. If we do in the future then we should * use a different number from 200.006 * pmonAlarm_minor_log ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo, ptr->restarts ); */ ilog ("%s process has failed ; %s\n", ptr->process, (ptr->restarts == 0) ? "Manual recovery is required." : "Auto recovery in progress."); /* Unlike the above call to pmonAlarm_minor_log, this call only creates a log entry in mtcAgent.log */ pmon_send_event ( MTC_EVENT_PMON_LOG, ptr ) ; } else { if ( ptr->restart == true ) { /* handle as error now rather than command */ ptr->restart = false ; } switch ( ptr->sev ) { case SEVERITY_CRITICAL: { wlog ("%s Critical Assert\n", ptr->process ); ptr->failed = true ; if ( ptr->alarm_severity != FM_ALARM_SEVERITY_CRITICAL ) { pmonAlarm_critical ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo ); ptr->alarm_severity = FM_ALARM_SEVERITY_CRITICAL ; } break ; } case SEVERITY_MAJOR: { wlog ("%s Major Assert\n", ptr->process ); ptr->failed = true ; if ( ptr->alarm_severity != FM_ALARM_SEVERITY_MAJOR ) { pmonAlarm_major ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo ); ptr->alarm_severity = FM_ALARM_SEVERITY_MAJOR ; } break ; } case SEVERITY_MINOR: { wlog ("%s Minor Assert\n", ptr->process ); ptr->failed = true ; if ( ptr->alarm_severity != FM_ALARM_SEVERITY_MINOR ) { pmonAlarm_minor ( ctrl_ptr->my_hostname, PMON_ALARM_ID__PMOND, processInfo, ptr->restarts ); ptr->alarm_severity = FM_ALARM_SEVERITY_MINOR ; } break ; } default: { slog ("%s has Invalid Severity", ptr->process); ptr->sev = SEVERITY_CLEAR ; ptr->failed = false ; rc = RETRY ; break ; } } } return (rc); } /********************************************************************************* * * Name : _process_death_hdlr * * Purpose : Handle realtime signal events from "Notification of death * of arbitrary process" (NODOAP) service in the kernel. * * Description: This handler is bound into the kernel with signal_hdlr_init * Monitored processes are registered with the NODOAP feature * * 1. when service starts * 2. after a process is re-spawned and deemed stable and recovered * * The kernel passes the pid of the dead process in through info_ptr->si_pid. * This handler searches the process list for that pid. If found then it triggers * that process to be recovered by the fsm. if that process for some crazy reason * is already in the failed state then this handler deferrs to allowing the fsm * to complete. * * If the pid is not found in the process control structure then the pidfiles * are searched. if the process is not fould in that secondary search then the * handler forces the get_events audit to run as a catch all. * * Note: The _get_events audit already runs periodically but at a much slower rate. * * Update: emacdona: commented out debug logs as we should not be logging * in a signal handler * */ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr ) { UNUSED(context_ptr); UNUSED(sig_num) ; if ( info_ptr ) { process_config_type * ptr = &process_config[0] ; bool found = false ; dlog ("Sig:%d Pid:%d Code:%d Exit:%d\n", info_ptr->si_signo, info_ptr->si_pid, info_ptr->si_code, info_ptr->si_status ); for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { ptr = &process_config[i] ; if ( ptr->pid == info_ptr->si_pid ) { found = true ; if ( ptr->failed != true ) { ptr->failed = true ; manage_process_failure ( ptr ); } break ; } } if ( !found ) { for ( int i = 0 ; i < _pmon_ctrl_ptr->processes ; i++ ) { int pid ; ptr = &process_config[i] ; if ((pid = get_process_pid ( ptr ))) { if ( pid == info_ptr->si_pid ) { found = true ; if ( ptr->failed != true ) { /* One notification from the kernel is all we need */ manage_process_failure ( ptr ); } break ; } } } } if ( !found ) { /* Failed to find process for pid */ /* Forcing _get_events audit */ _pmon_ctrl_ptr->run_audit = true ; } } else { /* Handler called with NULL siginfo pointer */ /* Forcing _get_events audit */ _pmon_ctrl_ptr->run_audit = true ; } } /************************************************************************ * * Name : manage_process_alarms * * Description: This interface manages process alarms over a process * configuration reload * * Steps: * * 1. Loop over each item in the list and mark the process as failed * with the specified severity level. * * 2. If the process is not found then clear its alarm as it is no * longer a valid process in the new profile and we don't want a * lingering stuck alarm. * *************************************************************************/ void manage_process_alarms ( list & _list, process_config_type * const ptr, int const processes ) { /* get out if the list is empty ; should not have been called if * empty but ... just in case */ if ( ! _list.empty() ) { list::iterator _iter_ptr ; /* loop over the list ... */ for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr ) { /* for each item assum it is not found */ bool found = false ; /* try and find this process in the new process profile */ for ( int i = 0 ; i < processes ; i++ ) { if ( ! _iter_ptr->process.compare((ptr+i)->process) ) { /* If the process is found then mark it as failed and update its severity. * At this point we then assume that there is an alarm raised for this process. */ found = true ; (ptr+i)->failed = false ; wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() ); pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); } } /* if not found then just clear the alarm */ if ( found == false) { wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() ); pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); } } } } void pmon_service ( pmon_ctrl_type * ctrl_ptr ) { std::list socks ; struct timeval waitd; fd_set readfds; int select_fail_count = 0 ; int flush_thld = 0 ; int rc = PASS ; int shutdown_log_throttle = 0; /* iNotify stuff */ bool inotify_fault = false ; daemon_config_type * cfg_ptr = daemon_get_cfg_ptr (); pmon_socket_type * sock_ptr = pmon_getSock_ptr (); int select_timeout = (cfg_ptr->audit_period*100); int audit_period = (cfg_ptr->audit_period/10); int pulse_period = cfg_ptr->audit_period ; int hostwd_period = (cfg_ptr->hostwd_update_period); int degrade_period = (cfg_ptr->audit_period/50); if ( audit_period == 0 ) audit_period = 10 ; if ( degrade_period == 0 ) degrade_period = 10 ; ilog ("Starting to monitor processes\n"); pmon_send_hostwd ( ); /* Load and register generic processes - not subfunction processes */ load_processes (); /* Setup inotify to watch CONFIG_DIR */ if ( set_inotify_watch ( CONFIG_DIR, ctrl_ptr->fd, ctrl_ptr->wd ) ) inotify_fault = true ; socks.clear(); socks.push_front (sock_ptr->cmd_sock->getFD()); socks.push_front (sock_ptr->event_sock->getFD()); socks.push_front (sock_ptr->amon_sock); socks.sort(); ilog ("Starting 'Audit' timer (%d secs)\n", audit_period ); mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period ); ilog ("Starting 'Degrade Audit' timer (%d secs)\n", degrade_period ); mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period ); ilog ("Starting 'Pulse' timer (%d secs)\n", pulse_period ); mtcTimer_start_msec ( pmonTimer_pulse, pmon_timer_handler, pulse_period ); ilog ("Starting 'Host Watchdog' timer (%d secs)\n", hostwd_period ); mtcTimer_start ( pmonTimer_hostwd, pmon_timer_handler, hostwd_period ); for ( ; ; ) { /* Accomodate for hup reconfig */ select_timeout = (cfg_ptr->audit_period*100); audit_period = (cfg_ptr->audit_period/10); degrade_period = (cfg_ptr->audit_period/50); if ( audit_period < 1 ) audit_period = 10 ; if ( degrade_period < 1 ) degrade_period = 10 ; daemon_signal_hdlr (); /* Initialize the master fd_set */ FD_ZERO(&readfds); if ( sock_ptr->cmd_sock->getFD() ) { FD_SET(sock_ptr->cmd_sock->getFD(), &readfds); } if ( sock_ptr->event_sock->getFD() ) { FD_SET(sock_ptr->event_sock->getFD(), &readfds); } if ( sock_ptr->amon_sock ) { FD_SET(sock_ptr->amon_sock, &readfds); } waitd.tv_sec = 0; waitd.tv_usec = select_timeout ; /* This is used as a delay up to select_timeout */ rc = select( socks.back()+1, &readfds, NULL, NULL, &waitd); /* If the select time out expired then */ if (( rc < 0 ) || ( rc == 0 )) { /* Check to see if the select call failed. */ /* ... but filter Interrupt signal */ if (( rc < 0 ) && ( errno != EINTR )) { wlog_throttled ( select_fail_count, 20, "Socket Select Failed (rc:%d) %s \n", errno, strerror(errno)); } } else { if ( FD_ISSET(sock_ptr->cmd_sock->getFD(), &readfds)) { pmon_service_inbox (); } if (FD_ISSET(sock_ptr->amon_sock, &readfds)) { amon_service_inbox ( _pmon_ctrl_ptr->processes ); } } if (pmonTimer_pulse.ring == true ) { pmonTimer_pulse.ring = false ; /* Send a I'm Alive message to the pulse interface */ /* Robustness Update: Added an event_mode bool that will * be true if the kernel supports notification of death * of arbitrary process patch. If that feature is not present * then allow pmon to operate but in a degraded state. Eventually * we can turn this into a customer alarm/log. * Degrade is acheived by not sending the pulses to the watcher. */ if ( ctrl_ptr->event_mode == true ) { pmon_send_pulse ( ); } } /* Avoid pmond thrashing trying to recover processes during * system shutdown. */ if ( _pmon_ctrl_ptr->system_state == MTC_SYSTEM_STATE__STOPPING ) { wlog_throttled ( shutdown_log_throttle, 500, "process monitoring disabled during system shutdown\n"); usleep (500); continue ; } if ( shutdown_log_throttle ) shutdown_log_throttle = 0 ; if ( inotify_fault == false ) { if ( get_inotify_events ( ctrl_ptr->fd ) == true ) { if ( _pmon_ctrl_ptr->reload_config == false ) { _pmon_ctrl_ptr->reload_config = true ; ilog ("Setting config reload flag\n"); /* Hijack the audit timer for the next period for config reload */ mtcTimer_reset (pmonTimer_degrade); if ( daemon_is_file_present ( PATCHING_IN_PROG_FILE ) == true ) { _pmon_ctrl_ptr->patching_in_progress = true ; wlog ("Patching in progress ; delaying config reload by 30 secs...\n"); mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, (degrade_period + 30) ); } else { mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period ); } } } } if ( pmonTimer_hostwd.ring == true ) { /* inservice recovery from hostw connection failures */ if ( sock_ptr->hostwd_sock == 0 ) { hostwd_port_init(); } if ( ctrl_ptr->event_mode == true ) { pmon_send_hostwd ( ); pmonTimer_hostwd.ring = false; } } /* Run Get Events by audit timer */ if (pmonTimer_audit.ring == true ) { _get_events (); mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period ); } /* Run the degrade set/clear by audit */ if (pmonTimer_degrade.ring == true ) { /* run the degrade clear audit */ if ( want_degrade_clear () == true ) { dlog ("sending degrade clear\n"); pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ; } else { dlog ("sending degrade assert\n"); // pmon_send_event ( MTC_EVENT_PMON_MAJOR, &process_config[0] ) ; } /* Check for config reload state request */ if ( _pmon_ctrl_ptr->reload_config == true ) { /* But defer it while there is a process in the * manually requested restart state */ bool restart_request_active = false ; for ( int i = 0 ; i < ctrl_ptr->processes ; i++ ) { if ( process_config[i].restart == true ) { /* Added as fix */ wlog ("deferring process config reload to next audit\n"); wlog ("... while manual restart of '%s' is in progress\n", process_config[i].process ); restart_request_active = true ; break ; } } if ( restart_request_active == false ) { load_processes (); } } mtcTimer_start ( pmonTimer_degrade, pmon_timer_handler, degrade_period ); } /* Get_events run by forced audit or not in event mode */ else if (( ctrl_ptr->run_audit == true ) || ( ctrl_ptr->event_mode == false )) { _get_events ( ); } /* Monitor Processes */ for ( int i = 0 ; i < ctrl_ptr->processes ; i++ ) { /* Allow a process to be ignored */ if ( process_config[i].ignore == true ) { process_config[i].failed = false ; process_config[i].active_failed = false ; /* Handle process auto recovery from stopped state */ if (( process_config[i].pt_ptr->ring == true ) && ( process_config[i].stopped == true )) { elog ("%s process was stopped but never restarted ; auto recovery in progress\n", process_config[i].process ); process_config[i].stopped = false ; process_config[i].ignore = false ; passiveStageChange ( &process_config[i], PMON_STAGE__MANAGE ); } continue ; } else if ( process_config[i].status_monitoring ) { pmon_status_handler ( &process_config[i] ); } else if (( process_config[i].stage == PMON_STAGE__POLLING ) || ( process_config[i].stage == PMON_STAGE__START_WAIT ) || ( process_config[i].restart == true ) || ( process_config[i].failed == true )) { /* Run the FSM for this failed process */ pmon_passive_handler ( &process_config[i] ) ; } else if ( process_config[i].active_monitoring ) { // if ( process_config[i].active_failed == false ) if ( process_config[i].failed == false ) { pmon_active_handler ( &process_config[i] ); } else { elog ("%s Failed Active Monitoring ... recovering.\n", process_config[i].process ); manage_process_failure ( &process_config[i]) ; } } /* Audit to ensure that running processes are * registered with the kernel */ if (( process_config[i].stage != PMON_STAGE__POLLING ) && ( process_config[i].stage != PMON_STAGE__START_WAIT ) && ( process_config[i].registered == false ) && ( _pmon_ctrl_ptr->event_mode ) && ( process_config[i].restart == false ) && ( process_config[i].failed == false ) && ( process_config[i].ignore == false )) { int pid = get_process_pid ( &process_config[i] ); if ( pid ) { if ( kill (pid, 0 ) == 0 ) { process_config[i].pid = pid ; register_process ( &process_config[i] ); } } } } /* Debugging */ if (daemon_get_cfg_ptr()->debug_level & 1 ) { char proc_mask [MAX_PROCESSES*2] ; bool somefailed = false ; memset (&proc_mask[0], 0, sizeof(proc_mask)); for ( int x = 0 , y = 0 ; x < ctrl_ptr->processes ; x++, y+=2 ) { if ( process_config[x].failed ) { proc_mask[y] = '1' ; somefailed = true ; } else proc_mask[y] = '0' ; proc_mask[y+1] = ' ' ; } if ( somefailed ) { alog ( "Process Mask: %s\n", &proc_mask[0] ); } } /* Support the log flush config option */ if ( cfg_ptr->flush ) { if ( ++flush_thld > cfg_ptr->flush ) { flush_thld = 0 ; fflush (stdout); fflush (stderr); } } } } string get_status_failure_text ( process_config_type * ptr ) { string extra_text(""); if(( ptr->status_failure_text_file != NULL ) && ( ptr->status_failure_text_file[0] != '\0')) { FILE * status_text_file_stream = fopen ( ptr->status_failure_text_file, "r" ); if ( status_text_file_stream == NULL ) { wlog (" Failed to get extra alam text from file %s\n", ptr->status_failure_text_file ); } else { char buffer[MAX_STATUS_ERROR_TEXT_LEN]; if ( fgets(buffer, MAX_STATUS_ERROR_TEXT_LEN, status_text_file_stream) != NULL) { extra_text = buffer; } fclose(status_text_file_stream); } } return extra_text; } /**************************************************************************** * * Name : quorum_process_failure * * Description: manage debounce and log report of quorum process failure * * Warnings : Only call this when there is a quorum process faiure * that has exceeded the threshold count. * ****************************************************************************/ void quorum_process_failure ( process_config_type * ptr ) { wlog ("%s quorum process %s\n", ptr->process, ptr->quorum_failure ? "unrecoverable" : "failed" ); if ( ptr->quorum_failure == true ) { ptr->quorum_unrecoverable = true; } else { ptr->quorum_failure = true; } }