diff --git a/mtce/src/pmon/pmon.h b/mtce/src/pmon/pmon.h index 11ed9714..158553da 100755 --- a/mtce/src/pmon/pmon.h +++ b/mtce/src/pmon/pmon.h @@ -231,6 +231,7 @@ typedef struct recovery_method_type recovery_method ; /**< How processes are recovered */ bool reload_config ; bool patching_in_progress ; + bool last_alarm_query_pass; } pmon_ctrl_type ; void pmon_set_ctrl_ptr ( pmon_ctrl_type * ctrl_ptr ); diff --git a/mtce/src/pmon/pmonAlarm.cpp b/mtce/src/pmon/pmonAlarm.cpp index 2a491642..86e0a319 100644 --- a/mtce/src/pmon/pmonAlarm.cpp +++ b/mtce/src/pmon/pmonAlarm.cpp @@ -38,14 +38,14 @@ void pmonAlarm_init ( void ) alarmUtil_type * ptr ; /** Process Failure Alarm ****************************************************/ - + ptr = &alarm_list[PMON_ALARM_ID__PMOND]; memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT))); snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", PMOND_ALARM_ID); ptr->name = "process failure" ; ptr->instc_prefix = "process=" ; - + ptr->critl_reason = ""; ptr->minor_reason = ""; ptr->major_reason = ""; @@ -56,12 +56,12 @@ void pmonAlarm_init ( void ) ptr->alarm.inhibit_alarms = FM_FALSE; ptr->alarm.service_affecting = FM_TRUE ; ptr->alarm.suppression = FM_TRUE ; - + ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */ ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */ - snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, - "If problem consistently occurs after Host is locked and unlocked then " + snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, + "If problem consistently occurs after Host is locked and unlocked then " "contact next level of support for root cause analysis and recovery."); } @@ -97,38 +97,46 @@ EFmAlarmSeverityT pmonAlarm_state ( string hostname, pmon_alarm_id_enum id ) /****************************************************************************** * - * Name : manage_queried_alarms + * Name : query_alarms * * Description: query FM for all the existing process monitor alarms and build * up the callers 'saved_alarm_list' with those process names and * corresponding severity. * - * Assumptions: If the hostname is passed in as not empty then assume the clear - * is requested. - * * Updates : callers saved_alarm_list * + * Returns : PASS if FM returns no error + * FAIL_REQUEST ... alarmUtil_query_identity failed + * FAIL_OPERATION ... fm_get_fault failed + * FAIL_NULL_POINTER ... failed to get memory + * ******************************************************************************/ -void manage_queried_alarms ( list & saved_alarm_list, string hostname ) +int query_alarms ( list & saved_alarm_list, string hostname ) { + static const char HOSTNAME_LABEL [] = "host=" ; + static const char PROCNAME_LABEL [] = ".process=" ; + + int rc = FAIL ; saved_alarm_list.clear(); - /** - * Query all the pmon alarms and if there is an alarm for a - * process that is functioing properly then clear the alarm. - **/ SFmAlarmDataT * alarm_list_ptr = (SFmAlarmDataT*) malloc ((sizeof(SFmAlarmDataT)*PMON_MAX_ALARMS)); if ( alarm_list_ptr ) { - if ( alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ) == PASS ) + /* Query all the pmon alarms */ + rc = alarmUtil_query_identity ( pmonAlarm_getId_str(PMON_ALARM_ID__PMOND), alarm_list_ptr, PMON_MAX_ALARMS ); + if ( rc == RETRY ) + { + dlog ("no %s alarms found", pmonAlarm_getId_str(PMON_ALARM_ID__PMOND).c_str()); + rc = PASS ; + } + else if ( rc == PASS ) { for ( int i = 0 ; i < PMON_MAX_ALARMS ; ++i ) { /* loop over each active alarm and maintain its activity state */ if ( strnlen ((alarm_list_ptr+i)->entity_instance_id , MAX_FILENAME_LEN ) ) { - int rc ; AlarmFilter alarm_filter ; SFmAlarmDataT alarm_query ; memset(&alarm_query, 0, sizeof(alarm_query)); @@ -139,34 +147,49 @@ void manage_queried_alarms ( list & saved_alarm_lis if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK ) { - string entity = alarm_filter.entity_instance_id ; - size_t pos = entity.find("process="); - if ( pos != std::string::npos ) - { - string pn = entity.substr(pos+strlen("process=")); - ilog ("%s alarm is %s (process:%s)\n", alarm_filter.entity_instance_id, - alarmUtil_getSev_str(alarm_query.severity).c_str(), pn.c_str()); + rc = PASS ; - /* filter out 'process=pmond' as that alarm is handled by hbsAgent */ - if ( pn.compare("pmond") ) + string entity = alarm_filter.entity_instance_id ; + size_t pos_hn = entity.find(HOSTNAME_LABEL); + size_t pos_pn = entity.find(PROCNAME_LABEL); + + if (( pos_hn != std::string::npos ) && + ( pos_pn != std::string::npos )) + { + string hn = entity.substr(pos_hn+strlen(HOSTNAME_LABEL), pos_pn-strlen(HOSTNAME_LABEL)); + string pn = entity.substr(pos_pn+strlen(PROCNAME_LABEL)); + + /* verify hostname */ + if ( ( hn.length() == 0 ) || ( hn != hostname ) ) { - if ( !hostname.empty() ) - { - pmonAlarm_clear ( hostname, PMON_ALARM_ID__PMOND, pn ); - } - else - { - active_process_alarms_type this_alarm ; - this_alarm.process = pn ; - this_alarm.severity = alarm_query.severity ; - saved_alarm_list.push_front ( this_alarm ); - } + /* ignore alarms not for this host */ + dlog ("%s %s %s alarm not for this host", + entity.c_str(), + hn.c_str(), + pn.c_str()); + continue ; + } + dlog ("%s alarm is %s (process:%s)\n", + alarm_filter.entity_instance_id, + alarmUtil_getSev_str(alarm_query.severity).c_str(), + pn.c_str()); + + /* filter out 'process=pmond' + * ... that alarm is handled by hbsAgent */ + if ( pn != MTC_SERVICE_PMOND_NAME ) + { + active_process_alarms_type this_alarm ; + this_alarm.process = pn ; + this_alarm.severity = alarm_query.severity ; + saved_alarm_list.push_front ( this_alarm ); } } } else { - ilog ("fm_get_fault failed (rc:%d)\n", rc ); + wlog ("fm_get_fault failed (rc:%d)\n", rc ); + rc = FAIL_OPERATION ; + break ; } } else @@ -174,10 +197,21 @@ void manage_queried_alarms ( list & saved_alarm_lis dlog2 ("last entry %d\n", i); break ; } - } + } /* for loop */ + } + else + { + wlog("failed to query alarms from fm ; rc:%d", rc); + rc = FAIL_REQUEST ; } free(alarm_list_ptr); } + else + { + elog ("unable to allocate memory for alarm list"); + rc = FAIL_NULL_POINTER ; + } + return (rc); } /************************* A L A R M I N G **************************/ diff --git a/mtce/src/pmon/pmonAlarm.h b/mtce/src/pmon/pmonAlarm.h index 79414e1c..392fea82 100644 --- a/mtce/src/pmon/pmonAlarm.h +++ b/mtce/src/pmon/pmonAlarm.h @@ -37,8 +37,10 @@ typedef struct EFmAlarmSeverityT severity ; } active_process_alarms_type ; -/* Clear any pending alarms if the specified hostname is valid */ -void manage_queried_alarms ( list & alarm_list, string hostname="" ); +/* Query FM for a list of Process Monitor (200.006) alarms */ +int query_alarms ( list & alarm_list, string hostname="" ); + +void alarmed_process_audit ( void ); void pmonAlarm_init ( void ); diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 7ab0a8ee..2abe1255 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -41,15 +41,6 @@ static struct mtc_timer ptimer[MAX_PROCESSES] ; std::list config_files ; std::list::iterator string_iter_ptr ; -/* If there is an alarm in the list that matches one in the process list - * then update that process with its severity and failed state. - * If there is a process in the saved list that is not in the process list - * then clear its alarm as it is no longer valid. - */ -void manage_process_alarms ( list & _list, - process_config_type * const ptr, - int const processes ); - static process_config_type process_config[MAX_PROCESSES] ; /* lookup process control by index and return its pointer if found. @@ -216,6 +207,7 @@ void pmon_timer_init ( void ) /* Init the timer for this process */ mtcTimer_init ( process_config[i].pt_ptr, _pmon_ctrl_ptr->my_hostname, "process" ) ; } + _pmon_ctrl_ptr->last_alarm_query_pass = false ; } void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr ); @@ -371,7 +363,7 @@ void init_process_config_memory ( void ) * all the process config files from /etc/pmon.d */ void load_processes ( void ) { - list saved_alarm_list ; + list queried_alarm_list ; int rc = PASS ; @@ -385,10 +377,6 @@ void load_processes ( void ) close_process_socket ( &process_config[i] ); } - /* Query fm for existing pmon process alarms and - * for each that is found store their 'name' and - * 'severity' in the passed in saved list */ - manage_queried_alarms ( saved_alarm_list ); /* init the process config memory */ init_process_config_memory (); @@ -454,13 +442,8 @@ void load_processes ( void ) } _pmon_ctrl_ptr->reload_config = false ; - /* If there were process alarms that existed over the reload - * then ensure that those processes are updated with that information. */ - if ( saved_alarm_list.size () ) - { - ilog ("there are %ld active alarms over reload\n", saved_alarm_list.size()); - manage_process_alarms ( saved_alarm_list, &process_config[0], _pmon_ctrl_ptr->processes ); - } + /* use the audit to clear pre-existing alarms at process startup */ + alarmed_process_audit (); } @@ -1702,65 +1685,124 @@ void _process_death_hdlr ( int sig_num, siginfo_t * info_ptr, void * context_ptr } } -/************************************************************************ +/*************************************************************************** * - * Name : manage_process_alarms + * Name : alarmed_process_audit * - * Description: This interface manages process alarms over a process - * configuration reload + * Purpose : Verify the process state matches the queried alarm state * - * Steps: + * Description: To correct process alarm state mismatches. * - * 1. Loop over each item in the list and mark the process as failed - * with the specified severity level. - * - * 2. If the process is not found then clear its alarm as it is no - * longer a valid process in the new profile and we don't want a - * lingering stuck alarm. - * - *************************************************************************/ + ***************************************************************************/ -void manage_process_alarms ( list & _list, - process_config_type * const ptr, - int const processes ) +void alarmed_process_audit ( void ) { - /* get out if the list is empty ; should not have been called if - * empty but ... just in case */ - if ( ! _list.empty() ) + /* Don't audit FM in service after the last query was successful. + * There is a blocking issue that needs to be dealt with */ + if ( _pmon_ctrl_ptr->last_alarm_query_pass == true ) + return ; + + /* + * Query fm for existing pmon process alarms and + * for each that is found store their 'name' and + * 'severity' in the passed in queried_alarm_list. + */ + list queried_alarm_list ; + int rc = query_alarms ( queried_alarm_list, get_ctrl_ptr()->my_hostname ); + _pmon_ctrl_ptr->last_alarm_query_pass = (rc == PASS); + + /* just return if query failed */ + if ( _pmon_ctrl_ptr->last_alarm_query_pass == false ) + return ; + + if ( queried_alarm_list.size () ) { list::iterator _iter_ptr ; + alog ("audit found %ld active alarms", queried_alarm_list.size()); + /* loop over the list ... */ - for ( _iter_ptr=_list.begin(); _iter_ptr!=_list.end(); ++_iter_ptr ) + for ( _iter_ptr=queried_alarm_list.begin(); + _iter_ptr!=queried_alarm_list.end(); + ++_iter_ptr ) { - /* for each item assum it is not found */ bool found = false ; + alog ("%s audit", _iter_ptr->process.c_str()); - /* try and find this process in the new process profile */ - for ( int i = 0 ; i < processes ; i++ ) + /* find this process*/ + for ( int i = 0 ; (i < _pmon_ctrl_ptr->processes) && !found ; i++ ) { - if ( ! _iter_ptr->process.compare((ptr+i)->process) ) - { - /* If the process is found then mark it as failed and update its severity. - * At this point we then assume that there is an alarm raised for this process. */ - found = true ; + process_config_type * ptr = &process_config[i]; - (ptr+i)->failed = false ; - wlog ("%s process was failed critical ; clearing existing alarm\n", _iter_ptr->process.c_str() ); - pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); + if ( ! _iter_ptr->process.compare(ptr->process) ) + { + found = true ; + if ( ptr->failed == false ) + { + ilog ("%s stale alarm ; clearing", + _iter_ptr->process.c_str() ); + + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + _iter_ptr->process ); + } + else if ( _iter_ptr->severity != ptr->alarm_severity ) + { + wlog ("%s alarm severity mismatch ; %s -> %s ; correcting", + ptr->process, + alarmUtil_getSev_str(_iter_ptr->severity).c_str(), + alarmUtil_getSev_str(ptr->alarm_severity).c_str()); + if ( ptr->alarm_severity == FM_ALARM_SEVERITY_MINOR ) + { + pmonAlarm_minor(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process, 0); + } + else if (ptr->alarm_severity == FM_ALARM_SEVERITY_MAJOR ) + { + pmonAlarm_major(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process); + } + else if (ptr->alarm_severity == FM_ALARM_SEVERITY_CRITICAL ) + { + pmonAlarm_critical(get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process); + } + else + { + wlog ("%s unexpected severity '%s' ; clearing alarm", + ptr->process, + ptr->severity); + + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + ptr->process ); + } + } + else + { + alog ("%s is alarmed '%s' ; audit", + ptr->process, + ptr->severity); + } } } - /* if not found then just clear the alarm */ if ( found == false) { - wlog ("%s process alarm clear ; not in current process profile\n", _iter_ptr->process.c_str() ); - pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, PMON_ALARM_ID__PMOND, _iter_ptr->process ); + wlog ("%s is not a monitored process ; clearing alarm", + _iter_ptr->process.c_str()); + pmonAlarm_clear ( get_ctrl_ptr()->my_hostname, + PMON_ALARM_ID__PMOND, + _iter_ptr->process ); } } } } + void pmon_service ( pmon_ctrl_type * ctrl_ptr ) { std::list socks ; @@ -1931,6 +1973,8 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) { _get_events (); mtcTimer_start ( pmonTimer_audit, pmon_timer_handler, audit_period ); + + alarmed_process_audit (); } /* Run the degrade set/clear by audit */