From 031818e55bc255b59e486ebf6faadf4b784c93fe Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Fri, 26 Mar 2021 13:05:51 -0400 Subject: [PATCH] Add in-service test to clear stale config failure alarm A configuration failure alarm can get stuck asserted if that node experiences an uncontrolled reboot that recovers without a configuration failure. This update adds an in-service test that audits host health while there is a configuration failure alarm raised and clear that alarm if the failure condition goes away. This could be a result of an in-service manifest that runs and corrects the configuration or if the node reboots and comes back up in a healthy (properly configured) state. Fixed bug that was clearing config alarm severity state when a heartbeat clear event is received. This update also goes a step further and introduces an alarms state audit that detects and corrects maintenance alarm state mismatches. Test Plan: PASS: Verify the add handler loads config alarm state PASS: Verify in-service test clears stale config alarm PASS: Verify in-service test acts on new config failure ... degrade - active controller ... fail - other hosts PASS: Verify audit fixes mtce alarm state mismatches PASS: Verify audit handles fm not running case PASS: Verify audit handling behavior with valid alarm cases PASS: Verify locked alarm management over process restart PASS: Verify audit only logs active alarms list changes PASS: Verify audit runs for both locked/unlocked nodes PASS: Verify update as a patch Regression: PASS: Verify enable sequence config failure handling PASS: ... active controller - recoverable degrade PASS: ... other nodes - threshold fail PASS: ... auto recovery disable - config failure PASS: Verify mtcAgent process logging PASS: Verify heartbeat handling and alarming PASS: Verify Standard system install PASS: Verify AIO system install Change-Id: If9957229810435e9faeb08374f2b5fbcb5b0f826 Closes-Bug: 1918195 Signed-off-by: Eric MacDonald --- mtce/src/common/nodeClass.cpp | 75 ++++++++---- mtce/src/common/nodeClass.h | 14 ++- mtce/src/maintenance/mtcAlarm.cpp | 162 ++++++++++++++++++++++++++ mtce/src/maintenance/mtcAlarm.h | 3 + mtce/src/maintenance/mtcNodeCtrl.cpp | 9 -- mtce/src/maintenance/mtcNodeHdlrs.cpp | 89 +++++++------- 6 files changed, 265 insertions(+), 87 deletions(-) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index f705f095..0e4cc1ca 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) { ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ; } - ptr->alarms_loaded = false ; + ptr->active_alarms = "" ; /* no active alarms */ ptr->cfgEvent.base = NULL ; ptr->sysinvEvent.base= NULL ; @@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) return ptr ; } + struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname ) { /* check for empty list condition */ @@ -5088,6 +5089,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface } } +/**************************************************************************** + * + * Name : manage_heartbeat_clear + * + * Description: Manage clearing heartbeat failure status + * + * Assuptions : Called by Both hbsAgent and mtcAgent + * + ***************************************************************************/ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) { nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname ); @@ -5103,13 +5113,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) node_ptr->heartbeat_failed[i] = false ; if ( i == MGMNT_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; } if ( i == CLSTR_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; } } } @@ -5118,13 +5132,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface ) node_ptr->heartbeat_failed[iface] = false ; if ( iface == MGMNT_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ; } else if ( iface == CLSTR_IFACE ) { - node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; - node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; + if ( heartbeat ) + node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ; + if ( maintenance ) + node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ; } } } @@ -9068,21 +9086,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", + node_ptr->hostname.c_str(), node_ptr->mtcAlive_online ? 'Y' : 'N', node_ptr->mtcAlive_offline ? 'Y' : 'N', node_ptr->mtcAlive_count, node_ptr->mtcAlive_gate ? "closed" : "open", - node_ptr->mtcAlive_misses); + node_ptr->mtcAlive_misses); mem_log (str); } void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", + node_ptr->hostname.c_str(), node_ptr->alarms[MTC_ALARM_ID__LOCK ] ? " Locked" : " .", node_ptr->alarms[MTC_ALARM_ID__CONFIG ] ? " Config" : " .", node_ptr->alarms[MTC_ALARM_ID__ENABLE ] ? " Enable" : " .", @@ -9092,6 +9110,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ) mem_log (str); } +void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr ) +{ + if ( ! node_ptr->active_alarms.empty() ) + { + char str[MAX_MEM_LOG_DATA] ; + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n", + node_ptr->hostname.c_str(), + node_ptr->active_alarms.c_str()); + mem_log (str); + } +} + void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; @@ -9142,8 +9172,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n", - node_ptr->hostname.c_str(), - node_ptr->mac.c_str(), + node_ptr->hostname.c_str(), + node_ptr->mac.c_str(), node_ptr->ip.c_str(), node_ptr->clstr_ip.c_str(), node_ptr->uptime ); @@ -9155,11 +9185,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr ) char str[MAX_MEM_LOG_DATA] ; for ( int iface = 0 ; iface < MAX_IFACES ; iface++ ) { - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s Monitor:%s\n", + node_ptr->hostname.c_str(), get_iface_name_str (iface), - node_ptr->hbs_minor[iface] ? "true " : "false", - node_ptr->hbs_degrade[iface] ? "true " : "false", + node_ptr->hbs_minor[iface] ? "true " : "false", + node_ptr->hbs_degrade[iface] ? "true " : "false", node_ptr->hbs_failure[iface] ? "true " : "false", node_ptr->monitor[iface] ? "YES" : "no" ); mem_log (str); @@ -9188,8 +9218,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr ) void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr ) { char str[MAX_MEM_LOG_DATA] ; - snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", - node_ptr->hostname.c_str(), + snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", + node_ptr->hostname.c_str(), get_oosTestStages_str(node_ptr->oosTestStage).c_str(), node_ptr->oos_test_count, get_insvTestStages_str(node_ptr->insvTestStage).c_str(), @@ -9261,6 +9291,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname ) // mem_log_reset_info ( node_ptr ); mem_log_power_info ( node_ptr ); mem_log_alarm1 ( node_ptr ); + mem_log_alarm2 ( node_ptr ); mem_log_mtcalive ( node_ptr ); mem_log_stage ( node_ptr ); mem_log_bm ( node_ptr ); diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 9b667316..225847a7 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -652,12 +652,12 @@ private: /** @} private_monitoring_services_variables */ - /* List of alarms and current severity */ - #define MAX_ALARMS (10) + /* List of alarms current severity */ EFmAlarmSeverityT alarms[MAX_ALARMS]; - /* tracks whether the alarms for this host have been loaded already or not */ - bool alarms_loaded ; + /* string containing active alarms and their severity + * ... for logging purposes only */ + string active_alarms ; /** true if this host has recovered before the mnfa timeout period. * This bool flags the graceful recovery handler that this node @@ -665,8 +665,6 @@ private: * and uptime accordingly */ bool mnfa_graceful_recovery ; - int stress_iteration ; - /* BMC Protocol Learning Controls and State */ /* specifies what BMC protocol is selected for this host @@ -843,6 +841,9 @@ private: /* server specific power state query handler */ bool (*is_poweron_handler) (string hostname, string query_response ); + /* Audit that monitors and auto corrects alarm state mismatches */ + void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr ); + /* Calculate the overall reset progression timeout */ int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries ); @@ -1304,6 +1305,7 @@ private: void mem_log_state1 ( struct nodeLinkClass::node * node_ptr ); void mem_log_state2 ( struct nodeLinkClass::node * node_ptr ); void mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr ); + void mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr ); void mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr ); void mem_log_stage ( struct nodeLinkClass::node * node_ptr ); void mem_log_test_info ( struct nodeLinkClass::node * node_ptr ); diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp index 8262da9f..28d1b6bc 100644 --- a/mtce/src/maintenance/mtcAlarm.cpp +++ b/mtce/src/maintenance/mtcAlarm.cpp @@ -26,6 +26,7 @@ using namespace std; #include "daemon_common.h" /* */ #include "nodeBase.h" /* */ +#include "nodeClass.h" /* */ #include "nodeTimers.h" /* */ #include "nodeUtil.h" /* */ #include "mtcAlarm.h" /* for ... this module header */ @@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname ) } } +/**************************************************************************** + * + * Name : mtcAlarm_audit + * + * Purpose : Monitor and Auto-Correct maintenance alarms + * + * Description: Query locked state alarm (raw) + * if successful + * - Query alarms + * - compare to running state + * - correct mismatches ; internal state takes precidence + * - log all alarm state changes + * + ****************************************************************************/ + +void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr ) +{ + /* + * Read locked state alarm directly to detect fm access failures. + * If successful further reads are done using a wrapper utility. + */ + SFmAlarmDataT alarm_query ; + AlarmFilter alarm_filter ; + EFmErrorT rc ; + + memset(&alarm_query, 0, sizeof(alarm_query)); + memset(&alarm_filter, 0, sizeof(alarm_filter)); + snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", + LOCK_ALARM_ID); + snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s", + ENTITY_PREFIX, node_ptr->hostname.data()); + rc = fm_get_fault ( &alarm_filter, &alarm_query ); + if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND )) + { + wlog("%s alarm query failure ; code:%d", + node_ptr->hostname.c_str(), + rc ); + return ; + } + + /* With FM comms proven working lets check the other mtc alarms */ + string active_alarms = ""; + for ( int i = 0 ; i < MAX_ALARMS ; i++ ) + { + mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ; + if ( id == MTC_ALARM_ID__LOCK ) + { + /* Unexpected severity case */ + if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) + { + if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING ) + { + node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ; + + wlog("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(alarm_query.severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK ); + + } + if (!active_alarms.empty()) + active_alarms.append(", "); + active_alarms.append(_getIdentity(id) + ":"); + active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id])); + } + /* Unexpected assertion case */ + else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && + ( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR )) + { + node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ; + + wlog("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(alarm_query.severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + mtcAlarm_clear ( node_ptr->hostname, id ); + } + } + else if (( id == MTC_ALARM_ID__CONFIG ) || + ( id == MTC_ALARM_ID__ENABLE ) || + ( id == MTC_ALARM_ID__BM ) || + ( id == MTC_ALARM_ID__CH_CONT) || + ( id == MTC_ALARM_ID__CH_COMP)) + { + EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id); + if ( severity != node_ptr->alarms[id] ) + { + ilog ("%s %s alarm mismatch ; %s -> %s", + node_ptr->hostname.c_str(), + _getIdentity(id).c_str(), + alarmUtil_getSev_str(severity).c_str(), + alarmUtil_getSev_str(node_ptr->alarms[id]).c_str()); + + if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR ) + { + mtcAlarm_clear ( node_ptr->hostname, id ); + } + else + { + mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] ); + } + } + if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR ) + { + if (!active_alarms.empty()) + active_alarms.append(", "); + active_alarms.append(_getIdentity(id) + ":"); + active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id])); + } + } + /* else don't care about other alarm ids ; logs events etc */ + } + + /* manage logging of active alarms */ + if ( !active_alarms.empty() ) + { + if ( node_ptr->active_alarms != active_alarms ) + { + ilog ("%s active alarms: %s", + node_ptr->hostname.c_str(), + active_alarms.c_str()); + + node_ptr->active_alarms = active_alarms ; + } + /* else + * do nothing because there are active alarms + * that have not changed since the last audit. + */ + } + else if ( ! node_ptr->active_alarms.empty() ) + { + /* clear active alarm list since there 'were' active alarms + * but there are no longer active alarms */ + node_ptr->active_alarms.clear(); + ilog ("%s no active alarms", node_ptr->hostname.c_str()); + } + /* else + * no active alarms ; don't log */ +} + /************************* A L A R M I N G **************************/ +/* Raise the specified maintenance alarm severity */ +int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity ) +{ + switch ( severity ) + { + case FM_ALARM_SEVERITY_MINOR: + return (mtcAlarm_minor(hostname,id)); + case FM_ALARM_SEVERITY_MAJOR: + return (mtcAlarm_major(hostname,id)); + case FM_ALARM_SEVERITY_CRITICAL: + return (mtcAlarm_critical(hostname,id)); + default: + return (FAIL_BAD_PARM); + } +} + /* Clear the specified hosts's maintenance alarm */ int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id ) { diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h index 25565d4f..6e93f659 100644 --- a/mtce/src/maintenance/mtcAlarm.h +++ b/mtce/src/maintenance/mtcAlarm.h @@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id ); /** Clear the specified maintenance alarm for specific host */ int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id ); +/** Raise specified severity level alarm for the specified host */ +int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity ); + /** Assert a specified mtce alarm against the specified host with a WARNING severity level */ int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id ); diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index 152217c6..5e180252 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -1187,15 +1187,6 @@ int _self_provision ( void ) if ( my_identity.name == record_info.name ) { - /* If the active controller was 'locked' and is being auto-corrected - * to 'unlocked' then ensure that there is no locked alarm set for it */ - if ( record_info.admin != "locked" ) - { - mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK ); - /* this is not required because its already inited to clear */ - // node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR - } - if ( my_identity.mac != record_info.mac ) { wlog ("%s mac address mismatch (%s - %s)\n", diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 8f6ce2f3..36aa10d2 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -6107,48 +6107,32 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcInvApi_update_state ( node_ptr, "availability", "available" ); } - /* handle other cases */ - EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname, - MTC_ALARM_ID__ENABLE); + /* Query FM for existing Enable and Config alarm status */ + EFmAlarmSeverityT enable_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE); + EFmAlarmSeverityT config_alarm_severity = + mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG); - if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED ) + /* Clear generic enable alarm over process restart. + * Will get reasserted if the cause condition still exists */ + if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { - node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ; - - /* If the node is locked then the Enable alarm - * should not be present */ - if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - sev = FM_ALARM_SEVERITY_CLEAR ; - } + ilog ("%s found enable alarm ; clearing %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(enable_alarm_severity).c_str()); + mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); } - /* Manage enable alarm over process restart. - * - * - clear the alarm in the active controller case - * - maintain the alarm, set degrade state in MAJOR and CRIT cases - * - clear alarm for all other severities. - */ - if ( THIS_HOST ) + /* The config alarm is maintained if it exists. + * The in-service test handler will clear the alarm + * if the config failure is gone */ + if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR ) { - if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - } - } - else - { - if (( sev == FM_ALARM_SEVERITY_CRITICAL ) || - ( sev == FM_ALARM_SEVERITY_MAJOR )) - { - node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ; - node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ; - } - else if ( sev != FM_ALARM_SEVERITY_CLEAR ) - { - mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE ); - } + node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; + node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ; + ilog ("%s found config alarm ; loaded %s", + node_ptr->hostname.c_str(), + alarmUtil_getSev_str(config_alarm_severity).c_str()); } if ( is_controller(node_ptr) ) @@ -6188,7 +6172,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) { ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE ); - /* Work Around for issue: */ mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime ); mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE ); @@ -6222,7 +6205,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD ); } } - if ( daemon_get_cfg_ptr()->debug_level & 1 ) nodeLinkClass::host_print (node_ptr); @@ -6357,6 +6339,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } case MTC_ADD__WORKQUEUE_WAIT: { + rc = workQueue_done ( node_ptr ); if ( rc == RETRY ) { @@ -6444,6 +6427,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) } node_ptr->addStage = MTC_ADD__START; + plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime ); node_ptr->add_completed = true ; break ; @@ -7202,6 +7186,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr ) } } + /* audit alarms */ + mtcAlarm_audit (node_ptr ); + break ; } case MTC_OOS_TEST__WAIT: @@ -7600,7 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } - /* Monitor the health of the host - no pass file */ + /* Monitor the health of the host */ if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) && ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) || @@ -7626,6 +7613,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str()); } + /* + * In-service Config Failure/Alarm handling + */ + + /* Detect new config failure condition */ if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) { /* not healthy .... */ @@ -7637,16 +7629,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) { wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str()); if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD ) - { - node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ; - - /* threshold is reached so raise the config alarm if it is not already raised */ - if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL ) - { - mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG ); - node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ; - } - } + alarm_config_failure ( node_ptr ); } } else @@ -7666,6 +7649,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } } + /* or correct an alarmed config failure that has cleared */ + else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG ) + { + if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY ) + alarm_config_clear ( node_ptr ); + } else { node_ptr->health_threshold_counter = 0 ;