From 031818e55bc255b59e486ebf6faadf4b784c93fe Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Fri, 26 Mar 2021 13:05:51 -0400
Subject: [PATCH] Add in-service test to clear stale config failure alarm

A configuration failure alarm can get stuck asserted if
that node experiences an uncontrolled reboot that recovers
without a configuration failure.

This update adds an in-service test that audits host health
while there is a configuration failure alarm raised and
clear that alarm if the failure condition goes away. This
could be a result of an in-service manifest that runs and
corrects the configuration or if the node reboots and comes
back up in a healthy (properly configured) state.

Fixed bug that was clearing config alarm severity state
when a heartbeat clear event is received.

This update also goes a step further and introduces an
alarms state audit that detects and corrects maintenance
alarm state mismatches.

Test Plan:

PASS: Verify the add handler loads config alarm state
PASS: Verify in-service test clears stale config alarm
PASS: Verify in-service test acts on new config failure
      ... degrade - active controller
      ... fail    - other hosts
PASS: Verify audit fixes mtce alarm state mismatches
PASS: Verify audit handles fm not running case
PASS: Verify audit handling behavior with valid alarm cases
PASS: Verify locked alarm management over process restart
PASS: Verify audit only logs active alarms list changes
PASS: Verify audit runs for both locked/unlocked nodes
PASS: Verify update as a patch

Regression:

PASS: Verify enable sequence config failure handling
PASS: ... active controller     - recoverable degrade
PASS: ... other nodes           - threshold fail
PASS: ... auto recovery disable - config failure
PASS: Verify mtcAgent process logging
PASS: Verify heartbeat handling and alarming
PASS: Verify Standard system install
PASS: Verify AIO system install

Change-Id: If9957229810435e9faeb08374f2b5fbcb5b0f826
Closes-Bug: 1918195
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce/src/common/nodeClass.cpp         |  75 ++++++++----
 mtce/src/common/nodeClass.h           |  14 ++-
 mtce/src/maintenance/mtcAlarm.cpp     | 162 ++++++++++++++++++++++++++
 mtce/src/maintenance/mtcAlarm.h       |   3 +
 mtce/src/maintenance/mtcNodeCtrl.cpp  |   9 --
 mtce/src/maintenance/mtcNodeHdlrs.cpp |  89 +++++++-------
 6 files changed, 265 insertions(+), 87 deletions(-)

diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp
index f705f095..0e4cc1ca 100755
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@@ -660,7 +660,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
     {
         ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
     }
-    ptr->alarms_loaded   = false ;
+    ptr->active_alarms = "" ; /* no active alarms */
 
     ptr->cfgEvent.base   = NULL ;
     ptr->sysinvEvent.base= NULL ;
@@ -778,6 +778,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
     return ptr ;
 }
 
+
 struct nodeLinkClass::node* nodeLinkClass::getNode ( string hostname )
 {
    /* check for empty list condition */
@@ -5088,6 +5089,15 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
     }
 }
 
+/****************************************************************************
+ *
+ * Name       : manage_heartbeat_clear
+ *
+ * Description: Manage clearing heartbeat failure status
+ *
+ * Assuptions : Called by Both hbsAgent and mtcAgent
+ *
+ ***************************************************************************/
 void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
 {
     nodeLinkClass::node * node_ptr = nodeLinkClass::getNode ( hostname );
@@ -5103,13 +5113,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
             node_ptr->heartbeat_failed[i] = false ;
             if ( i == MGMNT_IFACE )
             {
-                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
             }
             if ( i == CLSTR_IFACE )
             {
-                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+                if ( heartbeat )
+                    node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+                if ( maintenance )
+                    node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
             }
         }
     }
@@ -5118,13 +5132,17 @@ void nodeLinkClass::manage_heartbeat_clear ( string hostname, iface_enum iface )
         node_ptr->heartbeat_failed[iface] = false ;
         if ( iface == MGMNT_IFACE )
         {
-            node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_MGMNT] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
         }
         else if ( iface == CLSTR_IFACE )
         {
-            node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
-            node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
+            if ( heartbeat )
+                node_ptr->alarms[HBS_ALARM_ID__HB_CLSTR] = FM_ALARM_SEVERITY_CLEAR ;
+            if ( maintenance )
+                node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
         }
     }
 }
@@ -9068,21 +9086,21 @@ void nodeLinkClass::mem_log_mtcalive ( struct nodeLinkClass::node * node_ptr )
 {
     char str[MAX_MEM_LOG_DATA] ;
 
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tmtcAlive: online:%c offline:%c Cnt:%d Gate:%s Misses:%d\n",
+                node_ptr->hostname.c_str(),
                 node_ptr->mtcAlive_online ? 'Y' : 'N',
                 node_ptr->mtcAlive_offline ? 'Y' : 'N',
                 node_ptr->mtcAlive_count,
                 node_ptr->mtcAlive_gate ? "closed" : "open",
-                node_ptr->mtcAlive_misses); 
+                node_ptr->mtcAlive_misses);
     mem_log (str);
 }
 
 void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
 {
     char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n", 
-               node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAlarm List:%s%s%s%s%s%s\n",
+               node_ptr->hostname.c_str(),
                node_ptr->alarms[MTC_ALARM_ID__LOCK    ] ? " Locked"   : " .",
                node_ptr->alarms[MTC_ALARM_ID__CONFIG  ] ? " Config"   : " .",
                node_ptr->alarms[MTC_ALARM_ID__ENABLE  ] ? " Enable"   : " .",
@@ -9092,6 +9110,18 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
     mem_log (str);
 }
 
+void nodeLinkClass::mem_log_alarm2 ( struct nodeLinkClass::node * node_ptr )
+{
+    if ( ! node_ptr->active_alarms.empty() )
+    {
+        char str[MAX_MEM_LOG_DATA] ;
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tActive Alarms:%s\n",
+                   node_ptr->hostname.c_str(),
+                   node_ptr->active_alarms.c_str());
+        mem_log (str);
+    }
+}
+
 void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
 {
     char str[MAX_MEM_LOG_DATA] ;
@@ -9142,8 +9172,8 @@ void nodeLinkClass::mem_log_network ( struct nodeLinkClass::node * node_ptr )
 {
     char str[MAX_MEM_LOG_DATA] ;
     snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s %s cluster_host_ip: %s Uptime: %u\n",
-                node_ptr->hostname.c_str(), 
-                node_ptr->mac.c_str(), 
+                node_ptr->hostname.c_str(),
+                node_ptr->mac.c_str(),
                 node_ptr->ip.c_str(),
                 node_ptr->clstr_ip.c_str(),
                 node_ptr->uptime );
@@ -9155,11 +9185,11 @@ void nodeLinkClass::mem_log_heartbeat ( struct nodeLinkClass::node * node_ptr )
     char str[MAX_MEM_LOG_DATA] ;
     for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
     {
-        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n", 
-                   node_ptr->hostname.c_str(), 
+        snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\t%s Minor:%s Degrade:%s Failed:%s  Monitor:%s\n",
+                   node_ptr->hostname.c_str(),
                    get_iface_name_str (iface),
-                   node_ptr->hbs_minor[iface] ? "true " : "false", 
-                   node_ptr->hbs_degrade[iface] ? "true " : "false", 
+                   node_ptr->hbs_minor[iface] ? "true " : "false",
+                   node_ptr->hbs_degrade[iface] ? "true " : "false",
                    node_ptr->hbs_failure[iface] ? "true " : "false",
                    node_ptr->monitor[iface] ? "YES" : "no"  );
         mem_log (str);
@@ -9188,8 +9218,8 @@ void nodeLinkClass::mem_log_hbs_cnts ( struct nodeLinkClass::node * node_ptr )
 void nodeLinkClass::mem_log_test_info ( struct nodeLinkClass::node * node_ptr )
 {
     char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n", 
-                node_ptr->hostname.c_str(), 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tOOS Stage:%s Runs:%d - INSV Stage:%s Runs:%d\n",
+                node_ptr->hostname.c_str(),
                 get_oosTestStages_str(node_ptr->oosTestStage).c_str(),
                 node_ptr->oos_test_count,
                 get_insvTestStages_str(node_ptr->insvTestStage).c_str(),
@@ -9261,6 +9291,7 @@ void nodeLinkClass::memDumpNodeState ( string hostname )
             // mem_log_reset_info ( node_ptr );
             mem_log_power_info ( node_ptr );
             mem_log_alarm1     ( node_ptr );
+            mem_log_alarm2     ( node_ptr );
             mem_log_mtcalive   ( node_ptr );
             mem_log_stage      ( node_ptr );
             mem_log_bm         ( node_ptr );
diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h
index 9b667316..225847a7 100755
--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@@ -652,12 +652,12 @@ private:
 
         /** @} private_monitoring_services_variables */
 
-        /* List of alarms and current severity */
-        #define MAX_ALARMS           (10)
+        /* List of alarms current severity */
         EFmAlarmSeverityT alarms[MAX_ALARMS];
 
-        /* tracks whether the alarms for this host have been loaded already or not */
-        bool alarms_loaded ;
+        /* string containing active alarms and their severity
+         * ... for logging purposes only */
+        string active_alarms ;
 
         /** true if this host has recovered before the mnfa timeout period.
          *  This bool flags the graceful recovery handler that this node
@@ -665,8 +665,6 @@ private:
          *  and uptime accordingly */
         bool mnfa_graceful_recovery ;
 
-        int stress_iteration ;
-
         /* BMC Protocol Learning Controls and State */
 
         /* specifies what BMC protocol is selected for this host
@@ -843,6 +841,9 @@ private:
     /* server specific power state query handler */
     bool (*is_poweron_handler) (string hostname, string query_response );
 
+    /* Audit that monitors and auto corrects alarm state mismatches */
+    void mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr );
+
     /* Calculate the overall reset progression timeout */
     int calc_reset_prog_timeout ( struct nodeLinkClass::node * node_ptr, int retries );
 
@@ -1304,6 +1305,7 @@ private:
     void mem_log_state1    ( struct nodeLinkClass::node * node_ptr );
     void mem_log_state2    ( struct nodeLinkClass::node * node_ptr );
     void mem_log_alarm1    ( struct nodeLinkClass::node * node_ptr );
+    void mem_log_alarm2    ( struct nodeLinkClass::node * node_ptr );
     void mem_log_mtcalive  ( struct nodeLinkClass::node * node_ptr );
     void mem_log_stage     ( struct nodeLinkClass::node * node_ptr );
     void mem_log_test_info ( struct nodeLinkClass::node * node_ptr );
diff --git a/mtce/src/maintenance/mtcAlarm.cpp b/mtce/src/maintenance/mtcAlarm.cpp
index 8262da9f..28d1b6bc 100644
--- a/mtce/src/maintenance/mtcAlarm.cpp
+++ b/mtce/src/maintenance/mtcAlarm.cpp
@@ -26,6 +26,7 @@ using namespace std;
 #include "daemon_common.h" /*                                           */
 
 #include "nodeBase.h"      /*                                           */
+#include "nodeClass.h"     /*                                           */
 #include "nodeTimers.h"    /*                                           */
 #include "nodeUtil.h"      /*                                           */
 #include "mtcAlarm.h"      /* for ... this module header                */
@@ -379,8 +380,169 @@ void mtcAlarm_clear_all ( string hostname )
     }
 }
 
+/****************************************************************************
+ *
+ * Name       : mtcAlarm_audit
+ *
+ * Purpose    : Monitor and Auto-Correct maintenance alarms
+ *
+ * Description: Query locked state alarm (raw)
+ *              if successful
+ *                 - Query alarms
+ *                 - compare to running state
+ *                 - correct mismatches ; internal state takes precidence
+ *                 - log all alarm state changes
+ *
+ ****************************************************************************/
+
+void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
+{
+   /*
+    * Read locked state alarm directly to detect fm access failures.
+    * If successful further reads are done using a wrapper utility.
+    */
+    SFmAlarmDataT alarm_query  ;
+    AlarmFilter   alarm_filter ;
+    EFmErrorT     rc           ;
+
+    memset(&alarm_query, 0, sizeof(alarm_query));
+    memset(&alarm_filter, 0, sizeof(alarm_filter));
+    snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
+               LOCK_ALARM_ID);
+    snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
+                    ENTITY_PREFIX, node_ptr->hostname.data());
+    rc = fm_get_fault ( &alarm_filter, &alarm_query );
+    if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
+    {
+        wlog("%s alarm query failure ; code:%d",
+                 node_ptr->hostname.c_str(),
+                 rc );
+        return ;
+    }
+
+    /* With FM comms proven working lets check the other mtc alarms */
+    string active_alarms = "";
+    for ( int i = 0 ; i < MAX_ALARMS ; i++ )
+    {
+        mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
+        if ( id == MTC_ALARM_ID__LOCK )
+        {
+            /* Unexpected severity case */
+            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            {
+                if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
+                {
+                    node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
+
+                    wlog("%s %s alarm mismatch ; %s -> %s",
+                             node_ptr->hostname.c_str(),
+                             _getIdentity(id).c_str(),
+                             alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                             alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                    mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
+
+                }
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+            /* Unexpected assertion case */
+            else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
+                     (  alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
+            {
+                node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
+
+                wlog("%s %s alarm mismatch ; %s -> %s",
+                         node_ptr->hostname.c_str(),
+                         _getIdentity(id).c_str(),
+                         alarmUtil_getSev_str(alarm_query.severity).c_str(),
+                         alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                mtcAlarm_clear ( node_ptr->hostname, id );
+            }
+        }
+        else if (( id == MTC_ALARM_ID__CONFIG ) ||
+                 ( id == MTC_ALARM_ID__ENABLE ) ||
+                 ( id == MTC_ALARM_ID__BM     ) ||
+                 ( id == MTC_ALARM_ID__CH_CONT) ||
+                 ( id == MTC_ALARM_ID__CH_COMP))
+        {
+            EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
+            if ( severity != node_ptr->alarms[id] )
+            {
+                ilog ("%s %s alarm mismatch ; %s -> %s",
+                          node_ptr->hostname.c_str(),
+                          _getIdentity(id).c_str(),
+                           alarmUtil_getSev_str(severity).c_str(),
+                           alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
+
+                if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
+                {
+                    mtcAlarm_clear ( node_ptr->hostname, id );
+                }
+                else
+                {
+                    mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
+                }
+            }
+            if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
+            {
+                if (!active_alarms.empty())
+                    active_alarms.append(", ");
+                active_alarms.append(_getIdentity(id) + ":");
+                active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
+            }
+        }
+        /* else don't care about other alarm ids ; logs events etc */
+    }
+
+    /* manage logging of active alarms */
+    if ( !active_alarms.empty() )
+    {
+        if ( node_ptr->active_alarms != active_alarms )
+        {
+            ilog ("%s active alarms: %s",
+                      node_ptr->hostname.c_str(),
+                      active_alarms.c_str());
+
+            node_ptr->active_alarms = active_alarms ;
+        }
+        /* else
+         *    do nothing because there are active alarms
+         *    that have not changed since the last audit.
+         */
+    }
+    else if ( ! node_ptr->active_alarms.empty() )
+    {
+        /* clear active alarm list since there 'were' active alarms
+         * but there are no longer active alarms */
+        node_ptr->active_alarms.clear();
+        ilog ("%s no active alarms", node_ptr->hostname.c_str());
+    }
+    /* else
+     *    no active alarms ; don't log */
+}
+
 /*************************   A L A R M I N G   **************************/
 
+/* Raise the specified maintenance alarm severity */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
+{
+    switch ( severity )
+    {
+        case FM_ALARM_SEVERITY_MINOR:
+            return (mtcAlarm_minor(hostname,id));
+        case FM_ALARM_SEVERITY_MAJOR:
+            return (mtcAlarm_major(hostname,id));
+        case FM_ALARM_SEVERITY_CRITICAL:
+            return (mtcAlarm_critical(hostname,id));
+        default:
+            return (FAIL_BAD_PARM);
+    }
+}
+
 /* Clear the specified hosts's maintenance alarm */
 int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
 {
diff --git a/mtce/src/maintenance/mtcAlarm.h b/mtce/src/maintenance/mtcAlarm.h
index 25565d4f..6e93f659 100644
--- a/mtce/src/maintenance/mtcAlarm.h
+++ b/mtce/src/maintenance/mtcAlarm.h
@@ -95,6 +95,9 @@ string mtcAlarm_getId_str ( mtc_alarm_id_enum id );
 /** Clear the specified maintenance alarm for specific host */
 int  mtcAlarm_clear    ( string hostname, mtc_alarm_id_enum id );
 
+/** Raise specified severity level alarm for the specified host */
+int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity );
+
 /** Assert a specified mtce alarm against the specified host with a WARNING severity level */
 int  mtcAlarm_warning  ( string hostname, mtc_alarm_id_enum id );
 
diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp
index 152217c6..5e180252 100644
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@@ -1187,15 +1187,6 @@ int _self_provision ( void )
 
             if ( my_identity.name == record_info.name )
             {
-                /* If the active controller was 'locked' and is being auto-corrected
-                 * to 'unlocked' then ensure that there is no locked alarm set for it */
-                if ( record_info.admin != "locked" )
-                {
-                        mtcAlarm_clear ( my_identity.name, MTC_ALARM_ID__LOCK );
-                        /* this is not required because its already inited to clear */
-                        // node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_CLEAR
-                }
-
                 if ( my_identity.mac != record_info.mac )
                 {
                     wlog ("%s mac address mismatch (%s - %s)\n",
diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp
index 8f6ce2f3..36aa10d2 100755
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@@ -6107,48 +6107,32 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                 mtcInvApi_update_state ( node_ptr, "availability", "available" );
             }
 
-            /* handle other cases */
-            EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,
-                                                     MTC_ALARM_ID__ENABLE);
+            /* Query FM for existing Enable and Config alarm status */
+            EFmAlarmSeverityT enable_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__ENABLE);
+            EFmAlarmSeverityT config_alarm_severity =
+                mtcAlarm_state ( node_ptr->hostname, MTC_ALARM_ID__CONFIG);
 
-            if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
+            /* Clear generic enable alarm over process restart.
+             * Will get reasserted if the cause condition still exists */
+            if ( enable_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
             {
-                node_ptr->alarms[MTC_ALARM_ID__LOCK] = FM_ALARM_SEVERITY_WARNING ;
-
-                /* If the node is locked then the Enable alarm
-                 * should not be present */
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                    sev = FM_ALARM_SEVERITY_CLEAR ;
-                }
+                ilog ("%s found enable alarm ; clearing %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(enable_alarm_severity).c_str());
+                mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
             }
 
-            /* Manage enable alarm over process restart.
-             *
-             * - clear the alarm in the active controller case
-             * - maintain the alarm, set degrade state in MAJOR and CRIT cases
-             * - clear alarm for all other severities.
-             */
-            if ( THIS_HOST )
+            /* The config alarm is maintained if it exists.
+             * The in-service test handler will clear the alarm
+             * if the config failure is gone */
+            if ( config_alarm_severity != FM_ALARM_SEVERITY_CLEAR )
             {
-                if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
-            }
-            else
-            {
-                if (( sev == FM_ALARM_SEVERITY_CRITICAL ) ||
-                    ( sev == FM_ALARM_SEVERITY_MAJOR ))
-                {
-                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = sev ;
-                    node_ptr->degrade_mask |= DEGRADE_MASK_ENABLE ;
-                }
-                else if ( sev != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                }
+                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
+                node_ptr->alarms[MTC_ALARM_ID__CONFIG] = config_alarm_severity ;
+                ilog ("%s found config alarm ; loaded %s",
+                          node_ptr->hostname.c_str(),
+                          alarmUtil_getSev_str(config_alarm_severity).c_str());
             }
 
             if ( is_controller(node_ptr) )
@@ -6188,7 +6172,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                     {
                         ilog ("%s %s\n",node_ptr->hostname.c_str(), MTC_TASK_SWACT_COMPLETE );
 
-                        /* Work Around for issue: */
                         mtcInvApi_update_uptime ( node_ptr, node_ptr->uptime );
 
                         mtcInvApi_update_task ( node_ptr, MTC_TASK_SWACT_COMPLETE );
@@ -6222,7 +6205,6 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
                     mtcSmgrApi_request ( node_ptr, state , SWACT_FAIL_THRESHOLD );
                 }
             }
-
             if ( daemon_get_cfg_ptr()->debug_level & 1 )
                 nodeLinkClass::host_print (node_ptr);
 
@@ -6357,6 +6339,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
         }
         case MTC_ADD__WORKQUEUE_WAIT:
         {
+
             rc = workQueue_done ( node_ptr );
             if ( rc == RETRY )
             {
@@ -6444,6 +6427,7 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
             }
 
             node_ptr->addStage = MTC_ADD__START;
+
             plog ("%s Host Add Completed (uptime:%d)\n", node_ptr->hostname.c_str(), node_ptr->uptime );
             node_ptr->add_completed = true ;
             break ;
@@ -7202,6 +7186,9 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
                 }
             }
 
+            /* audit alarms */
+            mtcAlarm_audit (node_ptr );
+
             break ;
         }
         case MTC_OOS_TEST__WAIT:
@@ -7600,7 +7587,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                 }
             }
 
-            /* Monitor the health of the host - no pass file */
+            /* Monitor the health of the host */
             if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
                 (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
                 (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
@@ -7626,6 +7613,11 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                     ilog ("%s sm degrade clear\n", node_ptr->hostname.c_str());
                 }
 
+                /*
+                 * In-service Config Failure/Alarm handling
+                 */
+
+                /* Detect new config failure condition */
                 if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)
                 {
                     /* not healthy .... */
@@ -7637,16 +7629,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                         {
                             wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                             if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
-                            {
-                                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
-
-                                /* threshold is reached so raise the config alarm if it is not already raised */
-                                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CRITICAL )
-                                {
-                                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
-                                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CRITICAL ;
-                                }
-                            }
+                                alarm_config_failure ( node_ptr );
                         }
                     }
                     else
@@ -7666,6 +7649,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                         }
                     }
                 }
+                /* or correct an alarmed config failure that has cleared */
+                else if ( node_ptr->degrade_mask & DEGRADE_MASK_CONFIG )
+                {
+                    if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_HEALTHY )
+                        alarm_config_clear ( node_ptr );
+                }
                 else
                 {
                     node_ptr->health_threshold_counter = 0 ;