Mtce: Make Multi-Node Failure Avoidance Configurable

The maintenance system implements a high availability (HA) feature designed to detect the simultaneous heartbeat failure of a group of hosts and avoid failing all those hosts until heartbeat resumes or after a set period of time. This feature is called Multi-Node Failure Avoidance, aka MNFA, and currently has the hosts threshold set to 3 and timeout set to 100 secs. This update implements enhancements to that existing feature by making the 'number-of-hosts threshold' and 'timeout period' customer configurable service parameters. The new service parameters are listed under platform:maintenance which display with the following command > system service-parameter-list mnfa_threshold: This new label and value is added to the puppet managed /etc/mtc.ini and represents the number of hosts that are required to fail heartbeat as a group; within the heartbeat failure window (heartbeat_failure_threshold) after which maintenance activates MNFA Mode. This update changes the default number of failing hosts from 3 to 2 while allowing a configurable range from 2 to 100. mnfa_timeout: This new label and value is added to the puppet managed /etc/mtc.ini. While MNFA mode is active, it will remain active until the number of failing hosts drop below the mnfa_threshold or this timer expires. The MNFA mode deactivates on the first occurance of either case. Upon deactivation the remaining failed hosts are no longer treated as a failure group but instead are all Gracefully Recovered individually. A value of zero imposes no timeout making the deactivation criteria solely host based. This update changes the default 100 second timer to 0; no-timeout while permitting valid a times range from 100 to 86400 secs or 1 day. Test Plan: PASS - Verify duplex and 4 compute DOR PASS - Verify default MNFA - 1 inactive controller and 4 computes PASS - Verify default MNFA - 4 computes PASS - Verify default MNFA - 1 active controller and 3 computes and failed host PASS - Verify Single host heartbeat failure handling - fail host PASS - Verify Multi Node failure below mnfa_threshold - fail hosts PASS - Verify MNFA handling with timeout of zero and threshold of 3 PASS - Verify MNFA timeout handling with timeout set at 100 sec PASS - Verify MNFA service parameter lising, default value and mtc.ini PASS - Verify MNFA service parameter change and inservice apply PASS - Verify MNFA timeout service parameter change from value to 0 PASS - Verify MNFA timeout service parameter change from to inrange value PASS - Verify MNFA service parametrer out of range change handling PASS - Verify MNFA timeout change from No-Timeout to 100 sec (while active) DocImpact Story: 2003576 Task: 24903 Change-Id: Ib56dd79b38c3726e042cf34aae361f229c89940b Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2018-08-29 15:19:20 -04:00 · 2018-08-29 15:19:20 -04:00 · 82e851d651
commit 82e851d651
parent 482d1acea8
10 changed files with 121 additions and 208 deletions
--- a/mtce-common/centos/build_srpm.data
+++ b/mtce-common/centos/build_srpm.data
@ -1,3 +1,3 @@
 SRC_DIR="cgts-mtce-common-1.0"
-TIS_PATCH_VER=135
+TIS_PATCH_VER=136
 BUILD_IS_SLOW=5
--- a/mtce-common/cgts-mtce-common-1.0/common/logMacros.h
+++ b/mtce-common/cgts-mtce-common-1.0/common/logMacros.h
@ -147,21 +147,6 @@ typedef struct

    int   latency_thld          ; /**< scheduling latency threshold in msec b4 log */

-    /** Multi Node Failure Avoidance Controls                                 */
-    char * mnfa_threshold_type     ; /**< value used in multi node failure
-                                          avoidance calculation ;
-                                          'number' / 'percent'age of hosts */
-    int    mnfa_threshold_percent  ; /**< number of hosts simultaneously
-                                          failing heartbeat                */
-    int    mnfa_threshold_number   ; /**< percentage of pool
-                                          simultanepously failing heartbeat*/
-    int    mnfa_recovery_threshold ; /**< Multi-Node-Failure Avoidance Recovery Threshold
-                                          Similar to the LOC above for graceful recovery
-                                          hosts that have LOC for longer than this time in
-                                          seconds are failed and sent into the enable_handler
-                                          FSM while those that recover before this period are
-                                          sent into the graceful recovery_handler FSM. */
-
    /** Configurable Timeouts ; unit is 'seconds'                             */
    int   controller_mtcalive_timeout  ; /**< mtcAlive wait timeout           */
    int   compute_mtcalive_timeout     ; /**< mtcAlive wait timeout           */
@ -172,7 +157,6 @@ typedef struct
    int   sysinv_noncrit_timeout       ; /**< sysinv nonc request timeout     */
    int   work_queue_timeout           ; /**< end of action workq complete TO */
    int   loc_recovery_timeout         ; /**< loss of comms recovery timeout  */
-    int   mnfa_recovery_timeout        ; /**< mnfa recovery timeout           */
    int   node_reinstall_timeout       ; /**< node reinstall timeout          */
    int   dor_mode_timeout             ; /**< dead office recovery timeout    */
    int   dor_recovery_timeout_ext     ; /**< dor recovery timeout extension  */
--- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.cpp
@ -218,7 +218,6 @@ nodeLinkClass::nodeLinkClass()
    this->controller_mtcalive_timeout = 0;
    this->goenabled_timeout           = 0;
    this->loc_recovery_timeout        = 0;
-    this->mnfa_recovery_timeout       = 0;
    this->node_reinstall_timeout      = 0;
    this->token_refresh_rate          = 0;
    this->autorecovery_enabled        = false ;
@ -270,16 +269,16 @@ nodeLinkClass::nodeLinkClass()
    active_controller_hostname.clear() ;
    inactive_controller_hostname.clear() ;

+    /* MNFA Activity Controls */
+    mnfa_threshold  = 2 ; /* 2 hosts    */
+    mnfa_timeout    = 0 ; /* no timeout */
+
    /* Start with no failures */
    mnfa_awol_list.clear();
    mnfa_host_count[MGMNT_IFACE] = 0 ;
    mnfa_host_count[INFRA_IFACE] = 0 ;
    mnfa_occurances = 0 ;
    mnfa_active     = false ;
-    mnfa_threshold_type    = MNFA_NUMBER ;
-    mnfa_threshold_percent = 5 ;
-    mnfa_threshold_number  = 3 ;
-    mnfa_threshold = mnfa_threshold_number ;

    mgmnt_link_up_and_running = false ;
    infra_link_up_and_running = false ;
@ -4309,10 +4308,9 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
        {
            /* If we are mnfa_active AND now below the threshold
             * then trigger mnfa_exit */
-            if (( --mnfa_host_count[iface] < mnfa_calculate_threshold( node_ptr->hostname ) ) &&
+            if (( --mnfa_host_count[iface] < mnfa_threshold) &&
                   ( mnfa_active == true ))
            {
-    
                wlog ("%s MNFA exit with graceful recovery (%s:%d)\n",
                          node_ptr->hostname.c_str(),
                          get_iface_name_str(iface),
@ -4468,6 +4466,8 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
    }
    else
    {
+        alarm_enabled_failure (node_ptr);
+
        mnfa_add_host ( node_ptr , iface );

        if ( mnfa_active == false )
@ -4481,17 +4481,13 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
            {
                node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
            }
-            if ( mnfa_host_count[iface] < mnfa_calculate_threshold( hostname ))
+            if (mnfa_host_count[iface] < this->mnfa_threshold)
            {
-
                elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));

                nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
-                if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
-                {
-                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL;
-                }
+
+                alarm_enabled_failure (node_ptr);

                if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
                    ( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
@ -8296,19 +8292,12 @@ void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
 void nodeLinkClass::mem_log_mnfa ( void )
 {
    char str[MAX_MEM_LOG_DATA] ;
-
-    int temp = mnfa_threshold_number ;
-    if ( mnfa_threshold_type == MNFA_PERCENT )
-        temp = mnfa_threshold_percent ;
-
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: Mode:%s:%d State:%s Hosts:%d:%d Cases:%d Threshold:%d\n", 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: State:%s Hosts:%d:%d Threshold:%d Occurances:%d\n",
                my_hostname.c_str(),
-                mnfa_threshold_type ? "Percent" : "Number",
-                temp,
                mnfa_active ? "ACTIVE" : "inactive",
                mnfa_host_count[MGMNT_IFACE],
                mnfa_host_count[INFRA_IFACE],
-                mnfa_calculate_threshold( "" ),
+                mnfa_threshold,
                mnfa_occurances);
    mem_log (str);
 }
--- a/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h
+++ b/mtce-common/cgts-mtce-common-1.0/common/nodeClass.h
@ -1149,11 +1149,6 @@ private:
    /** Tracks the number of times multi failure avoidance was exited */
    int mnfa_occurances ;

-    /** true when the multi node failure count exceeds the multi
-     *  node failure avoidance threshold and until there are no more
-     *  in service trouble hosts */
-    bool mnfa_active ;
-
    /** Recover or exit from the muli-node failure avoidance state
     *  This involves restarting the heartbeat on all the nodes
     *  that remain hbs_minor and clearing any heartbneat degrade
@ -1428,6 +1423,11 @@ public:
    std::list<string> hostname_inventory ;
    std::list<string>::iterator host ;

+    /** true when the multi node failure count exceeds the multi
+     *  node failure avoidance threshold and until there are no more
+     *  in service trouble hosts */
+    bool mnfa_active ;
+
    std::list<string>           mnfa_awol_list ;
    void                        mnfa_timeout_handler ( void );

@ -1722,23 +1722,17 @@ public:
    int  inotify_shadow_file_fd ;
    int  inotify_shadow_file_wd ;

-    /** The multi node failure avoidance type */
-    #define MNFA_NUMBER  0
-    #define MNFA_PERCENT 1
-    int mnfa_threshold_type    ;
+    /* MNFA Timeout
+     *
+     * Time in secs MNFA can remain active.
+     * If 0 then there is no timeout. */
+    int mnfa_timeout ;

-    /** % of hosts that need to simultaneously fail before 'mnfa' kicks in */
-    int mnfa_threshold_percent ;
-
-    /** # of hosts that need to simultaneously fail before 'mnfa' kicks in */
-    int mnfa_threshold_number  ;
-
-    /** the calculated threshold */
+    /* MNFA Host Involvement Threshold
+     * Number of hosts simultaneously failing heartbeat
+     * upon which feature will activate */
    int mnfa_threshold ;

-    /** Calculates and returns the mnfa threshold based on enabled hosts */
-    int mnfa_calculate_threshold ( string hostname );
-
    /* collectd event handler */
    int collectd_notify_handler ( string & hostname,
                                  string & resource,
@ -1997,7 +1991,6 @@ public:
    int sysinv_timeout         ;
    int sysinv_noncrit_timeout ;
    int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout        */
-    int mnfa_recovery_timeout; /**< Multi-Node-Failure Avoidance Recovery Timeout */
    int work_queue_timeout   ;
    int node_reinstall_timeout ;

--- a/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/daemon/daemon_config.cpp
@ -46,7 +46,6 @@ void daemon_config_default ( daemon_config_type* config_ptr )
    config_ptr->sysinv_api_bind_ip    = strdup("none");
    config_ptr->mode                  = strdup("none");
    config_ptr->fit_host              = strdup("none");
-    config_ptr->mnfa_threshold_type   = strdup("none");
    config_ptr->multicast             = strdup("none");

    config_ptr->debug_all    = 0 ;
@ -174,14 +173,6 @@ int timeout_config_handler (       void * user,
        config_ptr->loc_recovery_timeout = atoi(value);
        ilog ("LOC  Timeout: %3d secs\n", config_ptr->loc_recovery_timeout );
    }
-    else if (MATCH("timeouts", "mnfa_recovery_timeout"))
-    {
-        config_ptr->mnfa_recovery_timeout = atoi(value);
-        if (( config_ptr->mnfa_recovery_timeout > 300 ) ||
-            ( config_ptr->mnfa_recovery_timeout == 0 ))
-              config_ptr->mnfa_recovery_timeout = 5 ;
-        ilog ("MNFA Timeout: %3d secs\n", config_ptr->mnfa_recovery_timeout );
-    }
    else if (MATCH("timeouts", "node_reinstall_timeout"))
    {
        config_ptr->node_reinstall_timeout = atoi(value);
@ -254,7 +245,6 @@ void daemon_dump_cfg ( void )
    if ( strcmp(ptr->infra_iface, "none" )) { ilog ("infra_iface           = %s\n", ptr->infra_iface    );}
    if ( strcmp(ptr->multicast, "none"   )) { ilog ("multicast             = %s\n", ptr->multicast );}

-    
    if ( ptr->ha_port        ) { ilog ("ha_port               = %d\n", ptr->ha_port               );}
    if ( ptr->vim_cmd_port   ) { ilog ("vim_cmd_port          = %d\n", ptr->vim_cmd_port          );}
    if ( ptr->vim_event_port ) { ilog ("vim_event_port        = %d\n", ptr->vim_event_port        );}
@ -316,10 +306,6 @@ void daemon_dump_cfg ( void )
    if ( ptr->stall_rec_thld    ) { ilog ("stall_rec_thld        = %d\n", ptr->stall_rec_thld       );}

    /* mtcAgent */
-    if ( ptr->mnfa_threshold_type        ) { ilog ("mnfa_threshold_type   = %s\n", ptr->mnfa_threshold_type         );}
-    if ( ptr->mnfa_threshold_percent     ) { ilog ("mnfa_threshold_percent= %d\n", ptr->mnfa_threshold_percent      );}
-    if ( ptr->mnfa_threshold_number      ) { ilog ("mnfa_threshold_number = %d\n", ptr->mnfa_threshold_number       );}
-    if ( ptr->mnfa_recovery_threshold    ) { ilog ("mnfa_recovery_threshod= %d\n", ptr->mnfa_recovery_threshold     );}
    if ( ptr->controller_mtcalive_timeout) { ilog ("controller_mtcalive_to= %d\n", ptr->controller_mtcalive_timeout );}
    if ( ptr->compute_mtcalive_timeout   ) { ilog ("compute_mtcalive_to   = %d\n", ptr->compute_mtcalive_timeout    );}
    if ( ptr->goenabled_timeout          ) { ilog ("goenabled_timeout     = %d\n", ptr->goenabled_timeout           );}
@ -328,7 +314,6 @@ void daemon_dump_cfg ( void )
    if ( ptr->sysinv_noncrit_timeout     ) { ilog ("sysinv_noncrit_timeout= %d\n", ptr->sysinv_noncrit_timeout      );}
    if ( ptr->work_queue_timeout         ) { ilog ("work_queue_timeout    = %d\n", ptr->work_queue_timeout          );}
    if ( ptr->loc_recovery_timeout       ) { ilog ("loc_recovery_timeout  = %d\n", ptr->loc_recovery_timeout        );}
-    if ( ptr->mnfa_recovery_timeout      ) { ilog ("mnfa_recovery_timeout = %d\n", ptr->mnfa_recovery_timeout       );}
    if ( ptr->node_reinstall_timeout     ) { ilog ("node_reinstall_timeout= %d\n", ptr->node_reinstall_timeout      );}
    if ( ptr->uptime_period              ) { ilog ("uptime_period         = %d\n", ptr->uptime_period               );}
    if ( ptr->online_period              ) { ilog ("online_period         = %d\n", ptr->online_period               );}
--- a/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/heartbeat/hbsStubs.cpp
@ -270,12 +270,6 @@ int daemon_log_message ( const char * hostname,
    return(PASS);
 }

-
-int  nodeLinkClass::mnfa_calculate_threshold ( string hostname )
-{
-    UNUSED(hostname);
-    return(PASS) ;
-}
 void nodeLinkClass::mnfa_add_host     ( struct nodeLinkClass::node * node_ptr, iface_enum iface )
 { node_ptr = node_ptr ; iface = iface ; }
 void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
--- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeCtrl.cpp
@ -295,18 +295,6 @@ static int mtc_config_handler ( void * user,
        config_ptr->mask |= CONFIG_AGENT_API_RETRIES ;
        mtcInv.api_retries = config_ptr->api_retries ;
    }
-    else if (MATCH("agent", "mnfa_threshold_type"))
-    {
-        config_ptr->mnfa_threshold_type = strdup(value);
-    }
-    else if (MATCH("agent", "mnfa_threshold_percent"))
-    {
-        config_ptr->mnfa_threshold_percent = atoi(value);
-    }
-    else if (MATCH("agent", "mnfa_threshold_number"))
-    {
-        config_ptr->mnfa_threshold_number = atoi(value);
-    }
    else if (MATCH("timeouts", "failsafe_shutdown_delay"))
    {
        config_ptr->failsafe_shutdown_delay = atoi(value);
@ -335,6 +323,55 @@ static int mtc_config_handler ( void * user,
    return (FAIL);
 }

+static int mtc_ini_handler   ( void * user,
+                         const char * section,
+                         const char * name,
+                         const char * value)
+{
+    UNUSED(user);
+
+    if (MATCH("agent", "mnfa_threshold"))
+    {
+        mtcInv.mnfa_threshold = atoi(value);
+        ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold);
+    }
+    else if (MATCH("timeouts", "mnfa_timeout"))
+    {
+        int old = mtcInv.mnfa_timeout ;
+        mtcInv.mnfa_timeout = atoi(value);
+        if ( mtcInv.mnfa_timeout == 0 )
+        {
+            ilog ("MNFA Timeout: Never\n");
+        }
+        else
+        {
+            ilog ("MNFA Timeout: %3d secs\n", mtcInv.mnfa_timeout );
+        }
+
+        /* handle a change in mnfa timeout while MNFA is active */
+        if (( mtcInv.mnfa_active  == true ) &&
+            ( mtcInv.mnfa_timeout != old ))
+        {
+            mtcTimer_reset ( mtcInv.mtcTimer_mnfa );
+            if (( old == 0 ) || mtcInv.mnfa_timeout != 0 )
+            {
+                wlog ("MNFA Auto-Recovery in %d seconds\n",
+                       mtcInv.mnfa_timeout);
+
+                mtcTimer_start ( mtcInv.mtcTimer_mnfa,
+                                 mtcTimer_handler,
+                                 mtcInv.mnfa_timeout);
+            }
+            else if ( mtcInv.mnfa_timeout == 0 )
+            {
+                ilog ("MNFA timer set to no-timeout ; previous %d sec timer cancelled", old );
+            }
+        }
+    }
+    return (PASS);
+}
+
+
 /* Read and process mtc.ini file settings into the daemon configuration */
 int daemon_configure ( void )
 {
@ -350,6 +387,12 @@ int daemon_configure ( void )
        return (FAIL_LOAD_INI);
    }

+    if (ini_parse(MTCE_INI_FILE, mtc_ini_handler, &mtc_config) < 0)
+    {
+        elog ("Can't load '%s'\n", MTCE_INI_FILE );
+        return (FAIL_LOAD_INI);
+    }
+
    if (ini_parse(MTCE_INI_FILE, keystone_config_handler, &mtc_config) < 0)
    {
        elog ("Can't load '%s'\n", MTCE_INI_FILE );
@ -406,14 +449,12 @@ int daemon_configure ( void )
        mtcInv.goenabled_timeout = DEFAULT_GOENABLE_TIMEOUT ;

    mtcInv.loc_recovery_timeout  = mtc_config.loc_recovery_timeout ;
-    mtcInv.mnfa_recovery_timeout = mtc_config.mnfa_recovery_timeout ;

    if ( mtc_config.node_reinstall_timeout )
        mtcInv.node_reinstall_timeout = mtc_config.node_reinstall_timeout ;
    else
        mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;

-
    if ( mtc_config.dor_mode_timeout <= 0 )
    {
        slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
@ -423,25 +464,6 @@ int daemon_configure ( void )
        mtc_config.dor_mode_timeout = DEFAULT_DOR_MODE_TIMEOUT ;
    }

-    /* validate and auto correct manage multi node failure avoidance thresholds */
-    if (( mtc_config.mnfa_threshold_type != NULL ) &&
-        ( !strncmp (mtc_config.mnfa_threshold_type, "percent", strlen("percent"))))
-    {
-        if ( mtc_config.mnfa_threshold_percent > 100 )
-        {
-             mtc_config.mnfa_threshold_percent = 100 ;
-        }
-        mtcInv.mnfa_threshold_type = MNFA_PERCENT ;
-        ilog ("mnfAvoidance: %d%c\n", mtc_config.mnfa_threshold_percent, '%' );
-        mtcInv.mnfa_threshold_percent = mtc_config.mnfa_threshold_percent ;
-    }
-    else
-    {
-        mtcInv.mnfa_threshold_type = MNFA_NUMBER ;
-        ilog ("mnfAvoidance: %d hosts\n", mtc_config.mnfa_threshold_number );
-        mtcInv.mnfa_threshold_number = mtc_config.mnfa_threshold_number ;
-    }
-
    if ( mtc_config.swact_timeout )
    {
        if ( mtc_config.swact_timeout < (MTC_SWACT_POLL_TIMER*2))
--- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeHdlrs.cpp
@ -1690,11 +1690,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                        /* Go to the goEnabled stage */
                        recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );

-                        if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
-                        {
-                            mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                            node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
-                        }
+                        alarm_enabled_failure(node_ptr);
                        break ;
                    }
                }
@ -1732,11 +1728,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                    /* Go to the goEnabled stage */
                    recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );

-                    if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
-                    {
-                        mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                        node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
-                    }
+                    alarm_enabled_failure (node_ptr);
                }
            }
            /* A timer ring indicates that the host is not up */
@ -1780,11 +1772,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
                /* Inform the VIM that this host has failed */
                mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );

-                if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
-                {
-                    mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
-                    node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
-                }
+                alarm_enabled_failure(node_ptr);
+
                /* Clear all degrade flags except for the HWMON one */
                clear_host_degrade_causes ( node_ptr->degrade_mask );
                node_ptr->degraded_resources_list.clear();
--- a/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp
+++ b/mtce-common/cgts-mtce-common-1.0/maintenance/mtcNodeMnfa.cpp
@ -41,48 +41,6 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
    ilog ("MNFA POOL:%s\n", pool_list.c_str());
 }

-/*******************************************************************************
- *
- * Name       : mnfa_calculate_threshold
- *
- * Description: Calculates and returns the mnfa threshold based
- *              on enabled hosts.
- *
- * Auto corrects the value to a min number.
- *
- * Calculate the multi-node failure avoidance handling threshold
- * This is the number of hosts than need to fail simultaneously
- * in order to trigger mode ; i.e. mnfa_active=true
- *
- *******************************************************************************/
-int nodeLinkClass::mnfa_calculate_threshold ( string hostname )
-{
-    int mnfa_enabled_nodes = enabled_nodes ();
-
-    /* Calculate the threshold */
-    if ( mnfa_threshold_type == MNFA_PERCENT )
-        mnfa_threshold = mnfa_enabled_nodes / mnfa_threshold_percent ;
-    else
-        mnfa_threshold = mnfa_threshold_number ;
-
-    /* Don't allow the multi-node failure avoidance
-     * to ever be 1 or we would never fail a host */
-    if ( mnfa_threshold < mnfa_threshold_number )
-    {
-        ilog ("%s MNFA threshold rounded to %d from %d\n",
-               hostname.c_str(),
-               mnfa_threshold_number,
-               mnfa_enabled_nodes / mnfa_threshold_percent );
-        mnfa_threshold = mnfa_threshold_number ;
-    }
-
-    if ( mnfa_awol_list.size() )
-    {
-        log_mnfa_pool ( mnfa_awol_list );
-    }
-    return (mnfa_threshold);
-}
-
 /*****************************************************************************
 *
 * Name       : mnfa_add_host
@ -105,6 +63,8 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
        /* if we are active then add the node to the awol list */
        if ( mnfa_active == true )
        {
+            alarm_enabled_failure (node_ptr);
+
            /* once we are mnfa_active we need to give all the
             * hbs_minor=true hosts a graceful recovery token
             * mnfa_graceful_recovery = true and add to the awol list */
@ -116,7 +76,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
                mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
        }
        else if (( mnfa_active == false ) &&
-                 ( mnfa_host_count[iface] >= mnfa_calculate_threshold( node_ptr->hostname )))
+                 ( mnfa_host_count[iface] >= this->mnfa_threshold))
        {
            enter = true ;
        }
@ -134,6 +94,11 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
                 get_iface_name_str(INFRA_IFACE),
                 node_ptr->hbs_minor_count[INFRA_IFACE]);

+        if ( mnfa_awol_list.size() )
+        {
+            log_mnfa_pool ( mnfa_awol_list );
+        }
+
        if ( enter == true )
        {
            mnfa_enter ();
@ -196,7 +161,7 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
 *     mnfa_graceful_recovery token
 *
 *  5. Start the MNFA Auto-Recovery timer with time based on the config
- *     setting mnfa_recovery_timeout
+ *     setting mnfa_timeout
 *
 ****************************************************************************/
 void nodeLinkClass::mnfa_enter ( void )
@ -211,8 +176,7 @@ void nodeLinkClass::mnfa_enter ( void )
      * previous mnfa but the failure case occurs again. If that
      * happens we need to cancel the timer that will issue
      * the period recovery command. */
-     if ( mtcTimer_mnfa.tid )
-         mtcTimer_stop ( mtcTimer_mnfa );
+     mtcTimer_reset ( mtcTimer_mnfa );

     /* Loop through inventory and recover each host that
      * remains in the hbs_minor state.
@ -232,6 +196,7 @@ void nodeLinkClass::mnfa_enter ( void )
             if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
                mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );

+             alarm_enabled_failure (ptr);
         }
         if (( ptr->next == NULL ) || ( ptr == tail ))
             break ;
@ -239,14 +204,20 @@ void nodeLinkClass::mnfa_enter ( void )

     mnfa_awol_list.unique();

-     /* Start the timer that will eventually send the MTC_RECOVER_HBS command */
-     wlog ("MNFA Auto-Recovery in %d seconds\n",       mnfa_recovery_timeout);
-     mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, mnfa_recovery_timeout);
+     if ( this->mnfa_timeout )
+     {
+         wlog ("MNFA Auto-Recovery in %d seconds\n",       this->mnfa_timeout);
+         mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
+     }
+     if ( mnfa_awol_list.size() )
+     {
+         log_mnfa_pool ( mnfa_awol_list );
+     }
 }

 /****************************************************************************
 *
- * Name       : mnfa_enter
+ * Name       : mnfa_exit
 *
 * Description: Perform the operations required to exit mnfa mode
 * These include ...
@ -266,7 +237,7 @@ void nodeLinkClass::mnfa_enter ( void )
 *     hosts that remain in the hbs_minor state.
 *
 * if ( force == true )
- *    The mnfa_recovery_timeout has expired
+ *    The mnfa_timeout has expired
 *    All hosts in the awol list are forced failed and into the
 *       enable_handler FSM.
 * else
@ -279,18 +250,20 @@ void nodeLinkClass::mnfa_exit ( bool force )
 {
    if ( mnfa_active == true )
    {
-        wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
-                     force ? "(Auto-Recover)" : "");
-
-        mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
        mnfa_occurances++ ;
        mnfa_active = false ;
-
        if ( force == true )
        {
            elog ("... MNFA %d sec timeout - forcing full enable on ... \n",
-                       mnfa_recovery_timeout);
+                       this->mnfa_timeout);
+        }

+        wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
+                     force ? "(Auto-Recover)" : "");
+        mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
+
+        if ( mnfa_awol_list.size() )
+        {
            log_mnfa_pool ( mnfa_awol_list );
        }

@ -342,8 +315,7 @@ void nodeLinkClass::mnfa_exit ( bool force )
        }

        /* Stop the ... failure -> full enable ... window timer if it is active */
-        if ( mtcTimer_mnfa.tid )
-            mtcTimer_stop ( mtcTimer_mnfa );
+        mtcTimer_reset ( mtcTimer_mnfa );

        /* Start the timer that will eventually send the MTC_RECOVER_HBS command */
        mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
--- a/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf
+++ b/mtce-common/cgts-mtce-common-1.0/scripts/mtc.conf
@ -18,14 +18,6 @@ offline_threshold = 46        ; number of back to back mtcAlive requests missed
                              ; 100:46 will yield a typical 5 sec holdoff from
                              ; failed to offline

-mnfa_threshold_type = number  ; Two different types are supported
-                              ;  'number' or 'percent' of simultaneous
-                              ;  failures that enable multi-node
-                              ;  failure avoidance handling
-                              ;
-mnfa_threshold_percent = 10   ; if ( mnfa_threshold_type == percent )
-mnfa_threshold_number = 3     ; if ( mnfa_threshold_type == number )
-
 inventory_port = 6385         ; The Inventory Port Number
 keystone_port = 5000          ; The Keystone Port Number
 ha_port = 7777                ; The Inventory Port Number
@ -86,13 +78,6 @@ loc_recovery_timeout = 5      ; Loss Of Communication Recovery Timeout
                              ;  the max number of seconds that a host can be in
                              ;  loss of communication state without failing the unit

-mnfa_recovery_timeout = 100   ; Multi-Node-Failure Avoidance Recovery Threshold
-                              ;  Similar to the LOC above for graceful recovery
-                              ;  hosts that have LOC for longer than this time in
-                              ;  seconds are failed and sent into the enable_handler
-                              ;  FSM while those that recover before this period are
-                              ;  sent into the graceful recovery_handler FSM.
-
 dor_mode_timeout = 20           ; The default base time in seconds for how long
                                ; maintenance DOR mode is active. This number
                                ; is extended by the number of enabled hosts.