Merge "Mtce: Add Thresholded Maintenance Enable Recovery support"

2018-12-13 15:57:44 +00:00 · 2018-12-13 15:57:44 +00:00 · 8eb55b2b03
commit 8eb55b2b03
parent 373f21e5cd 3a5c578355
10 changed files with 723 additions and 405 deletions
--- a/mtce-common/centos/build_srpm.data
+++ b/mtce-common/centos/build_srpm.data
@ -1,3 +1,3 @@
 SRC_DIR="src"
-TIS_PATCH_VER=140
+TIS_PATCH_VER=141
 BUILD_IS_SLOW=5
--- a/mtce-common/src/common/logMacros.h
+++ b/mtce-common/src/common/logMacros.h
@ -166,6 +166,18 @@ typedef struct
    int   kernwd_update_period         ; /**< expect kernel watchdog to be updated */
    int   autorecovery_threshold       ; /**< AIO stop autorecovery threshold */

+    /**< Auto Recovery Thresholds                                             */
+    int   ar_config_threshold          ; /**< Configuration Failure Threshold */
+    int   ar_goenable_threshold        ; /**< GoEnable Failure Threshold      */
+    int   ar_hostservices_threshold    ; /**< Host Services Failure Threshold */
+    int   ar_heartbeat_threshold       ; /**< Heartbeat Soak Failure Threshold*/
+
+    /**< Auto Recovery Retry Intervals                                        */
+    int   ar_config_interval           ; /**< Configuration Failure Interval  */
+    int   ar_goenable_interval         ; /**< GoEnable Failure Interval       */
+    int   ar_hostservices_interval     ; /**< Host Services Failure Interval  */
+    int   ar_heartbeat_interval        ; /**< Heartbeat Soak Failure Interval */
+
    int   debug_all    ;
    int   debug_json   ; /**< Enable jlog (json string  ) output if not false */
    int   debug_timer  ; /**< Enable tlog (timer logs   ) output if not false */
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -229,28 +229,35 @@ void daemon_exit ( void );
 #define MTC_TASK_INIT_FAIL         "Initialization Failed, recovering"
 #define MTC_TASK_START_SERVICE_FAIL "Start Services Failed"
 #define MTC_TASK_START_SERVICE_TO  "Start Services Timeout"
-#define MTC_TASK_ENABLING          "Enabling"
-#define MTC_TASK_ENABLING_SUBF     "Enabling Compute Service"
-#define MTC_TASK_ENABLING_SUBF_FAIL "Enabling Compute Service Failed"
-#define MTC_TASK_ENABLING_SUBF_TO  "Enabling Compute Service Timeout"
-#define MTC_TASK_ENABLE_WORK_FAIL  "Enable Action Failed, re-enabling"
-#define MTC_TASK_ENABLE_WORK_FAIL_ "Enable Action Failed"
-#define MTC_TASK_ENABLE_WORK_TO    "Enable Action Timeout, re-enabling"
-#define MTC_TASK_ENABLE_WORK_TO_   "Enable Action Timeout"
+#define MTC_TASK_ENABLE_WORK_FAIL  "Enable Action Failed"
+#define MTC_TASK_ENABLE_WORK_TO    "Enable Action Timeout"
 #define MTC_TASK_ENABLE_FAIL_HB    "Enable Heartbeat Failure, re-enabling"
 #define MTC_TASK_RECOVERY_FAIL     "Graceful Recovery Failed, re-enabling"
 #define MTC_TASK_RECOVERY_WAIT     "Graceful Recovery Wait"
 #define MTC_TASK_RECOVERED         "Gracefully Recovered"
+
+#define MTC_TASK_ENABLING          "Enabling"
 #define MTC_TASK_MAIN_CONFIG_FAIL  "Configuration Failed, re-enabling"
-#define MTC_TASK_SUBF_CONFIG_FAIL  "Compute Configuration Failed, re-enabling"
-#define MTC_TASK_SUBF_CONFIG_FAIL_ "Compute Configuration Failed"
 #define MTC_TASK_MAIN_CONFIG_TO    "Configuration Timeout, re-enabling"
+#define MTC_TASK_MAIN_INTEST_FAIL  "In-Test Failed, re-enabling"
+#define MTC_TASK_MAIN_INTEST_TO    "In-Test Timeout, re-enabling"
+#define MTC_TASK_MAIN_SERVICE_FAIL "Start Services Failed, re-enabling"
+#define MTC_TASK_MAIN_SERVICE_TO   "Start Services Timeout, re-enabling"
+
+#define MTC_TASK_ENABLING_SUBF     "Enabling Compute Service"
+#define MTC_TASK_SUBF_CONFIG_FAIL  "Compute Configuration Failed, re-enabling"
 #define MTC_TASK_SUBF_CONFIG_TO    "Compute Configuration Timeout, re-enabling"
-#define MTC_TASK_SUBF_CONFIG_TO_   "Compute Configuration Timeout"
-#define MTC_TASK_INTEST_FAIL       "In-Test Failed, re-enabling"
-#define MTC_TASK_INTEST_FAIL_      "In-Test Failed"
-#define MTC_TASK_INTEST_FAIL_TO    "In-Test Timeout, re-enabling"
-#define MTC_TASK_INTEST_FAIL_TO_   "In-Test Timeout"
+#define MTC_TASK_SUBF_INTEST_FAIL  "Compute In-Test Failed, re-enabling"
+#define MTC_TASK_SUBF_INTEST_TO    "Compute In-Test Timeout, re-enabling"
+#define MTC_TASK_SUBF_SERVICE_FAIL "Compute Start Services Failed, re-enabling"
+#define MTC_TASK_SUBF_SERVICE_TO   "Compute Start Services Timeout, re-enabling"
+
+#define MTC_TASK_AR_DISABLED_CONFIG    "Configuration failure, threshold reached, Lock/Unlock to retry"
+#define MTC_TASK_AR_DISABLED_GOENABLE  "In-Test Failure, threshold reached, Lock/Unlock to retry"
+#define MTC_TASK_AR_DISABLED_SERVICES  "Service Failure, threshold reached, Lock/Unlock to retry"
+#define MTC_TASK_AR_DISABLED_ENABLE    "Enable Failure, threshold reached, Lock/Unlock to retry"
+#define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry"
+
 #define MTC_TASK_RESET_FAIL        "Reset Failed"
 #define MTC_TASK_RESET_QUEUE       "Reset Failed, retrying (%d of %d)"
 #define MTC_TASK_POWERON_FAIL      "Power-On Failed"
@ -275,8 +282,6 @@ void daemon_exit ( void );
 #define MTC_TASK_RESETTING_HOST    "Resetting Host, critical sensor"
 #define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
 #define MTC_TASK_SELF_UNLOCK_MSG   "Unlocking active controller, please stand-by while it reboots"
-#define MTC_TASK_AUTO_RECOVERY          "Critical failure. Auto-recovery enabled, re-enabling"
-#define MTC_TASK_AUTO_RECOVERY_DISABLED "Critical failure. Auto-recovery disabled, threshold reached"
 #define MTC_TASK_FAILED_SWACT_REQ  "Critical failure.Requesting SWACT to enabled standby controller"
 #define MTC_TASK_FAILED_NO_BACKUP  "Critical failure.Please provision/enable standby controller"

@ -1176,13 +1181,6 @@ typedef enum
    MTC_STRESS_TEST__STAGES = 6,
 } mtc_stressStages_enum ;

-typedef union
-{
-    mtc_enableStages_enum   enable  ;
-    mtc_disableStages_enum  disable ;
-    int                     raw     ;
-} mtc_stages_union ;
-
 typedef struct
 {
   mtc_nodeAdminAction_enum adminAction ;
@ -1210,6 +1208,24 @@ typedef enum
      MAX_IFACES = 2
 } iface_enum ;

+/* Auto recovery Disable Causes */
+typedef enum
+{
+    MTC_AR_DISABLE_CAUSE__CONFIG,
+    MTC_AR_DISABLE_CAUSE__GOENABLE,
+    MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
+    MTC_AR_DISABLE_CAUSE__HEARTBEAT,
+    MTC_AR_DISABLE_CAUSE__LAST,
+    MTC_AR_DISABLE_CAUSE__NONE,
+} autorecovery_disable_cause_enum ;
+
+/* Service Based Auto Recovery Control Structure */
+typedef struct
+{
+    unsigned int count     ; /* running back-2-back failure count         */
+    bool         disabled  ; /* true if autorecovery is disabled          */
+} autorecovery_cause_ctrl_type ;
+
 /** Returns true if the specified admin state string is valid */
 bool adminStateOk  ( string admin );

--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -236,9 +236,6 @@ nodeLinkClass::nodeLinkClass()
    this->loc_recovery_timeout        = 0;
    this->node_reinstall_timeout      = 0;
    this->token_refresh_rate          = 0;
-    this->autorecovery_enabled        = false ;
-    this->autorecovery_disabled       = false ;
-

    head = tail = NULL;
    memory_allocs = 0 ;
@ -313,6 +310,11 @@ nodeLinkClass::nodeLinkClass()
    sysinv_noncrit_timeout       = HTTP_SYSINV_NONC_TIMEOUT ;
    work_queue_timeout           = MTC_WORKQUEUE_TIMEOUT    ;

+    /* Init the auto recovery threshold and intervals to zero until
+     * modified by daemon config */
+    memset (&ar_threshold, 0, sizeof(ar_threshold));
+    memset (&ar_interval, 0, sizeof(ar_interval));
+
    /* Inservice test periods in seconds - 0 = disabled */
    insv_test_period = 0 ;
    oos_test_period = 0 ;
@ -340,7 +342,6 @@ nodeLinkClass::nodeLinkClass()
    tokenEvent.buf   = NULL ;

    unknown_host_throttle = 0 ;
-    invalid_arg_throttle = 0 ;

    testmode = 0 ;
    module_init( );
@ -564,6 +565,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    ptr->was_dor_recovery_mode= false ;
    ptr->dor_recovery_time    = 0     ;

+    ptr->ar_disabled = false ;
+    ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
+    memset (&ptr->ar_count, 0, sizeof(ptr->ar_count));
+    ptr->ar_log_throttle = 0 ;
+
    mtcTimer_init ( ptr->mtcTimer,         hostname, "mtc timer");       /* Init node's general mtc timer    */
    mtcTimer_init ( ptr->insvTestTimer,    hostname, "insv test timer");
    mtcTimer_init ( ptr->oosTestTimer,     hostname, "oos test timer");  /* Init node's oos test timer       */
@ -603,7 +609,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    ptr->subStage          = MTC_SUBSTAGE__DONE  ;
    ptr->reinstallStage    = MTC_REINSTALL__DONE ;
    ptr->resetStage        = MTC_RESET__START    ;
-    ptr->handlerStage.enable = MTC_ENABLE__START ; /* Enable and Disable */
+    ptr->enableStage       = MTC_ENABLE__START   ;
+    ptr->disableStage      = MTC_DISABLE__START  ;

    ptr->oos_test_count = 0 ;
    ptr->insv_test_count = 0 ;
@ -613,9 +620,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    ptr->uptime_refresh_counter = 0 ;
    ptr->node_unlocked_counter = 0 ;

-    /* Default to a healthy config until mtcAlive messages prove otherwise */
-    ptr->mtce_flags = ( MTC_FLAG__I_AM_CONFIGURED |
-                        MTC_FLAG__I_AM_HEALTHY ) ;
+    /* Good health needs to be learned */
+    ptr->mtce_flags = 0 ;

    ptr->graceful_recovery_counter = 0 ;
    ptr->health_threshold_counter = 0 ;
@ -746,8 +752,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
    ptr->adminAction = MTC_ADMIN_ACTION__NONE    ;
    ptr->adminAction_todo_list.clear();

-    ptr->handlerStage.enable = MTC_ENABLE__START;
-
    hosts++ ;

    /* (re)build the Resource Reference Array */
@ -4504,16 +4508,31 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
    }
    else
    {
-        //bool want_degrade = true ;
-        //if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
-        //    want_degrade = false ;
-        // alarm_enabled_failure (node_ptr, want_degrade);
+        /* handle auto recovery for heartbeat failure during enable */
+        if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT )
+            return ;
+        else if ( node_ptr->enableStage == MTC_ENABLE__HEARTBEAT_SOAK )
+        {
+            elog ("%s %s *** Heartbeat Loss *** (during enable soak)\n",
+                      hostname.c_str(),
+                      get_iface_name_str(iface));
+
+            if ( ar_manage ( node_ptr,
+                             MTC_AR_DISABLE_CAUSE__HEARTBEAT,
+                             MTC_TASK_AR_DISABLED_HEARTBEAT ) == PASS )
+            {
+                  mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
+                  enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+                  adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
+            }
+            return ;
+        }

        mnfa_add_host ( node_ptr , iface );

        if ( mnfa_active == false )
        {
-            elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface)); 
+            elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
            if ( iface == INFRA_IFACE )
            {
                node_ptr->heartbeat_failed[INFRA_IFACE] = true ;
@ -4546,6 +4565,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
                {
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
                    enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
                }
            }
        }
@ -5485,7 +5505,7 @@ int nodeLinkClass::critical_process_failed( string & hostname,
        /* Special critical process failure handling for AIO system */
        if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
        {
-            if ( this->autorecovery_disabled == true )
+            if ( node_ptr->ar_disabled == true )
            {
                dlog ("%s bypassing persistent critical process failure\n",
                          node_ptr->hostname.c_str());
@ -5510,7 +5530,7 @@ int nodeLinkClass::critical_process_failed( string & hostname,
        dlog ("%s adminState:%s  EnableStage:%s\n",
                  node_ptr->hostname.c_str(),
                  adminAction_enum_to_str(node_ptr->adminAction).c_str(),
-                  get_enableStages_str(node_ptr->handlerStage.enable).c_str());
+                  get_enableStages_str(node_ptr->enableStage).c_str());
    }
    return (PASS);
 }
@ -5843,7 +5863,7 @@ int nodeLinkClass::set_enableStage ( string & hostname,
    nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
    if ( node_ptr != NULL )
    {
-        node_ptr->handlerStage.enable = stage ;
+        node_ptr->enableStage = stage ;
        return (PASS);
    }
    return (FAIL);
@ -5867,7 +5887,7 @@ mtc_enableStages_enum nodeLinkClass::get_enableStage ( string & hostname)
    nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
    if ( node_ptr != NULL )
    {
-        return ( node_ptr->handlerStage.enable ) ;
+        return ( node_ptr->enableStage ) ;
    }
    return (MTC_ENABLE__STAGES);
 }
@ -6124,16 +6144,15 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
                        node_ptr->node_unlocked_counter++ ;
                    }

-                    if ( is_controller ( node_ptr ) )
-                        autorecovery_clear (node_ptr->hostname);
+                    ar_enable (node_ptr);

-                    node_ptr->handlerStage.enable = MTC_ENABLE__START ;
+                    node_ptr->enableStage = MTC_ENABLE__START ;
                    break ;
                }
                case MTC_ADMIN_ACTION__LOCK:
                case MTC_ADMIN_ACTION__FORCE_LOCK:
                {
-                    node_ptr->handlerStage.disable = MTC_DISABLE__START ;
+                    node_ptr->disableStage = MTC_DISABLE__START ;
                    break ;
                }
                case MTC_ADMIN_ACTION__RESET:
@ -6423,26 +6442,26 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr,
 {
    /* TODO: Consider converting stage to strings ... */
    if (( newHdlrStage >= MTC_ENABLE__STAGES ) ||
-        ( node_ptr->handlerStage.enable >= MTC_ENABLE__STAGES ))
+        ( node_ptr->enableStage >= MTC_ENABLE__STAGES ))
    {
       slog ("%s has invalid Enable stage (%d:%d)\n",
              node_ptr->hostname.c_str(),
-              node_ptr->handlerStage.enable,
+              node_ptr->enableStage,
              newHdlrStage );

-       node_ptr->handlerStage.enable = MTC_ENABLE__FAILURE ;
+       node_ptr->enableStage = MTC_ENABLE__FAILURE ;

       /* TODO: cause failed or degraded state ? */
       return (FAIL);
    }
-    else if ( node_ptr->handlerStage.enable != newHdlrStage )
+    else if ( node_ptr->enableStage != newHdlrStage )
    {
        clog ("%s %s -> %s\n",
               node_ptr->hostname.c_str(),
-               get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
+               get_enableStages_str(node_ptr->enableStage).c_str(),
               get_enableStages_str(newHdlrStage).c_str());

-        node_ptr->handlerStage.enable = newHdlrStage  ;
+        node_ptr->enableStage = newHdlrStage  ;
        return (PASS);
    }
    else
@ -6450,7 +6469,7 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr,
        /* No state change */
        dlog1 ("%s %s -> %s\n",
               node_ptr->hostname.c_str(),
-               get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
+               get_enableStages_str(node_ptr->enableStage).c_str(),
               get_enableStages_str(newHdlrStage).c_str());
        return (PASS);
    }
@ -6461,15 +6480,15 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
                                        mtc_disableStages_enum newHdlrStage )
 {
    /* TODO: Consider converting stage to strings ... */
-    if (( newHdlrStage >= MTC_DISABLE__STAGES ) || 
-        ( node_ptr->handlerStage.disable >= MTC_DISABLE__STAGES ))
+    if (( newHdlrStage >= MTC_DISABLE__STAGES ) ||
+        ( node_ptr->disableStage >= MTC_DISABLE__STAGES ))
    {
-       slog ("%s has invalid disable stage (%d:%d)\n", 
+       slog ("%s has invalid disable stage (%d:%d)\n",
              node_ptr->hostname.c_str(),
-              node_ptr->handlerStage.disable, 
+              node_ptr->disableStage,
              newHdlrStage );

-       node_ptr->handlerStage.disable = MTC_DISABLE__DISABLED ;
+       node_ptr->disableStage = MTC_DISABLE__DISABLED ;

       /* TODO: cause failed or degraded state ? */
       return (FAIL);
@ -6478,10 +6497,10 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
    {
        clog ("%s %s -> %s\n", 
               node_ptr->hostname.c_str(),
-               get_disableStages_str(node_ptr->handlerStage.disable).c_str(), 
+               get_disableStages_str(node_ptr->disableStage).c_str(), 
               get_disableStages_str(newHdlrStage).c_str());

-        node_ptr->handlerStage.disable = newHdlrStage  ;
+        node_ptr->disableStage = newHdlrStage  ;
        return (PASS);
    }
 }
@ -7053,61 +7072,131 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
    return static_cast<struct node *>(NULL);
 }

-
 /*****************************************************************************
 *
- * Name       : autorecovery_clear
+ * Name       : ar_enable
 *
- * Assumptions: Applies when simplex.
- *
- * Description: Removes the auto recovery count file if it exists.
- *
- * Auto recovery count is tracked/preserved in a host named auto recovery
- * counter file /tmp/hostname_ar_count.
- *
- *****************************************************************************/
-
-void autorecovery_clear ( string hostname )
-{
-    string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ;
-    if ( daemon_is_file_present (ar_file.data()))
-    {
-        wlog ("%s clearing autorecovery counter\n", hostname.c_str());
-        daemon_remove_file (ar_file.data());
-    }
-}
-
-/*****************************************************************************
- *
- * Name       : manage_autorecovery
- *
- * Assumptions: Applies to the active controller only while simplex.
- *
- * Description: Issues an immediate lazy reboot if the autorecovery threshold
- *              is reached. Otherwise it disables autorecovery and returns
- *              do we don't get a rolling boot loop.
+ * Description: Clears all auto recovery state for the specified host and
+ *              removes the auto recovery count file if it exists.
 *
 * Auto recovery count is tracked/preserved in a host named auto recovery
 * counter file /etc/mtc/tmp/hostname_ar_count.
 *
- * in the event of a persistent autorecovery failure that results in a
- * disable then the active controller goes enabled-degraded with a horizon
- * status that indicates the active controller has a critical failure but
- * auto recovery is disabled. The enable alarm is raised.
- *
 *****************************************************************************/

-void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr )
+void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
 {
-    /* manage removing the auto recovery threshold count file */
-    if ( ( THIS_HOST ) &&
-        ( this->autorecovery_enabled == true ) &&
-        ( this->autorecovery_disabled == false ) &&
-        ( is_inactive_controller_main_insv() == false ))
+    string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
+    if ( daemon_is_file_present (ar_file.data()))
    {
-        int value      = 0 ;
-        string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
-        int threshold  = daemon_get_cfg_ptr()->autorecovery_threshold ;
+        wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str());
+        daemon_remove_file (ar_file.data());
+    }
+
+    if (( node_ptr->ar_disabled ) ||
+        ( node_ptr->ar_cause != MTC_AR_DISABLE_CAUSE__NONE ))
+    {
+        wlog ("%s re-enabling autorecovery\n", node_ptr->hostname.c_str());
+    }
+
+    node_ptr->ar_disabled = false ;
+    node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
+    memset (&node_ptr->ar_count, 0, sizeof(node_ptr->ar_count));
+
+    node_ptr->ar_log_throttle = 0 ;
+}
+
+/*****************************************************************************
+ *
+ * Name       : ar_manage
+ *
+ * Purpose    : Manage Auto Recovery state.
+ *
+ * Description: the following checks and operations are performed ...
+ *
+ * Pre Checks:
+ *
+ * Validate auto recovery cause code
+ * Return if already in ar_disabled state. Unlikely but safe guard.
+ *
+ * Manage Auto Recovery:
+ *
+ * Case 1: Failed active controller with no enabled inactive controller.
+ *
+ *    Requires persistent count file and self reboot until threshold
+ *    is reached.
+ *
+ *    Issues an immediate lazy reboot if the autorecovery threshold
+ *    is not reached. Otherwise it disables autorecovery and returns
+ *    so we don't get a rolling boot loop.
+ *
+ *    Auto recovery count is tracked/preserved in a host named auto
+ *    recovery counter file /etc/mtc/tmp/hostname_ar_count.
+ *
+ * Case 2: All other cases
+ *
+ * Case 2a: No auto recovery thresholding of active controller in non AIO SX
+ *          where the user can't lock and unlock the active controller.
+ *
+ *    Maintain auto recovery count and set ar_disabled for the host when
+ *    the threshold is reached.
+ *
+ * Parameters:
+ *
+ *    node_ptr nodeLinkClass ptr of failing host.
+ *
+ *    cause    autorecovery_disable_cause_enum failure cause code.
+ *
+ *    string   host status string to display when auto recovery
+ *             threshold is reached and autorecovery is disabled.
+ *
+ * Returns:
+ *
+ *    FAIL tells the caller to break from its FSM at earliest opportunity
+ *         because auto recovery threshold is reached and auto recovery
+ *         is disabled.
+ *
+ *    PASS tells the caller that the threshold is not reached and to
+ *         continue handling the failure.
+ *
+ ******************************************************************************/
+
+int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
+                               autorecovery_disable_cause_enum cause,
+                               string ar_disable_banner )
+{
+    int rc = FAIL ;
+
+    /* Auto recovery only applies for hosts that are unlocked
+     * and not already in ar_disabled state */
+    if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
+        ( node_ptr->ar_disabled ))
+    {
+        return (rc);
+    }
+
+    /* check for invalid call case */
+    if ( cause >= MTC_AR_DISABLE_CAUSE__LAST )
+    {
+        slog ("%s called with invalid auto recovery cause (%d)",
+                  node_ptr->hostname.c_str(), cause );
+        return (rc);
+    }
+
+    /* update cause code */
+    if ( node_ptr->ar_cause != cause )
+        node_ptr->ar_cause = cause ;
+
+
+    /* Case 1 check */
+    if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false ))
+    {
+        /* manage the auto recovery threshold count file */
+        unsigned int value = 0 ;
+
+        string ar_file = TMP_DIR_PATH +
+                         node_ptr->hostname +
+                         AUTO_RECOVERY_FILE_SUFFIX ;

        if ( daemon_is_file_present (ar_file.data()))
        {
@ -7119,48 +7208,75 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr
        /* Save the new value in the file */
        daemon_log_value ( ar_file.data(), value );

+        value = daemon_get_file_int ( ar_file.data() );
+
        /* set rc to reflect what the caller should do */
-        if ( value > threshold )
+        if ( value > this->ar_threshold[node_ptr->ar_cause] )
        {
            elog ("%s auto recovery threshold exceeded (%d)\n",
-                      node_ptr->hostname.c_str(), threshold );
+                      node_ptr->hostname.c_str(),
+                      this->ar_threshold[node_ptr->ar_cause] );

-            this->autorecovery_disabled = true ;
-
-            if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
-            {
-                alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
-            }
-            else
-            {
-                alarm_enabled_failure ( node_ptr , true ) ;
-            }
+            node_ptr->ar_disabled = true ;
+            adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );

            allStateChange  ( node_ptr, node_ptr->adminState,
                                        MTC_OPER_STATE__ENABLED,
                                        MTC_AVAIL_STATUS__DEGRADED );

-            mtcInvApi_update_task ( node_ptr,
-                                    MTC_TASK_AUTO_RECOVERY_DISABLED );
+            mtcInvApi_update_task ( node_ptr, ar_disable_banner );

-            return ;
+            return (rc);
        }

-        wlog ("%s auto recovery (try %d of %d)\n",
-                  node_ptr->hostname.c_str(), value , threshold );
+        wlog ("%s auto recovery (try %d of %d) (%d)",
+                  node_ptr->hostname.c_str(),
+                  value,
+                  this->ar_threshold[node_ptr->ar_cause],
+                  node_ptr->ar_cause);

-        mtcInvApi_update_states_now ( node_ptr,
-                                      "unlocked",
-                                      "disabled",
-                                      "failed",
-                                      "disabled",
-                                      "failed" );
-
-        mtcInvApi_update_task_now ( node_ptr,
-                                    MTC_TASK_AUTO_RECOVERY );
+        mtcInvApi_update_states_now ( node_ptr, "unlocked",
+                                      "disabled", "failed",
+                                      "disabled", "failed" );

        lazy_graceful_fs_reboot ( node_ptr );
    }
+    else /* Case 2 */
+    {
+        send_hbs_command   ( node_ptr->hostname, MTC_CMD_STOP_HOST  );
+        mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
+
+        if (( NOT_THIS_HOST ) &&
+            ( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ))
+        {
+            if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
+                  this->ar_threshold [node_ptr->ar_cause] )
+            {
+                elog ("%s auto recovery threshold exceeded (%d)\n",
+                          node_ptr->hostname.c_str(),
+                          this->ar_threshold[node_ptr->ar_cause] );
+                node_ptr->ar_disabled = true ;
+                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
+                mtcInvApi_update_task ( node_ptr, ar_disable_banner );
+                rc = FAIL ;
+            }
+            else
+            {
+                wlog ("%s auto recovery (try %d of %d) (%d)",
+                          node_ptr->hostname.c_str(),
+                          node_ptr->ar_count[node_ptr->ar_cause],
+                          this->ar_threshold[node_ptr->ar_cause],
+                          node_ptr->ar_cause);
+                rc = PASS ;
+            }
+        }
+        else
+        {
+            wlog ("%s auto recovery\n", node_ptr->hostname.c_str());
+            rc = PASS ;
+        }
+    }
+    return (rc);
 }

 /****************************************************************************
@ -7198,13 +7314,8 @@ void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,

 void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
 {
-    /* don't do a full enable if active controller in simplex mode */
-    if ( THIS_HOST && SIMPLEX )
-    {
-        wlog ("%s avoiding full enable of simplex system\n", node_ptr->hostname.c_str());
-        wlog ("%s ... lock and unlock host to force recovery\n", node_ptr->hostname.c_str());
+    if ( node_ptr->ar_disabled == true )
        return ;
-    }

    if ( node_ptr->was_dor_recovery_mode )
    {
@ -8599,13 +8710,14 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
 void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
 {
    char str[MAX_MEM_LOG_DATA] ;
-    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Able:%d\n", 
+    snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Enable:%d Disable:%d\n",
                node_ptr->hostname.c_str(),
-                node_ptr->addStage, 
-                node_ptr->offlineStage, 
-                node_ptr->swactStage, 
+                node_ptr->addStage,
+                node_ptr->offlineStage,
+                node_ptr->swactStage,
                node_ptr->recoveryStage,
-                node_ptr->handlerStage.raw);
+                node_ptr->enableStage,
+                node_ptr->disableStage);
    mem_log (str);
 }

--- a/mtce/src/common/nodeClass.h
+++ b/mtce/src/common/nodeClass.h
@ -254,13 +254,9 @@ private:
        mtc_nodeOperState_enum    operState_dport  ; /**< Data Port Operational State   */
        mtc_nodeAvailStatus_enum  availStatus_dport; /**< Data Port Availability Status */

-
-        /** Maintains the current handler stage.
-          * This is a union of all handler types such as enable,
-          * disable, degrade etc. See nodeBase.h for list of union members */
-        mtc_stages_union          handlerStage;
-
        /* Individual FSM handler stages */
+        mtc_enableStages_enum     enableStage     ;
+        mtc_disableStages_enum    disableStage    ;
        mtc_offlineStages_enum    offlineStage    ;
        mtc_onlineStages_enum     onlineStage     ;
        mtc_swactStages_enum      swactStage      ;
@ -380,6 +376,24 @@ private:
        /** when true requests the task for this host be cleared at first opportunity */
        bool clear_task ;

+        /******* Auto Recovery Control Structure and member Functions ********/
+
+        /* reason/cause based host level enable failure counter */
+        unsigned int ar_count[MTC_AR_DISABLE_CAUSE__LAST] ;
+
+        /* The last enable failure reason/cause.
+         * Note: MTC_AR_DISABLE_CAUSE__NONE is no failure (default) */
+        autorecovery_disable_cause_enum ar_cause ;
+
+        /* when true indicates that a host has reached its enbale failure
+         * threshold and is left in the unlocked-disabled state */
+        bool ar_disabled ;
+
+        /* throttles the ar_disabled log to periodically indicate auto
+         * recovery disabled state but avoid flooding that same message. */
+        #define AR_LOG_THROTTLE_THRESHOLD (100000)
+        unsigned int ar_log_throttle ;
+
        /** Host's mtc timer struct. Use to time handler stages.
         *
         *  reset     -> reset command response
@ -870,9 +884,10 @@ private:

    int update_dport_states ( struct nodeLinkClass::node * node_ptr, int event );

-    /* manage deciding to return or issue an immediate reboot if the
-     * auto recovery threshold is exceeded. */
-    void manage_autorecovery ( struct nodeLinkClass::node * node_ptr );
+    /* manage auto recovery */
+    int  ar_manage ( struct nodeLinkClass::node * node_ptr,
+                     autorecovery_disable_cause_enum cause,
+                     string ar_disable_banner );

    /** ***********************************************************************
      *
@ -1041,6 +1056,12 @@ private:
    void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr );
    void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr );

+    /* Enables/Clears dynamic auto recovery state. start fresh !
+     * called in disabled_handler (lock) and in the DONE stages
+     * of the enable handler. */
+    void ar_enable ( struct nodeLinkClass::node * node_ptr );
+
+
    /** Find the node that has this timerID in its general mtc timer */
    struct nodeLinkClass::node * get_mtcTimer_timer   ( timer_t tid );
    struct nodeLinkClass::node * get_mtcConfig_timer  ( timer_t tid );
@ -2005,22 +2026,36 @@ public:
    int compute_mtcalive_timeout;
    int controller_mtcalive_timeout ;
    int goenabled_timeout      ;
+
+    /** /etc/mtc.conf configurable audit intervals */
    int swact_timeout          ;
    int sysinv_timeout         ;
    int sysinv_noncrit_timeout ;
-    int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout        */
-    int work_queue_timeout   ;
+    int loc_recovery_timeout   ; /**< Loss Of Communication Recovery Timeout */
+    int work_queue_timeout     ;
    int node_reinstall_timeout ;

-    /** /etc/mtc.ini configurable audit intervals */
    int insv_test_period  ;
    int oos_test_period   ;
    int uptime_period     ;
    int online_period     ;
    int token_refresh_rate;

+    /* Service specific max failures before autorecovery is disabled.
+     *
+     * ... values for each service are loaded from mtc config
+     *     file at daemon startup
+     */
+    unsigned int ar_threshold[MTC_AR_DISABLE_CAUSE__LAST] ;
+
+    /* service specific secs between autorecovery retries.
+     *
+     * ... values for each service are loaded from mtc config
+     *     file at daemon startup
+     */
+    unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST]  ;
+
    int  unknown_host_throttle ;
-    int  invalid_arg_throttle  ;
 };

 /**
@ -2052,7 +2087,6 @@ const char * get_adminAction_str ( mtc_nodeAdminAction_enum action );
 string       bmc_get_ip          ( string hostname, string mac , string & current_bm_ip );
 void         clear_host_degrade_causes ( unsigned int & degrade_mask );
 bool         sensor_monitoring_supported ( string hostname );
-void         autorecovery_clear  ( string hostname );
 void         log_mnfa_pool      ( std::list<string> & mnfa_awol_list );

 #endif /* __INCLUDE_NODECLASS_H__ */
--- a/mtce/src/maintenance/mtcNodeCtrl.cpp
+++ b/mtce/src/maintenance/mtcNodeCtrl.cpp
@ -316,6 +316,25 @@ static int mtc_config_handler ( void * user,
        mtcInv.offline_threshold = atoi(value);
        ilog ("OfflineThrsh: %d\n", mtcInv.offline_threshold );
    }
+
+    else if (MATCH("agent", "ar_config_threshold"))
+        mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
+    else if (MATCH("agent", "ar_goenable_threshold"))
+        mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value);
+    else if (MATCH("agent", "ar_hostservices_threshold"))
+        mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
+    else if (MATCH("agent", "ar_heartbeat_threshold"))
+        mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
+
+    else if (MATCH("agent", "ar_config_interval"))
+        mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
+    else if (MATCH("agent", "ar_goenable_interval"))
+        mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value);
+    else if (MATCH("agent", "ar_hostservices_interval"))
+        mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
+    else if (MATCH("agent", "ar_heartbeat_interval"))
+        mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
+
    else
    {
        return (PASS);
@ -635,6 +654,20 @@ int daemon_configure ( void )
    ilog("hwmond      : %d (port)\n", mtc_config.hwmon_cmd_port );
    ilog("auth_host   : %s \n", mtc_config.keystone_auth_host );

+    /* log system wide service based auto recovery control values */
+    ilog("AR Config   : %d (threshold) %d sec (retry interval)",
+          mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG],
+          mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__CONFIG]);
+    ilog("AR GoEnable : %d (threshold) %d sec (retry interval)",
+          mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE],
+          mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__GOENABLE]);
+    ilog("AR Host Svcs: %d (threshold) %d sec (retry interval)",
+          mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES],
+          mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HOST_SERVICES]);
+    ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)",
+          mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT],
+          mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]);
+
    /* Get this Controller Activity State */
    mtc_config.active = daemon_get_run_option ("active") ;
    ilog ("Controller  : %s\n",
--- a/mtce/src/maintenance/mtcNodeFsm.cpp
+++ b/mtce/src/maintenance/mtcNodeFsm.cpp
@ -118,7 +118,8 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
     * the insv_test_handler gets run as soon as a host's main function is enabled.
     ****************************************************************************
     */
-    if (( node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
+    if (( node_ptr->ar_disabled == false ) &&
+        ( node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
        ( node_ptr->operState   == MTC_OPER_STATE__ENABLED )   &&
        ((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
         (node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
@ -265,7 +266,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
    {
        flog ("%s -> Running SubFunction Enable handler (%d)\n",
                     node_ptr->hostname.c_str(),
-                     node_ptr->handlerStage.enable );
+                     node_ptr->enableStage );

        nodeLinkClass::enable_subf_handler ( node_ptr );
    }
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -446,6 +446,15 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
 {
    int rc = PASS ;

+    if ( node_ptr->ar_disabled == true )
+    {
+        wlog_throttled ( node_ptr->ar_log_throttle,
+                         AR_LOG_THROTTLE_THRESHOLD,
+                         "%s auto recovery disabled cause:%d",
+                         node_ptr->hostname.c_str(), node_ptr->ar_cause );
+         return (RETRY); ;
+    }
+
    if ( THIS_HOST )
    {
        /******************************************************************
@ -476,7 +485,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
        }
    }

-    switch ( (int)node_ptr->handlerStage.enable )
+    switch ( (int)node_ptr->enableStage )
    {
        case MTC_ENABLE__FAILURE:
        {
@ -539,7 +548,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                if ( is_inactive_controller_main_insv() == true )
                {
                    wlog ("%s has critical failure\n", node_ptr->hostname.c_str());
-                    wlog ("%s ... requesting swact to in-service inactive controller\n", node_ptr->hostname.c_str());
+                    wlog ("%s ... requesting swact to peer controller",
+                              node_ptr->hostname.c_str());

                    mtcInvApi_update_task_now ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );

@ -587,19 +597,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                }
                else
                {
-                    this->autorecovery_enabled = true ;
-
-                    /* use thresholded auto recovery for simplext failure case */
-                    manage_autorecovery ( node_ptr );
-
-                    if ( this->autorecovery_disabled == false )
-                    {
-                        wlog ("%s has critical failure.\n", node_ptr->hostname.c_str());
-                        wlog ("%s ... downgrading to degrade with auto recovery disabled\n", node_ptr->hostname.c_str());
-                        wlog ("%s ... to avoid disabling only enabled controller\n", node_ptr->hostname.c_str());
-                        this->autorecovery_disabled = true ;
-                    }
-
                    if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
                    {
                        /* Raise Critical Compute Function Alarm */
@ -620,7 +617,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                                            MTC_AVAIL_STATUS__FAILED );
            }

-            if ( degrade_only == true )
+            /* if we get here in controller simplex mode then go degraded
+             * if we are not already degraded. Otherwise, fail. */
+            if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
+            {
+                if (( node_ptr->adminState  != MTC_ADMIN_STATE__UNLOCKED ) ||
+                    ( node_ptr->operState   != MTC_OPER_STATE__ENABLED   ) ||
+                    ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
+                {
+                    allStateChange  ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
+                                                MTC_OPER_STATE__ENABLED,
+                                                MTC_AVAIL_STATUS__DEGRADED );
+                }
+                /* adminAction state is already changed to NONE. */
+            }
+
+            else if ( degrade_only == true )
            {
                allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                           MTC_OPER_STATE__ENABLED,
@ -636,25 +648,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            /* Inform the VIM of the failure */
            mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );

-            /* if we get here in controller simplex mode then go degraded
-             * if we are not already degraded. Otherwise, fail. */
-            if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
+            /* handle thresholded auto recovery retry delay interval */
+            if ( node_ptr->ar_cause < MTC_AR_DISABLE_CAUSE__LAST )
            {
-                /* autorecovery must be disabled */
-                if (( node_ptr->adminState  != MTC_ADMIN_STATE__UNLOCKED ) ||
-                    ( node_ptr->operState   != MTC_OPER_STATE__ENABLED   ) ||
-                    ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
+                unsigned int interval = this->ar_interval[node_ptr->ar_cause] ;
+                if ( interval )
                {
-                    allStateChange  ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
-                                                MTC_OPER_STATE__ENABLED,
-                                                MTC_AVAIL_STATUS__DEGRADED );
+                    /* Wait this failure cause's retry delay */
+                    mtcTimer_start ( node_ptr->mtcTimer,
+                                     mtcTimer_handler,
+                                     interval );
+
+                    wlog ("%s waiting %d secs before enable sequence retry (%d)",
+                              node_ptr->hostname.c_str(),
+                              interval, node_ptr->ar_cause );
                }
-                /* adminAction state is already changed to NONE. */
+                else
+                    node_ptr->mtcTimer.ring = true ;
            }
            else
-            {
-                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
-            }
+                node_ptr->mtcTimer.ring = true ;
+
+            enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );

            break;
        }
@ -717,6 +732,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            mtcCmd_workQ_purge ( node_ptr );
            mtcCmd_doneQ_purge ( node_ptr );

+            node_ptr->mtce_flags = 0 ;
+
            /* Assert the mtc alive gate */
            node_ptr->mtcAlive_gate = true ;

@ -739,8 +756,9 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )

                   /* enable auto recovery if the inactive controller
                    * is out of service */
-                   if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
-                       this->autorecovery_enabled = true ;
+                   //if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
+                   //    node_ptr->ar_disabled = false ;
+                   // this->autorecovery_enabled = true ;

                    /* fall through */

@ -757,20 +775,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                                  get_availStatus_str(node_ptr->availStatus).c_str());

                        mtcInvApi_update_task ( node_ptr, "" );
-
-                        /* Special case */
-                        // alarm_enabled_clear ( node_ptr, false );
-
-                        //mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
-                        //node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
-
-                        //allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
-                        //                           MTC_OPER_STATE__ENABLED,
-                        //                           MTC_AVAIL_STATUS__DEGRADED );
-
-                        // adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
-
-                        // return (PASS);
                    }
                    else
                    {
@ -986,6 +990,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->mtcAlive_online         = false ;
            node_ptr->mtcAlive_offline        = true  ;
            node_ptr->goEnabled               = false ;
+            node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;

            clear_service_readies ( node_ptr );

@ -1038,17 +1043,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                mtcTimer_reset ( node_ptr->mtcTimer );

                /* Check to see if the host is/got configured correctly */
-                if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
+                if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
+                    ((  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
                {
-                    elog ("%s configuration incomplete or failed (oob:%x:%x)\n",
+                    elog ("%s configuration failed or incomplete (oob:%x)\n",
                              node_ptr->hostname.c_str(),
-                              node_ptr->mtce_flags,
-                              MTC_FLAG__I_AM_CONFIGURED);
+                              node_ptr->mtce_flags)

                    /* raise an alarm for the failure of the config */
                    alarm_config_failure ( node_ptr );
+
                    mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
                    enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+
+                    /* handle auto recovery for this failure */
+                    if ( ar_manage ( node_ptr,
+                                     MTC_AR_DISABLE_CAUSE__CONFIG,
+                                     MTC_TASK_AR_DISABLED_CONFIG ) != PASS )
+                        break ;
                }
                else
                {
@ -1152,6 +1164,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_ENABLE__GOENABLED_WAIT:
        {
+            bool goenable_failed = false ;
            /* The healthy code comes from the host in the mtcAlive message.
             * This 'if' clause was introduced to detected failure of host
             * without having to wait for the GOENABLED phase to timeout.
@ -1162,27 +1175,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
             * be gracefully recovered to enabled in that case. Instead
             * we want to recover the card through a reset as quickly as
             * possible. */
-            if ( node_ptr->health == NODE_UNHEALTHY )
-            {
-                elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());

-                mtcTimer_reset ( node_ptr->mtcTimer );
-                this->force_full_enable ( node_ptr );
-            }
            /* search for the Go Enable message */
-            else if ( node_ptr->goEnabled_failed == true )
+            if (( node_ptr->health == NODE_UNHEALTHY ) ||
+                (( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)) ||
+                 ( node_ptr->goEnabled_failed == true ))
            {
                elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
-                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
-                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+                goenable_failed = true ;
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
            }
            /* search for the Go Enable message */
            else if ( node_ptr->goEnabled == true )
            {
                mtcTimer_reset ( node_ptr->mtcTimer );
                plog ("%s got GOENABLED\n", node_ptr->hostname.c_str());
-                // plog ("%s main configured OK\n", node_ptr->hostname.c_str());

                /* O.K. clearing the state now that we got it */
                node_ptr->goEnabled = false ;
@ -1194,26 +1202,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
            {
-                elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
-                ilog ("%s ... the out-of-service tests took too long to complete\n",
-                          node_ptr->hostname.c_str());
-
-                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL_TO_ );
+                elog ("%s has GOENABLED Timeout", node_ptr->hostname.c_str());
                node_ptr->mtcTimer.ring = false ;
-
-                /* raise an alarm for the enable failure */
-                alarm_enabled_failure ( node_ptr , true );
-
-                /* go back and issue reboot again */
-                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
-
-                /* no longer In-Test ; we are 'Failed' again" */
-                availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED  );
+                goenable_failed = true ;
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
            }
            else
            {
                ; /* wait some more */
            }
+
+            if ( goenable_failed )
+            {
+                alarm_enabled_failure ( node_ptr, true );
+
+                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__GOENABLE,
+                                 MTC_TASK_AR_DISABLED_GOENABLE ) != PASS )
+                    break ;
+            }
            break ;
        }

@ -1224,14 +1234,20 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
            if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
            {
-                node_ptr->hostservices_failed = true ;
-
                elog ("%s %s failed ; launch\n",
                          node_ptr->hostname.c_str(),
                          node_ptr->host_services_req.name.c_str());

-                mtcInvApi_update_task ( node_ptr, MTC_TASK_START_SERVICE_FAIL );
+                node_ptr->hostservices_failed = true ;
+                alarm_enabled_failure ( node_ptr, true );
                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
+                                 MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
+                    break ;
            }
            else
            {
@ -1261,6 +1277,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            else if ( rc != PASS )
            {
                node_ptr->hostservices_failed = true ;
+                alarm_enabled_failure ( node_ptr, true );
+                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+
+
                /* distinguish 'timeout' from other 'execution' failures */
                if ( rc == FAIL_TIMEOUT )
                {
@ -1269,7 +1289,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                              node_ptr->host_services_req.name.c_str());

                    mtcInvApi_update_task ( node_ptr,
-                                            MTC_TASK_START_SERVICE_TO );
+                                            MTC_TASK_MAIN_SERVICE_TO );
                }
                else
                {
@ -1279,9 +1299,14 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                              rc);

                    mtcInvApi_update_task ( node_ptr,
-                                            MTC_TASK_START_SERVICE_FAIL );
+                                            MTC_TASK_MAIN_SERVICE_FAIL );
                }
-                enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
+                                 MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
+                    break ;
            }
            else /* success path */
            {
@ -1321,8 +1346,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                mtcTimer_reset ( node_ptr->mtcTimer );
            }

-            /* Start Monitoring Services - heartbeat, process and hardware */
-            send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );

            if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
            {
@ -1338,11 +1361,13 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                          MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
                          node_ptr->hbsClient_ready ? " ready event" : "out ready event"  );

-
                /* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE
                 * seconds before we declare enable */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
                enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
+
+                /* Start Monitoring Services - heartbeat, process and hardware */
+                send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
            }
            break ;
        }
@ -1351,6 +1376,11 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            if ( node_ptr->mtcTimer.ring == true )
            {
                plog ("%s heartbeating\n", node_ptr->hostname.c_str() );
+
+                /* handle auto recovery ear for thsi potential cause */
+                node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
+                node_ptr->ar_count[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = 0 ;
+
                /* if heartbeat is not working then we will
                 * never get here and enable the host */
                enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
@ -1490,7 +1520,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else
            {
-
                node_ptr->enabled_count++ ;

                /* Inform the VIM that this host is enabled */
@ -1505,6 +1534,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
                adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );

                node_ptr->health_threshold_counter = 0 ;
+
+                ar_enable ( node_ptr );
            }

            break ;
@ -2103,7 +2134,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            {
                elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
                mtcTimer_reset ( node_ptr->mtcTimer );
-                mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
                this->force_full_enable ( node_ptr );
            }

@ -2121,6 +2152,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            else if ( node_ptr->mtcTimer.ring == true )
            {
                elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );

                node_ptr->mtcTimer.ring = false ;

@ -2640,7 +2672,7 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
 {
    int rc = PASS ;

-    switch ( (int)node_ptr->handlerStage.disable )
+    switch ( (int)node_ptr->disableStage )
    {
        case MTC_DISABLE__START:
        {
@ -2657,6 +2689,7 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
            clear_subf_failed_bools ( node_ptr );
            clear_hostservices_ctls ( node_ptr );

+            enableStageChange  ( node_ptr, MTC_ENABLE__START ) ;
            disableStageChange ( node_ptr, MTC_DISABLE__DIS_SERVICES_WAIT) ;

            stop_offline_handler ( node_ptr );
@ -2758,7 +2791,7 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
            /* If the stage is still MTC_DISABLE__DIS_SERVICES_WAIT then the
             * host should already be powered on so lets send the stop
             * services command */
-            if ( node_ptr->handlerStage.disable == MTC_DISABLE__DIS_SERVICES_WAIT )
+            if ( node_ptr->disableStage == MTC_DISABLE__DIS_SERVICES_WAIT )
            {
                bool start = false ;
                if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
@ -3042,6 +3075,9 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
            recovery_ctrl_init ( node_ptr->hwmon_reset );
            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

+            /* re-enable auto recovery */
+            ar_enable ( node_ptr );
+
            /* Load configured mtcAlive and goEnabled timers */
            LOAD_NODETYPE_TIMERS ;

@ -3055,7 +3091,7 @@ int nodeLinkClass::disable_handler  ( struct nodeLinkClass::node * node_ptr )
        default:
        {
            elog ("%s Bad Case (%d)\n", node_ptr->hostname.c_str(),
-                                        node_ptr->handlerStage.disable );
+                                        node_ptr->disableStage );
            rc = FAIL_BAD_CASE ;
        }
    }
@ -3273,14 +3309,20 @@ int nodeLinkClass::online_handler ( struct nodeLinkClass::node * node_ptr )
    int rc = PASS ;

    /* don't need to manage the offline or online state
-     * for the following availability states */
-    if (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )   ||
+     * for the following states
+     *  ... auto recovery state
+     *  ... enable stages
+     *  ... availability states */
+    if (( node_ptr->ar_disabled == true ) ||
+        ( node_ptr->enableStage == MTC_ENABLE__FAILURE ) ||
+        ( node_ptr->enableStage == MTC_ENABLE__FAILURE_WAIT ) ||
+        ( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED  )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFDUTY   )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__INTEST    )   ||
        ( node_ptr->availStatus == MTC_AVAIL_STATUS__NOT_INSTALLED ))
    {
-        return (PASS);
+        return (rc);
    }

    switch ( (int)node_ptr->onlineStage )
@ -4648,6 +4690,8 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
            recovery_ctrl_init ( node_ptr->hwmon_reset );
            recovery_ctrl_init ( node_ptr->hwmon_powercycle );

+            ar_enable ( node_ptr );
+
            mtcInvApi_force_task ( node_ptr, "" );
            break ;
        }
@ -5537,25 +5581,69 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )

        case MTC_ADD__CLEAR_TASK:
        {
-            if ( is_controller(node_ptr) )
+            /* Check for hosts that were in the auto recovery disabled state */
+            if ( !node_ptr->task.empty () )
            {
-                if ( node_ptr->mtcTimer.ring == true )
+                if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED ) &&
+                   (( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG))  ||
+                    ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE))||
+                    ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES))||
+                    ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT))))
                {
-                    if ( !node_ptr->task.empty () )
+                    if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_CONFIG ))
                    {
-                        mtcInvApi_force_task ( node_ptr, "" );
+                        node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__CONFIG ;
+                        alarm_config_failure ( node_ptr );
                    }
+                    else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_GOENABLE ))
+                    {
+                        node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__GOENABLE ;
+                        alarm_enabled_failure ( node_ptr, true );
+                    }
+                    else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_SERVICES ))
+                    {
+                        node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HOST_SERVICES ;
+                        alarm_enabled_failure ( node_ptr, true );
+                    }
+                    else if ( !node_ptr->task.compare(MTC_TASK_AR_DISABLED_HEARTBEAT ))
+                    {
+                        node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__HEARTBEAT ;
+                    }
+                    node_ptr->ar_disabled = true ;
+
+                    if ( THIS_HOST )
+                        mtcInvApi_update_states ( node_ptr, "unlocked", "enabled", "degraded" );
+                    else
+                        mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
+
+                    node_ptr->addStage = MTC_ADD__START;
+                    node_ptr->add_completed = true ;
+
+                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
+                    plog ("%s Host Add Completed ; auto recovery disabled state (uptime:%d)\n",
+                              node_ptr->hostname.c_str(), node_ptr->uptime );
+                    break ;
                }
                else
                {
-                    break ;
+                    if ( is_controller(node_ptr) )
+                    {
+                        if ( node_ptr->mtcTimer.ring == true )
+                        {
+                            mtcInvApi_force_task ( node_ptr, "" );
+                        }
+                        else
+                        {
+                            break ;
+                        }
+                    }
+                    else
+                    {
+                        /* do it immediately for all otyher server types */
+                        mtcInvApi_force_task ( node_ptr, "" );
+                    }
                }
            }
-            else
-            {
-                /* do it immediately for all otyher server types */
-                mtcInvApi_force_task ( node_ptr, "" );
-            }
            /* default retries counter to zero before START_SERVICES */
            node_ptr->retries = 0 ;
            node_ptr->addStage = MTC_ADD__START_SERVICES ;
@ -6017,55 +6105,6 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
                              node_ptr->start_services_running_main);
                }
            }
-#endif
-
-
-            /* Avoid forcing the states to the database when on the first & second pass.
-             * This is because it is likely we just read all the states and
-             * if coming out of a DOR or a SWACT we don't need to un-necessarily
-             * produce that extra sysinv traffic.
-             * Also, no point forcing the states while there is an admin action
-             * or enable or graceful recovery going on as well because state changes
-             * are being done in the FSM already */
-            if (( node_ptr->oos_test_count > 1 ) &&
-                ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
-                ( !node_ptr->handlerStage.raw ) &&
-                ( !node_ptr->recoveryStage ))
-            {
-                /* Change the oper and avail states in the database */
-                allStateChange ( node_ptr, node_ptr->adminState,
-                                           node_ptr->operState,
-                                           node_ptr->availStatus );
-            }
-
-#ifdef WANT_CLEAR_ALARM_AUDIT
-
-            /* TODO: Obsolete with new Alarm Strategy */
-            /* Self Correct Stuck Failure Alarms */
-            if (( node_ptr->adminState   == MTC_ADMIN_STATE__UNLOCKED  ) &&
-                ( node_ptr->operState    == MTC_OPER_STATE__ENABLED ) &&
-                (( node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
-                 ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
-            {
-                if ( node_ptr->alarms[MTC_ALARM_ID__CONFIG] != FM_ALARM_SEVERITY_CLEAR )
-                {
-                    mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
-                    node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
-                }
-                alarm_enabled_clear ( node_ptr , false);
-            }
-#endif
-            /* Make sure the locked status on the host itself is set */
-            if (( node_ptr->adminState  == MTC_ADMIN_STATE__LOCKED  ) &&
-                ( node_ptr->operState   == MTC_OPER_STATE__DISABLED ) &&
-                ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
-                ( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED)    ))
-            {
-                ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str());
-
-                /* Tell the host that it is locked */
-                send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE);
-            }

            if (( daemon_is_file_present ( MTC_CMD_FIT__GOENABLE_AUDIT )) &&
                ( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6078,6 +6117,43 @@ int nodeLinkClass::oos_test_handler ( struct nodeLinkClass::node * node_ptr )
                    send_mtc_cmd ( node_ptr->hostname, MTC_REQ_SUBF_GOENABLED, MGMNT_INTERFACE );
                }
            }
+#endif
+
+            if ( node_ptr->ar_disabled == true )
+            {
+                elog ( "%s auto recovery disabled cause:%d",
+                           node_ptr->hostname.c_str(),
+                           node_ptr->ar_cause );
+            }
+
+            /* Avoid forcing the states to the database when on the first & second pass.
+             * This is because it is likely we just read all the states and
+             * if coming out of a DOR or a SWACT we don't need to un-necessarily
+             * produce that extra sysinv traffic.
+             * Also, no point forcing the states while there is an admin action
+             * or enable or graceful recovery going on as well because state changes
+             * are being done in the FSM already */
+            if (( node_ptr->oos_test_count > 1 ) &&
+                ( node_ptr->adminAction == MTC_ADMIN_ACTION__NONE ) &&
+                ( !node_ptr->enableStage ) && ( !node_ptr->recoveryStage ))
+            {
+                /* Change the oper and avail states in the database */
+                allStateChange ( node_ptr, node_ptr->adminState,
+                                           node_ptr->operState,
+                                           node_ptr->availStatus );
+            }
+
+            /* Make sure the locked status on the host itself is set */
+            if (( node_ptr->adminState  == MTC_ADMIN_STATE__LOCKED  ) &&
+                ( node_ptr->operState   == MTC_OPER_STATE__DISABLED ) &&
+                ( node_ptr->availStatus == MTC_AVAIL_STATUS__ONLINE ) &&
+                ( !(node_ptr->mtce_flags & MTC_FLAG__I_AM_LOCKED)    ))
+            {
+                ilog ("%s setting 'locked' status\n", node_ptr->hostname.c_str());
+
+                /* Tell the host that it is locked */
+                send_mtc_cmd ( node_ptr->hostname , MTC_MSG_LOCKED, MGMNT_INTERFACE);
+            }

            break ;
        }
@ -6140,7 +6216,8 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
            }
            /* manage degrade state and alarms */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
-                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ))
+                (  node_ptr->operState   == MTC_OPER_STATE__ENABLED   ) &&
+                (  node_ptr->ar_disabled == false ))
            {
                /************************************************************
                 *               Manage In-Service Alarms                   *
@ -6170,7 +6247,6 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_INSV_TEST__RUN:
        {
-
 #ifdef WANT_FIT_TESTING

            daemon_load_fit ();
@ -6229,22 +6305,20 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
             * controller autorecovery. Otherwise enable it but in this case
             * don't change the disable bool as that is used to gate auto
             * recovery once the threshoild is reached */
-            if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
-            {
-                if (( this->autorecovery_enabled == true ) &&
-                    ( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
-                {
-                    autorecovery_clear ( CONTROLLER_0 );
-                    autorecovery_clear ( CONTROLLER_1 );
-                    this->autorecovery_enabled  = false ;
-                    this->autorecovery_disabled = false ;
-                }
-                else if (( this->autorecovery_enabled == false ) &&
-                         ( node_ptr->operState != MTC_OPER_STATE__ENABLED ))
-                {
-                    this->autorecovery_enabled = true ;
-                }
-            }
+//            if ( is_controller ( node_ptr ) && NOT_THIS_HOST )
+//            {
+//                if (( node_ptr->ar_disabled == false ) &&
+//                    ( node_ptr->operState == MTC_OPER_STATE__ENABLED ))
+//                {
+//                    autorecovery_clear ( CONTROLLER_0 );
+//                    autorecovery_clear ( CONTROLLER_1 );
+//                }
+                //else if (( node_ptr->ar_disabled == true ) &&
+                //         ( node_ptr->operState != MTC_OPER_STATE__ENABLED ))
+                //{
+                //    node_ptr->ar_disabled = false ;
+                //}
+            // }

            /* Monitor the health of the host - no pass file */
            if ((  node_ptr->adminState  == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6440,7 +6514,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                     *
                     **/
                    if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
-                        ( this->autorecovery_disabled == false ) &&
+                        ( node_ptr->ar_disabled == false ) &&
                        ( node_ptr->start_services_needed == false ))
                    {
                        if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
@ -6527,7 +6601,7 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
                        if (( daemon_is_file_present ( CONFIG_COMPLETE_FILE )) &&
                            ( daemon_is_file_present ( CONFIG_FAIL_FILE )))
                        {
-                            wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*3), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
+                            wlog_throttled ( node_ptr->health_threshold_counter, (MTC_UNHEALTHY_THRESHOLD*10), "%s is UNHEALTHY\n", node_ptr->hostname.c_str());
                            if ( node_ptr->health_threshold_counter >= MTC_UNHEALTHY_THRESHOLD )
                            {
                                node_ptr->degrade_mask |= DEGRADE_MASK_CONFIG ;
--- a/mtce/src/maintenance/mtcSubfHdlrs.cpp
+++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp
@ -43,20 +43,17 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
 {
    int rc = PASS ;

+    if ( node_ptr->ar_disabled == true )
+    {
+        enableStageChange ( node_ptr, MTC_ENABLE__START );
+        return (rc);
+    }
+
    /* Setup the log prefix */
    string name = node_ptr->hostname ;
    name.append("-compute");

-    bool simplex = false ;
-    if (( SIMPLEX ) ||
-        (( THIS_HOST ) &&
-        (( this->is_inactive_controller_main_insv() == false ) ||
-         ( this->is_inactive_controller_subf_insv() == false ))))
-    {
-        simplex = true ;
-    }
-
-    switch ( (int)node_ptr->handlerStage.enable )
+    switch ( (int)node_ptr->enableStage )
    {
        case MTC_ENABLE__FAILURE_WAIT:
        {
@ -77,29 +74,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            workQueue_purge ( node_ptr );
            doneQueue_purge ( node_ptr );

-            enableStageChange ( node_ptr, MTC_ENABLE__START );
-
-            /* avoid failing this controller if there is no inactive to
-             * take over and avoid thrashing back and forth if the sub
-             * function on the inactive is disabled */
-            if ( simplex )
-            {
-                /* if autorecovery is enabled then handle it that way. */
-                if ( this->autorecovery_enabled == true )
-                {
-                    adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
-                    enableStageChange ( node_ptr, MTC_ENABLE__START );
-
-                    manage_autorecovery ( node_ptr );
-                }
-
-                wlog ("%s is ENABLED-degraded (failed subfunction)\n", name.c_str());
-            }
-            else
-            {
-                /* if there is another controller enabled then just force a full enable of this one */
-                force_full_enable ( node_ptr ) ;
-            }
+            force_full_enable ( node_ptr ) ;
            break ;
        }

@ -139,22 +114,26 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                enableStageChange ( node_ptr, MTC_ENABLE__GOENABLED_TIMER );
                alarm_config_clear ( node_ptr );
            }
-            else if ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )
+
+            if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
+                ((  node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
            {
                mtcTimer_reset (node_ptr->mtcTimer);
-                elog ("%s configuration failed (oob:%x:%x)\n",
-                          name.c_str(),
-                          node_ptr->mtce_flags,
-                          MTC_FLAG__I_AM_NOT_HEALTHY);
+
+                elog ("%s configuration failed or incomplete (oob:%x)\n",
+                          name.c_str(), node_ptr->mtce_flags);

                alarm_config_failure ( node_ptr );

-                if ( simplex )
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL_ );
-                else
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_FAIL );

                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__CONFIG,
+                                 MTC_TASK_AR_DISABLED_CONFIG ) != PASS )
+                    break ;
            }

            /* timeout handling */
@ -166,12 +145,15 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )

                alarm_config_failure ( node_ptr );

-                if ( simplex )
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO_ );
-                else
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_CONFIG_TO );

                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__CONFIG,
+                                 MTC_TASK_AR_DISABLED_CONFIG ) != PASS )
+                    break ;
            }
            else
            {
@ -227,19 +209,19 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_ENABLE__GOENABLED_WAIT:
        {
+            bool goenable_failed = false ;
+
            /* search for the Go Enable message */
-            if ( node_ptr->goEnabled_failed_subf == true )
+            if (( node_ptr->health == NODE_UNHEALTHY ) ||
+                ( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY) ||
+                ( node_ptr->goEnabled_failed_subf == true ))
            {
                mtcTimer_reset ( node_ptr->mtcTimer );
                elog ("%s one or more out-of-service tests failed\n", name.c_str());

-                mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_INTEST_FAIL_ : MTC_TASK_INTEST_FAIL );
-
-                /* Need thresholded auto recovery for this failure mode */
-                if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
-                    this->autorecovery_enabled = true ;
-
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_FAIL );
                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+                goenable_failed = true ;
            }

            /* search for the Go Enable message */
@ -248,6 +230,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                mtcTimer_reset ( node_ptr->mtcTimer );

                alarm_enabled_clear ( node_ptr, false );
+                alarm_compute_clear ( node_ptr, true );

                plog ("%s passed  out-of-service tests\n", name.c_str());

@ -275,18 +258,25 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            {
                elog ("%s out-of-service test execution timeout\n", name.c_str());

-                mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_INTEST_FAIL_TO_ : MTC_TASK_INTEST_FAIL_TO );
-
-                /* Need thresholded auto recovery for this failure mode */
-                if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
-                    this->autorecovery_enabled = true ;
-
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_INTEST_TO );
                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+                goenable_failed = true ;
            }
            else
            {
                ; /* wait some more */
            }
+
+            if ( goenable_failed == true )
+            {
+                alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__GOENABLE,
+                                 MTC_TASK_AR_DISABLED_GOENABLE ) != PASS )
+                    break ;
+            }
            break ;
        }
        case  MTC_ENABLE__HOST_SERVICES_START:
@ -312,17 +302,20 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )

            else if ( launch_host_services_cmd ( node_ptr, start, subf ) != PASS )
            {
-                node_ptr->hostservices_failed_subf = true ;
-
                wlog ("%s %s failed ; launch\n",
                          name.c_str(),
                          node_ptr->host_services_req.name.c_str());

-                /* Need thresholded auto recovery for this failure mode */
-                if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
-                    this->autorecovery_enabled = true ;
-
+                node_ptr->hostservices_failed_subf = true ;
+                alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
+                                 MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
+                    break ;
            }
            else
            {
@ -343,11 +336,12 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            }
            else if ( rc != PASS )
            {
-                /* Need thresholded auto recovery for this failure mode */
-                if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
-                    this->autorecovery_enabled = true ;
-
                node_ptr->hostservices_failed_subf = true ;
+                alarm_compute_failure ( node_ptr, FM_ALARM_SEVERITY_CRITICAL );
+
+                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+
+
                if ( rc == FAIL_TIMEOUT )
                {
                    elog ("%s %s failed ; timeout\n",
@ -355,7 +349,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                              node_ptr->host_services_req.name.c_str());

                    /* Report "Enabling Compute Service Timeout" to sysinv/horizon */
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF_TO );
+                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_TO );
                }
                else
                {
@ -365,9 +359,14 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                              rc);

                    /* Report "Enabling Compute Service Failed" to sysinv/horizon */
-                    mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLING_SUBF_FAIL );
+                    mtcInvApi_update_task ( node_ptr, MTC_TASK_SUBF_SERVICE_FAIL );
                }
-                enableStageChange ( node_ptr, MTC_ENABLE__SUBF_FAILED );
+
+                /* handle auto recovery for this failure */
+                if ( ar_manage ( node_ptr,
+                                 MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
+                                 MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
+                    break ;
            }
            else /* success path */
            {
@ -409,8 +408,6 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                mtcTimer_reset ( node_ptr->mtcTimer );
            }

-            /* Start Monitoring heartbeat */
-            send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );

            if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
            {
@ -426,6 +423,9 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
                /* allow heartbeat to run for 10 seconds before we declare enable */
                mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
                enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
+
+                /* Start Monitoring heartbeat */
+                send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
            }
            break ;
        }
@ -434,6 +434,11 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            if ( node_ptr->mtcTimer.ring == true )
            {
                plog ("%s heartbeating\n", name.c_str() );
+
+                /* handle auto recovery ear for thsi potential cause */
+                node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
+                node_ptr->ar_count[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = 0 ;
+
                /* if heartbeat is not working then we will
                 * never get here and enable the host */
                enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
@ -472,7 +477,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            {
                elog ("%s enable failed ; Enable workQueue timeout, purging ...\n", name.c_str());

-                mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_ENABLE_WORK_TO_ : MTC_TASK_ENABLE_WORK_TO );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_TO );

                fail = true ;
            }
@ -480,7 +485,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            {
                elog ("%s enable failed ; Enable doneQueue has failed commands\n", name.c_str());

-                mtcInvApi_update_task ( node_ptr, simplex ? MTC_TASK_ENABLE_WORK_FAIL_ : MTC_TASK_ENABLE_WORK_FAIL );
+                mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_WORK_FAIL );

                fail = true ;
            }
@ -656,6 +661,8 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
            node_ptr->dor_recovery_mode = false ;
            this->dor_mode_active = false ;

+            ar_enable ( node_ptr );
+
            mtcInvApi_force_task ( node_ptr, "" );
            break ;
        }
--- a/mtce/src/scripts/mtc.conf
+++ b/mtce/src/scripts/mtc.conf
@ -34,6 +34,35 @@ autorecovery_threshold = 3    ; The number of times maintenance will try to
                              ;  while there is no backup controllers to fail
                              ;  over to before giving up.

+
+; Service specific Auto Recovery failure thresholds.
+;
+; ar_<service>_threshold = <max_retries>
+;
+; If a host fails to enable due to a particular service failure, for example
+; configuration, goenabled etc. , then the mtcAgent will stop retrying after
+; the particular services' threshold is reached. While at threshold auto
+; recovery for specified host is disabled. The host sits there in the
+; unlocked-disabled-failed state with the WEBGUI host status showing that
+; auto recovery is disabled and horizon showing then a lock/unlock is required
+; to trigger another enable attempt and reset of the auto recovery counter.
+ar_config_threshold = 2
+ar_goenable_threshold = 2
+ar_hostservices_threshold = 2
+ar_heartbeat_threshold = 2
+
+; Service specific Auto Recovery retry interval.
+;
+; ar_<service>_interval = <retry delay in seconds>
+;
+; When a host fails to enable due to a particular service reason then
+; the mtcAgent will use the service specific interval value specified
+; to wait before it retries the enable sequence again.
+ar_config_interval = 30
+ar_goenable_interval = 30
+ar_hostservices_interval = 30
+ar_heartbeat_interval = 600
+
 api_retries = 10              ; number of API retries b4 failure

 [client]                     ; Client Configuration