Fix Graceful Recovery handling while in Graceful Recovery handling

The current Graceful Recovery handler is not properly handling back-to-back Multi Node Failure Avoidance (MNFA) events. There are two phases to MNFA phase 1: waiting for number of failed nodes to fall below mnfa_threahold as each affected node's heartbeat is recovered. phase 2: then a Graceful Recovery Wait period which is an 11 second heartbeat soak to verify that a stable heartbeat is regained before declaring the NMFA event complete. The Graceful Recovery Wait status of one or more affected nodes has been seen to be left uncleared (stuck) on one or more of the affected nodes if phase 2 of MNFA is interrupted by another MNFA event ; aka MNFA Nesting. Although this stuck status is not service affecting it does leave one or more nodes' host.task field, as observed under host-show, with "Graceful Recovery Wait" rather than empty. This update makes Multi Node Failure Avoidance (MNFA) handling changes to ensure that, upon MNFA exit, the recovery handler is properly restarted if MNFA Nesting occurs. Two additional Graceful Recovery phase issues were identified and fixed by this update. 1. Cut Graceful recovery handling in half - Found and removed a redundant 11 second heartbeat soak at the very end of the recovery handler. - This cuts the graceful recovery handling time down from 22 to 11 seconds thereby cutting potential for nesting in half. 2. Increased supported Graceful Recovery nesting from 3 to 5 - Found that some links bounce more than others so a nesting count of 3 can lead to an occasional single node failure. - This adds a bit more resiliency to MNFA handling of cases that exhibit more link messaging bounce. Test Plan: Verified 60+ MNFA occurrences across 4 different system types including AIO plus, Standard and Storage PASS: Verify Single Node Graceful Recovery Handling PASS: Verify Multi Node Graceful Recovery Handling PASS: Verify Single Node Graceful Recovery Nesting Handling PASS: Verify Multi Node Graceful Recovery Nesting Handling PASS: Verify MNFA of up to 5 nests can be gracefully recovered PASS: Verify MNFA of 6 nests lead to full enable of affected nodes PASS: Verify update as a patch PASS: Verify mtcAgent logging Regression: PASS: Verify standard system install PASS: Verify product verification maintenance regression (4 runs) PASS: Verify MNFA threshold increase and below threshold behavior PASS: Verify MNFA with reduced timeout behavior for ... nested case that does not timeout ... case that does not timeout ... case that does timeout Closes Bug: 1892877 Change-Id: I6b7d4478b5cae9521583af78e1370dadacd9536e Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2021-03-16 17:03:49 -04:00 · 2021-03-16 17:03:49 -04:00 · 5c83453fdf
commit 5c83453fdf
parent 84ba5f693a
5 changed files with 71 additions and 93 deletions
--- a/mtce-common/src/common/nodeBase.cpp
+++ b/mtce-common/src/common/nodeBase.cpp
@ -397,10 +397,9 @@ void mtc_stages_init ( void )
   recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START    ] = "Heartbeat-Start";
   recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK     ] = "Heartbeat-Soak";
   recoveryStages_str[MTC_RECOVERY__STATE_CHANGE       ] = "State Change";
-   recoveryStages_str[MTC_RECOVERY__ENABLE_START       ] = "Enable-Start";
   recoveryStages_str[MTC_RECOVERY__FAILURE            ] = "Failure";
   recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT     ] = "WorkQ-Wait";
-   recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT        ] = "Enable-Wait";
+   recoveryStages_str[MTC_RECOVERY__ENABLE             ] = "Enable";
   recoveryStages_str[MTC_RECOVERY__STAGES             ] = "unknown";

   disableStages_str [MTC_DISABLE__START               ] = "Disable-Start";
--- a/mtce-common/src/common/nodeBase.h
+++ b/mtce-common/src/common/nodeBase.h
@ -948,7 +948,7 @@ typedef enum
 string get_delStages_str ( mtc_delStages_enum stage );


-#define MTC_MAX_FAST_ENABLES (3)
+#define MTC_MAX_FAST_ENABLES (5)
 typedef enum
 {
    MTC_RECOVERY__START =  0,
@ -974,10 +974,9 @@ typedef enum
    MTC_RECOVERY__HEARTBEAT_START,
    MTC_RECOVERY__HEARTBEAT_SOAK,
    MTC_RECOVERY__STATE_CHANGE,
-    MTC_RECOVERY__ENABLE_START,
    MTC_RECOVERY__FAILURE,
    MTC_RECOVERY__WORKQUEUE_WAIT,
-    MTC_RECOVERY__ENABLE_WAIT,
+    MTC_RECOVERY__ENABLE,
    MTC_RECOVERY__STAGES,
 } mtc_recoveryStages_enum ;

--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -4927,17 +4927,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
    }

     if ( temp_count != mnfa_host_count[iface] )
-     {    
+     {
         slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
                   node_ptr->hostname.c_str(),
                   get_iface_name_str(iface),
                   mnfa_host_count[iface], temp_count );
                   mnfa_host_count[iface] = temp_count ;
         mnfa_host_count[iface] = temp_count ;
-     }    
+     }
     else
     {
-         wlog ("%s MNFA host tally (%s:%d)\n",
+         dlog ("%s MNFA host tally (%s:%d)\n",
                   node_ptr->hostname.c_str(),
                   get_iface_name_str(iface),
                   mnfa_host_count[iface] );
@ -5903,9 +5903,6 @@ int nodeLinkClass::critical_process_failed( string & hostname,
                      node_ptr->hostname.c_str()); /* dlog */
        }

-        /* Start fresh the next time we enter graceful recovery handler */
-        node_ptr->graceful_recovery_counter = 0 ;
-
        /* Set node as unlocked-disabled-failed */
        allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
                                   MTC_OPER_STATE__DISABLED,
@ -6863,7 +6860,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
 }

 /** Validate and log Recovery stage changes */
-int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr, 
+int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
                                          mtc_recoveryStages_enum newHdlrStage )
 {
    int rc = PASS ;
@ -6871,14 +6868,14 @@ int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
    if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
        ( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
    {
-        slog ("%s Invalid recovery stage (%d:%d)\n", 
+        slog ("%s Invalid recovery stage (%d:%d)\n",
                  node_ptr->hostname.c_str(),
-                  node_ptr->recoveryStage, 
+                  node_ptr->recoveryStage,
                  newHdlrStage );

        if ( newHdlrStage < MTC_RECOVERY__STAGES )
        {
-            clog ("%s ? -> %s\n", 
+            clog ("%s ? -> %s\n",
               node_ptr->hostname.c_str(),
               get_recoveryStages_str(newHdlrStage).c_str());

@ -6890,11 +6887,11 @@ int nodeLinkClass::recoveryStageChange  ( struct nodeLinkClass::node * node_ptr,
            rc = FAIL ;
        }
    }
-    else 
+    else
    {
-        clog ("%s %s -> %s\n", 
+        clog ("%s %s -> %s\n",
               node_ptr->hostname.c_str(),
-               get_recoveryStages_str(node_ptr->recoveryStage).c_str(), 
+               get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
               get_recoveryStages_str(newHdlrStage).c_str());

        node_ptr->recoveryStage = newHdlrStage  ;
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -1660,7 +1660,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
             *   2. Setting the node operational state to Disabled
             *   3. Setting the Enable action
             */
-            if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
+            node_ptr->graceful_recovery_counter++ ;
+            if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
            {
                /* gate off further mtcAlive messaging timme the offline
                * handler runs. This prevents stale messages from making it
@ -2555,7 +2556,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            else if ( rc == PASS )
            {
                /* Start Graceful Recovery */
-                recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ;
+                recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ;
                break ;
            }
            else if ( rc == FAIL_WORKQ_TIMEOUT )
@ -2571,51 +2572,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
            nodeLinkClass::force_full_enable ( node_ptr );
            break ;
        }
-        case MTC_RECOVERY__ENABLE_START:
+        case MTC_RECOVERY__ENABLE:
        {
-            /* Create the recovery enable timer. This timer is short.
-             * A node need to stay enabled with the hartbeat service
-             * running for a period of time before declaring it enabled */
-            mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
-
-            recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
-            break;
-        }
-        case MTC_RECOVERY__ENABLE_WAIT:
-        {
-            /* When this timer fires the host has been up for enough time */
-            if ( node_ptr->mtcTimer.ring == true )
+            if ( is_controller(node_ptr) )
            {
-                if ( is_controller(node_ptr) )
+                if ( mtcSmgrApi_request ( node_ptr,
+                                          CONTROLLER_ENABLED,
+                                          SMGR_MAX_RETRIES ) != PASS )
                {
-                    if ( mtcSmgrApi_request ( node_ptr,
-                                              CONTROLLER_ENABLED,
-                                              SMGR_MAX_RETRIES ) != PASS )
-                    {
-                        wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
-                              node_ptr->hostname.c_str());
-                    }
+                    wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n",
+                          node_ptr->hostname.c_str());
                }
-                /* Node Has Recovered */
-                node_ptr->graceful_recovery_counter = 0 ;
-                recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-                adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__NONE );
-                node_ptr->health_threshold_counter = 0 ;
-                node_ptr->enabled_count++ ;
-                node_ptr->http_retries_cur = 0 ;
-
-                doneQueue_purge ( node_ptr );
-                if ( node_ptr->was_dor_recovery_mode )
-                {
-                    report_dor_recovery (  node_ptr , "is ENABLED" );
-                }
-                else
-                {
-                    plog ("%s is ENABLED (Gracefully Recovered)\n",
-                              node_ptr->hostname.c_str());
-                }
-                alarm_enabled_clear ( node_ptr, false );
            }
+            /* Node Has Recovered */
+            node_ptr->graceful_recovery_counter = 0 ;
+            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
+            adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__NONE );
+            node_ptr->health_threshold_counter = 0 ;
+            node_ptr->enabled_count++ ;
+            node_ptr->http_retries_cur = 0 ;
+
+            doneQueue_purge ( node_ptr );
+            if ( node_ptr->was_dor_recovery_mode )
+            {
+                report_dor_recovery (  node_ptr , "is ENABLED" );
+            }
+            else
+            {
+                plog ("%s is ENABLED (Gracefully Recovered)\n",
+                          node_ptr->hostname.c_str());
+            }
+            alarm_enabled_clear ( node_ptr, false );
            break ;
        }
        default:
--- a/mtce/src/maintenance/mtcNodeMnfa.cpp
+++ b/mtce/src/maintenance/mtcNodeMnfa.cpp
@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )

    if ( node_ptr->mnfa_graceful_recovery == true )
    {
-        /* Restart the heartbeat for this recovered host */
-        // send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
-
        if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
        {
-            ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str());
-            recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
-            adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
+            ilog ("%s graceful recovery (graceful recover count:%d)",
+                      node_ptr->hostname.c_str(),
+                      node_ptr->graceful_recovery_counter);
        }
        else
        {
-            wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() );
+            wlog ("%s graceful recovery restart (graceful recover count:%d)",
+                      node_ptr->hostname.c_str(),
+                      node_ptr->graceful_recovery_counter );
        }
+        recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
+        adminActionChange   ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
    }
 }

@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force )
         * Clear heartbeat degrades */
        for ( struct node * ptr = head ;  ; ptr = ptr->next )
        {
-            if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
-                 ( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
-                 ( ptr->operState == MTC_OPER_STATE__ENABLED ))
+            std::list<string>::iterator mnfa_awol_ptr  ;
+            for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
+                  mnfa_awol_ptr != mnfa_awol_list.end() ;
+                  mnfa_awol_ptr++ )
            {
-                ptr->hbs_minor[MGMNT_IFACE] = false ;
-                ptr->hbs_minor[CLSTR_IFACE] = false ;
+                /* skip host if not in the mnfa pool */
+                if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
+                   continue ;

-                if ( force == true )
+                if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
+                     ( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
+                     ( ptr->operState == MTC_OPER_STATE__ENABLED ))
                {
-                    elog ("... %s failed ; auto-recovering\n",
-                               ptr->hostname.c_str());
+                    ptr->hbs_minor[MGMNT_IFACE] = false ;
+                    ptr->hbs_minor[CLSTR_IFACE] = false ;

-                    /* Set node as failed */
-                    availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
-                    enableStageChange ( ptr, MTC_ENABLE__START );
-                    adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
-                }
-                else
-                {
-                    if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
+                    if ( force == true )
                    {
-                        if ( ptr->degrade_mask == 0 )
-                        {
-                            availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE );
-                        }
-                    }
+                        elog ("... %s failed ; auto-recovering\n",
+                                   ptr->hostname.c_str());

-                    if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
-                    {
-                        recoveryStageChange ( ptr, MTC_RECOVERY__START );
-                        adminActionChange   ( ptr, MTC_ADMIN_ACTION__RECOVER );
+                        /* Set node as failed */
+                        availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
+                        enableStageChange ( ptr, MTC_ENABLE__START );
+                        adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
                    }
                    else
                    {
-                        wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() );
+                        mnfa_recover_host ( ptr );
                    }
                }
+                break ;
            }
            if (( ptr->next == NULL ) || ( ptr == tail ))
                break ;