Fix Graceful Recovery handling while in Graceful Recovery handling

The current Graceful Recovery handler is not properly handling
back-to-back Multi Node Failure Avoidance (MNFA) events.

There are two phases to MNFA

 phase 1: waiting for number of failed nodes to fall below
          mnfa_threahold as each affected node's heartbeat
          is recovered.
 phase 2: then a Graceful Recovery Wait period which is an
          11 second heartbeat soak to verify that a stable
          heartbeat is regained before declaring the NMFA
          event complete.

The Graceful Recovery Wait status of one or more affected nodes
has been seen to be left uncleared (stuck) on one or more of the
affected nodes if phase 2 of MNFA is interrupted by another MNFA
event ; aka MNFA Nesting.

Although this stuck status is not service affecting it does leave
one or more nodes' host.task field, as observed under host-show,
with "Graceful Recovery Wait" rather than empty.

This update makes Multi Node Failure Avoidance (MNFA) handling
changes to ensure that, upon MNFA exit, the recovery handler
is properly restarted if MNFA Nesting occurs.

Two additional Graceful Recovery phase issues were identified
and fixed by this update.

 1. Cut Graceful recovery handling in half

    - Found and removed a redundant 11 second heartbeat soak
      at the very end of the recovery handler.
    - This cuts the graceful recovery handling time down from
      22 to 11 seconds thereby cutting potential for nesting
      in half.

 2. Increased supported Graceful Recovery nesting from 3 to 5

    - Found that some links bounce more than others so a nesting
      count of 3 can lead to an occasional single node failure.
    - This adds a bit more resiliency to MNFA handling of cases
      that exhibit more link messaging bounce.

Test Plan: Verified 60+ MNFA occurrences across 4 different
           system types including AIO plus, Standard and Storage

PASS: Verify Single Node Graceful Recovery Handling
PASS: Verify Multi Node Graceful Recovery Handling
PASS: Verify Single Node Graceful Recovery Nesting Handling
PASS: Verify Multi Node Graceful Recovery Nesting Handling
PASS: Verify MNFA of up to 5 nests can be gracefully recovered
PASS: Verify MNFA of 6 nests lead to full enable of affected nodes
PASS: Verify update as a patch
PASS: Verify mtcAgent logging

Regression:

PASS: Verify standard system install
PASS: Verify product verification maintenance regression (4 runs)
PASS: Verify MNFA threshold increase and below threshold behavior
PASS: Verify MNFA with reduced timeout behavior for
      ... nested case that does not timeout
      ... case that does not timeout
      ... case that does timeout

Closes Bug: 1892877
Change-Id: I6b7d4478b5cae9521583af78e1370dadacd9536e
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-03-16 17:03:49 -04:00
parent 84ba5f693a
commit 5c83453fdf
5 changed files with 71 additions and 93 deletions

View File

@ -397,10 +397,9 @@ void mtc_stages_init ( void )
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start"; recoveryStages_str[MTC_RECOVERY__HEARTBEAT_START ] = "Heartbeat-Start";
recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak"; recoveryStages_str[MTC_RECOVERY__HEARTBEAT_SOAK ] = "Heartbeat-Soak";
recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change"; recoveryStages_str[MTC_RECOVERY__STATE_CHANGE ] = "State Change";
recoveryStages_str[MTC_RECOVERY__ENABLE_START ] = "Enable-Start";
recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure"; recoveryStages_str[MTC_RECOVERY__FAILURE ] = "Failure";
recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait"; recoveryStages_str[MTC_RECOVERY__WORKQUEUE_WAIT ] = "WorkQ-Wait";
recoveryStages_str[MTC_RECOVERY__ENABLE_WAIT ] = "Enable-Wait"; recoveryStages_str[MTC_RECOVERY__ENABLE ] = "Enable";
recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown"; recoveryStages_str[MTC_RECOVERY__STAGES ] = "unknown";
disableStages_str [MTC_DISABLE__START ] = "Disable-Start"; disableStages_str [MTC_DISABLE__START ] = "Disable-Start";

View File

@ -948,7 +948,7 @@ typedef enum
string get_delStages_str ( mtc_delStages_enum stage ); string get_delStages_str ( mtc_delStages_enum stage );
#define MTC_MAX_FAST_ENABLES (3) #define MTC_MAX_FAST_ENABLES (5)
typedef enum typedef enum
{ {
MTC_RECOVERY__START = 0, MTC_RECOVERY__START = 0,
@ -974,10 +974,9 @@ typedef enum
MTC_RECOVERY__HEARTBEAT_START, MTC_RECOVERY__HEARTBEAT_START,
MTC_RECOVERY__HEARTBEAT_SOAK, MTC_RECOVERY__HEARTBEAT_SOAK,
MTC_RECOVERY__STATE_CHANGE, MTC_RECOVERY__STATE_CHANGE,
MTC_RECOVERY__ENABLE_START,
MTC_RECOVERY__FAILURE, MTC_RECOVERY__FAILURE,
MTC_RECOVERY__WORKQUEUE_WAIT, MTC_RECOVERY__WORKQUEUE_WAIT,
MTC_RECOVERY__ENABLE_WAIT, MTC_RECOVERY__ENABLE,
MTC_RECOVERY__STAGES, MTC_RECOVERY__STAGES,
} mtc_recoveryStages_enum ; } mtc_recoveryStages_enum ;

View File

@ -4927,17 +4927,17 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
} }
if ( temp_count != mnfa_host_count[iface] ) if ( temp_count != mnfa_host_count[iface] )
{ {
slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n", slog ("%s MNFA host tally (%s:%d incorrect - expected %d) ; correcting\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
get_iface_name_str(iface), get_iface_name_str(iface),
mnfa_host_count[iface], temp_count ); mnfa_host_count[iface], temp_count );
mnfa_host_count[iface] = temp_count ; mnfa_host_count[iface] = temp_count ;
mnfa_host_count[iface] = temp_count ; mnfa_host_count[iface] = temp_count ;
} }
else else
{ {
wlog ("%s MNFA host tally (%s:%d)\n", dlog ("%s MNFA host tally (%s:%d)\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
get_iface_name_str(iface), get_iface_name_str(iface),
mnfa_host_count[iface] ); mnfa_host_count[iface] );
@ -5903,9 +5903,6 @@ int nodeLinkClass::critical_process_failed( string & hostname,
node_ptr->hostname.c_str()); /* dlog */ node_ptr->hostname.c_str()); /* dlog */
} }
/* Start fresh the next time we enter graceful recovery handler */
node_ptr->graceful_recovery_counter = 0 ;
/* Set node as unlocked-disabled-failed */ /* Set node as unlocked-disabled-failed */
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__DISABLED, MTC_OPER_STATE__DISABLED,
@ -6863,7 +6860,7 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
} }
/** Validate and log Recovery stage changes */ /** Validate and log Recovery stage changes */
int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr, int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
mtc_recoveryStages_enum newHdlrStage ) mtc_recoveryStages_enum newHdlrStage )
{ {
int rc = PASS ; int rc = PASS ;
@ -6871,14 +6868,14 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
if (( newHdlrStage >= MTC_RECOVERY__STAGES ) || if (( newHdlrStage >= MTC_RECOVERY__STAGES ) ||
( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES )) ( node_ptr->recoveryStage >= MTC_RECOVERY__STAGES ))
{ {
slog ("%s Invalid recovery stage (%d:%d)\n", slog ("%s Invalid recovery stage (%d:%d)\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
node_ptr->recoveryStage, node_ptr->recoveryStage,
newHdlrStage ); newHdlrStage );
if ( newHdlrStage < MTC_RECOVERY__STAGES ) if ( newHdlrStage < MTC_RECOVERY__STAGES )
{ {
clog ("%s ? -> %s\n", clog ("%s ? -> %s\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
get_recoveryStages_str(newHdlrStage).c_str()); get_recoveryStages_str(newHdlrStage).c_str());
@ -6890,11 +6887,11 @@ int nodeLinkClass::recoveryStageChange ( struct nodeLinkClass::node * node_ptr,
rc = FAIL ; rc = FAIL ;
} }
} }
else else
{ {
clog ("%s %s -> %s\n", clog ("%s %s -> %s\n",
node_ptr->hostname.c_str(), node_ptr->hostname.c_str(),
get_recoveryStages_str(node_ptr->recoveryStage).c_str(), get_recoveryStages_str(node_ptr->recoveryStage).c_str(),
get_recoveryStages_str(newHdlrStage).c_str()); get_recoveryStages_str(newHdlrStage).c_str());
node_ptr->recoveryStage = newHdlrStage ; node_ptr->recoveryStage = newHdlrStage ;

View File

@ -1660,7 +1660,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
* 2. Setting the node operational state to Disabled * 2. Setting the node operational state to Disabled
* 3. Setting the Enable action * 3. Setting the Enable action
*/ */
if ( ++node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES ) node_ptr->graceful_recovery_counter++ ;
if ( node_ptr->graceful_recovery_counter > MTC_MAX_FAST_ENABLES )
{ {
/* gate off further mtcAlive messaging timme the offline /* gate off further mtcAlive messaging timme the offline
* handler runs. This prevents stale messages from making it * handler runs. This prevents stale messages from making it
@ -2555,7 +2556,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
else if ( rc == PASS ) else if ( rc == PASS )
{ {
/* Start Graceful Recovery */ /* Start Graceful Recovery */
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_START ) ; recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE ) ;
break ; break ;
} }
else if ( rc == FAIL_WORKQ_TIMEOUT ) else if ( rc == FAIL_WORKQ_TIMEOUT )
@ -2571,51 +2572,37 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
nodeLinkClass::force_full_enable ( node_ptr ); nodeLinkClass::force_full_enable ( node_ptr );
break ; break ;
} }
case MTC_RECOVERY__ENABLE_START: case MTC_RECOVERY__ENABLE:
{ {
/* Create the recovery enable timer. This timer is short. if ( is_controller(node_ptr) )
* A node need to stay enabled with the hartbeat service
* running for a period of time before declaring it enabled */
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
recoveryStageChange ( node_ptr, MTC_RECOVERY__ENABLE_WAIT ) ;
break;
}
case MTC_RECOVERY__ENABLE_WAIT:
{
/* When this timer fires the host has been up for enough time */
if ( node_ptr->mtcTimer.ring == true )
{ {
if ( is_controller(node_ptr) ) if ( mtcSmgrApi_request ( node_ptr,
CONTROLLER_ENABLED,
SMGR_MAX_RETRIES ) != PASS )
{ {
if ( mtcSmgrApi_request ( node_ptr, wlog ("%s Failed to send 'unlocked-enabled' to HA Service Manager ; allowing enable\n",
CONTROLLER_ENABLED, node_ptr->hostname.c_str());
SMGR_MAX_RETRIES ) != PASS )
{
wlog ("%s Failed to send 'unlocked-disabled' to HA Service Manager ; allowing enable\n",
node_ptr->hostname.c_str());
}
} }
/* Node Has Recovered */
node_ptr->graceful_recovery_counter = 0 ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
node_ptr->health_threshold_counter = 0 ;
node_ptr->enabled_count++ ;
node_ptr->http_retries_cur = 0 ;
doneQueue_purge ( node_ptr );
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
}
else
{
plog ("%s is ENABLED (Gracefully Recovered)\n",
node_ptr->hostname.c_str());
}
alarm_enabled_clear ( node_ptr, false );
} }
/* Node Has Recovered */
node_ptr->graceful_recovery_counter = 0 ;
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
node_ptr->health_threshold_counter = 0 ;
node_ptr->enabled_count++ ;
node_ptr->http_retries_cur = 0 ;
doneQueue_purge ( node_ptr );
if ( node_ptr->was_dor_recovery_mode )
{
report_dor_recovery ( node_ptr , "is ENABLED" );
}
else
{
plog ("%s is ENABLED (Gracefully Recovered)\n",
node_ptr->hostname.c_str());
}
alarm_enabled_clear ( node_ptr, false );
break ; break ;
} }
default: default:

View File

@ -159,19 +159,20 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->mnfa_graceful_recovery == true ) if ( node_ptr->mnfa_graceful_recovery == true )
{ {
/* Restart the heartbeat for this recovered host */
// send_hbs_command ( node_ptr->hostname, MTC_RESTART_HBS );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
{ {
ilog ("%s graceful recovery from MNFA\n", node_ptr->hostname.c_str()); ilog ("%s graceful recovery (graceful recover count:%d)",
recoveryStageChange ( node_ptr, MTC_RECOVERY__START ); node_ptr->hostname.c_str(),
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER ); node_ptr->graceful_recovery_counter);
} }
else else
{ {
wlog ("%s already gracefully recovering\n", node_ptr->hostname.c_str() ); wlog ("%s graceful recovery restart (graceful recover count:%d)",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter );
} }
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
} }
} }
@ -298,43 +299,38 @@ void nodeLinkClass::mnfa_exit ( bool force )
* Clear heartbeat degrades */ * Clear heartbeat degrades */
for ( struct node * ptr = head ; ; ptr = ptr->next ) for ( struct node * ptr = head ; ; ptr = ptr->next )
{ {
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) || std::list<string>::iterator mnfa_awol_ptr ;
( ptr->hbs_minor[MGMNT_IFACE] == true )) && for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
( ptr->operState == MTC_OPER_STATE__ENABLED )) mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{ {
ptr->hbs_minor[MGMNT_IFACE] = false ; /* skip host if not in the mnfa pool */
ptr->hbs_minor[CLSTR_IFACE] = false ; if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
continue ;
if ( force == true ) if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{ {
elog ("... %s failed ; auto-recovering\n", ptr->hbs_minor[MGMNT_IFACE] = false ;
ptr->hostname.c_str()); ptr->hbs_minor[CLSTR_IFACE] = false ;
/* Set node as failed */ if ( force == true )
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( ptr, MTC_ENABLE__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
}
else
{
if ( ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
{ {
if ( ptr->degrade_mask == 0 ) elog ("... %s failed ; auto-recovering\n",
{ ptr->hostname.c_str());
availStatusChange ( ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
if ( ptr->adminAction != MTC_ADMIN_ACTION__RECOVER ) /* Set node as failed */
{ availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
recoveryStageChange ( ptr, MTC_RECOVERY__START ); enableStageChange ( ptr, MTC_ENABLE__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__RECOVER ); adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
} }
else else
{ {
wlog ("%s already gracefully recovering\n", ptr->hostname.c_str() ); mnfa_recover_host ( ptr );
} }
} }
break ;
} }
if (( ptr->next == NULL ) || ( ptr == tail )) if (( ptr->next == NULL ) || ( ptr == tail ))
break ; break ;