Debian: Make Mtce offline handler more resilient to slow shutdowns

The current offline handler assumes the node is offline after
'offline_search_count' reaches 'offline_threshold' count
regardless of whether mtcAlive messages were received during
the search window.

The offline algorithm requires that no mtcAlive messages
be seen for the full offline_threshold count.

During a slow shutdown the mtcClient runs for longer than
it should and as a result can lead to maintenance seeing
the node as recovered before it should.

This update manages the offline search counter to ensure that
it only reached the count threshold after seeing no mtcAlive
messages for the full search count. Any mtcAlive message seen
during the count triggers a count reset.

This update also
1. Adjusts the reset retry cadence from 7 to 12 secs
   to prevent unnecessary reboot thrash during
   the current shutdown.
2. Clears the hbsClient ready event at the start of the
   subfunction handler so the heartbeat soak is only
   started after seeing heartbeat client ready events
   that follow the main config.

Test Plan:

PASS: Debian and CentOS Build and DX install
PASS: Verify search count management
PASS: Verify issue does not occur over lock/unlock soak (100+)
      - where the same test without update did show issue.
PASS: Monitor alive logs for behavioral correctness
PASS: Verify recovery reset occurs after expected extended time.

Closes-Bug: 1993656
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
Change-Id: If10bb75a1fb01d0ecd3f88524d74c232658ca29e
This commit is contained in:
Eric MacDonald 2022-10-20 15:45:49 +00:00
parent b88d7456b1
commit da398e0c5f
5 changed files with 66 additions and 31 deletions

View File

@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname,
ENTITY_PREFIX, hostname.data(), instance.data());
}
alog ("entity_instance:%s\n", alarm_filter.entity_instance_id );
alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id );
if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
{
dlog ("Found with Severity: %d\n", alarm_query.severity );
@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr,
memset(&alarm_filter, 0, sizeof(alarm_filter));
snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data());
rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms );
alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
if ( rc == FM_ERR_OK )
{
return (PASS);
@ -242,7 +242,7 @@ int alarmUtil ( string & hostname,
if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) &&
( curr_sev == alarm.severity ))
{
alog ("%s %s %s already at desired (%s) severity level\n",
alog1 ("%s %s %s already at desired (%s) severity level\n",
hostname.c_str(),
identity.c_str(),
instance.c_str(),
@ -259,18 +259,18 @@ int alarmUtil ( string & hostname,
{
if ( alarm.alarm_state == FM_ALARM_STATE_SET )
{
alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
}
else
{
alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
}
/* Debug Logs */
alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
hostname.c_str(),
alarm.alarm_state,
alarm.severity,
@ -322,7 +322,7 @@ int alarmUtil ( string & hostname,
// send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0],
// &__alarmObject.temp_str[0] );
alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 );
if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK )
{

View File

@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
{
if ( node_ptr->mtcAlive_clstr == false )
{
alog2 ("%s %s mtcAlive received",
alog ("%s %s mtcAlive received",
node_ptr->hostname.c_str(),
get_iface_name_str(interface));
node_ptr->mtcAlive_clstr = true ;
@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
{
if ( node_ptr->mtcAlive_mgmnt == false )
{
alog2 ("%s %s mtcAlive received",
alog ("%s %s mtcAlive received",
node_ptr->hostname.c_str(),
get_iface_name_str(interface));
node_ptr->mtcAlive_mgmnt = true ;

View File

@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
node_ptr->hostname.c_str(), delay);
@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str());
}
}
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay );
node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
break ;
@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->mtcCmd_timer.ring == true )
{
int delay = (((offline_period*offline_threshold)/1000)+3);
int delay = (((offline_period*offline_threshold)/1000)*3);
/* bmc power control reset by bmc */
rc = bmc_command_recv ( node_ptr );

View File

@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
}
case MTC_OFFLINE__START:
{
node_ptr->mtcAlive_count = 0 ;
node_ptr->mtcAlive_mgmnt = false ;
node_ptr->mtcAlive_clstr = false ;
node_ptr->offline_log_throttle = 0 ;
@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
availStatus_enum_to_str(node_ptr->availStatus).c_str());
this->ctl_mtcAlive_gate ( node_ptr, false ) ;
/**
* Handle the race condition case where the
* mtcAlive was received after the last check
* while in MTC_OFFLINE__WAIT below and here when
* the node_ptr->mtcAlive_<iface> state variables
* are cleared. Need to also clear the
* offline_search_count here as well.
**/
if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count )
{
node_ptr->mtcAlive_online = true ;
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
node_ptr->offline_search_count,
offline_threshold );
node_ptr->offline_search_count = 0 ; /* reset the count */
}
node_ptr->mtcAlive_mgmnt = false ;
node_ptr->mtcAlive_clstr = false ;
@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
offline_threshold );
}
}
else
else if ( node_ptr->offline_search_count )
{
/**
* This algorithm was assuming the node is offline after
* offline_search_count reached offline_threshold count.
*
* Note: The mtcClient sends periodic mtcAlive messages
* until it is shutdown.
* This algorithm also explicitely 'requests' the message.
* The algorithm depends on not receving the message, even
* when requested for offline_threshold counts 'in a row'.
*
* When shutdown is slowed or delayed, a late mtcAlive
* can trick this FSM into seeing the node as recovered
* when in fact its still shuttingdown.
*
* To maintain the intended absence of mtcAlive messages
* count 'in a row', this check resets the search count
* if a mtcAlive is seen during the search.
**/
node_ptr->mtcAlive_online = true ;
if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr )
{
ilog_throttled ( node_ptr->offline_log_throttle, 10,
"%s still seeing mtcAlive (%c:%c)\n",
ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_count,
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
}
else
{
alog ("%s still seeing mtcAlive (%c:%c)\n",
node_ptr->hostname.c_str(),
node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
node_ptr->mtcAlive_clstr ? 'Y' : 'n');
}
node_ptr->mtcAlive_clstr ? 'Y' : 'n',
node_ptr->offline_search_count,
offline_threshold );
node_ptr->offline_search_count = 0 ; /* reset the search count */
}
if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )

View File

@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->unknown_health_reported = false ;
node_ptr->goEnabled_failed_subf = false ;
node_ptr->hbsClient_ready = false ;
/* load worker subfunciton alarm state */
EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,