Debian: Make Mtce offline handler more resilient to slow shutdowns

The current offline handler assumes the node is offline after 'offline_search_count' reaches 'offline_threshold' count regardless of whether mtcAlive messages were received during the search window. The offline algorithm requires that no mtcAlive messages be seen for the full offline_threshold count. During a slow shutdown the mtcClient runs for longer than it should and as a result can lead to maintenance seeing the node as recovered before it should. This update manages the offline search counter to ensure that it only reached the count threshold after seeing no mtcAlive messages for the full search count. Any mtcAlive message seen during the count triggers a count reset. This update also 1. Adjusts the reset retry cadence from 7 to 12 secs to prevent unnecessary reboot thrash during the current shutdown. 2. Clears the hbsClient ready event at the start of the subfunction handler so the heartbeat soak is only started after seeing heartbeat client ready events that follow the main config. Test Plan: PASS: Debian and CentOS Build and DX install PASS: Verify search count management PASS: Verify issue does not occur over lock/unlock soak (100+) - where the same test without update did show issue. PASS: Monitor alive logs for behavioral correctness PASS: Verify recovery reset occurs after expected extended time. Closes-Bug: 1993656 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com> Change-Id: If10bb75a1fb01d0ecd3f88524d74c232658ca29e
2022-10-20 15:45:49 +00:00 · 2022-10-20 15:45:49 +00:00 · da398e0c5f
parent b88d7456b1
commit da398e0c5f
5 changed files with 66 additions and 31 deletions
--- a/mtce-common/src/common/alarmUtil.cpp
+++ b/mtce-common/src/common/alarmUtil.cpp
@ -151,7 +151,7 @@ EFmAlarmSeverityT alarmUtil_query ( string & hostname,
                    ENTITY_PREFIX, hostname.data(), instance.data());
    }

-    alog ("entity_instance:%s\n", alarm_filter.entity_instance_id );
+    alog1 ("entity_instance:%s\n", alarm_filter.entity_instance_id );
    if (( rc = fm_get_fault ( &alarm_filter, &alarm_query )) == FM_ERR_OK )
    {
        dlog ("Found with Severity: %d\n", alarm_query.severity );
@ -183,7 +183,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr,
        memset(&alarm_filter, 0, sizeof(alarm_filter));
        snprintf ( alarm_filter.alarm_id, FM_MAX_BUFFER_LENGTH, "%s", identity.data());
        rc = fm_get_faults_by_id ( &alarm_filter.alarm_id, alarm_list_ptr, &max_alarms );
-        alog ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
+        alog1 ("%s fm_get_faults_by_id rc = %d\n", alarm_filter.alarm_id, rc );
        if ( rc == FM_ERR_OK )
        {
            return (PASS);
@ -242,7 +242,7 @@ int alarmUtil ( string & hostname,
    if (( alarm.alarm_state != FM_ALARM_STATE_MSG ) && 
        ( curr_sev == alarm.severity ))
    {
-        alog ("%s %s %s already at desired (%s) severity level\n", 
+        alog1 ("%s %s %s already at desired (%s) severity level\n",
                  hostname.c_str(),
                  identity.c_str(), 
                  instance.c_str(),
@ -259,18 +259,18 @@ int alarmUtil ( string & hostname,
    {
        if ( alarm.alarm_state == FM_ALARM_STATE_SET )
        {
-            alog ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
+            alog1 ("%s setting %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
        }
        else
        {
-            alog ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
+            alog1 ("%s creating %s %s log\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id );
        }

        /* Debug Logs */
-        alog ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
-        alog ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
-        alog ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
-        alog ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n", 
+        alog1 ("%s Alarm Reason: %s\n", hostname.c_str(), alarm.reason_text );
+        alog1 ("%s Alarm Action: %s\n", hostname.c_str(), alarm.proposed_repair_action );
+        alog1 ("%s Alarm Ident : %s : %s\n", hostname.c_str(), alarm.entity_type_id, alarm.entity_instance_id );
+        alog1 ("%s Alarm State : state:%d sev:%d type:%d cause:%d sa:%c supp:%c\n",
                  hostname.c_str(),
                  alarm.alarm_state,
                  alarm.severity,
@ -322,7 +322,7 @@ int alarmUtil ( string & hostname,
        // send_log_message ( mtclogd_ptr, hostname.data(), &__alarmObject.varlog_filename[0],
        //                                                 &__alarmObject.temp_str[0] );

-        alog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
+        alog1 ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
        nodeUtil_latency_log ( hostname, NODEUTIL_LATENCY_MON_START , 0 );
        if ( ( rc = fm_clear_fault_async ( &filter )) != FM_ERR_OK )
        {
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@ -3902,7 +3902,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
            {
                if ( node_ptr->mtcAlive_clstr == false )
                {
-                    alog2 ("%s %s mtcAlive received",
+                    alog ("%s %s mtcAlive received",
                              node_ptr->hostname.c_str(),
                              get_iface_name_str(interface));
                    node_ptr->mtcAlive_clstr = true ;
@ -3912,7 +3912,7 @@ void nodeLinkClass::set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int in
            {
                if ( node_ptr->mtcAlive_mgmnt == false )
                {
-                    alog2 ("%s %s mtcAlive received",
+                    alog ("%s %s mtcAlive received",
                              node_ptr->hostname.c_str(),
                              get_iface_name_str(interface));
                    node_ptr->mtcAlive_mgmnt = true ;
--- a/mtce/src/maintenance/mtcCmdHdlr.cpp
+++ b/mtce/src/maintenance/mtcCmdHdlr.cpp
@ -454,7 +454,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
                }
                else
                {
-                    int delay = (((offline_period*offline_threshold)/1000)+3);
+                    int delay = (((offline_period*offline_threshold)/1000)*3);
                    ilog ("%s searching for offline ; next reboot attempt in %d seconds\n",
                              node_ptr->hostname.c_str(), delay);

@ -507,7 +507,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
                    wlog ("%s Board Management Interface not accessible\n", node_ptr->hostname.c_str());
                }
           }
-            int delay = (((offline_period*offline_threshold)/1000)+3);
+            int delay = (((offline_period*offline_threshold)/1000)*3);
            mtcTimer_start ( node_ptr->mtcCmd_timer, mtcTimer_handler, delay );
            node_ptr->mtcCmd_work_fifo_ptr->stage = MTC_CMD_STAGE__OFFLINE_CHECK ;
            break ;
@ -516,7 +516,7 @@ int nodeLinkClass::cmd_handler ( struct nodeLinkClass::node * node_ptr )
        {
             if ( node_ptr->mtcCmd_timer.ring == true )
             {
-                  int delay = (((offline_period*offline_threshold)/1000)+3);
+                  int delay = (((offline_period*offline_threshold)/1000)*3);

                  /* bmc power control reset by bmc */
                     rc = bmc_command_recv ( node_ptr );
--- a/mtce/src/maintenance/mtcNodeHdlrs.cpp
+++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp
@ -3221,6 +3221,7 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
        }
        case MTC_OFFLINE__START:
        {
+            node_ptr->mtcAlive_count = 0 ;
            node_ptr->mtcAlive_mgmnt = false ;
            node_ptr->mtcAlive_clstr = false ;
            node_ptr->offline_log_throttle = 0 ;
@ -3245,6 +3246,27 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
                      availStatus_enum_to_str(node_ptr->availStatus).c_str());

            this->ctl_mtcAlive_gate ( node_ptr, false ) ;
+
+            /**
+             * Handle the race condition case where the
+             * mtcAlive was received after the last check
+             * while in MTC_OFFLINE__WAIT below and here when
+             * the node_ptr->mtcAlive_<iface> state variables
+             * are cleared. Need to also clear the
+             * offline_search_count here as well.
+             **/
+            if (( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr ) && node_ptr->offline_search_count )
+            {
+                node_ptr->mtcAlive_online = true ;
+                ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
+                          node_ptr->hostname.c_str(),
+                          node_ptr->mtcAlive_count,
+                          node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
+                          node_ptr->mtcAlive_clstr ? 'Y' : 'n',
+                          node_ptr->offline_search_count,
+                          offline_threshold );
+                node_ptr->offline_search_count = 0 ; /* reset the count */
+            }
            node_ptr->mtcAlive_mgmnt = false ;
            node_ptr->mtcAlive_clstr = false ;

@ -3299,24 +3321,36 @@ int nodeLinkClass::offline_handler ( struct nodeLinkClass::node * node_ptr )
                                  offline_threshold );
                    }
                }
-                else
+                else if ( node_ptr->offline_search_count )
                {
+                   /**
+                    * This algorithm was assuming the node is offline after
+                    * offline_search_count reached offline_threshold count.
+                    *
+                    * Note: The mtcClient sends periodic mtcAlive messages
+                    * until it is shutdown.
+                    * This algorithm also explicitely 'requests' the message.
+                    * The algorithm depends on not receving the message, even
+                    * when requested for offline_threshold counts 'in a row'.
+                    *
+                    * When shutdown is slowed or delayed, a late mtcAlive
+                    * can trick this FSM into seeing the node as recovered
+                    * when in fact its still shuttingdown.
+                    *
+                    * To maintain the intended absence of mtcAlive messages
+                    * count 'in a row', this check resets the search count
+                    * if a mtcAlive is seen during the search.
+                    **/
+
                    node_ptr->mtcAlive_online = true ;
-                    if ( node_ptr->mtcAlive_mgmnt || node_ptr->mtcAlive_clstr )
-                    {
-                        ilog_throttled ( node_ptr->offline_log_throttle, 10,
-                                         "%s still seeing mtcAlive (%c:%c)\n",
+                    ilog ("%s still seeing mtcAlive (%d) (%c:%c) ; reset offline_search_count=%d of %d\n",
                              node_ptr->hostname.c_str(),
+                              node_ptr->mtcAlive_count,
                              node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
-                                         node_ptr->mtcAlive_clstr ? 'Y' : 'n');
-                    }
-                    else
-                    {
-                        alog ("%s still seeing mtcAlive (%c:%c)\n",
-                                  node_ptr->hostname.c_str(),
-                                  node_ptr->mtcAlive_mgmnt ? 'Y' : 'n',
-                                  node_ptr->mtcAlive_clstr ? 'Y' : 'n');
-                    }
+                              node_ptr->mtcAlive_clstr ? 'Y' : 'n',
+                              node_ptr->offline_search_count,
+                              offline_threshold );
+                    node_ptr->offline_search_count = 0 ; /* reset the search count */
                }

                if ( node_ptr->offlineStage == MTC_OFFLINE__IDLE )
--- a/mtce/src/maintenance/mtcSubfHdlrs.cpp
+++ b/mtce/src/maintenance/mtcSubfHdlrs.cpp
@ -84,6 +84,7 @@ int nodeLinkClass::enable_subf_handler ( struct nodeLinkClass::node * node_ptr )

            node_ptr->unknown_health_reported = false ;
            node_ptr->goEnabled_failed_subf   = false ;
+            node_ptr->hbsClient_ready         = false ;

            /* load worker subfunciton alarm state */
            EFmAlarmSeverityT sev = mtcAlarm_state ( node_ptr->hostname,