Debounce SM Unhealthy state notification

Maintenance doesn't give an unhealthy Service Mgmnt (SM) process enough time to attempt a self recovery before failing and rebooting the controller it runs on. This update adds a small debounce to delay mtce's reaction to SM's unhealthy state notification. Only if the failure state persists for longer than 6 back-to-back mtcAlive messages, approximately 30 secs, will maintenance fail and recovery the node through reboot. Change-Id: Ica1b0925f0c767001d80e6a3b9928a6761b0c00f Closes-Bug: #1892789 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2020-08-24 19:31:42 -04:00 · 2020-08-24 19:31:42 -04:00 · 08c8c65795
commit 08c8c65795
parent b4e935a631
1 changed files with 71 additions and 48 deletions
--- a/mtce/src/maintenance/mtcCompMsg.cpp
+++ b/mtce/src/maintenance/mtcCompMsg.cpp
@ -707,6 +707,8 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
 ****************************************************************************/
 int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int interface )
 {
+    static int _sm_unhealthy_debounce_counter [MAX_IFACES] = {0,0} ;
+
    struct timespec ts ;
    clock_gettime (CLOCK_MONOTONIC, &ts );

@ -765,8 +767,29 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int

    if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
        msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
+
    if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
+    {
+        /* debounce 6 mtcAlive messages = ~25-30 second debounce */
+        #define MAX_SM_UNHEALTHY_DEBOUNCE (6)
+        if ( ++_sm_unhealthy_debounce_counter[interface] > MAX_SM_UNHEALTHY_DEBOUNCE )
+        {
+            wlog("SM Unhealthy flag set (%s)",
+                  get_iface_name_str(interface));
            msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ;
+        }
+        else
+        {
+            wlog("SM Unhealthy debounce %d of %d (%s)",
+                  _sm_unhealthy_debounce_counter[interface],
+                  MAX_SM_UNHEALTHY_DEBOUNCE,
+                  get_iface_name_str(interface));
+        }
+    }
+    else
+    {
+        _sm_unhealthy_debounce_counter[interface] = 0 ;
+    }

    /* add the interface and sequence number to the mtcAlice message */
    identity.append ( ",\"interface\":\"");