From 5ab03b5222f223e93ee299ed91a70a2df95647c4 Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Fri, 8 Jan 2021 09:59:24 -0500
Subject: [PATCH] Mtce heartbeat cluster state change notification improvement

The current heartbeat cluster state change notification
needs to be sent when heartbeat pulses begin to be missed
rather than only after the host has reached the Heartbeat
Loss threshold. This buys SM more time, almost a full
second, and in doing so provides more accurate data for
it to make its SM heartbeat failure handling decisions.

This update also begins sending maintenance heartbeat
cluster state change notifications just before the next
multicast pulse request but after the cluster vault is
updated from the last pulse period. This ensures that
SM gets the most up-to-date cluster information.

This update also changes the hbsAgent's service file
to depend on the local hbsClient. By doing so, the
hbsAgent shuts down earlier over a graceful reboot
thereby preventing the hbsAgent from continuing to
report healthy response to the inactive controller
during active controller shutdown.

This way the inactive SM sees the failed active
controller when it queries the cluster in its
fail-pending state resulting in an inactive SM
take-over rather than stand-down.

Additional hbsAgent service file changes were made to
prevent systemd from auto recovering a failed hbsAgent
process, as its monitored and managed by pmond, and
fixed the ExecStop command line.

Test Plan:

PASS: Verify active controller graceful reboot.
      Standby controller takes over rather than shutdown
      - 30 of 30 iterations
PASS: Verify active controller forced reboot
PASS: Verify enabled standby controller graceful reboot
PASS: Verify Standard System install
PASS: Verify AIO DX system install

Regression:

PASS: Verify SM Uncontrolled Swact if active
      controller Mgmnt link drops.
PASS: Verify handling of downed cluster interface in
      - AIO DX (fail) and Standard (degrade) system
PASS: Verify no coredumps
PASS: Verify update as a patch

Change-Id: I6869631e091eb28a3cbb6f15d9a8ccd939c54410
Closes-Bug: 1906556
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 mtce-control/src/scripts/hbsAgent.service | 13 +----
 mtce/src/common/nodeClass.cpp             | 65 ++++++++++-------------
 mtce/src/heartbeat/hbsAgent.cpp           |  1 +
 mtce/src/heartbeat/hbsBase.h              |  6 ++-
 mtce/src/heartbeat/hbsCluster.cpp         | 47 ++++++++++++++--
 5 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/mtce-control/src/scripts/hbsAgent.service b/mtce-control/src/scripts/hbsAgent.service
index 7e111707..bd4bcd63 100644
--- a/mtce-control/src/scripts/hbsAgent.service
+++ b/mtce-control/src/scripts/hbsAgent.service
@@ -1,22 +1,13 @@
 [Unit]
 Description=StarlingX Maintenance Heartbeat Agent
-After=network.target syslog.service config.service
+After=hbsClient.service
 Before=pmon.service
 
 [Service]
 Type=forking
 ExecStart=/etc/rc.d/init.d/hbsAgent start
-ExecStop=/etc/rc.d/init.d/hbsAgent start
+ExecStop=/etc/rc.d/init.d/hbsAgent stop
 PIDFile=/var/run/hbsAgent.pid
-KillMode=process
-SendSIGKILL=no
-
-# Process recovery is handled by pmond if its running.
-# Delay 10 seconds to give pmond a chance to recover
-# before systemd kicks in to do it as a backup plan.
-Restart=always
-RestartSec=10
 
 [Install]
 WantedBy=multi-user.target
-
diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp
index ae43fe64..3a6cef81 100755
--- a/mtce/src/common/nodeClass.cpp
+++ b/mtce/src/common/nodeClass.cpp
@@ -8511,7 +8511,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
 
 
 
-#define HBS_LOSS_REPORT_THROTTLE (100)
+#define HBS_LOSS_REPORT_THROTTLE (100000)
 int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
 {
     int lost = 0  ;
@@ -8551,6 +8551,13 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
 
             if ( pulse_ptr->b2b_misses_count[iface] > 1 )
             {
+                if ( pulse_ptr->b2b_misses_count[iface] < hbs_failure_threshold )
+                {
+                    hbs_cluster_change ( pulse_ptr->hostname + " " +
+                            get_iface_name_str(iface) +
+                            " heartbeat miss " +
+                            itos(pulse_ptr->b2b_misses_count[iface]));
+                }
                 if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
                 {
                     if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
@@ -8657,57 +8664,43 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                 }
             }
 
-            /* Turn the cluster-host heartbeat loss into a degrade only
-             * condition if the clstr_degrade_only flag is set */
-            if (( iface == CLSTR_IFACE ) &&
-                ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                ( clstr_degrade_only == true ))
-            {
-                /* Only print the log at the threshold boundary */
-                if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
-                {
-                    if ( this->active_controller )
-                    {
-                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-                    }
-
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
-                               pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface) );
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
-                }
-            }
-
             /* Turn the clstr heartbeat loss into a degrade only
              * condition for inactive controller on normal system. */
-            else if (( iface == CLSTR_IFACE ) &&
-                     ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold ) &&
-                     ( this->system_type == SYSTEM_TYPE__NORMAL ) &&
-                     (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
+            if (( iface == CLSTR_IFACE ) &&
+                ((( this->system_type == SYSTEM_TYPE__NORMAL ) &&
+                 (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) ||
+                 ( clstr_degrade_only == true )))
             {
                 /* Only print the log at the threshold boundary */
-                if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+                if ( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE == hbs_failure_threshold )
                 {
                     if ( this->active_controller )
                     {
                         manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
                     }
-                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
+                    wlog ( "%s %s *** Heartbeat Loss *** (degrade only due to %s)\n",
                                pulse_ptr->hostname.c_str(),
-                               get_iface_name_str(iface));
-                    hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+                               get_iface_name_str(iface),
+                               clstr_degrade_only ? "config option" : "system type");
+                    hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
                 }
             }
 
             else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
+            // else if ( pulse_ptr->hbs_failure[iface] == false )
             {
-                elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
-                                                        get_iface_name_str(iface) );
+                elog ("%s %s *** Heartbeat Loss *** (b2b_misses:0x%x)\n",
+                          pulse_ptr->hostname.c_str(),
+                          get_iface_name_str(iface),
+                          pulse_ptr->b2b_misses_count[iface]);
+                hbs_cluster_change ( pulse_ptr->hostname + " " + get_iface_name_str(iface) + " heartbeat loss" );
 
                 if ( this->active_controller )
                 {
-                    manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
-
+                    if ( pulse_ptr->hbs_failure[iface] == false )
+                    {
+                        manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
+                    }
                     /* report this host as failed */
                     if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
                     {
@@ -8715,10 +8708,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
                     }
                 }
                 else
-                {
                     pulse_ptr->hbs_failure[iface] = true ;
-                }
-                hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
+
                 pulse_ptr->hbs_failure_count[iface]++ ;
             }
             if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp
index c8cb0305..bfd13f20 100644
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@@ -2368,6 +2368,7 @@ void daemon_service_run ( void )
                     arrival_histogram[iface] = "" ;
                     unexpected_pulse_list[iface] = "" ;
 
+
                     rc = hbs_pulse_request ( (iface_enum)iface, seq_num, ri, rri );
                     if ( rc != 0 )
                     {
diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h
index bfa8f1d1..b9f067f7 100755
--- a/mtce/src/heartbeat/hbsBase.h
+++ b/mtce/src/heartbeat/hbsBase.h
@@ -326,7 +326,7 @@ void hbs_cluster_log  ( string & hostname, mtce_hbs_cluster_type & cluster, stri
 void hbs_sm_handler ( void );
 
 /* send the cluster vault to SM */
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason );
 
 /* copy cluster data from src to dst */
 void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
@@ -338,6 +338,10 @@ void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
 /* Heartbeat service state audit */
 void hbs_state_audit ( void );
 
+/* Send state change message to SM if there has been a
+ * state change in the last period */
+void hbs_cluster_change_notifier ( void );
+
 /**
  * @} hbs_base
  */
diff --git a/mtce/src/heartbeat/hbsCluster.cpp b/mtce/src/heartbeat/hbsCluster.cpp
index 780fa8e3..1f82a4e3 100644
--- a/mtce/src/heartbeat/hbsCluster.cpp
+++ b/mtce/src/heartbeat/hbsCluster.cpp
@@ -69,6 +69,8 @@ typedef struct
 
     msgClassSock * sm_socket_ptr ;
 
+    string cluster_change_reason ;
+
 } hbs_cluster_ctrl_type ;
 
 /* Cluster control structire construct allocation. */
@@ -122,6 +124,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
     {
         ctrl.sm_socket_ptr = sm_socket_ptr ;
     }
+    ctrl.cluster_change_reason = "";
+
     ctrl.log_throttle = 0 ;
 }
 
@@ -173,7 +177,30 @@ void hbs_cluster_nums ( unsigned short this_controller,
 
 void hbs_cluster_change ( string cluster_change_reason )
 {
-    hbs_cluster_send( ctrl.sm_socket_ptr, 0, cluster_change_reason );
+    ilog ("reason: %s", cluster_change_reason.c_str());
+    if ( ctrl.cluster_change_reason.empty() )
+        ctrl.cluster_change_reason = cluster_change_reason ;
+    else
+        ctrl.cluster_change_reason.append("," + cluster_change_reason) ;
+}
+
+/****************************************************************************
+ *
+ * Name        : hbs_cluster_change_notifier
+ *
+ * Description : Send SM the cluster info if there has been a state change.
+ *
+ ***************************************************************************/
+void hbs_cluster_change_notifier ( void )
+{
+    if ( ! ctrl.cluster_change_reason.empty () )
+    {
+        if ( hbs_cluster_send( ctrl.sm_socket_ptr, 0,
+                               ctrl.cluster_change_reason ) == PASS )
+        {
+            ctrl.cluster_change_reason.clear();
+        }
+    }
 }
 
 /****************************************************************************
@@ -444,6 +471,7 @@ void hbs_cluster_update ( iface_enum iface,
             wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
                              "Unable to store history beyond %d ",
                              ctrl.cluster.histories );
+            hbs_cluster_change_notifier ();
             return ;
         }
         else
@@ -544,6 +572,8 @@ void hbs_cluster_update ( iface_enum iface,
     else
         history_ptr->oldest_entry_index++ ;
 
+    hbs_cluster_change_notifier ();
+
     /* clear the log throttle if we are updating history ok. */
     ctrl.log_throttle = 0 ;
 }
@@ -647,12 +677,12 @@ unsigned short hbs_cluster_unused_bytes ( void )
  *
  ***************************************************************************/
 
-void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
+int hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason )
 {
+    int rc = FAIL_SOCKET_SENDTO ;
     ctrl.cluster.reqid = (unsigned short)reqid ;
     if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
     {
-        ilog ("cluster state notification Reason: %s", reason.c_str());
         int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
         int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
         if ( bytes <= 0 )
@@ -660,12 +690,19 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
              elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
                     bytes , errno, strerror(errno));
         }
-        hbs_cluster_dump ( ctrl.cluster );
+        else
+        {
+            /* limit the string length */
+            ilog ("reason: %s", reason.substr(0,80).c_str());
+            hbs_cluster_dump ( ctrl.cluster );
+            rc = PASS ;
+        }
     }
     else
     {
         wlog ("cannot send cluster info due to socket error");
     }
+    return(rc);
 }
 
 /****************************************************************************
@@ -689,7 +726,7 @@ void hbs_history_save ( string hostname,
         {
             if ( hbs_cluster_cmp( sample, ctrl.cluster.history[h] ) )
             {
-                 hbs_cluster_change ("peer controller cluster event " +
+                 hbs_cluster_change ("peer cluster delta " +
                  hbs_cluster_network_name((mtce_hbs_network_enum)sample.network));
             }