Increase Maintenance Heartbeat period from 100 to 1000 msecs

This update changes to support a default Maintenance Heartbeat period of 1 second (1000 msecs). Test Plan: PASS: Verify full deployment of WRCP AIO DX Plus 1 Worker PASS: Verify full deployment of 2+4+2 Standard System PASS: Verify heartbeat period default is 1 second Regression: PASS: Verify AIO DX enable handler heartbeat soak PASS: Verify AIO DX add handler heartbeat soak PASS: Verify Standard controller enable handler heartbeat soak PASS: Verify Standard controller add handler heartbeat soak PASS: Verify Standard worker node enable handler heartbeat soak PASS: Verify Standard worker node add handler heartbeat soak PASS: Verify Standard worker node recovery handler heartbeat soak PASS: Verify heartbeat loss handling with new default heartbeat period PASS: Verify MNFA handling with with new default heartbeat period PASS: Verify DOR cases PASS: - 8 of 8 nodes recover ENABLED (full success path) PASS: - 4 of 8 nodes require power on - all storage nodes PASS: - 3 of 8 nodes require graceful recovery - 1 ctrl and 2 computes PASS: Verify hostwd quorum process failure fault detection and handling timing is not effected by new default heartbeat period. PASS: Run WRCP DX Sanity on AIO DX and 2+4+2 Sandard system PASS: Verify silent fault detection handling (FIT) PASS: Verify hbsAgent logging Partial-Fix: 2117252 Depends-On: https://review.opendev.org/c/starlingx/config/+/955893 Change-Id: Ic689fff86594793fb7a5693657b592dfb7d01a6c Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2025-07-25 19:41:44 -04:00
parent f231177a6e
commit 90541258a3
2 changed files with 17 additions and 6 deletions
--- a/mtce-common/src/common/nodeTimers.h
+++ b/mtce-common/src/common/nodeTimers.h
@@ -80,7 +80,7 @@
 #define MTC_POWERCYCLE_COOLDOWN_DELAY  (MTC_MINS_5)
 #define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
 #define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
-#define MTC_HEARTBEAT_SOAK_DURING_ADD    (10)
+#define MTC_HEARTBEAT_SOAK_DURING_ADD    (11)
 #define MTC_REINSTALL_TIMEOUT_DEFAULT  (MTC_MINS_40)
 #define MTC_REINSTALL_TIMEOUT_BMC_ACC  (MTC_MINS_10)
 #define MTC_REINSTALL_TIMEOUT_MIN      (MTC_MINS_1)
--- a/mtce/src/heartbeat/hbsAgent.cpp
+++ b/mtce/src/heartbeat/hbsAgent.cpp
@@ -207,8 +207,9 @@ void daemon_exit ( void )
 #define HBS_SOCKET_NSEC    (HBS_SOCKET_MSEC*1000)
 #define HBS_MIN_PERIOD     (100)
 #define HBS_MAX_PERIOD     (1000)
-#define HBS_VIRT_PERIOD    (500)
-#define HBS_BACKOFF_FACTOR (4) /* period*this during backoff */
+#define HBS_DEF_PERIOD     (HBS_MAX_PERIOD)
+#define HBS_VIRT_PERIOD    (HBS_DEF_PERIOD)
+#define HBS_BACKOFF_FACTOR (4) /* period during backoff */

 /** Control Config Mask */
 #define CONFIG_AGENT_MASK   (CONFIG_AGENT_HBS_PERIOD      |\
@@ -2324,6 +2325,12 @@ void daemon_service_run ( void )
                        some_progress = true ;
                    // }
                    hbsInv.pulse_requests[iface] = 0 ;
+                    if ( hbsInv.hbs_silent_fault_logged == true )
+                    {
+                        ilog ("Heartbeat service is now making forward process again");
+                        hbsInv.hbs_silent_fault_logged   = false ;
+                        hbsInv.hbs_silent_fault_detector = 0     ;
+                    }
                }
            }
            if ( some_progress == false )
@@ -2332,10 +2339,14 @@ void daemon_service_run ( void )
                {
                    hbsInv.hbs_silent_fault_logged = true;

+                    /* Generate a customer log that indicates the heartbeat service
+                     * is not making forward progress. This is an event, not an alarm.
+                     * The data of the log indicates the time the issue was detected.
+                     * There is no automatic recovery method. If the issue does not
+                     * resolve on its own then manually restarting the hbsAgent OR
+                     * or lock/unlock the named controller may resolve. */
                    alarm_warning_log ( hbsInv.my_hostname, SERVICESTATUS_LOG_ID,
-                            "maintenance heartbeat service is not making forward progress ; "
-                            "recommend process restart by controller switchover "
-                            "at earliest convenience" , "service=heartbeat");
+                            "maintenance heartbeat service is not making forward progress", "service=heartbeat");
                }
                hbsInv.hbs_silent_fault_detector = 0 ;
            }