Increase Maintenance Heartbeat period from 100 to 1000 msecs
This update changes to support a default Maintenance Heartbeat period
of 1 second (1000 msecs).
Test Plan:
PASS: Verify full deployment of WRCP AIO DX Plus 1 Worker
PASS: Verify full deployment of 2+4+2 Standard System
PASS: Verify heartbeat period default is 1 second
Regression:
PASS: Verify AIO DX enable handler heartbeat soak
PASS: Verify AIO DX add handler heartbeat soak
PASS: Verify Standard controller enable handler heartbeat soak
PASS: Verify Standard controller add handler heartbeat soak
PASS: Verify Standard worker node enable handler heartbeat soak
PASS: Verify Standard worker node add handler heartbeat soak
PASS: Verify Standard worker node recovery handler heartbeat soak
PASS: Verify heartbeat loss handling with new default heartbeat period
PASS: Verify MNFA handling with with new default heartbeat period
PASS: Verify DOR cases
PASS: - 8 of 8 nodes recover ENABLED (full success path)
PASS: - 4 of 8 nodes require power on - all storage nodes
PASS: - 3 of 8 nodes require graceful recovery - 1 ctrl and 2 computes
PASS: Verify hostwd quorum process failure fault detection and
handling timing is not effected by new default heartbeat period.
PASS: Run WRCP DX Sanity on AIO DX and 2+4+2 Sandard system
PASS: Verify silent fault detection handling (FIT)
PASS: Verify hbsAgent logging
Partial-Fix: 2117252
Depends-On: https://review.opendev.org/c/starlingx/config/+/955893
Change-Id: Ic689fff86594793fb7a5693657b592dfb7d01a6c
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
@@ -80,7 +80,7 @@
|
||||
#define MTC_POWERCYCLE_COOLDOWN_DELAY (MTC_MINS_5)
|
||||
#define MTC_POWERCYCLE_BACK2BACK_DELAY (MTC_MINS_5)
|
||||
#define MTC_HEARTBEAT_SOAK_BEFORE_ENABLE (11)
|
||||
#define MTC_HEARTBEAT_SOAK_DURING_ADD (10)
|
||||
#define MTC_HEARTBEAT_SOAK_DURING_ADD (11)
|
||||
#define MTC_REINSTALL_TIMEOUT_DEFAULT (MTC_MINS_40)
|
||||
#define MTC_REINSTALL_TIMEOUT_BMC_ACC (MTC_MINS_10)
|
||||
#define MTC_REINSTALL_TIMEOUT_MIN (MTC_MINS_1)
|
||||
|
||||
@@ -207,8 +207,9 @@ void daemon_exit ( void )
|
||||
#define HBS_SOCKET_NSEC (HBS_SOCKET_MSEC*1000)
|
||||
#define HBS_MIN_PERIOD (100)
|
||||
#define HBS_MAX_PERIOD (1000)
|
||||
#define HBS_VIRT_PERIOD (500)
|
||||
#define HBS_BACKOFF_FACTOR (4) /* period*this during backoff */
|
||||
#define HBS_DEF_PERIOD (HBS_MAX_PERIOD)
|
||||
#define HBS_VIRT_PERIOD (HBS_DEF_PERIOD)
|
||||
#define HBS_BACKOFF_FACTOR (4) /* period during backoff */
|
||||
|
||||
/** Control Config Mask */
|
||||
#define CONFIG_AGENT_MASK (CONFIG_AGENT_HBS_PERIOD |\
|
||||
@@ -2324,6 +2325,12 @@ void daemon_service_run ( void )
|
||||
some_progress = true ;
|
||||
// }
|
||||
hbsInv.pulse_requests[iface] = 0 ;
|
||||
if ( hbsInv.hbs_silent_fault_logged == true )
|
||||
{
|
||||
ilog ("Heartbeat service is now making forward process again");
|
||||
hbsInv.hbs_silent_fault_logged = false ;
|
||||
hbsInv.hbs_silent_fault_detector = 0 ;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( some_progress == false )
|
||||
@@ -2332,10 +2339,14 @@ void daemon_service_run ( void )
|
||||
{
|
||||
hbsInv.hbs_silent_fault_logged = true;
|
||||
|
||||
/* Generate a customer log that indicates the heartbeat service
|
||||
* is not making forward progress. This is an event, not an alarm.
|
||||
* The data of the log indicates the time the issue was detected.
|
||||
* There is no automatic recovery method. If the issue does not
|
||||
* resolve on its own then manually restarting the hbsAgent OR
|
||||
* or lock/unlock the named controller may resolve. */
|
||||
alarm_warning_log ( hbsInv.my_hostname, SERVICESTATUS_LOG_ID,
|
||||
"maintenance heartbeat service is not making forward progress ; "
|
||||
"recommend process restart by controller switchover "
|
||||
"at earliest convenience" , "service=heartbeat");
|
||||
"maintenance heartbeat service is not making forward progress", "service=heartbeat");
|
||||
}
|
||||
hbsInv.hbs_silent_fault_detector = 0 ;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user