From df50847580735b84dd68c56ea16b24bfc6f64f5c Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 15 Oct 2019 10:30:14 -0400 Subject: [PATCH] Ensure hbsClient ready event is cleared over a reboot. A host sometimes (rarely) fails heartbeat immediately following unlock. The hbsClient sends its ready event every 5 seconds. Mtce uses this event message as a clue that the target host is ready to start heartbeat following Graceful Recovery or in this case Enable sequence. This update fixes a potential race condition where the hbsClient ready event snuck through immediately following the unlock reboot. This tricked mtc into starting heartbeat too early following the online event that follows a reboot which lead to a heartbeat failure. Test Plan: PASS: compute system install PASS: standby controller lock/unlock soak (25 loops) PASS: 2 compute async locked/unlock soak (50 loops each) Regression: PASS: inservice hearbeat failure detection and handling Change-Id: I21699dbb2f0ab7355a9384d78b47a1fd1cea496d Closes-Bug: 1847656 Signed-off-by: Eric MacDonald --- mtce/src/maintenance/mtcNodeHdlrs.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 4d96f826..40e70052 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -1042,8 +1042,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) node_ptr->goEnabled = false ; node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ; - clear_service_readies ( node_ptr ); - /* Set uptime to zero in mtce and in the database */ node_ptr->uptime_save = 0 ; set_uptime ( node_ptr, 0 , false ); @@ -1083,6 +1081,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) * have come in while we were purging */ node_ptr->mtcAlive_online = false ; node_ptr->mtcAlive_offline = true ; + clear_service_readies ( node_ptr ); break ; } case MTC_ENABLE__MTCALIVE_WAIT: @@ -1090,6 +1089,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) /* search for the mtc alive message */ if ( node_ptr->mtcAlive_online == true ) { + node_ptr->hbsClient_ready = false ; mtcTimer_reset ( node_ptr->mtcTimer ); /* Check to see if the host is/got configured correctly */