Debounce SM Unhealthy state notification

Maintenance doesn't give an unhealthy Service Mgmnt (SM)
process enough time to attempt a self recovery before
failing and rebooting the controller it runs on.

This update adds a small debounce to delay mtce's
reaction to SM's unhealthy state notification.

Only if the failure state persists for longer than 6
back-to-back mtcAlive messages, approximately 30 secs,
will maintenance fail and recovery the node through
reboot.

Change-Id: Ica1b0925f0c767001d80e6a3b9928a6761b0c00f
Closes-Bug: #1892789
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-08-24 19:31:42 -04:00
parent b4e935a631
commit 08c8c65795

View File

@ -707,6 +707,8 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
****************************************************************************/
int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int interface )
{
static int _sm_unhealthy_debounce_counter [MAX_IFACES] = {0,0} ;
struct timespec ts ;
clock_gettime (CLOCK_MONOTONIC, &ts );
@ -765,8 +767,29 @@ int create_mtcAlive_msg ( mtc_message_type & msg, int cmd, string identity, int
if ( daemon_is_file_present ( SMGMT_DEGRADED_FILE ) )
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_DEGRADED ;
if ( daemon_is_file_present ( SMGMT_UNHEALTHY_FILE ) )
{
/* debounce 6 mtcAlive messages = ~25-30 second debounce */
#define MAX_SM_UNHEALTHY_DEBOUNCE (6)
if ( ++_sm_unhealthy_debounce_counter[interface] > MAX_SM_UNHEALTHY_DEBOUNCE )
{
wlog("SM Unhealthy flag set (%s)",
get_iface_name_str(interface));
msg.parm[MTC_PARM_FLAGS_IDX] |= MTC_FLAG__SM_UNHEALTHY ;
}
else
{
wlog("SM Unhealthy debounce %d of %d (%s)",
_sm_unhealthy_debounce_counter[interface],
MAX_SM_UNHEALTHY_DEBOUNCE,
get_iface_name_str(interface));
}
}
else
{
_sm_unhealthy_debounce_counter[interface] = 0 ;
}
/* add the interface and sequence number to the mtcAlice message */
identity.append ( ",\"interface\":\"");