Add SM process heartbeat and status to the hbs cluster

This update is the mtc hbsAgent side of a new
SM -> hbsAgent heartbeat algorithm for the
purpose of detecting peer SM process stalls.

This update adds an 'SM Heartbeat status' bit to
the cluster view it injects into its multicast
heartbeat requests.

Its peer is able to read this on-going hbsAgent/SM
heartbeat status through the cluster.

The status bit reads 'ok' while the hbsAgent sees
the SM heartbeat as steady.

The status bit reads 'not ok' while the SM heartbeat
is lost for longer than 800 msecs.

Change-Id: I0f2079b0fafd7bce0b97ee26d29899659d66f81d
Partial-Fix: #1895350
Co-Authored-By: Bin.Qian@windriver.com
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-09-11 20:43:11 -04:00
parent 1350502720
commit 8c81914ea5
8 changed files with 234 additions and 43 deletions

View File

@ -245,6 +245,7 @@ typedef struct
long msecs ;
} time_delta_type ;
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta );
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta );
int gettime ( time_debug_type & nowtime ) ;
unsigned long long gettime_monotonic_nsec ( void );

View File

@ -26,20 +26,20 @@ unsigned long long gettime_monotonic_nsec ( void )
{
struct timespec ts;
clock_gettime (CLOCK_MONOTONIC, &ts);
return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec;
return ((unsigned long long) ts.tv_sec) * 1000000000ULL + ts.tv_nsec;
}
int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_type & delta )
{
{
/* Subtract before from after */
if ((after.ts.tv_sec < before.ts.tv_sec) ||
((after.ts.tv_sec == before.ts.tv_sec) &&
(after.ts.tv_nsec <= before.ts.tv_nsec)))
(after.ts.tv_nsec <= before.ts.tv_nsec)))
{
delta.secs = delta.msecs = 1 ;
}
else
}
else
{
delta.secs = after.ts.tv_sec - before.ts.tv_sec ;
if (after.ts.tv_nsec < before.ts.tv_nsec)
@ -51,7 +51,34 @@ int timedelta ( time_debug_type & before , time_debug_type & after, time_delta_t
{
delta.msecs = after.ts.tv_nsec - before.ts.tv_nsec ;
}
delta.msecs = (delta.msecs/1000);
/* convert nsec time to msec time */
delta.msecs = (delta.msecs/1000000);
}
return (PASS) ;
}
int timedelta ( struct timespec & before , struct timespec & after, time_delta_type & delta )
{
/* Subtract before from after */
if ((after.tv_sec < before.tv_sec) || ((after.tv_sec == before.tv_sec) && (after.tv_nsec <= before.tv_nsec)))
{
delta.secs = delta.msecs = 0 ;
}
else
{
delta.secs = after.tv_sec - before.tv_sec ;
if (after.tv_nsec < before.tv_nsec)
{
delta.msecs = after.tv_nsec + 1000000000L - before.tv_nsec ;
delta.secs-- ;
}
else
{
delta.msecs = after.tv_nsec - before.tv_nsec ;
}
/* convert nsec time to msec time */
delta.msecs = (delta.msecs/1000000);
}
return (PASS) ;
}

View File

@ -69,6 +69,19 @@ using namespace std;
#define MAX_LEN 1000
/* Stores the MONOTONIC time the last SM heartbeat was received
* Heartbeat metrics
* SM heartbeat period definition */
static struct timespec sm_heartbeat_timestamp_last = { 0 , 0 } ;
static struct timespec sm_heartbeat_timestamp_restart = { 0 , 0 } ;
static int sm_heartbeat_count_b2b_misses = 0 ;
static int sm_heartbeat_count = 0 ;
const int SM_HEARTBEAT_PULSE_INTERVAL_MSEC = 100;
const int SM_HEARTBEAT_PULSE_PERIOD_MSECS = 800;
const int SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC = SM_HEARTBEAT_PULSE_PERIOD_MSECS * 2;
const int SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER =
SM_HEARTBEAT_PULSE_RECOVER_DURATION_MSEC / SM_HEARTBEAT_PULSE_INTERVAL_MSEC;
/* Historical String data for mem_logs */
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
@ -1398,6 +1411,7 @@ int daemon_init ( string iface, string nodetype )
*
****************************************************************************/
static int _hbs_sm_handler_log_throttle = 0 ;
static int _hbs_sm_heartbeat_log_throttle = 0 ;
void hbs_sm_handler ( void )
{
#define _MAX_MSG_LEN (80)
@ -1438,9 +1452,26 @@ void hbs_sm_handler ( void )
if (( service == SUPPERTED_SERVICE ) &&
( request == SUPPORTED_REQUEST ))
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
/* SM heartbeat pulses have a reqid = 0 and do not require a response */
if ( reqid == 0 )
{
time_delta_type delta ;
struct timespec ts = sm_heartbeat_timestamp_last ;
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
if(sm_heartbeat_count_b2b_misses && sm_heartbeat_timestamp_restart.tv_sec == 0)
{
sm_heartbeat_timestamp_restart = sm_heartbeat_timestamp_last;
}
timedelta (ts, sm_heartbeat_timestamp_last, delta );
sm_heartbeat_count++ ;
ilog_throttled(_hbs_sm_heartbeat_log_throttle, 100, "SM Heartbeat %d (%ld.%03ld secs)",
sm_heartbeat_count, delta.secs, delta.msecs);
}
else
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid, "query" );
}
/* reset log throttle */
_hbs_sm_handler_log_throttle = 0 ;
}
@ -1476,6 +1507,92 @@ void hbs_sm_handler ( void )
dlog ("... %s", sm_mesg );
}
/****************************************************************************
*
* Name ; manage_sm_heartbeat
*
* Purpose : Determine if we received an SM heartbeat message within
* the last SM_HEARTBEAT_PULSE_PERIOD_MSECS
*
* Description: Compare the monotonic now time to the monotonic time
* of the last received SM heartbeat pulse.
*
* Returns : True if time dela is less than SM_HEARTBEAT_PULSE_PERIOD_MSECS
* False if time delta is greater
*
***************************************************************************/
bool manage_sm_heartbeat ( void )
{
struct timespec ts ;
time_delta_type delta ;
clock_gettime (CLOCK_MONOTONIC, &ts );
timedelta (sm_heartbeat_timestamp_last, ts, delta );
int64_t delta_in_ms = delta.secs * 1000 + delta.msecs;
bool heartbeat_ok;
if ( delta_in_ms > SM_HEARTBEAT_PULSE_PERIOD_MSECS )
{
sm_heartbeat_count = 0;
if (( ++sm_heartbeat_count_b2b_misses < 20 )||
(!( sm_heartbeat_count_b2b_misses % 100 )))
{
wlog("SM Heartbeat missing since %ld.%03ld secs ago ; HBS Period Misses:%3d ; Running HB Count:%4d",
delta.secs, delta.msecs,
sm_heartbeat_count_b2b_misses,
sm_heartbeat_count);
}
heartbeat_ok = false;
}
else
{
if(sm_heartbeat_count_b2b_misses)
{
int expected_beeps = delta_in_ms / SM_HEARTBEAT_PULSE_INTERVAL_MSEC - 1;
if(sm_heartbeat_count >= expected_beeps)
{
if(sm_heartbeat_count >= SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER)
{
ilog("SM Heartbeat recovered (%d:%dbeeps/%ldms) after %d missing",
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
delta_in_ms,
sm_heartbeat_count_b2b_misses);
sm_heartbeat_count_b2b_misses = 0;
sm_heartbeat_count = 0;
sm_heartbeat_timestamp_restart.tv_sec = 0;
sm_heartbeat_timestamp_restart.tv_nsec = 0;
heartbeat_ok = true;
}else
{
ilog("SM Heartbeat recover continue (%d:%dbeeps/%ldms) after %d missing",
sm_heartbeat_count, SM_HEARTBEAT_PULSE_CONTINUE_BEEP_TO_RECOVER,
delta_in_ms,
sm_heartbeat_count_b2b_misses);
heartbeat_ok = false; // not good enough to declare recovered yet
}
}else
{
ilog("SM Heartbeat recover is interrupted after %ldms, missing %d beeps. "
"Counting will restart.",
delta_in_ms, expected_beeps - sm_heartbeat_count);
sm_heartbeat_timestamp_restart = ts;
sm_heartbeat_count = 1;
heartbeat_ok = false; // recover is interrupted by further missing beep
}
}else
{
if(delta_in_ms >= SM_HEARTBEAT_PULSE_INTERVAL_MSEC * 2)
{
ilog("SM Heartbeat missing for %ldms:%dms. Not yet declare stall.",
delta_in_ms, SM_HEARTBEAT_PULSE_PERIOD_MSECS
);
}
heartbeat_ok = true; // not bad enough to declare heartbeat failed yet
}
}
return heartbeat_ok;
}
/****************************************************************************
*
* Name : daemon_service_run
@ -1552,7 +1669,8 @@ void daemon_service_run ( void )
hbs_state_audit ();
}
/* run the first audit in 30 seconds */
/* The first audit was run after 30 seconds but then the
* continuous rate is every hour */
mtcTimer_start ( hbsTimer_audit, hbsTimer_handler, MTC_HRS_1 );
}
@ -1719,6 +1837,9 @@ void daemon_service_run ( void )
sockets_init = true ;
monitor_scheduling ( this_time, prev_time, 0, NODEUTIL_LATENCY_MON_START );
/* Update Sm heartbeat time to now time */
clock_gettime (CLOCK_MONOTONIC, &sm_heartbeat_timestamp_last );
/* no need for the heartbeat audit in a simplex system */
if ( hbsInv.system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX )
{
@ -2363,6 +2484,8 @@ void daemon_service_run ( void )
*/
else
{
bool heartbeat_ok = manage_sm_heartbeat();
/* manage vault wrt peer controller */
hbs_cluster_peer();
@ -2396,9 +2519,14 @@ void daemon_service_run ( void )
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
{
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding, heartbeat_ok );
}
}
/* log cluster throttled */
if (( heartbeat_ok == false ) && ( !( sm_heartbeat_count_b2b_misses % 100 )))
{
hbs_state_audit ( );
}
hbsTimer.ring = false ;
heartbeat_request = true ;
seq_num++ ;

View File

@ -290,7 +290,8 @@ void hbs_cluster_peer ( void );
*/
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding );
bool storage_0_responding,
bool sm_heartbeat_ok);
/* Called by the hbsAgent pulse transmitter to append this controllers
* running cluster view in the next multicast pulse request.

View File

@ -217,7 +217,7 @@ void cluster_list ( void )
void cluster_storage0_state ( bool enabled )
{
if ( ctrl.cluster.storage0_enabled != enabled )
if ( (bool)ctrl.cluster.storage0_enabled != enabled )
{
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
@ -305,36 +305,40 @@ void hbs_cluster_add ( string & hostname )
void hbs_cluster_del ( string & hostname )
{
std::list<string>::iterator hostname_ptr ;
bool found = false ;
for ( hostname_ptr = ctrl.monitored_hostname_list.begin();
hostname_ptr != ctrl.monitored_hostname_list.end() ;
hostname_ptr++ )
{
if ( hostname_ptr->compare(hostname) == 0 )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname.compare(STORAGE_0) == 0 )
{
cluster_storage0_state ( false );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
}
ilog ("%s deleted from cluster", hostname.c_str());
cluster_list ();
hbs_cluster_change ( hostname + " deleted" );
found = true ;
break ;
}
}
if ( found == true )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname.compare(STORAGE_0) == 0 )
cluster_storage0_state ( false );
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
ilog ("%s deleted from cluster", hostname.c_str());
cluster_list ();
hbs_cluster_change ( hostname + " deleted" );
}
else
{
slog("%s not found in cluster list", hostname.c_str());
}
}
/****************************************************************************
@ -371,7 +375,7 @@ void hbs_cluster_period_start ( void )
* 4. Maintain a back to back non-responding count for storage-0.
* Once the count reaches the minimum threshold of
* STORAGE_0_NR_THRESHOLD then the specific network history
* is updated to indicate storgae-0 is not responding. Once
* is updated to indicate storage-0 is not responding. Once
* storage-0 starts responding again with a single response
* then that network history is updated to indicate storage-0
* is responding.
@ -389,7 +393,8 @@ void hbs_cluster_period_start ( void )
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding )
bool storage_0_responding,
bool sm_heartbeat_ok)
{
if ( ctrl.monitored_hosts == 0 )
return ;
@ -407,6 +412,11 @@ void hbs_cluster_update ( iface_enum iface,
else
return ;
if ( ctrl.monitored_hosts < not_responding_hosts )
{
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
}
if ( not_responding_hosts )
{
clog ("controller-%d %s enabled:%d not responding:%d",
@ -454,13 +464,23 @@ void hbs_cluster_update ( iface_enum iface,
}
}
/* update sm heartbeat status */
if ( sm_heartbeat_ok == true )
{
history_ptr->sm_heartbeat_fail = false ;
}
else
{
history_ptr->sm_heartbeat_fail = true ;
}
/* Manage storage-0 status. */
if ( ctrl.cluster.storage0_enabled )
{
/* Handle storage-0 status change from not responding to responding. */
if ( storage_0_responding == true )
{
if (history_ptr->storage0_responding == false)
if ((bool)history_ptr->storage0_responding == false)
{
history_ptr->storage0_responding = true ;
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
@ -483,7 +503,7 @@ void hbs_cluster_update ( iface_enum iface,
}
/* Handle storage-0 status change from responding to not responding. */
if (( history_ptr->storage0_responding == true ) &&
if (( (bool)history_ptr->storage0_responding == true ) &&
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
{
history_ptr->storage0_responding = false ;
@ -495,7 +515,7 @@ void hbs_cluster_update ( iface_enum iface,
else
{
/* Typical path for storage-0 disabled or normal non-storage system case */
if ( history_ptr->storage0_responding == true )
if ( (bool)history_ptr->storage0_responding == true )
history_ptr->storage0_responding = false ;
/* Handle clearing threshold count when storage-0 is not enabled. */
@ -507,6 +527,10 @@ void hbs_cluster_update ( iface_enum iface,
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
if ( ctrl.monitored_hosts < not_responding_hosts )
{
slog("Monitored Hosts of %d is less than the number of Not Responding hosts %d" , ctrl.monitored_hosts, not_responding_hosts );
}
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;

View File

@ -328,17 +328,19 @@ void hbs_cluster_dump ( mtce_hbs_cluster_history_type & history, bool storage0_e
}
if ( storage0_enabled )
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s %s",
syslog ( LOG_INFO, "Cluster Vault : C%d %s S:%s SM:%s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
history.storage0_responding ? "y" : "n",
history.sm_heartbeat_fail ? "miss":" ok",
str);
}
else
{
syslog ( LOG_INFO, "Cluster Vault : C%d %s %s",
syslog ( LOG_INFO, "Cluster Vault : C%d %s SM:%s %s",
history.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)history.network).c_str(),
history.sm_heartbeat_fail ? "miss":" ok ",
str);
}
}

View File

@ -71,7 +71,10 @@ typedef struct
{
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
unsigned short network :4 ; /* see mtce_hbs_network_enum */
unsigned short reserved_bits :7 ; /* future - initted to 0 */
unsigned short reserved_bits :6 ; /* future - initted to 0 */
unsigned short sm_heartbeat_fail :1 ; /* SM Health
0 = SM Heartbeat OK,
1 = SM Heartbeat Failure */
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
unsigned short entries ; /* # of valid values in .entry */
unsigned short entries_max ; /* max size of the enry array */

View File

@ -30,6 +30,11 @@ barbican_port = 9311 ; The Barbican Port Number
token_refresh_rate = 1200 ; Authentication token refresh rate in seconds.
; A value of zero means no refresh.
; range is 0 or 600-22800
sm_heartbeat_loss_thld = 800 ; number of msecs SM heartbeat fails before
; declaring an SM process heartbeat loss in
; the heartbeat cluster.
autorecovery_threshold = 3 ; The number of times maintenance will try to
; auto recover a critically failed controller
; while there is no backup controllers to fail