Merge "Mtce: Improve robustness of heartbeat Loss reporting"

This commit is contained in:
Zuul 2018-12-21 14:59:11 +00:00 committed by Gerrit Code Review
commit 7512c6b105
5 changed files with 104 additions and 152 deletions

View File

@ -8249,7 +8249,7 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
#define HBS_LOSS_REPORT_THROTTLE (100)
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding ) int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{ {
int lost = 0 ; int lost = 0 ;
@ -8408,15 +8408,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
( infra_degrade_only == true )) ( infra_degrade_only == true ))
{ {
/* Only print the log at the threshold boundary */ /* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) if (( pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{ {
if ( this->active_controller ) if ( this->active_controller )
{ {
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
} }
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) ); get_iface_name_str(iface) );
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
} }
@ -8430,21 +8430,20 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
(( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE )) (( pulse_ptr->nodetype & CONTROLLER_TYPE) == CONTROLLER_TYPE ))
{ {
/* Only print the log at the threshold boundary */ /* Only print the log at the threshold boundary */
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) if ( (pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
{ {
if ( this->active_controller ) if ( this->active_controller )
{ {
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface ); manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
} }
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500, wlog ( "%s %s *** Heartbeat Loss *** (degrade only)\n",
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(), pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) ); get_iface_name_str(iface));
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" ); hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
} }
} }
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) && else if ((pulse_ptr->b2b_misses_count[iface]%HBS_LOSS_REPORT_THROTTLE) == hbs_failure_threshold )
( pulse_ptr->hbs_failure[iface] == false ))
{ {
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(), elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface) ); get_iface_name_str(iface) );

View File

@ -1183,11 +1183,11 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
memset (&event, 0 , sizeof(mtc_message_type)); memset (&event, 0 , sizeof(mtc_message_type));
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS ) if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
{ {
// daemon_dump_membuf_banner (); if ( hbs_config.debug_state == 2 )
{
hbsInv.print_node_info (); hbsInv.print_node_info ();
}
hbs_cluster_log ( hbsInv.my_hostname, "event", true ); hbs_cluster_log ( hbsInv.my_hostname, "event", true );
// daemon_dump_membuf ();
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
} }
else if ( event_cmd == MTC_EVENT_LOOPBACK ) else if ( event_cmd == MTC_EVENT_LOOPBACK )
@ -1504,9 +1504,6 @@ void daemon_service_run ( void )
daemon_exit (); daemon_exit ();
} }
/* set this controller as provisioned */
hbs_manage_controller_state ( hbsInv.my_hostname , true );
/* Run heartbeat service forever or until stop condition */ /* Run heartbeat service forever or until stop condition */
for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; ) for ( hbsTimer.ring = false , hbsTimer_audit.ring = false ; ; )
{ {
@ -1518,7 +1515,6 @@ void daemon_service_run ( void )
if ( sockets_init == true ) if ( sockets_init == true )
{ {
hbsInv.print_node_info(); hbsInv.print_node_info();
hbs_state_audit (); hbs_state_audit ();
} }
@ -1725,7 +1721,7 @@ void daemon_service_run ( void )
{ {
hbsInv.hbs_disabled = true ; hbsInv.hbs_disabled = true ;
hbsInv.hbs_state_change = true ; hbsInv.hbs_state_change = true ;
hbs_cluster_lock(); hbs_controller_lock ();
ilog ("heartbeat service going disabled (locked)"); ilog ("heartbeat service going disabled (locked)");
/* force the throttle 'still disabled' log to wait for /* force the throttle 'still disabled' log to wait for
@ -1904,17 +1900,12 @@ void daemon_service_run ( void )
} }
else if ( msg.cmd == MTC_CMD_STOP_HOST ) else if ( msg.cmd == MTC_CMD_STOP_HOST )
{ {
if ( hostname == hbsInv.my_hostname ) if ( hostname != hbsInv.my_hostname )
{
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
hbs_manage_controller_state( hostname, false );
}
else
{ {
hbsInv.mon_host ( hostname, false, true ); hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname ); hbs_cluster_del ( hostname );
ilog ("%s heartbeat service disabled by stop command",
hostname.c_str());
} }
} }
else if ( msg.cmd == MTC_CMD_START_HOST ) else if ( msg.cmd == MTC_CMD_START_HOST )
@ -2309,8 +2300,11 @@ void daemon_service_run ( void )
*/ */
bool storage_0_responding = true ; bool storage_0_responding = true ;
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding); int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
if ( !hbs_ctrl.locked && !hbsInv.hbs_disabled )
{
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding); hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
} }
}
hbsTimer.ring = false ; hbsTimer.ring = false ;
heartbeat_request = true ; heartbeat_request = true ;
seq_num++ ; seq_num++ ;

View File

@ -250,8 +250,9 @@ void hbs_cluster_change ( string cluster_change_reason );
* Automatically adjusts the numbers in the cluster vault. */ * Automatically adjusts the numbers in the cluster vault. */
void hbs_cluster_add ( string & hostname ); void hbs_cluster_add ( string & hostname );
void hbs_cluster_del ( string & hostname ); void hbs_cluster_del ( string & hostname );
void hbs_cluster_rem ( unsigned short controller );
void hbs_cluster_lock ( void ); /* do actions when this controller is detected as locked */
void hbs_controller_lock ( void );
/* Do stuff in preparation for another pulse period start */ /* Do stuff in preparation for another pulse period start */
void hbs_cluster_period_start ( void ); void hbs_cluster_period_start ( void );
@ -263,9 +264,6 @@ void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1, int hbs_cluster_cmp( mtce_hbs_cluster_history_type h1,
mtce_hbs_cluster_history_type h2 ); mtce_hbs_cluster_history_type h2 );
/* Manage the enabled state of the controllers */
void hbs_manage_controller_state ( string & hostname, bool enabled );
/* Set the number of monitored hosts and this controller's /* Set the number of monitored hosts and this controller's
* number in the cluster vault. */ * number in the cluster vault. */
void hbs_cluster_nums ( unsigned short this_controller, void hbs_cluster_nums ( unsigned short this_controller,

View File

@ -750,6 +750,7 @@ static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
static int rx_error_count[MAX_IFACES] = {0,0} ; static int rx_error_count[MAX_IFACES] = {0,0} ;
static int tx_error_count[MAX_IFACES] = {0,0} ; static int tx_error_count[MAX_IFACES] = {0,0} ;
static int missing_history_count[MAX_IFACES] = {0,0} ;
#define ERROR_LOG_THRESHOLD (200) #define ERROR_LOG_THRESHOLD (200)
@ -961,18 +962,21 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
} }
else else
{ {
int debug_state = daemon_get_cfg_ptr()->debug_state ;
clog ("controller-%d %s cluster info added to response (%d)", clog ("controller-%d %s cluster info added to response (%d)",
controller?0:1, controller?0:1,
get_iface_name_str(iface), missed_controller_summary_tracker[controller?0:1] ); get_iface_name_str(iface),
missed_controller_summary_tracker[controller?0:1] );
/* Now copy the other controller's cached cluster info into /* Now copy the other controller's cached cluster info into
* this controller's response */ * this controller's response */
hbs_cluster_copy ( controller_cluster_cache[controller?0:1], hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
hbs_sock.rx_mesg[iface].cluster ); hbs_sock.rx_mesg[iface].cluster );
if ( daemon_get_cfg_ptr()->debug_state & 4 )
{
string dump_banner = "" ; string dump_banner = "" ;
if ( debug_state )
{
dump_banner.append("controller-") ; dump_banner.append("controller-") ;
dump_banner.append(itos(controller?0:1)); dump_banner.append(itos(controller?0:1));
dump_banner.append(" cluster info from cache injected into controller-"); dump_banner.append(" cluster info from cache injected into controller-");
@ -980,11 +984,31 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
dump_banner.append(":"); dump_banner.append(":");
dump_banner.append(get_iface_name_str(iface)); dump_banner.append(get_iface_name_str(iface));
dump_banner.append(" pulse response"); dump_banner.append(" pulse response");
}
if ( debug_state & 4 )
{
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner ); hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner );
} }
else
{
clog ("%s", dump_banner.c_str());
} }
} }
} }
if (missing_history_count[iface])
{
ilog ("controller-%d %s providing cluster history",
controller, get_iface_name_str(iface));
missing_history_count[iface] = 0 ;
}
}
else
{
wlog_throttled ( missing_history_count[iface], 5000,
"controller-%d %s proividing no cluster history",
controller, get_iface_name_str(iface));
}
} }
/* Cluster Data management end */ /* Cluster Data management end */

View File

@ -30,17 +30,17 @@ typedef struct
/* Contains the controller number (0 or 1) for this controller. */ /* Contains the controller number (0 or 1) for this controller. */
unsigned short this_controller ; unsigned short this_controller ;
/* Preserves which controllers are enabled. */ /* Used to manage the cluster based on this and peer controller state */
bool controller_0_enabled ;
bool controller_1_enabled ;
#ifdef THREE_CONTROLLER_SYSTEM
bool controller_2_enabled ;
#endif
bool peer_controller_enabled ; bool peer_controller_enabled ;
/* Used to prevent log flooding in presence of back to back errors. */ /* Used to prevent log flooding in presence of back to back errors. */
unsigned int log_throttle ; unsigned int log_throttle ;
/* Used to log when
* - peer history goes missing (false -> true change)
* - peer history starts being received ( true -> false change ) */
bool peer_history_missing ;
/* Used to threshold storage-0 not responding state */ /* Used to threshold storage-0 not responding state */
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS]; unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
@ -130,12 +130,8 @@ void hbs_cluster_init ( unsigned short period, msgClassSock * sm_socket_ptr )
void hbs_cluster_ctrl_init ( void ) void hbs_cluster_ctrl_init ( void )
{ {
ctrl.this_controller = 0xffff ; ctrl.this_controller = 0xffff ;
ctrl.controller_0_enabled = false ;
ctrl.controller_1_enabled = false ;
#ifdef THREE_CONTROLLER_SYSTEM
ctrl.controller_2_enabled = false ;
#endif
ctrl.peer_controller_enabled = false ; ctrl.peer_controller_enabled = false ;
ctrl.peer_history_missing = true ;
ctrl.log_throttle = 0 ; ctrl.log_throttle = 0 ;
ctrl.monitored_networks = 0 ; ctrl.monitored_networks = 0 ;
ctrl.monitored_hosts = 0 ; ctrl.monitored_hosts = 0 ;
@ -236,64 +232,6 @@ void cluster_storage0_state ( bool enabled )
} }
} }
/****************************************************************************
*
* Name : hbs_manage_controller_state
*
* Description : Track the monitored enabled state of the controllers.
*
***************************************************************************/
void hbs_manage_controller_state ( string & hostname, bool enabled )
{
int controller = -1 ;
/* track controller state */
if ( hostname == CONTROLLER_0 )
{
controller = 0 ;
ctrl.controller_0_enabled = enabled ;
}
else if ( hostname == CONTROLLER_1 )
{
controller = 1 ;
ctrl.controller_1_enabled = enabled ;
}
else
{
/* ignore all other host names */
return ;
}
/* manage the state of the peer controller */
if ( ctrl.this_controller != controller )
{
/* Clear peer controller cluster history when the peer
* controller goes disabled */
if (( ctrl.peer_controller_enabled == true ) &&
( enabled == false ))
{
hbs_cluster_rem ( controller );
}
if ( enabled == false )
{
hbs_cluster_change ( "peer controller disabled" ) ;
}
else
{
hbs_cluster_change ( "peer controller enabled" ) ;
}
ctrl.peer_controller_enabled = enabled ;
}
else if ( enabled == false )
{
hbs_cluster_change ( "this controller locked" ) ;
hbs_cluster_lock();
}
}
/**************************************************************************** /****************************************************************************
* *
* Name : hbs_cluster_add * Name : hbs_cluster_add
@ -346,8 +284,11 @@ void hbs_cluster_add ( string & hostname )
hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
} }
/* Manage controller state ; true means enabled in this case. */ /* Catch enable/provisioning of the peer controller */
hbs_manage_controller_state ( hostname, true ); if (( hostname == CONTROLLER_0 ) && ( ctrl.this_controller != 0 ))
ctrl.peer_controller_enabled = true ;
if (( hostname == CONTROLLER_1 ) && ( ctrl.this_controller != 1 ))
ctrl.peer_controller_enabled = true ;
} }
/**************************************************************************** /****************************************************************************
@ -391,9 +332,6 @@ void hbs_cluster_del ( string & hostname )
hbs_cluster_init ( ctrl.cluster.period_msec, NULL ); hbs_cluster_init ( ctrl.cluster.period_msec, NULL );
} }
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str()); ilog ("%s deleted from cluster", hostname.c_str());
cluster_list (); cluster_list ();
@ -455,7 +393,6 @@ void hbs_cluster_period_start ( void )
* *
***************************************************************************/ ***************************************************************************/
void hbs_cluster_update ( iface_enum iface, void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts, unsigned short not_responding_hosts,
bool storage_0_responding ) bool storage_0_responding )
@ -636,7 +573,7 @@ void hbs_cluster_append ( hbs_message_type & msg )
} }
msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories); msg.cluster.bytes = BYTES_IN_CLUSTER_VAULT(msg.cluster.histories);
clog2 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)", clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes ); ctrl.this_controller, ctrl.monitored_networks, ctrl.cluster.histories, msg.cluster.bytes );
} }
@ -649,12 +586,22 @@ void hbs_cluster_peer ( void )
if (( ctrl.got_peer_controller_history == false ) && if (( ctrl.got_peer_controller_history == false ) &&
( ctrl.peer_controller_enabled == true )) ( ctrl.peer_controller_enabled == true ))
{ {
ilog ("missing peer controller cluster view" ); /* ERIK: DEBUG */ if ( ctrl.peer_history_missing == false )
{
wlog ( "missing peer controller cluster view" );
ctrl.peer_history_missing = true ;
}
/* if no nodes have reported peer controller history then inject /* if no nodes have reported peer controller history then inject
* a 0:0 value in for this pulse period for that controller. */ * a 0:0 value in for this pulse period for that controller. */
hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 ); hbs_cluster_inject ( ctrl.this_controller?0:1, 0, 0 );
} }
else if (( ctrl.got_peer_controller_history == true ) &&
( ctrl.peer_controller_enabled == true ) &&
( ctrl.peer_history_missing == true ))
{
wlog ( "receiving peer controller cluster view" );
ctrl.peer_history_missing = false ;
}
} }
/**************************************************************************** /****************************************************************************
@ -720,7 +667,9 @@ void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid , string reason
* *
***************************************************************************/ ***************************************************************************/
void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample ) void hbs_history_save ( string hostname,
mtce_hbs_network_enum network,
mtce_hbs_cluster_history_type & sample )
{ {
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{ {
@ -736,12 +685,12 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
memcpy( &ctrl.cluster.history[h], &sample, memcpy( &ctrl.cluster.history[h], &sample,
sizeof(mtce_hbs_cluster_history_type)); sizeof(mtce_hbs_cluster_history_type));
clog1 ("controller-%d updated vault with controller-%d:%s network history through %s (histories:%d)", clog1 ("controller-%d vault update from controller-%d %s reply with %d histories (this:%s)",
ctrl.this_controller, ctrl.this_controller,
sample.controller, sample.controller,
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), hbs_cluster_network_name(network).c_str(),
hostname.c_str(), ctrl.cluster.histories,
ctrl.cluster.histories); hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str());
return ; return ;
} }
} }
@ -756,15 +705,16 @@ void hbs_history_save ( string hostname, mtce_hbs_cluster_history_type & sample
ctrl.cluster.histories++ ; ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories); ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
ilog ("controller-%d added new controller-%d:%s history to vault ; now have %d network views", ilog ("controller-%d added new %s:%s history to vault ; now have %d network views",
ctrl.this_controller, ctrl.this_controller,
sample.controller, hostname.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(), hbs_cluster_network_name((mtce_hbs_network_enum)sample.network).c_str(),
ctrl.cluster.histories); ctrl.cluster.histories);
} }
void hbs_state_audit ( void ) void hbs_state_audit ( void )
{ {
if ( ctrl.monitored_hosts )
hbs_cluster_dump ( ctrl.cluster, "Audit" ); hbs_cluster_dump ( ctrl.cluster, "Audit" );
} }
@ -872,7 +822,7 @@ int hbs_cluster_save ( string & hostname,
{ {
/* set that we got some history and save it */ /* set that we got some history and save it */
ctrl.got_peer_controller_history = true ; ctrl.got_peer_controller_history = true ;
hbs_history_save ( hostname, msg.cluster.history[h] ); hbs_history_save ( hostname, network, msg.cluster.history[h] );
} }
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) ); hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
} }
@ -915,39 +865,26 @@ void hbs_cluster_inject ( unsigned short controller, unsigned short hosts_enable
} }
} }
/****************************************************************************
*
* Name : hbs_controller_lock
*
* Description : Clear all history for this controller.
* Called when this controller is detected as locked.
*
***************************************************************************/
void hbs_cluster_rem ( unsigned short controller ) void hbs_controller_lock ( void )
{ {
int removed = 0 ; if ( ctrl.cluster.histories )
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{ {
if ( ctrl.cluster.history[h].controller == controller ) ilog ("controller-%d locked ; clearing all cluster info", ctrl.this_controller );
{
removed++ ;
wlog ("controller-%d %s network history removed from cluster (slot %d)",
controller,
hbs_cluster_network_name((mtce_hbs_network_enum)ctrl.cluster.history[h].network).c_str(),
h );
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
}
}
if ( removed )
{
hbs_cluster_change ( "removed controller history" ) ;
}
ctrl.cluster.histories -= removed ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
}
/* remove all cluster history on a lock operation */
void hbs_cluster_lock( void )
{
ilog ("controller-%d lock ; clearing all cluster info", ctrl.this_controller );
for ( int h = 0 ; h < ctrl.cluster.histories ; h++ ) for ( int h = 0 ; h < ctrl.cluster.histories ; h++ )
{ {
memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type)); memset ( &ctrl.cluster.history[h], 0, sizeof(mtce_hbs_cluster_history_type));
} }
ctrl.cluster.histories = 0 ; ctrl.cluster.histories = 0 ;
hbs_cluster_change ( "this controller locked" ) ;
}
} }