Make Mtce ignore heartbeat events from in-active controller.
There is the potential for a race condition that can lead to mtce incorrectly failing hosts due to heartbeat failure event messages sourced from the in-active controller. During a split brain recovery action scenario there was a swact which left the hbsAgent on the new stand-by controller thinking it was still on the active controller. This specific split brain failure mode was one where the active and then (after swact) stand-by controller was failing heartbeat to its peer and other nodes in the system even though the new active controller saw heartbeat working fine. The problem being, the in-active controller detected and sent a heartbeat loss message to mtce before mtce was able to update the in-active controller's heartbeat activity status which would have gated the loss event send. This update adds an additional layer of protection by intentionally ignoring heartbeat events from the in-active controller that might slip through due to this activity state change race condition. Also fixed a flooding log in the hbsAgent for big systems. Change-Id: I825a801166b3e80cbf67945c7f587851f4e0d90b Closes-Bug: 1813976 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
c46e7d1a2c
commit
5c043f7ca9
@ -1,3 +1,3 @@
|
||||
SRC_DIR="src"
|
||||
TIS_PATCH_VER=150
|
||||
TIS_PATCH_VER=151
|
||||
BUILD_IS_SLOW=5
|
||||
|
@ -8299,8 +8299,8 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
}
|
||||
hbs_cluster_change ( pulse_ptr->hostname + " heartbeat loss" );
|
||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
|
@ -128,9 +128,9 @@ void monitor_scheduling ( unsigned long long & this_time, unsigned long long & p
|
||||
this_time = gettime_monotonic_nsec () ;
|
||||
if ( label_ptr && strncmp ( label_ptr, NODEUTIL_LATENCY_MON_START, strlen(NODEUTIL_LATENCY_MON_START)))
|
||||
{
|
||||
if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > 10 ))
|
||||
if ( ! strcmp (SCHED_MONITOR__RECEIVER, label_ptr ) && ( data > (int)hostname_inventory.size() ))
|
||||
{
|
||||
ilog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
|
||||
wlog ("===> receive latency : batch of %d pulses in under scheduling threshold of %d msec\n", data , hbs_config.latency_thld );
|
||||
}
|
||||
else if ( this_time > (prev_time + (NSEC_TO_MSEC*(hbs_config.latency_thld))))
|
||||
{
|
||||
|
@ -833,6 +833,27 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr )
|
||||
return (RETRY) ;
|
||||
}
|
||||
|
||||
string hostaddr = sock_ptr->mtc_event_rx_sock->get_src_str();
|
||||
string hostname = obj_ptr->get_hostname ( hostaddr ) ;
|
||||
if ( hostname.empty() )
|
||||
{
|
||||
wlog ("%s ignoring service event from unknown host (%s)",
|
||||
obj_ptr->my_hostname.c_str(), hostaddr.c_str());
|
||||
return (PASS);
|
||||
}
|
||||
if (( hostname != obj_ptr->my_hostname ) &&
|
||||
(( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS ) ||
|
||||
( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) ||
|
||||
( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) ||
|
||||
( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )||
|
||||
( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR )))
|
||||
{
|
||||
wlog ("%s %s from %s heartbeat service",
|
||||
&msg.buf[0],
|
||||
get_mtcNodeCommand_str(msg.cmd),
|
||||
hostname.c_str());
|
||||
return (PASS);
|
||||
}
|
||||
if ( msg.cmd == MTC_EVENT_LOOPBACK )
|
||||
{
|
||||
const char * event_hdr_ptr = get_loopback_header() ;
|
||||
|
Loading…
Reference in New Issue
Block a user