Fix heartbeat messaging when interface is set to 'lo'

Maintenance heartbeat service should not be multicast
messaging over an 'lo' interface which in IPv6 leads
to socket failures, log flooding and the inability to
detect and report pmond process failure.

To fix that this update
 - configures pulse messaging to unicast for monitored
   networks configured as 'lo'.
 - prevents heartbeating over the cluster network if both
   it and the management network are both configured on
   the 'lo' interface.
 - improves logging to avoid flooding in the presence of
   socket setup or access errors.
 - stops logging netlink events (interface state changes)
   on unmonitored network interfaces.
 - maintains heartbeat disabled state until the management
   network is up.
 - modifies hbsAgent socket failure handling and its pmon
   conf file so that a persistent socket failure during
   startup is alarmed as an hbsAgent process failure.

Test Plan:

PASS: Verify logging over system install and socket errors
PASS: Verify unicast messaging when cluster is set to 'lo'
PASS: Verify no cluster network heartbeat when it and mgmnt
      are set to 'lo'.

Regression:

PASS: Verify heartbeat messaging and cluster info
PASS: Verify pmond process failure alarm management
PASS: Verify heartbeat failure detection and graceful recovery
PASS: Verify AIO SX IPv6 system install and run
PASS: Verify AIO DX IPv6 system install and run
PASS: Verify Standard IPv6 system install and run
PASS: Verify Storage system IPv6 install and run
PASS: Verify Storage system IPv4 install and run
PASS: Verify MNFA handling in IPv6 storage system

Change-Id: I5a2a0b2dee0c690617c4e0b0e2ab8b1172b2dc49
Closes-Bug: 1884585
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-06-24 15:53:33 -04:00
parent fe8dd6d6f4
commit 55d5f43edb
5 changed files with 225 additions and 117 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2015 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -187,6 +187,7 @@ void log_link_events ( int netlink_sock,
iter_curr_ptr != links_gone_down.end() ;
iter_curr_ptr++ )
{
bool care = false ;
dlog3 ( "downed link: %s (running:%d:%d)\n",
iter_curr_ptr->c_str(),
mgmnt_link_up_and_running,
@ -194,6 +195,7 @@ void log_link_events ( int netlink_sock,
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
if ( mgmnt_link_up_and_running == true )
{
mgmnt_link_up_and_running = false ;
@ -202,6 +204,7 @@ void log_link_events ( int netlink_sock,
}
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
if ( clstr_link_up_and_running == true )
{
clstr_link_up_and_running = false ;
@ -209,6 +212,8 @@ void log_link_events ( int netlink_sock,
}
}
if ( care == true )
{
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is down (oper:%s) (%ld)\n",
@ -224,6 +229,7 @@ void log_link_events ( int netlink_sock,
}
}
}
}
if ( !links_gone_up.empty() )
{
dlog3 ("%ld links have recovered\n", links_gone_up.size());
@ -233,6 +239,7 @@ void log_link_events ( int netlink_sock,
iter_curr_ptr != links_gone_up.end() ;
iter_curr_ptr++ )
{
bool care = false ;
dlog3 ( "recovered link: %s (running:%d:%d)\n",
iter_curr_ptr->c_str(),
mgmnt_link_up_and_running,
@ -240,15 +247,19 @@ void log_link_events ( int netlink_sock,
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
mgmnt_link_up_and_running = true ;
wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr );
}
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
{
care = true ;
clstr_link_up_and_running = true ;
wlog ("Cluster-host link %s is up\n", clstr_iface_ptr );
}
if ( care == true )
{
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
{
wlog ("%s is up (oper:%s) (len:%ld)\n",
@ -266,6 +277,7 @@ void log_link_events ( int netlink_sock,
}
}
}
}

View File

@ -1,7 +1,7 @@
#ifndef __INCLUDE_NODEBASE_HH__
#define __INCLUDE_NODEBASE_HH__
/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -161,10 +161,10 @@ typedef enum
/** 'lo' interface IP address - TODO: get it from the interface */
#define LOCALHOST "localhost"
#define LOOPBACK_IP "127.0.0.1"
#define LOOPBACK_IPV6 "::1"
#define LOCALHOST "localhost"
#define LOOPBACK_IF "lo"
#define CLUSTER_HOST_SUFFIX ((const char*)("-cluster-host"))

View File

@ -5,8 +5,8 @@ pidfile = /var/run/hbsAgent.pid
style = lsb ; ocf or lsb
severity = major ; minor, major, critical
restarts = 1 ; restart retries before error assertion
interval = 10 ; number of seconds to wait between restarts
debounce = 10 ; number of seconds that a process needs to remain
interval = 5 ; number of seconds to wait between restarts
debounce = 20 ; number of seconds that a process needs to remain
; running before degrade is removed and retry count
; is cleared.
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -75,6 +75,10 @@ static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
static string mtcAgent_ip = "" ;
static std::list<string> hostname_inventory ;
/* Used to throttle warning messages that report
* an error transmitting the pulse request */
static int pulse_request_fail_log_counter[MAX_IFACES] ;
/** This heartbeat service inventory is tracked by
* the same nodeLinkClass that maintenance uses.
*
@ -449,7 +453,15 @@ int daemon_configure ( void )
ilog ("Clstr Addr : %s\n", hbsInv.my_clstr_ip.c_str());
}
if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
/* The cluster host network is considered unprovisioned
* for heartbeat while ...
* ... its interface is 'lo' ... */
if (!strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
{
hbsInv.clstr_network_provisioned = false ;
}
/* ... or it and the management interface are the same. */
else if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
{
hbsInv.clstr_network_provisioned = false ;
}
@ -551,20 +563,7 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
int rc = PASS ;
char * iface = NULL ;
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
iface = hbs_config.mgmnt_iface ;
}
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
}
else
{
wlog ("No Cluster-host Interface\n");
return (RETRY);
}
pulse_request_fail_log_counter[i] = 0 ;
/* Start by closing existing sockets just in case this is a (re)initialization */
if ( hbs_sock.rx_sock[i] )
@ -579,13 +578,60 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
hbs_sock.tx_sock[i] = 0 ;
}
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
if ( hbsInv.mgmnt_link_up_and_running == false )
{
wlog("Cannot setup Mgmnt pulse messaging when '%s' interface is down", hbs_config.clstr_iface );
return(FAIL_BAD_STATE);
}
else
{
iface = hbs_config.mgmnt_iface ;
if (strcmp(iface, LOOPBACK_IF))
{
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
else
{
hbs_sock.tx_sock[i] =
new msgClassTx(hbsInv.my_local_ip.data(), hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
}
}
else if (( i == CLSTR_IFACE ) &&
( hbsInv.clstr_network_provisioned == true ) &&
( hbs_config.clstr_iface != NULL ))
{
if ( hbsInv.clstr_link_up_and_running == false )
{
wlog("Cannot setup Clstr pulse messaging when '%s' interface is down", hbs_config.clstr_iface);
return(FAIL_BAD_STATE);
}
else
{
iface = hbs_config.clstr_iface ;
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
}
}
else
{
ilog("no heartbeat on %s network", get_iface_name_str(i) );
return (PASS);
}
/* Create transmit socket */
hbs_sock.tx_sock[i] = new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
if ( hbs_sock.tx_sock[i] )
{
if ( hbs_sock.tx_sock[i]->return_status != PASS )
{
elog("Cannot open multicast transmit socket - rc:%d (%d:%m)\n", hbs_sock.tx_sock[i]->return_status, errno );
elog("Failed to create %s pulse transmit socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.tx_sock[i]->return_status,
errno );
delete (hbs_sock.tx_sock[i]);
hbs_sock.tx_sock[i] = 0 ;
return (FAIL_SOCKET_CREATE);
@ -597,10 +643,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
}
else
{
elog("Cannot open multicast transmit socket - null object (%d:%m)\n", errno );
elog("Failed to create %s pulse transmit socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
dlog("Opened multicast transmit socket\n" );
/* In order to avoid multicast packets being routed wrong, force sending from that socket */
hbs_sock.tx_sock[i]->interfaceBind();
@ -614,8 +660,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
hbs_sock.rx_sock[i] = new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true);
if (( hbs_sock.rx_sock[i] == NULL ) || (hbs_sock.rx_sock[i]->return_status != PASS ))
{
elog("Failed opening pulse receive socket (%d:%s)\n",
errno, strerror (errno));
elog("Failed to create %s pulse receive socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.rx_sock[i]->return_status,
errno );
rc = FAIL_SOCKET_CREATE ;
}
else
@ -948,7 +996,11 @@ int hbs_pulse_request ( iface_enum iface,
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
{
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
/* Throttle this error log. */
elog_throttled( pulse_request_fail_log_counter[iface], 100,
"Failed to send %s Pulse request: " \
"%d:%s to %s.%d (rc:%i ; %d:%s)\n",
get_iface_name_str(iface),
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
@ -959,7 +1011,9 @@ int hbs_pulse_request ( iface_enum iface,
}
else
{
wlog("Unable to send pulse request - null tx object - auto re-init pending\n");
elog_throttled( pulse_request_fail_log_counter[iface], 100,
"Unable to send %s pulse request on null socket",
get_iface_name_str(iface));
return (FAIL_SOCKET_SENDTO);
}
@ -1448,10 +1502,6 @@ void daemon_service_run ( void )
* ultimately triggers an exit if that retry count gets too big */
int socket_init_fail_count = 0 ;
/* Used to throttle warning messages that report
* an error transmitting the pulse request */
int pulse_request_fail_log_counter[MAX_IFACES] ;
/* throttle initialization wait logs */
int wait_log_throttle = 0 ;
@ -1561,38 +1611,6 @@ void daemon_service_run ( void )
daemon_exit();
}
/* Setup the heartbeat sockets */
if ( (rc = hbs_socket_init ()) != PASS )
{
if ( socket_init_fail_count++ == 10 )
{
elog ("Failed socket initialization (rc:%d) max retries ; exiting ...\n", rc );
daemon_exit ();
}
else
{
elog ("Failed socket initialization (rc:%d) ; will retry in 5 secs ...\n", rc );
sleep (5);
}
}
else
{
ilog ("Sending ready event to maintenance\n");
do
{
/* Wait for maintenance */
rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ;
if ( rc == RETRY )
{
mtcWait_secs ( 3 );
}
} while ( rc == RETRY ) ;
if ( rc == FAIL )
{
elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc );
daemon_exit ();
}
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) )
{
hbsInv.mgmnt_link_up_and_running = false ;
@ -1616,6 +1634,45 @@ void daemon_service_run ( void )
}
}
/* Setup the heartbeat sockets */
if ( (rc = hbs_socket_init ()) != PASS )
{
#define HBS_SOCKET_INIT_RETRY_THRESHOLD (3)
#define HBS_SOCKET_INIT_RETRY_INTERVAL (2)
if ( socket_init_fail_count++ == HBS_SOCKET_INIT_RETRY_THRESHOLD )
{
elog ("Failed socket initialization (rc:%d) "
"max retries ; exiting ...", rc );
daemon_exit ();
}
else
{
elog ("Failed socket initialization (rc:%d) ; "
"will retry in %d secs ...\n",
rc, HBS_SOCKET_INIT_RETRY_INTERVAL);
sleep (HBS_SOCKET_INIT_RETRY_INTERVAL);
}
}
else
{
ilog ("Sending ready event to maintenance\n");
do
{
/* Wait for maintenance */
rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ;
if ( rc == RETRY )
{
// TODO: Threshold this loop and exit or this
// could be a silent process failure loop.
mtcWait_secs ( 3 );
}
} while ( rc == RETRY ) ;
if ( rc == FAIL )
{
elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc );
daemon_exit ();
}
/* Make the main loop schedule in real-time */
{
struct sched_param param ;
@ -1720,7 +1777,8 @@ void daemon_service_run ( void )
counter = 1 ;
}
}
else if ( hbsInv.hbs_disabled == true )
else if (( hbsInv.hbs_disabled == true ) &&
( hbsInv.mgmnt_link_up_and_running == true ))
{
hbs_ctrl.locked = false ;
hbsInv.hbs_disabled = false;
@ -2191,12 +2249,11 @@ void daemon_service_run ( void )
if ( rc != 0 )
{
/* TODO: Fix this with an alarm */
wlog_throttled ( pulse_request_fail_log_counter[iface], 100,
"%s hbs_pulse_request failed - rc:%d\n", get_iface_name_str(iface), rc);
if ( pulse_request_fail_log_counter[iface] == INTERFACE_ERRORS_FOR_REINIT )
if ( pulse_request_fail_log_counter[iface] > INTERFACE_ERRORS_FOR_REINIT )
{
_setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
rc = _setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
if ( rc )
continue ;
}
}
else

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -436,11 +436,14 @@ int daemon_configure ( void )
if ( strlen(hbs_config.clstr_iface) )
{
if (strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
{
if (strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
{
clstr_network_provisioned = true ;
ilog ("Cluster-host Name : %s\n", hbs_config.clstr_iface );
}
}
}
if ( clstr_network_provisioned == true )
{
ilog("Cluster-host Port : %d (rx)", hbs_config.hbs_client_clstr_port );
@ -476,44 +479,80 @@ int _setup_pulse_messaging ( iface_enum i, int rmem )
/* client sockets are not modified */
UNUSED(rmem);
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
iface = hbs_config.mgmnt_iface ;
}
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
}
else
{
wlog ("No Cluster-host Interface\n");
return (RETRY);
}
_close_pulse_rx_sock (i);
_close_pulse_tx_sock (i);
/********************************************************************/
/* Setup multicast Pulse Request Receive Socket */
/********************************************************************/
/* Load up the interface name */
if ( i == MGMNT_IFACE )
{
iface = hbs_config.mgmnt_iface ;
if (strcmp(iface, LOOPBACK_IF))
{
hbs_sock.rx_sock[i] =
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
}
else
{
// Default to unicast heartbeat on management 'lo' interface
hbs_sock.rx_sock[i] =
new msgClassRx(my_address.data(),hbs_sock.rx_port[i],IPPROTO_UDP,iface,false, false);
}
}
else if (( i == CLSTR_IFACE ) &&
( clstr_network_provisioned == true ) &&
( hbs_config.clstr_iface != NULL ))
{
iface = hbs_config.clstr_iface ;
hbs_sock.rx_sock[i] =
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
}
else
{
ilog("Cluster host interface not used.");
return (PASS);
}
if ( hbs_sock.rx_sock[i] )
{
if (hbs_sock.rx_sock[i]->return_status != PASS)
{
elog("Cannot create socket (%d) (%d:%m)\n", i, errno );
elog("Failed to create %s pulse receiver socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.rx_sock[i]->return_status,
errno );
_close_pulse_rx_sock (i);
return (FAIL_SOCKET_CREATE);
}
hbs_sock.rx_sock[i]->sock_ok(true);
}
else
{
elog("Failed to create %s pulse receiver socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
/********************************************************************/
/* Setup unicast transmit (reply) socket */
/********************************************************************/
hbs_sock.tx_sock[i] =
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP, iface);
if ( hbs_sock.tx_sock[i] == NULL )
{
elog("Failed to create %s pulse reply socket (%d:%m)\n",
get_iface_name_str(i), errno );
return (FAIL_SOCKET_CREATE);
}
if (hbs_sock.tx_sock[i]->return_status != PASS)
{
elog("Cannot create unicast transmit socket (%d) (%d:%m)\n", i, errno );
elog("Failed to create %s pulse reply socket (%d:%d:%m)\n",
get_iface_name_str(i),
hbs_sock.tx_sock[i]->return_status,
errno );
_close_pulse_tx_sock(i);
return (FAIL_SOCKET_CREATE);
}
@ -1234,7 +1273,7 @@ int daemon_init ( string iface, string nodeType_str )
}
/* Setup the heartbeat service messaging sockets */
else if ( hbs_socket_init () != PASS )
else if (( rc = hbs_socket_init ()) != PASS )
{
elog ("socket initialization failed (rc:%d)\n", rc );
rc = FAIL_SOCKET_INIT;