Fix heartbeat messaging when interface is set to 'lo'
Maintenance heartbeat service should not be multicast messaging over an 'lo' interface which in IPv6 leads to socket failures, log flooding and the inability to detect and report pmond process failure. To fix that this update - configures pulse messaging to unicast for monitored networks configured as 'lo'. - prevents heartbeating over the cluster network if both it and the management network are both configured on the 'lo' interface. - improves logging to avoid flooding in the presence of socket setup or access errors. - stops logging netlink events (interface state changes) on unmonitored network interfaces. - maintains heartbeat disabled state until the management network is up. - modifies hbsAgent socket failure handling and its pmon conf file so that a persistent socket failure during startup is alarmed as an hbsAgent process failure. Test Plan: PASS: Verify logging over system install and socket errors PASS: Verify unicast messaging when cluster is set to 'lo' PASS: Verify no cluster network heartbeat when it and mgmnt are set to 'lo'. Regression: PASS: Verify heartbeat messaging and cluster info PASS: Verify pmond process failure alarm management PASS: Verify heartbeat failure detection and graceful recovery PASS: Verify AIO SX IPv6 system install and run PASS: Verify AIO DX IPv6 system install and run PASS: Verify Standard IPv6 system install and run PASS: Verify Storage system IPv6 install and run PASS: Verify Storage system IPv4 install and run PASS: Verify MNFA handling in IPv6 storage system Change-Id: I5a2a0b2dee0c690617c4e0b0e2ab8b1172b2dc49 Closes-Bug: 1884585 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
fe8dd6d6f4
commit
55d5f43edb
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013, 2015 Wind River Systems, Inc.
|
* Copyright (c) 2013-2020 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -187,6 +187,7 @@ void log_link_events ( int netlink_sock,
|
|||||||
iter_curr_ptr != links_gone_down.end() ;
|
iter_curr_ptr != links_gone_down.end() ;
|
||||||
iter_curr_ptr++ )
|
iter_curr_ptr++ )
|
||||||
{
|
{
|
||||||
|
bool care = false ;
|
||||||
dlog3 ( "downed link: %s (running:%d:%d)\n",
|
dlog3 ( "downed link: %s (running:%d:%d)\n",
|
||||||
iter_curr_ptr->c_str(),
|
iter_curr_ptr->c_str(),
|
||||||
mgmnt_link_up_and_running,
|
mgmnt_link_up_and_running,
|
||||||
@ -194,6 +195,7 @@ void log_link_events ( int netlink_sock,
|
|||||||
|
|
||||||
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
||||||
{
|
{
|
||||||
|
care = true ;
|
||||||
if ( mgmnt_link_up_and_running == true )
|
if ( mgmnt_link_up_and_running == true )
|
||||||
{
|
{
|
||||||
mgmnt_link_up_and_running = false ;
|
mgmnt_link_up_and_running = false ;
|
||||||
@ -202,6 +204,7 @@ void log_link_events ( int netlink_sock,
|
|||||||
}
|
}
|
||||||
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
||||||
{
|
{
|
||||||
|
care = true ;
|
||||||
if ( clstr_link_up_and_running == true )
|
if ( clstr_link_up_and_running == true )
|
||||||
{
|
{
|
||||||
clstr_link_up_and_running = false ;
|
clstr_link_up_and_running = false ;
|
||||||
@ -209,18 +212,21 @@ void log_link_events ( int netlink_sock,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( care == true )
|
||||||
|
{
|
||||||
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
||||||
{
|
{
|
||||||
wlog ("%s is down (oper:%s) (%ld)\n",
|
wlog ("%s is down (oper:%s) (%ld)\n",
|
||||||
iter_curr_ptr->c_str(),
|
iter_curr_ptr->c_str(),
|
||||||
running ? "up" : "down",
|
running ? "up" : "down",
|
||||||
iter_curr_ptr->length() );
|
iter_curr_ptr->length());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
wlog ("%s is down (driver query failed) (len:%ld)\n",
|
wlog ("%s is down (driver query failed) (len:%ld)\n",
|
||||||
iter_curr_ptr->c_str(),
|
iter_curr_ptr->c_str(),
|
||||||
iter_curr_ptr->length() );
|
iter_curr_ptr->length());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -233,6 +239,7 @@ void log_link_events ( int netlink_sock,
|
|||||||
iter_curr_ptr != links_gone_up.end() ;
|
iter_curr_ptr != links_gone_up.end() ;
|
||||||
iter_curr_ptr++ )
|
iter_curr_ptr++ )
|
||||||
{
|
{
|
||||||
|
bool care = false ;
|
||||||
dlog3 ( "recovered link: %s (running:%d:%d)\n",
|
dlog3 ( "recovered link: %s (running:%d:%d)\n",
|
||||||
iter_curr_ptr->c_str(),
|
iter_curr_ptr->c_str(),
|
||||||
mgmnt_link_up_and_running,
|
mgmnt_link_up_and_running,
|
||||||
@ -240,15 +247,19 @@ void log_link_events ( int netlink_sock,
|
|||||||
|
|
||||||
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data()))
|
||||||
{
|
{
|
||||||
|
care = true ;
|
||||||
mgmnt_link_up_and_running = true ;
|
mgmnt_link_up_and_running = true ;
|
||||||
wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr );
|
wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr );
|
||||||
}
|
}
|
||||||
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data()))
|
||||||
{
|
{
|
||||||
|
care = true ;
|
||||||
clstr_link_up_and_running = true ;
|
clstr_link_up_and_running = true ;
|
||||||
wlog ("Cluster-host link %s is up\n", clstr_iface_ptr );
|
wlog ("Cluster-host link %s is up\n", clstr_iface_ptr );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( care == true )
|
||||||
|
{
|
||||||
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS )
|
||||||
{
|
{
|
||||||
wlog ("%s is up (oper:%s) (len:%ld)\n",
|
wlog ("%s is up (oper:%s) (len:%ld)\n",
|
||||||
@ -265,6 +276,7 @@ void log_link_events ( int netlink_sock,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#ifndef __INCLUDE_NODEBASE_HH__
|
#ifndef __INCLUDE_NODEBASE_HH__
|
||||||
#define __INCLUDE_NODEBASE_HH__
|
#define __INCLUDE_NODEBASE_HH__
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2013-2016 Wind River Systems, Inc.
|
* Copyright (c) 2013-2020 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -161,10 +161,10 @@ typedef enum
|
|||||||
|
|
||||||
|
|
||||||
/** 'lo' interface IP address - TODO: get it from the interface */
|
/** 'lo' interface IP address - TODO: get it from the interface */
|
||||||
|
#define LOCALHOST "localhost"
|
||||||
#define LOOPBACK_IP "127.0.0.1"
|
#define LOOPBACK_IP "127.0.0.1"
|
||||||
#define LOOPBACK_IPV6 "::1"
|
#define LOOPBACK_IPV6 "::1"
|
||||||
#define LOCALHOST "localhost"
|
#define LOOPBACK_IF "lo"
|
||||||
|
|
||||||
|
|
||||||
#define CLUSTER_HOST_SUFFIX ((const char*)("-cluster-host"))
|
#define CLUSTER_HOST_SUFFIX ((const char*)("-cluster-host"))
|
||||||
|
|
||||||
|
@ -5,8 +5,8 @@ pidfile = /var/run/hbsAgent.pid
|
|||||||
style = lsb ; ocf or lsb
|
style = lsb ; ocf or lsb
|
||||||
severity = major ; minor, major, critical
|
severity = major ; minor, major, critical
|
||||||
restarts = 1 ; restart retries before error assertion
|
restarts = 1 ; restart retries before error assertion
|
||||||
interval = 10 ; number of seconds to wait between restarts
|
interval = 5 ; number of seconds to wait between restarts
|
||||||
debounce = 10 ; number of seconds that a process needs to remain
|
debounce = 20 ; number of seconds that a process needs to remain
|
||||||
; running before degrade is removed and retry count
|
; running before degrade is removed and retry count
|
||||||
; is cleared.
|
; is cleared.
|
||||||
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor
|
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
* Copyright (c) 2013-2020 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -75,6 +75,10 @@ static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
|
|||||||
static string mtcAgent_ip = "" ;
|
static string mtcAgent_ip = "" ;
|
||||||
static std::list<string> hostname_inventory ;
|
static std::list<string> hostname_inventory ;
|
||||||
|
|
||||||
|
/* Used to throttle warning messages that report
|
||||||
|
* an error transmitting the pulse request */
|
||||||
|
static int pulse_request_fail_log_counter[MAX_IFACES] ;
|
||||||
|
|
||||||
/** This heartbeat service inventory is tracked by
|
/** This heartbeat service inventory is tracked by
|
||||||
* the same nodeLinkClass that maintenance uses.
|
* the same nodeLinkClass that maintenance uses.
|
||||||
*
|
*
|
||||||
@ -449,7 +453,15 @@ int daemon_configure ( void )
|
|||||||
ilog ("Clstr Addr : %s\n", hbsInv.my_clstr_ip.c_str());
|
ilog ("Clstr Addr : %s\n", hbsInv.my_clstr_ip.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
|
/* The cluster host network is considered unprovisioned
|
||||||
|
* for heartbeat while ...
|
||||||
|
* ... its interface is 'lo' ... */
|
||||||
|
if (!strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
|
||||||
|
{
|
||||||
|
hbsInv.clstr_network_provisioned = false ;
|
||||||
|
}
|
||||||
|
/* ... or it and the management interface are the same. */
|
||||||
|
else if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
|
||||||
{
|
{
|
||||||
hbsInv.clstr_network_provisioned = false ;
|
hbsInv.clstr_network_provisioned = false ;
|
||||||
}
|
}
|
||||||
@ -551,20 +563,7 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
|
|||||||
int rc = PASS ;
|
int rc = PASS ;
|
||||||
char * iface = NULL ;
|
char * iface = NULL ;
|
||||||
|
|
||||||
/* Load up the interface name */
|
pulse_request_fail_log_counter[i] = 0 ;
|
||||||
if ( i == MGMNT_IFACE )
|
|
||||||
{
|
|
||||||
iface = hbs_config.mgmnt_iface ;
|
|
||||||
}
|
|
||||||
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
|
|
||||||
{
|
|
||||||
iface = hbs_config.clstr_iface ;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
wlog ("No Cluster-host Interface\n");
|
|
||||||
return (RETRY);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Start by closing existing sockets just in case this is a (re)initialization */
|
/* Start by closing existing sockets just in case this is a (re)initialization */
|
||||||
if ( hbs_sock.rx_sock[i] )
|
if ( hbs_sock.rx_sock[i] )
|
||||||
@ -579,13 +578,60 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
|
|||||||
hbs_sock.tx_sock[i] = 0 ;
|
hbs_sock.tx_sock[i] = 0 ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Load up the interface name */
|
||||||
|
if ( i == MGMNT_IFACE )
|
||||||
|
{
|
||||||
|
if ( hbsInv.mgmnt_link_up_and_running == false )
|
||||||
|
{
|
||||||
|
wlog("Cannot setup Mgmnt pulse messaging when '%s' interface is down", hbs_config.clstr_iface );
|
||||||
|
return(FAIL_BAD_STATE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
iface = hbs_config.mgmnt_iface ;
|
||||||
|
if (strcmp(iface, LOOPBACK_IF))
|
||||||
|
{
|
||||||
|
hbs_sock.tx_sock[i] =
|
||||||
|
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
hbs_sock.tx_sock[i] =
|
||||||
|
new msgClassTx(hbsInv.my_local_ip.data(), hbs_sock.tx_port[i],IPPROTO_UDP,iface);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (( i == CLSTR_IFACE ) &&
|
||||||
|
( hbsInv.clstr_network_provisioned == true ) &&
|
||||||
|
( hbs_config.clstr_iface != NULL ))
|
||||||
|
{
|
||||||
|
if ( hbsInv.clstr_link_up_and_running == false )
|
||||||
|
{
|
||||||
|
wlog("Cannot setup Clstr pulse messaging when '%s' interface is down", hbs_config.clstr_iface);
|
||||||
|
return(FAIL_BAD_STATE);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
iface = hbs_config.clstr_iface ;
|
||||||
|
hbs_sock.tx_sock[i] =
|
||||||
|
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ilog("no heartbeat on %s network", get_iface_name_str(i) );
|
||||||
|
return (PASS);
|
||||||
|
}
|
||||||
|
|
||||||
/* Create transmit socket */
|
/* Create transmit socket */
|
||||||
hbs_sock.tx_sock[i] = new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP,iface);
|
|
||||||
if ( hbs_sock.tx_sock[i] )
|
if ( hbs_sock.tx_sock[i] )
|
||||||
{
|
{
|
||||||
if ( hbs_sock.tx_sock[i]->return_status != PASS )
|
if ( hbs_sock.tx_sock[i]->return_status != PASS )
|
||||||
{
|
{
|
||||||
elog("Cannot open multicast transmit socket - rc:%d (%d:%m)\n", hbs_sock.tx_sock[i]->return_status, errno );
|
elog("Failed to create %s pulse transmit socket (%d:%d:%m)\n",
|
||||||
|
get_iface_name_str(i),
|
||||||
|
hbs_sock.tx_sock[i]->return_status,
|
||||||
|
errno );
|
||||||
delete (hbs_sock.tx_sock[i]);
|
delete (hbs_sock.tx_sock[i]);
|
||||||
hbs_sock.tx_sock[i] = 0 ;
|
hbs_sock.tx_sock[i] = 0 ;
|
||||||
return (FAIL_SOCKET_CREATE);
|
return (FAIL_SOCKET_CREATE);
|
||||||
@ -597,10 +643,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
elog("Cannot open multicast transmit socket - null object (%d:%m)\n", errno );
|
elog("Failed to create %s pulse transmit socket (%d:%m)\n",
|
||||||
|
get_iface_name_str(i), errno );
|
||||||
return (FAIL_SOCKET_CREATE);
|
return (FAIL_SOCKET_CREATE);
|
||||||
}
|
}
|
||||||
dlog("Opened multicast transmit socket\n" );
|
|
||||||
|
|
||||||
/* In order to avoid multicast packets being routed wrong, force sending from that socket */
|
/* In order to avoid multicast packets being routed wrong, force sending from that socket */
|
||||||
hbs_sock.tx_sock[i]->interfaceBind();
|
hbs_sock.tx_sock[i]->interfaceBind();
|
||||||
@ -614,8 +660,10 @@ int _setup_pulse_messaging ( iface_enum i, int rmem_max )
|
|||||||
hbs_sock.rx_sock[i] = new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true);
|
hbs_sock.rx_sock[i] = new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true);
|
||||||
if (( hbs_sock.rx_sock[i] == NULL ) || (hbs_sock.rx_sock[i]->return_status != PASS ))
|
if (( hbs_sock.rx_sock[i] == NULL ) || (hbs_sock.rx_sock[i]->return_status != PASS ))
|
||||||
{
|
{
|
||||||
elog("Failed opening pulse receive socket (%d:%s)\n",
|
elog("Failed to create %s pulse receive socket (%d:%d:%m)\n",
|
||||||
errno, strerror (errno));
|
get_iface_name_str(i),
|
||||||
|
hbs_sock.rx_sock[i]->return_status,
|
||||||
|
errno );
|
||||||
rc = FAIL_SOCKET_CREATE ;
|
rc = FAIL_SOCKET_CREATE ;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -948,7 +996,11 @@ int hbs_pulse_request ( iface_enum iface,
|
|||||||
|
|
||||||
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
|
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
|
||||||
{
|
{
|
||||||
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
|
/* Throttle this error log. */
|
||||||
|
elog_throttled( pulse_request_fail_log_counter[iface], 100,
|
||||||
|
"Failed to send %s Pulse request: " \
|
||||||
|
"%d:%s to %s.%d (rc:%i ; %d:%s)\n",
|
||||||
|
get_iface_name_str(iface),
|
||||||
hbs_sock.tx_mesg[iface].s,
|
hbs_sock.tx_mesg[iface].s,
|
||||||
&hbs_sock.tx_mesg[iface].m[0],
|
&hbs_sock.tx_mesg[iface].m[0],
|
||||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||||
@ -959,7 +1011,9 @@ int hbs_pulse_request ( iface_enum iface,
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
wlog("Unable to send pulse request - null tx object - auto re-init pending\n");
|
elog_throttled( pulse_request_fail_log_counter[iface], 100,
|
||||||
|
"Unable to send %s pulse request on null socket",
|
||||||
|
get_iface_name_str(iface));
|
||||||
return (FAIL_SOCKET_SENDTO);
|
return (FAIL_SOCKET_SENDTO);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1448,10 +1502,6 @@ void daemon_service_run ( void )
|
|||||||
* ultimately triggers an exit if that retry count gets too big */
|
* ultimately triggers an exit if that retry count gets too big */
|
||||||
int socket_init_fail_count = 0 ;
|
int socket_init_fail_count = 0 ;
|
||||||
|
|
||||||
/* Used to throttle warning messages that report
|
|
||||||
* an error transmitting the pulse request */
|
|
||||||
int pulse_request_fail_log_counter[MAX_IFACES] ;
|
|
||||||
|
|
||||||
/* throttle initialization wait logs */
|
/* throttle initialization wait logs */
|
||||||
int wait_log_throttle = 0 ;
|
int wait_log_throttle = 0 ;
|
||||||
|
|
||||||
@ -1561,38 +1611,6 @@ void daemon_service_run ( void )
|
|||||||
daemon_exit();
|
daemon_exit();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Setup the heartbeat sockets */
|
|
||||||
if ( (rc = hbs_socket_init ()) != PASS )
|
|
||||||
{
|
|
||||||
if ( socket_init_fail_count++ == 10 )
|
|
||||||
{
|
|
||||||
elog ("Failed socket initialization (rc:%d) max retries ; exiting ...\n", rc );
|
|
||||||
daemon_exit ();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
elog ("Failed socket initialization (rc:%d) ; will retry in 5 secs ...\n", rc );
|
|
||||||
sleep (5);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ilog ("Sending ready event to maintenance\n");
|
|
||||||
do
|
|
||||||
{
|
|
||||||
/* Wait for maintenance */
|
|
||||||
rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ;
|
|
||||||
if ( rc == RETRY )
|
|
||||||
{
|
|
||||||
mtcWait_secs ( 3 );
|
|
||||||
}
|
|
||||||
} while ( rc == RETRY ) ;
|
|
||||||
if ( rc == FAIL )
|
|
||||||
{
|
|
||||||
elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc );
|
|
||||||
daemon_exit ();
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) )
|
if ( get_link_state ( hbs_sock.ioctl_sock, hbs_config.mgmnt_iface, &hbsInv.mgmnt_link_up_and_running ) )
|
||||||
{
|
{
|
||||||
hbsInv.mgmnt_link_up_and_running = false ;
|
hbsInv.mgmnt_link_up_and_running = false ;
|
||||||
@ -1616,6 +1634,45 @@ void daemon_service_run ( void )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Setup the heartbeat sockets */
|
||||||
|
if ( (rc = hbs_socket_init ()) != PASS )
|
||||||
|
{
|
||||||
|
#define HBS_SOCKET_INIT_RETRY_THRESHOLD (3)
|
||||||
|
#define HBS_SOCKET_INIT_RETRY_INTERVAL (2)
|
||||||
|
if ( socket_init_fail_count++ == HBS_SOCKET_INIT_RETRY_THRESHOLD )
|
||||||
|
{
|
||||||
|
elog ("Failed socket initialization (rc:%d) "
|
||||||
|
"max retries ; exiting ...", rc );
|
||||||
|
daemon_exit ();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
elog ("Failed socket initialization (rc:%d) ; "
|
||||||
|
"will retry in %d secs ...\n",
|
||||||
|
rc, HBS_SOCKET_INIT_RETRY_INTERVAL);
|
||||||
|
sleep (HBS_SOCKET_INIT_RETRY_INTERVAL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ilog ("Sending ready event to maintenance\n");
|
||||||
|
do
|
||||||
|
{
|
||||||
|
/* Wait for maintenance */
|
||||||
|
rc = send_event ( hbsInv.my_hostname, MTC_EVENT_HEARTBEAT_READY, MGMNT_IFACE ) ;
|
||||||
|
if ( rc == RETRY )
|
||||||
|
{
|
||||||
|
// TODO: Threshold this loop and exit or this
|
||||||
|
// could be a silent process failure loop.
|
||||||
|
mtcWait_secs ( 3 );
|
||||||
|
}
|
||||||
|
} while ( rc == RETRY ) ;
|
||||||
|
if ( rc == FAIL )
|
||||||
|
{
|
||||||
|
elog ("Unrecoverable heartbeat startup error (rc=%d)\n", rc );
|
||||||
|
daemon_exit ();
|
||||||
|
}
|
||||||
|
|
||||||
/* Make the main loop schedule in real-time */
|
/* Make the main loop schedule in real-time */
|
||||||
{
|
{
|
||||||
struct sched_param param ;
|
struct sched_param param ;
|
||||||
@ -1720,7 +1777,8 @@ void daemon_service_run ( void )
|
|||||||
counter = 1 ;
|
counter = 1 ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ( hbsInv.hbs_disabled == true )
|
else if (( hbsInv.hbs_disabled == true ) &&
|
||||||
|
( hbsInv.mgmnt_link_up_and_running == true ))
|
||||||
{
|
{
|
||||||
hbs_ctrl.locked = false ;
|
hbs_ctrl.locked = false ;
|
||||||
hbsInv.hbs_disabled = false;
|
hbsInv.hbs_disabled = false;
|
||||||
@ -2191,12 +2249,11 @@ void daemon_service_run ( void )
|
|||||||
if ( rc != 0 )
|
if ( rc != 0 )
|
||||||
{
|
{
|
||||||
/* TODO: Fix this with an alarm */
|
/* TODO: Fix this with an alarm */
|
||||||
wlog_throttled ( pulse_request_fail_log_counter[iface], 100,
|
if ( pulse_request_fail_log_counter[iface] > INTERFACE_ERRORS_FOR_REINIT )
|
||||||
"%s hbs_pulse_request failed - rc:%d\n", get_iface_name_str(iface), rc);
|
|
||||||
|
|
||||||
if ( pulse_request_fail_log_counter[iface] == INTERFACE_ERRORS_FOR_REINIT )
|
|
||||||
{
|
{
|
||||||
_setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
|
rc = _setup_pulse_messaging ( (iface_enum)iface , daemon_get_rmem_max ()) ;
|
||||||
|
if ( rc )
|
||||||
|
continue ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
|
* Copyright (c) 2013-2020 Wind River Systems, Inc.
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
*
|
*
|
||||||
@ -436,11 +436,14 @@ int daemon_configure ( void )
|
|||||||
if ( strlen(hbs_config.clstr_iface) )
|
if ( strlen(hbs_config.clstr_iface) )
|
||||||
{
|
{
|
||||||
if (strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
|
if (strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface))
|
||||||
|
{
|
||||||
|
if (strcmp(hbs_config.clstr_iface, LOOPBACK_IF))
|
||||||
{
|
{
|
||||||
clstr_network_provisioned = true ;
|
clstr_network_provisioned = true ;
|
||||||
ilog ("Cluster-host Name : %s\n", hbs_config.clstr_iface );
|
ilog ("Cluster-host Name : %s\n", hbs_config.clstr_iface );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if ( clstr_network_provisioned == true )
|
if ( clstr_network_provisioned == true )
|
||||||
{
|
{
|
||||||
ilog("Cluster-host Port : %d (rx)", hbs_config.hbs_client_clstr_port );
|
ilog("Cluster-host Port : %d (rx)", hbs_config.hbs_client_clstr_port );
|
||||||
@ -476,44 +479,80 @@ int _setup_pulse_messaging ( iface_enum i, int rmem )
|
|||||||
/* client sockets are not modified */
|
/* client sockets are not modified */
|
||||||
UNUSED(rmem);
|
UNUSED(rmem);
|
||||||
|
|
||||||
/* Load up the interface name */
|
|
||||||
if ( i == MGMNT_IFACE )
|
|
||||||
{
|
|
||||||
iface = hbs_config.mgmnt_iface ;
|
|
||||||
}
|
|
||||||
else if (( i == CLSTR_IFACE ) && ( hbs_config.clstr_iface != NULL ))
|
|
||||||
{
|
|
||||||
iface = hbs_config.clstr_iface ;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
wlog ("No Cluster-host Interface\n");
|
|
||||||
return (RETRY);
|
|
||||||
}
|
|
||||||
|
|
||||||
_close_pulse_rx_sock (i);
|
_close_pulse_rx_sock (i);
|
||||||
_close_pulse_tx_sock (i);
|
_close_pulse_tx_sock (i);
|
||||||
|
|
||||||
/********************************************************************/
|
/********************************************************************/
|
||||||
/* Setup multicast Pulse Request Receive Socket */
|
/* Setup multicast Pulse Request Receive Socket */
|
||||||
/********************************************************************/
|
/********************************************************************/
|
||||||
|
/* Load up the interface name */
|
||||||
|
if ( i == MGMNT_IFACE )
|
||||||
|
{
|
||||||
|
iface = hbs_config.mgmnt_iface ;
|
||||||
|
if (strcmp(iface, LOOPBACK_IF))
|
||||||
|
{
|
||||||
hbs_sock.rx_sock[i] =
|
hbs_sock.rx_sock[i] =
|
||||||
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
|
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Default to unicast heartbeat on management 'lo' interface
|
||||||
|
hbs_sock.rx_sock[i] =
|
||||||
|
new msgClassRx(my_address.data(),hbs_sock.rx_port[i],IPPROTO_UDP,iface,false, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
else if (( i == CLSTR_IFACE ) &&
|
||||||
|
( clstr_network_provisioned == true ) &&
|
||||||
|
( hbs_config.clstr_iface != NULL ))
|
||||||
|
{
|
||||||
|
iface = hbs_config.clstr_iface ;
|
||||||
|
hbs_sock.rx_sock[i] =
|
||||||
|
new msgClassRx(hbs_config.multicast,hbs_sock.rx_port[i],IPPROTO_UDP,iface,true,true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ilog("Cluster host interface not used.");
|
||||||
|
return (PASS);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( hbs_sock.rx_sock[i] )
|
||||||
|
{
|
||||||
if (hbs_sock.rx_sock[i]->return_status != PASS)
|
if (hbs_sock.rx_sock[i]->return_status != PASS)
|
||||||
{
|
{
|
||||||
elog("Cannot create socket (%d) (%d:%m)\n", i, errno );
|
elog("Failed to create %s pulse receiver socket (%d:%d:%m)\n",
|
||||||
|
get_iface_name_str(i),
|
||||||
|
hbs_sock.rx_sock[i]->return_status,
|
||||||
|
errno );
|
||||||
_close_pulse_rx_sock (i);
|
_close_pulse_rx_sock (i);
|
||||||
return (FAIL_SOCKET_CREATE);
|
return (FAIL_SOCKET_CREATE);
|
||||||
}
|
}
|
||||||
hbs_sock.rx_sock[i]->sock_ok(true);
|
hbs_sock.rx_sock[i]->sock_ok(true);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
elog("Failed to create %s pulse receiver socket (%d:%m)\n",
|
||||||
|
get_iface_name_str(i), errno );
|
||||||
|
return (FAIL_SOCKET_CREATE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/********************************************************************/
|
||||||
/* Setup unicast transmit (reply) socket */
|
/* Setup unicast transmit (reply) socket */
|
||||||
|
/********************************************************************/
|
||||||
hbs_sock.tx_sock[i] =
|
hbs_sock.tx_sock[i] =
|
||||||
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP, iface);
|
new msgClassTx(hbs_config.multicast,hbs_sock.tx_port[i],IPPROTO_UDP, iface);
|
||||||
|
if ( hbs_sock.tx_sock[i] == NULL )
|
||||||
|
{
|
||||||
|
elog("Failed to create %s pulse reply socket (%d:%m)\n",
|
||||||
|
get_iface_name_str(i), errno );
|
||||||
|
return (FAIL_SOCKET_CREATE);
|
||||||
|
}
|
||||||
if (hbs_sock.tx_sock[i]->return_status != PASS)
|
if (hbs_sock.tx_sock[i]->return_status != PASS)
|
||||||
{
|
{
|
||||||
elog("Cannot create unicast transmit socket (%d) (%d:%m)\n", i, errno );
|
elog("Failed to create %s pulse reply socket (%d:%d:%m)\n",
|
||||||
|
get_iface_name_str(i),
|
||||||
|
hbs_sock.tx_sock[i]->return_status,
|
||||||
|
errno );
|
||||||
_close_pulse_tx_sock(i);
|
_close_pulse_tx_sock(i);
|
||||||
return (FAIL_SOCKET_CREATE);
|
return (FAIL_SOCKET_CREATE);
|
||||||
}
|
}
|
||||||
@ -1234,7 +1273,7 @@ int daemon_init ( string iface, string nodeType_str )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Setup the heartbeat service messaging sockets */
|
/* Setup the heartbeat service messaging sockets */
|
||||||
else if ( hbs_socket_init () != PASS )
|
else if (( rc = hbs_socket_init ()) != PASS )
|
||||||
{
|
{
|
||||||
elog ("socket initialization failed (rc:%d)\n", rc );
|
elog ("socket initialization failed (rc:%d)\n", rc );
|
||||||
rc = FAIL_SOCKET_INIT;
|
rc = FAIL_SOCKET_INIT;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user