Fix BMC access loss handling

Recent refactoring of the BMC handler FSM introduced a code change that
prevents the BMC Access alarm from being raised after initial BMC
accessibility was established and is then lost.

This update ensures BMC access alarm management is working properly.

This update also implements ping failure debounce so that a single ping
failure does not trigger full reconnection handling. Instead that now
requires 3 ping failures in a row. This has the effect of adding a minute
to ping failure action handling before the usual 2 minute BMC access failure
alarm is raised. ping failure logging is reduced/improved.

Test Plan: for both hwmond and mtcAgent

PASS: Verify BMC access alarm due to bad provisioning (un, pw, ip, type)
PASS: Verify BMC ping failure debounce handling, recovery and logging
PASS: Verify BMC ping persistent failure handling
PASS: Verify BMC ping periodic miss handling
PASS: Verify BMC ping and access failure recovery timing
PASS: Verify BMC ping failure and recovery handling over BMC link pull/plug
PASS: Verify BMC sensor monitoring stops/resumes over ping failure/recovery

Regression:

PASS: Verify IPv6 System Install using provisioned BMCs (wp8-12)
PASS: Verify BMC power-off request handling with BMC ping failing & recovering
PASS: Verify BMC power-on request handling with BMC ping failing & recovering
PASS: Verify BMC reset request handling with BMC ping failing & recovering
PASS: Verify BMC sensor group read failure handling & recovery
PASS: Verify sensor monitoring after ping failure handling & recovery

Change-Id: I74870816930ef6cdb11f987424ffed300ff8affe
Closes-Bug: 1858110
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
(cherry picked from commit 9bf231a286)
This commit is contained in:
Eric MacDonald 2020-01-03 09:34:37 -05:00
parent be3cf4eeb5
commit eaf1f0c93d
10 changed files with 82 additions and 92 deletions

View File

@ -1041,19 +1041,25 @@ int msgClassRx::initSocket(bool allow_any, bool is_multicast)
* @param IP protocol
* @param name of interface to bind to, or null otherwise
*/
msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface)
msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface, bool quiet)
{
this->dst_addr = new msgClassAddr(address, port, proto);
this->src_addr = new msgClassAddr(address, port, proto);
if(interface)
{
ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address);
if ( !quiet)
{
ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address);
}
this->interface = new char[strlen(interface)+1];
snprintf(this->interface, strlen(interface)+1, "%s", interface);
}
else
{
ilog ("Creating socket on port %d with address: %s\n", port, address);
if ( !quiet)
{
ilog ("Creating socket on port %d with address: %s\n", port, address);
}
this->interface = NULL;
}
this->return_status = initSocket();

View File

@ -207,7 +207,7 @@ private:
class msgClassTx : public msgClassSock
{
public:
msgClassTx(const char* address, int port, int proto, const char* interface=NULL);
msgClassTx(const char* address, int port, int proto, const char* interface=NULL, bool quiet=false);
msgClassTx(const msgClassAddr& addr, const char* interface=NULL);
private:
int initSocket();

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017,2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -89,6 +89,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip
ping_info.sequence = getpid() ;
ping_info.recv_retries = 0 ;
ping_info.send_retries = 0 ;
ping_info.fail_debounce = 0 ;
ping_info.requested= false ;
ping_info.received = false ;
ping_info.recv_flush_highwater=2;
@ -97,7 +98,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip
ping_info.monitoring = false ;
ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ;
ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL );
ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL, true);
/* Validate the socket setup */
if ( ping_info.sock == NULL )
@ -194,7 +195,6 @@ int pingUtil_send ( ping_info_type & ping_info )
}
ping_info.sequence++ ;
ping_info.recv_retries = 0;
if ( ping_info.ipv6_mode == false )
{
@ -238,8 +238,6 @@ int pingUtil_send ( ping_info_type & ping_info )
bytes = ping_info.sock->write( (const char*)&ping6_tx, sizeof(ping6_tx_message_type));
}
ping_info.recv_retries = 0;
if ( bytes <= 0 )
{
wlog ("%s ping %s send failed (rc:%d) (%d:%m)\n", ping_info.hostname.c_str(), ping_info.ip.c_str(), bytes, errno );
@ -508,10 +506,19 @@ int pingUtil_recv ( ping_info_type & ping_info,
if ( ping_info.monitoring == false )
{
/* ... only want the log when we are first connecting */
ilog ("%s ping to %s ok ; (try %d)",
if ( ping_info.recv_retries )
{
ilog ("%s ping %s ok ; (try %d)",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.recv_retries+1);
}
else
{
ilog ("%s ping %s ok",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.recv_retries+1);
ping_info.ip.c_str());
}
}
else
{
@ -575,6 +582,7 @@ void pingUtil_fini ( ping_info_type & ping_info )
ping_info.recv_retries = 0;
ping_info.send_retries = 0;
ping_info.fail_debounce = 0;
ping_info.sequence = 0;
ping_info.identity = 0;
@ -585,10 +593,7 @@ void pingUtil_fini ( ping_info_type & ping_info )
void pingUtil_restart ( ping_info_type & ping_info )
{
ilog ("%s ping monitor restart", ping_info.hostname.c_str());
ping_info.ok = false ;
ping_info.send_retries = 0 ;
ping_info.monitoring = false ;
dlog ("%s ping monitor restart", ping_info.hostname.c_str());
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
@ -668,7 +673,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
if ( ++ping_info.send_retries > PING_MAX_SEND_RETRIES )
{
elog ("%s ping to %s failed\n",
dlog ("%s ping %s failed ; max send retries\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
@ -676,7 +681,9 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
}
else if ( pingUtil_send ( ping_info ) )
{
elog ("%s failed to send bmc ping\n", ping_info.hostname.c_str());
wlog ("%s ping %s send failed\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
}
else
@ -694,6 +701,10 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
mtcTimer_reset ( ping_info.timer );
}
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_WAIT_TIMER_MSEC );
/* send was ok so clear its retry counter */
ping_info.send_retries = 0 ;
ping_info.stage = PINGUTIL_MONITOR_STAGE__RECV ;
}
}
@ -711,25 +722,17 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
{
if ( ++ping_info.recv_retries > (PING_MAX_RECV_RETRIES) )
{
/* only print this log once on the resend attempt */
if ( ping_info.send_retries >= PING_MAX_SEND_RETRIES )
{
mlog ("%s ping recv from %s missed ; identity:%04x sequence:%04x (try %d of %d)\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.identity,
ping_info.sequence,
ping_info.recv_retries-1,
PING_MAX_RECV_RETRIES);
}
ping_info.stage = PINGUTIL_MONITOR_STAGE__SEND ;
break ;
ping_info.recv_retries = 0;
dlog ("%s ping %s failed ; max recv retries\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
}
else
{
blog1 ("%s retrying ping\n", ping_info.hostname.c_str());
blog1 ("%s retrying ping\n", ping_info.hostname.c_str());
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS );
}
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS );
}
else
{
@ -746,13 +749,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
ping_info.send_retries = 0 ;
ping_info.recv_retries = 0 ;
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_HOST, ping_info.hostname ) == true )
interval = 3 ;
if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_ALL ) == true )
interval = 3 ;
#endif
ping_info.fail_debounce = 0 ;
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, interval );
ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT ;
}
@ -766,14 +763,30 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
}
case PINGUTIL_MONITOR_STAGE__FAIL:
{
ping_info.ok = false ;
ping_info.send_retries = 0 ;
ping_info.monitoring = false ;
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
// Assume socket reinit unless thresholding */
bool reinit = true ;
if ( ping_info.ok == true )
{
if ( ++ping_info.fail_debounce < PING_FAIL_DEBOUNCE_THLD )
{
reinit = false ;
wlog("%s ping %s miss (%d of %d)",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.fail_debounce,
PING_FAIL_DEBOUNCE_THLD );
}
}
if ( reinit == true )
{
elog("%s ping %s fail", ping_info.hostname.c_str(),
ping_info.ip.c_str());
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
}
mtcTimer_reset ( ping_info.timer );
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_MONITOR_INTERVAL );
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_FAIL_RETRY_DELAY );
ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT;
break ;
}

View File

@ -2,7 +2,7 @@
#define __INCLUDE_PINGUTIL_H__
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017,2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -47,8 +47,8 @@ using namespace std;
#define PING_WAIT_TIMER_MSEC (200)
#define PING_RETRY_DELAY_MSECS (200)
#define PING_MONITOR_INTERVAL (60)
#define PING_MISS_RETRY_DELAY (5)
#define PING_FAIL_RETRY_DELAY (30)
#define PING_FAIL_DEBOUNCE_THLD (3)
#define PING_MESSAGE_LEN (80)
typedef enum
@ -73,6 +73,7 @@ typedef struct
unsigned short sequence ;
int send_retries ;
int recv_retries ;
int fail_debounce ;
bool ipv6_mode ;
bool received ;
bool requested ;

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=156
TIS_PATCH_VER=157
BUILD_IS_SLOW=5

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -4322,7 +4322,9 @@ void nodeLinkClass::bmc_access_data_init ( struct nodeLinkClass::node * node_ptr
node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ;
mtcTimer_reset ( node_ptr->bm_ping_info.timer );
node_ptr->bm_ping_info.timer_handler = &mtcTimer_handler ;
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
node_ptr->bmc_protocol_learning = false ;

View File

@ -85,6 +85,9 @@ void hwmonHostClass::bmc_data_init ( struct hwmonHostClass::hwmon_host * host_pt
host_ptr->accessible = false;
host_ptr->degraded = false ;
host_ptr->ping_info.ip = host_ptr->bm_ip ;
host_ptr->ping_info.hostname = host_ptr->hostname ;
hwmon_del_groups ( host_ptr );
hwmon_del_sensors ( host_ptr );

View File

@ -96,25 +96,9 @@ void hwmonHostClass::hwmon_fsm ( void )
pingUtil_acc_monitor ( host_ptr->ping_info );
if ( host_ptr->ping_info.ok == false )
{
/* Auto correct key ping information ; should never occur but if it does ... */
if (( host_ptr->ping_info.hostname.empty()) ||
( hostUtil_is_valid_ip_addr(host_ptr->ping_info.ip ) == false ))
{
slog ("%s host ping info missing ; (%d:%d)\n",
host_ptr->hostname.c_str(),
host_ptr->ping_info.hostname.empty(),
host_ptr->ping_info.ip.empty());
host_ptr->ping_info.hostname = host_ptr->hostname ;
host_ptr->ping_info.ip = host_ptr->bm_ip ;
}
}
/* Check to see if sensor monitoring for this host is
* disabled or the bm password has not yet been learned */
else if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty()))
if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty()))
{
/* ... make sure the thread sits in the
* idle state while disabled or there

View File

@ -1165,12 +1165,12 @@ int hwmonHostClass::bmc_sensor_monitor ( struct hwmonHostClass::hwmon_host * hos
{
if ( host_ptr->bmc_thread_info.status == FAIL_SYSTEM_CALL )
{
elog ("%s '%s' system call failed (retry %d of %d) (%s)",
elog ("%s '%s' system call failed (retry %d of %d)",
host_ptr->bmc_thread_info.log_prefix,
bmcUtil_getCmd_str(
host_ptr->bmc_thread_info.command).c_str(),
host_ptr->bmc_thread_ctrl.retries,
MAX_THREAD_RETRIES, host_ptr->thread_extra_info.bm_pw.c_str());
MAX_THREAD_RETRIES);
}
else if (( host_ptr->bmc_thread_ctrl.retries == (MAX_THREAD_RETRIES-1)) ||
( host_ptr->bmc_thread_ctrl.retries == 1 ))

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2018 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -3901,7 +3901,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false )
{
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds \n",
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY);
@ -6185,29 +6185,10 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
pingUtil_acc_monitor ( node_ptr->bm_ping_info );
}
/*****************************************************************
* Manage bmc creds refresh
****************************************************************/
if ( node_ptr->bm_ping_info.ok == false )
{
/* Auto correct key ping information ;
* should never occur but if it does ... */
if (( node_ptr->bm_ping_info.hostname.empty()) ||
( node_ptr->bm_ping_info.ip.empty()))
{
node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
}
if ( ! node_ptr->bm_pw.empty() )
{
node_ptr->bm_pw.clear();
}
}
/*****************************************************************
* Manage getting the bm password but only when ping is ok
****************************************************************/
else if ( node_ptr->bm_pw.empty() )
if ( node_ptr->bm_pw.empty() )
{
barbicanSecret_type * secret = secretUtil_manage_secret( node_ptr->secretEvent,
node_ptr->hostname,