Merge "Fix BMC access loss handling"

This commit is contained in:
Zuul 2020-01-13 22:29:15 +00:00 committed by Gerrit Code Review
commit 46c6b245df
10 changed files with 82 additions and 92 deletions

View File

@ -1041,19 +1041,25 @@ int msgClassRx::initSocket(bool allow_any, bool is_multicast)
* @param IP protocol
* @param name of interface to bind to, or null otherwise
*/
msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface)
msgClassTx::msgClassTx(const char* address, int port, int proto, const char* interface, bool quiet)
{
this->dst_addr = new msgClassAddr(address, port, proto);
this->src_addr = new msgClassAddr(address, port, proto);
if(interface)
{
ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address);
if ( !quiet)
{
ilog ("Creating %s socket on port %d with address: %s\n", interface, port, address);
}
this->interface = new char[strlen(interface)+1];
snprintf(this->interface, strlen(interface)+1, "%s", interface);
}
else
{
ilog ("Creating socket on port %d with address: %s\n", port, address);
if ( !quiet)
{
ilog ("Creating socket on port %d with address: %s\n", port, address);
}
this->interface = NULL;
}
this->return_status = initSocket();

View File

@ -207,7 +207,7 @@ private:
class msgClassTx : public msgClassSock
{
public:
msgClassTx(const char* address, int port, int proto, const char* interface=NULL);
msgClassTx(const char* address, int port, int proto, const char* interface=NULL, bool quiet=false);
msgClassTx(const msgClassAddr& addr, const char* interface=NULL);
private:
int initSocket();

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017,2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -89,6 +89,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip
ping_info.sequence = getpid() ;
ping_info.recv_retries = 0 ;
ping_info.send_retries = 0 ;
ping_info.fail_debounce = 0 ;
ping_info.requested= false ;
ping_info.received = false ;
ping_info.recv_flush_highwater=2;
@ -97,7 +98,7 @@ int pingUtil_init ( string hostname, ping_info_type & ping_info, const char * ip
ping_info.monitoring = false ;
ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ;
ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL );
ping_info.sock = new msgClassTx(ip_address, 0, IPPROTO_RAW, NULL, true);
/* Validate the socket setup */
if ( ping_info.sock == NULL )
@ -194,7 +195,6 @@ int pingUtil_send ( ping_info_type & ping_info )
}
ping_info.sequence++ ;
ping_info.recv_retries = 0;
if ( ping_info.ipv6_mode == false )
{
@ -238,8 +238,6 @@ int pingUtil_send ( ping_info_type & ping_info )
bytes = ping_info.sock->write( (const char*)&ping6_tx, sizeof(ping6_tx_message_type));
}
ping_info.recv_retries = 0;
if ( bytes <= 0 )
{
wlog ("%s ping %s send failed (rc:%d) (%d:%m)\n", ping_info.hostname.c_str(), ping_info.ip.c_str(), bytes, errno );
@ -508,10 +506,19 @@ int pingUtil_recv ( ping_info_type & ping_info,
if ( ping_info.monitoring == false )
{
/* ... only want the log when we are first connecting */
ilog ("%s ping to %s ok ; (try %d)",
if ( ping_info.recv_retries )
{
ilog ("%s ping %s ok ; (try %d)",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.recv_retries+1);
}
else
{
ilog ("%s ping %s ok",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.recv_retries+1);
ping_info.ip.c_str());
}
}
else
{
@ -575,6 +582,7 @@ void pingUtil_fini ( ping_info_type & ping_info )
ping_info.recv_retries = 0;
ping_info.send_retries = 0;
ping_info.fail_debounce = 0;
ping_info.sequence = 0;
ping_info.identity = 0;
@ -585,10 +593,7 @@ void pingUtil_fini ( ping_info_type & ping_info )
void pingUtil_restart ( ping_info_type & ping_info )
{
ilog ("%s ping monitor restart", ping_info.hostname.c_str());
ping_info.ok = false ;
ping_info.send_retries = 0 ;
ping_info.monitoring = false ;
dlog ("%s ping monitor restart", ping_info.hostname.c_str());
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
@ -668,7 +673,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
if ( ++ping_info.send_retries > PING_MAX_SEND_RETRIES )
{
elog ("%s ping to %s failed\n",
dlog ("%s ping %s failed ; max send retries\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
@ -676,7 +681,9 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
}
else if ( pingUtil_send ( ping_info ) )
{
elog ("%s failed to send bmc ping\n", ping_info.hostname.c_str());
wlog ("%s ping %s send failed\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
}
else
@ -694,6 +701,10 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
mtcTimer_reset ( ping_info.timer );
}
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_WAIT_TIMER_MSEC );
/* send was ok so clear its retry counter */
ping_info.send_retries = 0 ;
ping_info.stage = PINGUTIL_MONITOR_STAGE__RECV ;
}
}
@ -711,25 +722,17 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
{
if ( ++ping_info.recv_retries > (PING_MAX_RECV_RETRIES) )
{
/* only print this log once on the resend attempt */
if ( ping_info.send_retries >= PING_MAX_SEND_RETRIES )
{
mlog ("%s ping recv from %s missed ; identity:%04x sequence:%04x (try %d of %d)\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.identity,
ping_info.sequence,
ping_info.recv_retries-1,
PING_MAX_RECV_RETRIES);
}
ping_info.stage = PINGUTIL_MONITOR_STAGE__SEND ;
break ;
ping_info.recv_retries = 0;
dlog ("%s ping %s failed ; max recv retries\n",
ping_info.hostname.c_str(),
ping_info.ip.c_str());
ping_info.stage = PINGUTIL_MONITOR_STAGE__FAIL ;
}
else
{
blog1 ("%s retrying ping\n", ping_info.hostname.c_str());
blog1 ("%s retrying ping\n", ping_info.hostname.c_str());
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS );
}
mtcTimer_start_msec ( ping_info.timer, ping_info.timer_handler, PING_RETRY_DELAY_MSECS );
}
else
{
@ -746,13 +749,7 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
ping_info.send_retries = 0 ;
ping_info.recv_retries = 0 ;
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_HOST, ping_info.hostname ) == true )
interval = 3 ;
if ( daemon_want_fit ( FIT_CODE__FAST_PING_AUDIT_ALL ) == true )
interval = 3 ;
#endif
ping_info.fail_debounce = 0 ;
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, interval );
ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT ;
}
@ -766,14 +763,30 @@ int pingUtil_acc_monitor ( ping_info_type & ping_info )
}
case PINGUTIL_MONITOR_STAGE__FAIL:
{
ping_info.ok = false ;
ping_info.send_retries = 0 ;
ping_info.monitoring = false ;
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
// Assume socket reinit unless thresholding */
bool reinit = true ;
if ( ping_info.ok == true )
{
if ( ++ping_info.fail_debounce < PING_FAIL_DEBOUNCE_THLD )
{
reinit = false ;
wlog("%s ping %s miss (%d of %d)",
ping_info.hostname.c_str(),
ping_info.ip.c_str(),
ping_info.fail_debounce,
PING_FAIL_DEBOUNCE_THLD );
}
}
if ( reinit == true )
{
elog("%s ping %s fail", ping_info.hostname.c_str(),
ping_info.ip.c_str());
pingUtil_fini (ping_info);
pingUtil_init (ping_info.hostname, ping_info, ping_info.ip.data());
}
mtcTimer_reset ( ping_info.timer );
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_MONITOR_INTERVAL );
mtcTimer_start ( ping_info.timer, ping_info.timer_handler, PING_FAIL_RETRY_DELAY );
ping_info.stage = PINGUTIL_MONITOR_STAGE__WAIT;
break ;
}

View File

@ -2,7 +2,7 @@
#define __INCLUDE_PINGUTIL_H__
/*
* Copyright (c) 2015-2017 Wind River Systems, Inc.
* Copyright (c) 2015-2017,2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -47,8 +47,8 @@ using namespace std;
#define PING_WAIT_TIMER_MSEC (200)
#define PING_RETRY_DELAY_MSECS (200)
#define PING_MONITOR_INTERVAL (60)
#define PING_MISS_RETRY_DELAY (5)
#define PING_FAIL_RETRY_DELAY (30)
#define PING_FAIL_DEBOUNCE_THLD (3)
#define PING_MESSAGE_LEN (80)
typedef enum
@ -73,6 +73,7 @@ typedef struct
unsigned short sequence ;
int send_retries ;
int recv_retries ;
int fail_debounce ;
bool ipv6_mode ;
bool received ;
bool requested ;

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=156
TIS_PATCH_VER=157
BUILD_IS_SLOW=5

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -4322,7 +4322,9 @@ void nodeLinkClass::bmc_access_data_init ( struct nodeLinkClass::node * node_ptr
node_ptr->bm_ping_info.stage = PINGUTIL_MONITOR_STAGE__OPEN ;
mtcTimer_reset ( node_ptr->bm_ping_info.timer );
node_ptr->bm_ping_info.timer_handler = &mtcTimer_handler ;
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
node_ptr->bmc_protocol_learning = false ;

View File

@ -85,6 +85,9 @@ void hwmonHostClass::bmc_data_init ( struct hwmonHostClass::hwmon_host * host_pt
host_ptr->accessible = false;
host_ptr->degraded = false ;
host_ptr->ping_info.ip = host_ptr->bm_ip ;
host_ptr->ping_info.hostname = host_ptr->hostname ;
hwmon_del_groups ( host_ptr );
hwmon_del_sensors ( host_ptr );

View File

@ -96,25 +96,9 @@ void hwmonHostClass::hwmon_fsm ( void )
pingUtil_acc_monitor ( host_ptr->ping_info );
if ( host_ptr->ping_info.ok == false )
{
/* Auto correct key ping information ; should never occur but if it does ... */
if (( host_ptr->ping_info.hostname.empty()) ||
( hostUtil_is_valid_ip_addr(host_ptr->ping_info.ip ) == false ))
{
slog ("%s host ping info missing ; (%d:%d)\n",
host_ptr->hostname.c_str(),
host_ptr->ping_info.hostname.empty(),
host_ptr->ping_info.ip.empty());
host_ptr->ping_info.hostname = host_ptr->hostname ;
host_ptr->ping_info.ip = host_ptr->bm_ip ;
}
}
/* Check to see if sensor monitoring for this host is
* disabled or the bm password has not yet been learned */
else if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty()))
if (( host_ptr->monitor == false ) || ( host_ptr->bm_pw.empty()))
{
/* ... make sure the thread sits in the
* idle state while disabled or there

View File

@ -1165,12 +1165,12 @@ int hwmonHostClass::bmc_sensor_monitor ( struct hwmonHostClass::hwmon_host * hos
{
if ( host_ptr->bmc_thread_info.status == FAIL_SYSTEM_CALL )
{
elog ("%s '%s' system call failed (retry %d of %d) (%s)",
elog ("%s '%s' system call failed (retry %d of %d)",
host_ptr->bmc_thread_info.log_prefix,
bmcUtil_getCmd_str(
host_ptr->bmc_thread_info.command).c_str(),
host_ptr->bmc_thread_ctrl.retries,
MAX_THREAD_RETRIES, host_ptr->thread_extra_info.bm_pw.c_str());
MAX_THREAD_RETRIES);
}
else if (( host_ptr->bmc_thread_ctrl.retries == (MAX_THREAD_RETRIES-1)) ||
( host_ptr->bmc_thread_ctrl.retries == 1 ))

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2018 Wind River Systems, Inc.
* Copyright (c) 2013-2020 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -3901,7 +3901,7 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
if ( node_ptr->bmc_accessible == false )
{
wlog ("%s Power Off request rejected ; BMC not accessible ; retry in %d seconds \n",
wlog ("%s Reset request rejected ; BMC not accessible ; retry in %d seconds \n",
node_ptr->hostname.c_str(),
MTC_POWER_ACTION_RETRY_DELAY);
@ -6185,29 +6185,10 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
pingUtil_acc_monitor ( node_ptr->bm_ping_info );
}
/*****************************************************************
* Manage bmc creds refresh
****************************************************************/
if ( node_ptr->bm_ping_info.ok == false )
{
/* Auto correct key ping information ;
* should never occur but if it does ... */
if (( node_ptr->bm_ping_info.hostname.empty()) ||
( node_ptr->bm_ping_info.ip.empty()))
{
node_ptr->bm_ping_info.hostname = node_ptr->hostname ;
node_ptr->bm_ping_info.ip = node_ptr->bm_ip ;
}
if ( ! node_ptr->bm_pw.empty() )
{
node_ptr->bm_pw.clear();
}
}
/*****************************************************************
* Manage getting the bm password but only when ping is ok
****************************************************************/
else if ( node_ptr->bm_pw.empty() )
if ( node_ptr->bm_pw.empty() )
{
barbicanSecret_type * secret = secretUtil_manage_secret( node_ptr->secretEvent,
node_ptr->hostname,