Merge "Add support for peer controller reset via mtcClient"

This commit is contained in:
Zuul 2021-01-18 15:27:12 +00:00 committed by Gerrit Code Review
commit d73e6af5a3
21 changed files with 674 additions and 69 deletions

View File

@ -274,9 +274,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
*
*************************************************************************/
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol )
{
/* create the output filename */
string datafile ;

View File

@ -82,6 +82,14 @@ typedef struct
} bmc_info_type ;
typedef struct
{
string hostname;
string host_ip ;
string bm_ip ;
string bm_un ;
string bm_pw ;
} bmcUtil_accessInfo_type ;
/* BMC commands */
typedef enum
@ -107,6 +115,7 @@ typedef enum
#define BMC_QUERY_FILE_SUFFIX ((const char *)("_root_query"))
#define BMC_INFO_FILE_SUFFIX ((const char *)("_bmc_info"))
#define BMC_POWER_CMD_FILE_SUFFIX ((const char *)("_power_cmd_result"))
#define BMC_RESET_CMD_FILE_SUFFIX ((const char *)("_reset"))
#define BMC_BOOTDEV_CMD_FILE_SUFFIX ((const char *)("_bootdev"))
#define BMC_RESTART_CAUSE_FILE_SUFFIX ((const char *)("_restart_cause"))
#define BMC_POWER_STATUS_FILE_SUFFIX ((const char *)("_power_status"))
@ -137,9 +146,9 @@ void bmcUtil_create_pw_file ( thread_info_type * info_ptr,
bmc_protocol_enum protocol );
/* create the output filename */
string bmcUtil_create_data_fn ( string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
string bmcUtil_create_data_fn ( const string & hostname,
string file_suffix,
bmc_protocol_enum protocol );
/* Get power state from query response data. */
int bmcUtil_is_power_on ( string hostname,

View File

@ -130,6 +130,14 @@ bool hostUtil_is_valid_username ( string un )
return (false);
}
bool hostUtil_is_valid_pw ( string pw )
{
if ( !pw.empty() )
if ( pw.compare(NONE) )
return (true);
return (false);
}
bool hostUtil_is_valid_mac_addr ( string mac )
{
if ( !mac.empty() )

View File

@ -46,6 +46,7 @@ string hostUtil_getPrefixPath ( void );
bool hostUtil_is_valid_uuid ( string uuid );
bool hostUtil_is_valid_ip_addr ( string ip );
bool hostUtil_is_valid_username ( string un );
bool hostUtil_is_valid_pw ( string pw );
bool hostUtil_is_valid_bm_type ( string bm_type );
int hostUtil_mktmpfile ( string hostname, string basename, string & filename, string data );

View File

@ -202,3 +202,66 @@ int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_ty
ipmiUtil_bmc_info_log ( hostname, bmc_info, rc );
return (rc);
}
int ipmiUtil_reset_host_now ( string hostname,
bmcUtil_accessInfo_type accessInfo,
string output_filename)
{
dlog("%s %s BMC [IP:%s UN:%s]",
accessInfo.hostname.c_str(),
accessInfo.host_ip.c_str(),
accessInfo.bm_ip.c_str(),
accessInfo.bm_un.c_str());
if (daemon_is_file_present ( BMC_OUTPUT_DIR ) == false )
daemon_make_dir(BMC_OUTPUT_DIR) ;
if (daemon_is_file_present ( IPMITOOL_OUTPUT_DIR ) == false )
daemon_make_dir(IPMITOOL_OUTPUT_DIR) ;
/* create temp password file */
thread_info_type info ;
info.hostname = accessInfo.hostname ;
info.password_file = "" ;
info.pw_file_fd = 0 ;
/* Use common utility to create a temp pw file */
bmcUtil_create_pw_file ( &info, accessInfo.bm_pw, BMC_PROTOCOL__IPMITOOL );
/* create request */
string request =
ipmiUtil_create_request ( IPMITOOL_POWER_RESET_CMD,
accessInfo.bm_ip,
accessInfo.bm_un,
info.password_file,
output_filename );
/* issue request
*
* Note: Could launch a thread to avoid any stall.
* However, mtcClient can withstand up to a 25 second stall
* before pmon will fail it due to active monitoring.
* UT showed that there is no stall at all. */
unsigned long long latency_threshold_secs = DEFAULT_SYSTEM_REQUEST_LATENCY_SECS ;
unsigned long long before_time = gettime_monotonic_nsec () ;
int rc = system ( request.data()) ;
unsigned long long after_time = gettime_monotonic_nsec () ;
unsigned long long delta_time = after_time-before_time ;
if ( rc )
{
wlog("system call failed ; rc:%d [%d:%s]", rc, errno, strerror(errno) );
rc = FAIL_SYSTEM_CALL ;
}
if ( delta_time > (latency_threshold_secs*1000000000))
{
wlog ("%s bmc system call took %2llu.%-8llu sec", hostname.c_str(),
(delta_time > NSEC_TO_SEC) ? (delta_time/NSEC_TO_SEC) : 0,
(delta_time > NSEC_TO_SEC) ? (delta_time%NSEC_TO_SEC) : 0);
}
/* Cleanup */
if ( info.pw_file_fd > 0 )
close(info.pw_file_fd);
daemon_remove_file ( info.password_file.data());
return (rc);
}

View File

@ -57,6 +57,8 @@ int ipmiUtil_init ( void );
int ipmiUtil_bmc_info_load ( string hostname, const char * filename, bmc_info_type & mc_info );
int ipmiUtil_reset_host_now ( string hostname, bmcUtil_accessInfo_type accessInfo, string output_filename );
/* Create the ipmi request */
string ipmiUtil_create_request ( string cmd, string & ip, string & un, string & pw, string & out );

View File

@ -149,6 +149,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_REQ_MTCALIVE: return ("mtcAlive req");
case MTC_MSG_LOCKED: return ("locked msg");
case MTC_CMD_LAZY_REBOOT: return ("lazy reboot");
case MTC_MSG_INFO: return ("info msg");
case MTC_CMD_SYNC: return ("sync");
/* goenabled commands and messages */
case MTC_MSG_MAIN_GOENABLED: return ("goEnabled main msg");
@ -199,7 +201,8 @@ const char * get_mtcNodeCommand_str ( int cmd )
case MTC_EVENT_PMON_MAJOR: return("pmon major event");
case MTC_EVENT_PMON_MINOR: return("pmon minor event");
case MTC_EVENT_PMON_LOG: return("pmon log");
case MTC_EVENT_PMOND_RAISE: return("pmon raise");
case MTC_EVENT_PMOND_RAISE: return("pmond raise");
case MTC_EVENT_PMOND_CLEAR: return("pmond clear");
/* data port events */
case MTC_EVENT_AVS_CLEAR: return("AVS clear");

View File

@ -751,7 +751,9 @@ typedef struct
#define MTC_CMD_START_STORAGE_SVCS 19 /* to host */
#define MTC_CMD_LAZY_REBOOT 20 /* to host */
#define MTC_CMD_HOST_SVCS_RESULT 21 /* to host */
#define MTC_CMD_LAST 22
#define MTC_MSG_INFO 22 /* to host */
#define MTC_CMD_SYNC 23 /* to host */
#define MTC_CMD_LAST 24
#define RESET_PROG_MAX_REBOOTS_B4_RESET (5)
#define RESET_PROG_MAX_REBOOTS_B4_RETRY (RESET_PROG_MAX_REBOOTS_B4_RESET+2)
@ -1263,6 +1265,14 @@ typedef enum
MTC_AR_DISABLE_CAUSE__NONE,
} autorecovery_disable_cause_enum ;
/* code that represents a specific group of maintenance information
* ... typically for a specific feature */
typedef enum
{
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO,
MTC_INFO_CODE__LAST
} mtcInfo_enum ;
/* Service Based Auto Recovery Control Structure */
typedef struct
{

View File

@ -3295,6 +3295,102 @@ void nodeLinkClass::mtcInfo_log ( struct nodeLinkClass::node * node_ptr )
}
}
/***************************************************************************
*
* Name : build_mtcInfo_dict
*
* Purpose : Build a json dictionary for the specified info code enum
*
* Assumptions : Only MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO is supported
*
* Returns : Returns a json dictionary of mtcInfo.
*
* {
* "controller-0":{
* "ip":"192.168.204.2",
* "bm_ip":"xxx.xxx.xx.23",
* "bm_un":"root",
* "bm_pw":"root"
* },
* "controller-1":{
* "ip":"192.168.204.3",
* "bm_ip":"xxx.xxx.xx.24",
* "bm_un":"root",
* "bm_pw":"root"
* }
* }
*
**************************************************************************/
string nodeLinkClass::build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code )
{
string mtcInfo_dict = "" ;
/* loop/exit control */
int temp = 0 ;
/* should never happen but better to be safe */
if ( head == NULL )
return mtcInfo_dict ;
/* force the update to be a dictionary */
mtcInfo_dict = "{" ;
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if (( ptr->nodetype & CONTROLLER_TYPE ) &&
( mtcInfo_code == MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ))
{
if ( temp )
mtcInfo_dict.append(",");
mtcInfo_dict.append("\"" + ptr->hostname + "\":{");
mtcInfo_dict.append("\"mgmt_ip\":\"" + ptr->ip + "\",");
mtcInfo_dict.append("\"bm_ip\":\"" + ptr->bm_ip + "\",");
mtcInfo_dict.append("\"bm_un\":\"" + ptr->bm_un + "\",");
mtcInfo_dict.append("\"bm_pw\":\"" + ptr->bm_pw + "\"}");
if ( ++temp >= 2 )
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
mtcInfo_dict.append("}");
return mtcInfo_dict ;
}
/**************************************************************************
*
* Name : mtcInfo_handler
*
* Purpose : Send mtcInfo update to provisioned controllers when
* the push flag is set.
*
**************************************************************************/
void nodeLinkClass::mtcInfo_handler ( void )
{
/* This is set in the bm_handler once access to the BMC using
* provisioned credentials have been verified. */
if ( this->want_mtcInfo_push )
{
/* handler will enhance when more codes are introduced */
mtcInfo_enum mtcInfo_code = MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO ;
string mtcInfo_dict = build_mtcInfo_dict(mtcInfo_code);
if ( ! mtcInfo_dict.empty() )
{
string temp = CONTROLLER_0 ;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
if ( this->controllers > 1 )
{
temp = CONTROLLER_1;
send_mtc_cmd ( temp, MTC_MSG_INFO, MGMNT_INTERFACE, mtcInfo_dict);
}
}
this->want_mtcInfo_push = false ;
}
}
/* Lock Rules
*
* 1. Cannot lock this controller
@ -4422,6 +4518,18 @@ string nodeLinkClass::get_bm_ip ( string hostname )
return ("");
}
string nodeLinkClass::get_bm_pw ( string hostname )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
return (node_ptr->bm_pw);
}
elog ("%s bm pw lookup failed\n", hostname.c_str() );
return ("");
}
string nodeLinkClass::get_bm_un ( string hostname )
{
nodeLinkClass::node* node_ptr ;

View File

@ -828,10 +828,13 @@ private:
int oos_test_handler ( struct nodeLinkClass::node * node_ptr );
int insv_test_handler ( struct nodeLinkClass::node * node_ptr );
int stress_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int bmc_handler ( struct nodeLinkClass::node * node_ptr );
int degrade_handler ( struct nodeLinkClass::node * node_ptr );
int uptime_handler ( void );
void mtcInfo_handler ( void );
int host_services_handler ( struct nodeLinkClass::node * node_ptr );
/* Starts the specified 'reset or powercycle' recovery monitor */
@ -851,13 +854,22 @@ private:
void ctl_mtcAlive_gate ( struct nodeLinkClass::node * node_ptr, bool gate_state );
void set_mtcAlive ( struct nodeLinkClass::node * node_ptr, int interface );
/********* mtcInfo in the database ************/
int mtcInfo_set ( struct nodeLinkClass::node * node_ptr, string key, string value );
string mtcInfo_get ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_clr ( struct nodeLinkClass::node * node_ptr, string key );
void mtcInfo_log ( struct nodeLinkClass::node * node_ptr );
int set_mtcInfo ( struct nodeLinkClass::node * node_ptr, string & mtc_info );
/********* mtcInfo that gets puished out to daemons ***********/
/* flag telling mtce when a mtcInfo push needs to be done */
bool want_mtcInfo_push = false ;
/* performs the mtcInfo push */
void push_mtcInfo ( void );
/*****************************************************************************
*
* Name : bmc_command_send
@ -1192,11 +1204,11 @@ private:
* Set to true when the autorecovery threshold is reached
* and we want to avoid taking further autorecovery action
* even though it may be requested. */
bool autorecovery_disabled ;
bool autorecovery_disabled = false ;
/* Set to true by fault detection methods that are
* autorecoverable when in simplex mode. */
bool autorecovery_enabled ;
bool autorecovery_enabled = false ;
/** Tracks the number of hosts that 'are currently' in service trouble
* wrt heartbeat (above minor threshold).
@ -1464,11 +1476,14 @@ public:
/***********************************************************/
/** Number of provisioned controllers */
int controllers = 0 ;
/** Number of provisioned hosts (nodes) */
int hosts ;
int hosts = 0 ;
/* Set to True while waiting for UNLOCK_READY_FILE in simplex mode */
bool unlock_ready_wait ;
bool unlock_ready_wait = false ;
/** Host has been deleted */
bool host_deleted ;
@ -1517,6 +1532,9 @@ public:
/** Return the number of inventoried hosts */
int num_hosts ( void );
/** Return the number of inventoried controllers */
int num_controllers ( void );
/** **********************************************************************
*
* Name : nodeLinkClass::workQueue_enqueue
@ -1664,6 +1682,9 @@ public:
/* Clear heartbeat failed flag for all interfaces */
void manage_heartbeat_clear ( string hostname, iface_enum iface );
/* Build a json dictionary of containing code specified maintenance info */
string build_mtcInfo_dict ( mtcInfo_enum mtcInfo_code );
/** Test and Debug Members and Variables */
/** Print node info banner */
@ -1789,6 +1810,7 @@ public:
string get_bm_ip ( string hostname );
string get_bm_un ( string hostname );
string get_bm_pw ( string hostname );
string get_bm_type ( string hostname );
string get_hostname_from_bm_ip ( string bm_ip );

View File

@ -13,7 +13,7 @@ LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -l
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
CCFLAGS = -g -O2 -Wall -Wextra -Werror
CCFLAGS = -g -O2 -Wall -Wextra -Werror -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -279,8 +279,14 @@ void nodeLinkClass::mnfa_enter ( void )
void nodeLinkClass::mnfa_exit ( bool force )
{ force = force ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface )
{ UNUSED(hostname); UNUSED(cmd); UNUSED(interface); return PASS ; }
int send_mtc_cmd ( string & hostname, int cmd, int interface, string json_dict)
{
UNUSED(hostname);
UNUSED(cmd);
UNUSED(interface);
UNUSED(json_dict);
return PASS ;
}
int nodeLinkClass::mtcInvApi_subf_states ( string hostname,
string oper_subf,

View File

@ -54,7 +54,7 @@ BINS = mtcAgent mtcClient
LDLIBS += -lstdc++ -ldaemon -lcommon -lthreadUtil -lbmcUtils -lfmcommon -lalarm -lpthread -lrt -levent -ljson-c -lamon -lcrypto -luuid
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../heartbeat -I../hwmon -I../public
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces
CCFLAGS += -g -O2 -Wall -Wextra -Werror -Wno-missing-braces -std=c++11
STATIC_ANALYSIS_TOOL = cppcheck
STATIC_ANALYSIS_TOOL_EXISTS = $(shell [[ -e `which $(STATIC_ANALYSIS_TOOL)` ]] && echo 1 || echo 0)

View File

@ -20,7 +20,7 @@
#include <stdio.h>
#include <string.h>
#include <sys/un.h> /* for ... unix domain sockets */
#include <sys/un.h> /* for ... unix domain sockets */
#include <arpa/inet.h>
#include <sys/socket.h>
#include <net/if.h>
@ -29,8 +29,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <list> /* for the list of conf file names */
#include <list> /* for ... list of conf file names */
#include <unistd.h> /* for ... sync */
using namespace std;
@ -204,6 +204,24 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
mlog1 ("mtcAlive request received (%s network)\n", interface_name.c_str());
return ( send_mtcAlive_msg ( sock_ptr, get_who_i_am(), interface ));
}
else if ( msg.cmd == MTC_MSG_INFO )
{
mlog1("mtc 'info' message received (%s network)\n", interface_name.c_str());
load_mtcInfo_msg ( msg );
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_CMD_SYNC )
{
ilog ("mtc '%s' message received (%s network)\n",
get_mtcNodeCommand_str(msg.cmd),
interface_name.c_str());
ilog ("Sync Start");
sync ();
ilog ("Sync Done");
return ( PASS ); /* no ack for this message */
}
else if ( msg.cmd == MTC_MSG_LOCKED )
{
/* Only recreate the file if its not already present */
@ -603,7 +621,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface )
}
/** Send an event to the mtcAgent **/
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr )
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr )
{
mtc_message_type event ;
@ -619,6 +637,24 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
/* We don't use the buffer for mtce events to remove it from the size */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
}
else if ( cmd == MTC_EVENT_MONITOR_READY )
{
string event_info = "{\"" ;
event_info.append(MTC_JSON_INV_NAME);
event_info.append("\":\"");
event_info.append(get_hostname());
event_info.append("\",\"");
event_info.append(MTC_JSON_SERVICE);
event_info.append("\":\"");
event_info.append(MTC_SERVICE_MTCCLIENT_NAME );
event_info.append("\"}");
size_t len = event_info.length()+1 ;
snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header());
snprintf ( &event.buf[0], len, "%s", event_info.data());
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len));
ilog ("%s %s ready", get_hostname().c_str(), MTC_SERVICE_MTCCLIENT_NAME);
}
else if (( cmd == MTC_EVENT_AVS_CLEAR ) ||
( cmd == MTC_EVENT_AVS_MAJOR ) ||
( cmd == MTC_EVENT_AVS_CRITICAL ))
@ -666,7 +702,7 @@ int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_na
{
if ( bytes == 0 )
{
slog ("message send failed ; message size=0 for cmd:%d is 0\n", event.cmd );
slog ("message send failed ; message size=0 for cmd:0x%x is 0\n", event.cmd );
rc = FAIL_NO_DATA ;
}
else if ((rc = sock_ptr->mtc_client_tx_socket->write((char*)&event.hdr[0], bytes))!= bytes )
@ -933,32 +969,59 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
return (PASS) ;
}
/* Accelerated Virtual Switch 'events' socket
* - for receiving data port state change event
* Event strings are
*
* {"type":"port-state", "severity":"critical|major|clear"}
*
* type:port-state - the provider network data port status has changed to the supplied fault severity
*
* severity:
* critical - port has failed and is not part of an aggregate or is the last port in an aggregate (degrade, disable services)
* major - port has failed and is part of an aggregate with other inservice-ports (degrade only)
* clear - port has recovered from a failed state and is operational (clear degrade, enable services)
*
* NOTE: The port status can transition from any of the above states to any other state.
*
* The neutron agent monitors the vswitch ports at a 2 second interval.
* If a port changes link state during the polling period, it will
* raise/clear the alarm, but now also calculates the impact of that port
* failure on the provider network data interface.
*
* The overall aggregated state across all provider network interfaces will
* be reported to maintenance when ports enter a link down or up state.
* The agent will also periodically send the current provider network port
* status to maintenance every 30 seconds.
*
*/
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port)
{
mtc_message_type msg ;
int bytes = 0 ;
MEMSET_ZERO (msg);
snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header());
msg.cmd = cmd ;
switch ( cmd )
{
case MTC_CMD_SYNC:
{
ilog ("Sending '%s' command to %s:%s:%d",
get_mtcNodeCommand_str(cmd),
hostname.c_str(),
address.c_str(), port);
msg.num = 0 ;
/* buffer not used in this message */
bytes = ((sizeof(mtc_message_type))-(BUF_SIZE));
break ;
}
default:
{
slog("Unsupported command ; %s:%d", get_mtcNodeCommand_str(cmd), cmd );
return (FAIL_BAD_CASE);
}
}
int rc = FAIL ;
/* Send to controller floating address */
if (( sock_ptr->mtc_client_tx_socket ) &&
( sock_ptr->mtc_client_tx_socket->sock_ok() == true ))
{
print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), false );
rc = sock_ptr->mtc_client_tx_socket->write((char*)&msg.hdr[0], bytes, address.data(), port ) ;
if ( 0 >= rc )
{
elog("failed to send command to mtcClient (%d) (%d:%s)", rc, errno, strerror(errno));
rc = FAIL_SOCKET_SENDTO ;
}
else
rc = PASS ;
}
else
{
elog("mtc_client_tx_socket not ok");
rc = FAIL_BAD_STATE ;
}
return (rc) ;
}
int mtcCompMsg_testhead ( void )
{

View File

@ -443,6 +443,34 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT );
return (PASS);
}
else if ( service == MTC_SERVICE_MTCCLIENT_NAME )
{
ilog ("%s %s ready", hostname.c_str(), MTC_SERVICE_MTCCLIENT_NAME);
/* if this ready event is from the mtcClient of a
* controller that has valid bmc access info then
* build the 'peer controller kill' mtcInfo and
* send it to that mtcClient */
if ( obj_ptr->get_nodetype ( hostname ) & CONTROLLER_TYPE )
{
string bm_pw = obj_ptr->get_bm_pw ( hostname ) ;
if ( !bm_pw.empty() && ( bm_pw != NONE ))
{
string bm_un = obj_ptr->get_bm_un ( hostname ) ;
string bm_ip = obj_ptr->get_bm_ip ( hostname ) ;
if (( hostUtil_is_valid_username ( bm_un )) &&
( hostUtil_is_valid_ip_addr ( bm_ip )))
{
send_mtc_cmd ( hostname,
MTC_MSG_INFO,
MGMNT_INTERFACE,
obj_ptr->build_mtcInfo_dict (
MTC_INFO_CODE__PEER_CONTROLLER_KILL_INFO));
}
}
}
return (PASS);
}
if ( service == MTC_SERVICE_HWMOND_NAME )
{
std::list<string>::iterator temp ;
@ -578,11 +606,12 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr,
return (rc);
}
int send_mtc_cmd ( string & hostname, int cmd , int interface )
int send_mtc_cmd ( string & hostname, int cmd , int interface, string json_dict )
{
int rc = FAIL ;
bool force = false ;
mtc_message_type mtc_cmd ;
string data = "" ;
mtc_socket_type * sock_ptr = get_sockPtr ();
memset (&mtc_cmd,0,sizeof(mtc_message_type));
@ -592,6 +621,16 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
switch ( cmd )
{
case MTC_MSG_INFO:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
mtc_cmd.cmd = cmd ;
mtc_cmd.num = 0 ;
data = "{\"mtcInfo\":" + json_dict + "}";
ilog("%s mtc info update", hostname.c_str());
rc = PASS ;
break ;
}
case MTC_REQ_MTCALIVE:
{
snprintf ( &mtc_cmd.hdr[0], MSG_HEADER_SIZE, "%s" , get_cmd_req_msg_header() );
@ -689,11 +728,20 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface )
* Note: the minus 1 is to overwrite the null */
snprintf ( &mtc_cmd.hdr[MSG_HEADER_SIZE-1], MSG_HEADER_SIZE, "%s", obj_ptr->get_hostIfaceMac(hostname, MGMNT_IFACE).data());
string data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
/* If data is empty then at least add where the message came from */
if ( data.empty() )
{
data = "{\"address\":\"";
data.append(obj_ptr->my_float_ip) ;
data.append("\",\"interface\":\"");
data.append(get_iface_name_str(interface));
data.append("\"}");
}
else
{
; /* data is already pre loaded by the command case above */
}
/* copy data into message buffer */
snprintf ( &mtc_cmd.buf[0], data.length()+1, "%s", data.data());
bytes = (sizeof(mtc_message_type)-(BUF_SIZE-(data.length()+1)));

View File

@ -43,9 +43,9 @@
#include <signal.h>
#include <fcntl.h>
#include <errno.h>
//#include <syslog.h> /* for ... syslog */
#include <sys/stat.h>
#include <list>
#include <json-c/json.h> /* for ... json_tokener_parse */
using namespace std;
@ -56,6 +56,10 @@ using namespace std;
#include "nodeBase.h" /* for ... Common Definitions */
#include "nodeTimers.h" /* fpr ... Timer Service */
#include "nodeUtil.h" /* for ... Common Utilities */
#include "hostUtil.h" /* for ... hostUtil_is_valid_... */
#include "jsonUtil.h" /* for ... jsonUtil_get_key_value_string */
#include "bmcUtil.h" /* for ... bmcUtil_accessInfo_type */
#include "ipmiUtil.h" /* for ... ipmiUtil_reset_host_now */
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
#include "mtcNodeMsg.h" /* for ... common maintenance messaging */
#include "mtcNodeComp.h" /* for ... this module header */
@ -96,7 +100,7 @@ string get_hostname ( void )
* Daemon Configuration Structure - The allocated struct
* @see daemon_common.h for daemon_config_type struct format.
*/
static daemon_config_type mtc_config ;
static daemon_config_type mtc_config ;
daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
/**
@ -106,6 +110,8 @@ daemon_config_type * daemon_get_cfg_ptr () { return &mtc_config ; }
static mtc_socket_type mtc_sock ;
static mtc_socket_type * sock_ptr ;
static bmcUtil_accessInfo_type peer_controller = {"none","none","none","none","none"};
static bmcUtil_accessInfo_type this_controller = {"none","none","none","none","none"};
int run_goenabled_scripts ( string type );
@ -138,6 +144,16 @@ void timer_handler ( int sig, siginfo_t *si, void *uc)
mtcTimer_stop_int_safe ( ctrl.hostservices.timer );
ctrl.hostservices.timer.ring = true ;
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.sync_timer.tid )
{
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
mtcTimer_stop_int_safe ( ctrl.peer_ctrlr_reset.sync_timer );
}
else if ( *tid_ptr == ctrl.peer_ctrlr_reset.audit_timer.tid )
{
/* use auto restart */
ctrl.peer_ctrlr_reset.audit_timer.ring = true ;
}
else
{
mtcTimer_stop_tid_int_safe ( tid_ptr );
@ -207,9 +223,8 @@ void daemon_exit ( void )
exit (0) ;
}
/* Startup config read */
static int mtc_config_handler ( void * user,
static int mtc_config_handler ( void * user,
const char * section,
const char * name,
const char * value)
@ -236,11 +251,14 @@ static int mtc_config_handler ( void * user,
config_ptr->failsafe_shutdown_delay = atoi(value);
ilog ("Shutdown TO : %d secs\n", config_ptr->failsafe_shutdown_delay );
}
else
if (( ctrl.nodetype & CONTROLLER_TYPE ) &&
(MATCH("client", "sync_b4_peer_ctrlr_reset")))
{
return (PASS);
ctrl.peer_ctrlr_reset.sync = atoi(value);
ilog("SyncB4 Reset: %s",
ctrl.peer_ctrlr_reset.sync ? "Yes" : "No" );
}
return (FAIL);
return (PASS);
}
/* Read the mtc.ini file and load control */
@ -946,6 +964,65 @@ void _manage_goenabled_tests ( void )
_scripts_cleanup (ctrl.active_script_set) ;
}
int issue_reset_and_cleanup ( void )
{
int rc = FAIL ;
const char peer_ctrlr [] = "Peer controller reset" ;
ilog("SM %s request", peer_ctrlr );
/* check creds */
if (( hostUtil_is_valid_ip_addr ( peer_controller.bm_ip ) == false ) ||
( hostUtil_is_valid_username ( peer_controller.bm_un ) == false ) ||
( hostUtil_is_valid_pw ( peer_controller.bm_pw ) == false ))
{
elog("%s cannot reset peer BMC host at %s due to invalid credentials",
ctrl.hostname, peer_controller.bm_ip.c_str());
return (rc);
}
/* create output filename - no need to delete after operation */
string output_filename = bmcUtil_create_data_fn ( ctrl.hostname,
BMC_RESET_CMD_FILE_SUFFIX,
BMC_PROTOCOL__IPMITOOL );
if ( output_filename.empty() )
{
elog("%s ; failed to create output filename", peer_ctrlr);
rc = FAIL_STRING_EMPTY ;
}
else if ( ipmiUtil_reset_host_now ( ctrl.hostname,
peer_controller,
output_filename ) == PASS )
{
string result = daemon_get_file_str ( output_filename.data() );
ilog("%s succeeded", peer_ctrlr);
/* don't fail the operation if the result is unexpected ; but log it */
if ( result.compare( IPMITOOL_POWER_RESET_RESP ) )
{
dlog("... but reset command output was unexpected ; %s",
result.c_str());
}
rc = PASS ;
}
else
{
elog("%s failed", peer_ctrlr);
rc = FAIL_OPERATION ;
}
if ( rc == PASS )
{
/* give the host a chance to reset before
* telling SM the reset is done */
sleep (2) ;
/* Don't want to remove the file if the reset was not successful */
dlog("removing %s", RESET_PEER_NOW );
daemon_remove_file ( RESET_PEER_NOW );
}
return (rc);
}
/* The main service loop */
int daemon_init ( string iface, string nodetype_str )
@ -963,6 +1040,7 @@ int daemon_init ( string iface, string nodetype_str )
ctrl.subfunction = 0 ;
ctrl.system_type = daemon_system_type ();
ctrl.clstr_iface_provisioned = false ;
ctrl.peer_ctrlr_reset.sync = false ;
/* convert node type to integer */
ctrl.nodetype = get_host_function_mask ( nodetype_str ) ;
@ -1018,6 +1096,13 @@ int daemon_init ( string iface, string nodetype_str )
mtcTimer_init ( ctrl.goenabled.timer, &ctrl.hostname[0], "goenable timer" );
mtcTimer_init ( ctrl.hostservices.timer, &ctrl.hostname[0], "host services timer" );
/* initialize peer controller reset feature */
mtcTimer_init ( ctrl.peer_ctrlr_reset.audit_timer, &ctrl.hostname[0], "peer ctrlr reset audit timer" ),
mtcTimer_init ( ctrl.peer_ctrlr_reset.sync_timer, &ctrl.hostname[0], "peer ctrlr reset sync timer" ),
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
ctrl.peer_ctrlr_reset.audit_period = PEER_CTRLR_AUDIT_PERIOD ;
/* initialize the script group control structures */
script_ctrl_init ( &ctrl.goenabled );
script_ctrl_init ( &ctrl.hostservices );
@ -1073,6 +1158,17 @@ void daemon_service_run ( void )
/* Send first mtcAlive ASAP */
mtcTimer_start ( ctrl.timer, timer_handler, 1 );
/* Monitor for peer controller reset requests when this
* daemon runs on a controller */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.audit_timer,
timer_handler,
ctrl.peer_ctrlr_reset.audit_period );
}
mtce_send_event ( sock_ptr, MTC_EVENT_MONITOR_READY, NULL );
/* lets go select so that the sock does not go crazy */
dlog ("%s running main loop with %d msecs socket timeout\n",
&ctrl.hostname[0], (SOCKET_WAIT/1000) );
@ -1384,7 +1480,51 @@ void daemon_service_run ( void )
}
}
}
/* service controller specific audits */
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
/* peer controller reset service audit */
if ( ctrl.peer_ctrlr_reset.audit_timer.ring )
{
if ( daemon_is_file_present ( RESET_PEER_NOW ) )
{
if ( ctrl.peer_ctrlr_reset.sync )
{
if ( ctrl.peer_ctrlr_reset.sync_timer.ring )
{
issue_reset_and_cleanup ();
ctrl.peer_ctrlr_reset.sync_timer.ring = false ;
}
else if ( ctrl.peer_ctrlr_reset.sync_timer.tid == NULL )
{
if ( send_mtcClient_cmd ( &mtc_sock,
MTC_CMD_SYNC,
peer_controller.hostname,
peer_controller.host_ip,
mtc_config.mtc_rx_mgmnt_port) == PASS )
{
mtcTimer_start ( ctrl.peer_ctrlr_reset.sync_timer, timer_handler, MTC_SECS_10 );
ilog("... waiting for peer controller to sync - %d secs", MTC_SECS_10);
}
else
{
elog("failed to send 'sync' command to peer controller mtcClient");
ctrl.peer_ctrlr_reset.sync_timer.ring = true ;
}
}
else
{
; /* wait longer */
}
}
else
{
issue_reset_and_cleanup ();
}
}
ctrl.peer_ctrlr_reset.audit_timer.ring = false ;
}
}
daemon_signal_hdlr ();
}
daemon_exit();
@ -1750,7 +1890,6 @@ void daemon_sigchld_hdlr ( void )
}
default:
{
wlog ("child handler running with no active script set (%d)\n", ctrl.active_script_set );
return ;
}
}
@ -1820,6 +1959,84 @@ void daemon_sigchld_hdlr ( void )
}
}
/***************************************************************************
*
* Name : load_mtcInfo_msg
*
* Description: Extract the mtc info from the MTC_MSG_INFO message.
*
* Assumptions: So far only the peer controller reset feature uses this.
*
* Returns : Nothing
*
***************************************************************************/
void load_mtcInfo_msg ( mtc_message_type & msg )
{
if ( ctrl.nodetype & CONTROLLER_TYPE )
{
mlog1("%s", &msg.buf[0]);
struct json_object *_obj = json_tokener_parse( &msg.buf[0] );
if ( _obj )
{
if ( strcmp(&ctrl.hostname[0], CONTROLLER_0 ))
peer_controller.hostname = CONTROLLER_0 ;
else
peer_controller.hostname = CONTROLLER_1 ;
struct json_object *info_obj = (struct json_object *)(NULL);
json_bool json_rc = json_object_object_get_ex( _obj,
"mtcInfo",
&info_obj );
if ( ( json_rc == TRUE ) && ( info_obj ))
{
struct json_object *ctrl_obj = (struct json_object *)(NULL);
json_bool json_rc =
json_object_object_get_ex( info_obj,
peer_controller.hostname.data(),
&ctrl_obj );
if (( json_rc == TRUE ) && ( ctrl_obj ))
{
peer_controller.host_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_HOSTIP) ;
peer_controller.bm_ip = jsonUtil_get_key_value_string(ctrl_obj, MTC_JSON_INV_BMIP) ;
peer_controller.bm_un = jsonUtil_get_key_value_string(ctrl_obj, "bm_un");
peer_controller.bm_pw = jsonUtil_get_key_value_string(ctrl_obj, "bm_pw");
/* log the mc info but not the bmc password ; only
* indicate that it looks 'ok' or 'is 'none' */
ilog ("%s is my peer [host:%s bmc:%s:%s:%s]",
peer_controller.hostname.c_str(),
peer_controller.host_ip.c_str(),
peer_controller.bm_ip.c_str(),
peer_controller.bm_un.c_str(),
hostUtil_is_valid_pw(peer_controller.bm_pw) ? "ok":"none");
}
else
{
wlog("peer mtcInfo missing (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
}
else
{
wlog("mtcInfo label parse error (rc:%d) ; %s",
json_rc, &msg.buf[0]);
}
json_object_put(_obj);
}
else
{
wlog("message buffer tokenize error ; %s", &msg.buf[0]);
}
}
else
{
slog("%s got mtcInfo ; unexpected for this nodetype", ctrl.hostname);
}
}
/* Push daemon state to log file */
void daemon_dump_info ( void )
{
@ -1853,13 +2070,13 @@ int daemon_run_testhead ( void )
* STAGE 1: some test
************************************************/
printf ( "| Test %d : Maintenance Service Test ............. ", stage );
if ( rc != PASS )
if ( rc != PASS )
{
FAILED_STR ;
rc = FAIL ;
}
else
PASSED ;
PASSED ;
printf ("+---------------------------------------------------------+\n");
return PASS ;

View File

@ -17,6 +17,10 @@
#include <string.h>
#include <unistd.h>
using namespace std;
#include "nodeTimers.h" /* for ... Timer Service */
/** Compute Config mask */
#define CONFIG_CLIENT_MASK (CONFIG_AGENT_MTC_MGMNT_PORT |\
CONFIG_CLIENT_MTC_MGMNT_PORT |\
@ -59,6 +63,22 @@ typedef struct
} script_ctrl_type ;
void script_ctrl_init ( script_ctrl_type * script_ctrl_ptr );
/* peer controller reset control structure and associated definitions */
/* This is a flag file set by SM when SM wants maintanence to perform a
* BMC reset of the other (peer) controller */
#define RESET_PEER_NOW "/var/run/.sm_reset_peer"
#define PEER_CTRLR_AUDIT_PERIOD (2)
typedef struct
{
struct
mtc_timer sync_timer ;
mtc_timer audit_timer ;
int audit_period ;
bool sync ;
} peer_ctrlr_reset_type ;
typedef struct
{
char hostname [MAX_HOST_NAME_SIZE+1];
@ -76,7 +96,7 @@ typedef struct
unsigned int function ;
unsigned int subfunction ;
struct mtc_timer timer ; /* mtcAlive timer */
struct mtc_timer timer ; /* mtcAlive timer */
bool clstr_iface_provisioned ;
@ -102,6 +122,7 @@ typedef struct
/* Where to send events */
string mtcAgent_ip ;
peer_ctrlr_reset_type peer_ctrlr_reset;
} ctrl_type ;
ctrl_type * get_ctrl_ptr ( void );
@ -109,5 +130,6 @@ ctrl_type * get_ctrl_ptr ( void );
bool is_subfunction_worker ( void );
int run_goenabled_scripts ( mtc_socket_type * sock_ptr , string requestor );
int run_hostservices_scripts ( unsigned int cmd );
void load_mtcInfo_msg ( mtc_message_type & msg );
#endif

View File

@ -1326,6 +1326,7 @@ void nodeLinkClass::fsm ( void )
daemon_signal_hdlr ();
mtcHttpSvr_look ( mtce_event );
}
mtcInv.mtcInfo_handler();
}
}

View File

@ -6166,6 +6166,8 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
if ( is_controller(node_ptr) )
{
this->controllers++ ;
mtc_cmd_enum state = CONTROLLER_DISABLED ;
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
@ -6635,6 +6637,8 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
mtcInfo_set ( node_ptr, MTCE_INFO_KEY__BMC_PROTOCOL, BMC_PROTOCOL__IPMI_STR );
node_ptr->bmc_protocol = BMC_PROTOCOL__IPMITOOL ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
ilog ("%s bmc control using %s:%s",
@ -6751,8 +6755,15 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
node_ptr->bmc_thread_ctrl.done = true ;
node_ptr->bmc_thread_info.command = 0 ;
}
/* store mtcInfo, which specifies the selected BMC protocol,
* into the sysinv database */
mtcInvApi_update_mtcInfo ( node_ptr );
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );
}
@ -6942,6 +6953,11 @@ int nodeLinkClass::bmc_handler ( struct nodeLinkClass::node * node_ptr )
}
} /* end power off detection handling */
/* push the BMC access info out to the mtcClient when
* a controller's BMC connection is established/verified */
if ( node_ptr->nodetype & CONTROLLER_TYPE )
this->want_mtcInfo_push = true ;
send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST );
send_hwmon_command ( node_ptr->hostname, MTC_CMD_START_HOST );

View File

@ -125,11 +125,13 @@ int send_mtcAlive_msg ( mtc_socket_type * sock_ptr, string identity, int interfa
int recv_mtc_reply_noblock ( void );
int send_mtc_cmd ( string & hostname, int cmd, int interface );
int send_mtc_cmd ( string & hostname, int cmd, int interface , string json_dict="" );
int mtc_service_command ( mtc_socket_type * sock_ptr , int interface );
int mtc_set_availStatus ( string & hostname, mtc_nodeAvailStatus_enum status );
int mtce_send_event ( mtc_socket_type * sock_ptr, int cmd , const char * mtce_name_ptr );
int mtce_send_event ( mtc_socket_type * sock_ptr, unsigned int cmd , const char * mtce_name_ptr );
int mtc_clstr_init ( mtc_socket_type * sock_ptr , char * iface );
string get_who_i_am ( void );
int send_mtcClient_cmd ( mtc_socket_type * sock_ptr, int cmd, string hostname, string address, int port);
#endif

View File

@ -87,6 +87,10 @@ sched_delay_threshold = 300 ; scheduler delay time in msecs that will trigger
daemon_log_port = 2121 ; daemon logger port
mtcalarm_req_port = 2122 ;
sync_b4_peer_ctrlr_reset = 0 ; issue a sync command to peer controller mtcClient
; before issuing BMC reset.
[timeouts] ; configurable maintenance timeout values in seconds
failsafe_shutdown_delay = 120;