From a4238c2a355899aa4d6d9bcd429a7f8614041416 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 18 Jun 2019 08:33:28 -0400 Subject: [PATCH] Add 50 byte hostname support to maintenance Hosts with hostnames longer than 31 characters do not go online (locked-disabled-online) after installation. This update enhances maintenance messaging to support up to 50 byte/character hostnames. System Install: --------------- PASS: Verify system install PASS: Verify AIO system install (regression) PASS: Verify system install with long hostnames, deployment-config.yaml PASS: Verify mtcAgent process startup/restart logs PASS: Verify hbsAgent process startup/restart logs (active controller) PASS: Verify hbsAgent process startup/restart logs (standby controller) PASS: Verify hwmond process startup/restart logs PASS: Verify guestAgent process startup/restart logs PASS: Verify all common maintenance daemons startup/restart logs PASS: Verify patch applies and removes cleanly PASS: Verify long hostname Add ; inventory distribution PASS: Verify short hostname Add ; inventory distribution Long Hostname Handling: ----------------------- PASS: Verify host name support for up to 50 and 51 byte hostnames Heartbeat Monitoring: --------------------- PASS: Verify cluster-host interface link down handling. PASS: Verify graceful recovery from host reboot. PASS: Verify pmond process failure and recovery cycle. Maintenance Actions: -------------------- PASS: Verify host install with 50 byte hostname PASS: Verify host lock PASS: Verify host unlock PASS: Verify host reboot PASS: Verify host reinstall PASS: Verify host delete (no core dump / all daemon logs) PASS: Verify host power-off PASS: Verify host power-on PASS: Verify BMC State Info PASS: Verify lock and unlock storage node PASS: Controller Swact over and Back PASS: Verify thresholded heartbeat failure handling PASS: Verify node locked flag file PASS: Verify no core dumps during testiong Hardware Monitor: ----------------- PASS: Verify BMC Provisioning/Reprovisioning/Deprovisioning PASS: Verify Inventory Add/Delete/Modify PASS: Verify Sensor Model and Monitoring PASS: Verify Sensor Model Relearn PASS: Verify Alarming and Logs PASS: Verify Sensor Action, Interval modification PASS: Verify Critical Sensor Action handling (ignore, log, alarm, reset, power cycle) Guest Agent: ------------ PASS: Verify inventory add and delete Process Monitor: ---------------- PASS: Verify process monitor logs PASS: Verify process monitor events into mtcAgent PASS: Verify process monitor failure alarming and recovery clear. PASS: Verify process monitor regression script (test-pmon.sh -c restart) PASS: Verify process monitor regression script (test-pmon.sh -c kill) PASS: Verify process monitor regression script (test-pmon-action.sh) PASS: Verify critical process failure handling PASS: Verify major process failure handling Collectd Monitoring: ----------------- PASS: Verify collectd monitoring for long hostname hosts Regression: ----------- PASS: Verify mtce daemon sigal handling (test-signals.sh) Change-Id: If22ab081397ec1e8b24f20aad8c99f8079cb98a5 Closes-Bug: 1824429 Signed-off-by: Eric MacDonald --- mtce-common/src/common/msgClass.cpp | 18 +- mtce-common/src/common/nlEvent.cpp | 50 +- mtce-common/src/common/nodeBase.cpp | 2 +- mtce-common/src/common/nodeBase.h | 34 +- mtce-common/src/common/nodeUtil.cpp | 3 +- mtce/src/common/nodeClass.cpp | 101 ++-- mtce/src/common/nodeClass.h | 3 +- mtce/src/heartbeat/hbsAgent.cpp | 251 ++++---- mtce/src/heartbeat/hbsBase.h | 2 +- mtce/src/heartbeat/hbsClient.cpp | 38 +- mtce/src/hwmon/hwmonInit.cpp | 2 +- mtce/src/hwmon/hwmonMsg.cpp | 108 ++-- mtce/src/maintenance/mtcCompMsg.cpp | 35 +- mtce/src/maintenance/mtcCtrlMsg.cpp | 853 +++++++++++++++------------ mtce/src/maintenance/mtcHttpSvr.cpp | 4 +- mtce/src/maintenance/mtcNodeComp.cpp | 2 +- mtce/src/maintenance/mtcNodeCtrl.cpp | 4 +- mtce/src/pmon/pmonHdlr.cpp | 4 +- mtce/src/pmon/pmonMsg.cpp | 135 ++--- 19 files changed, 915 insertions(+), 734 deletions(-) diff --git a/mtce-common/src/common/msgClass.cpp b/mtce-common/src/common/msgClass.cpp index b2359cc1..9f5bb78e 100644 --- a/mtce-common/src/common/msgClass.cpp +++ b/mtce-common/src/common/msgClass.cpp @@ -309,17 +309,17 @@ int msgClassAddr::getAddressFromInterface(const char* interface, char* address, return rc; } } - char hostname[MAX_HOST_NAME_SIZE+1] = {0}; - if (gethostname(hostname, - MAX_HOST_NAME_SIZE) < 0) { - elog("Failed to get system host name (err: %d)", errno); + char hostname[MAX_CHARS_HOSTNAME] = {0}; + if (gethostname(hostname, MAX_CHARS_HOSTNAME) < 0) + { + elog("Failed to gethostname (%d:%m)", errno); return rc; } // if hostname is localhost then resolution will give us // the interface loopback address. Detect this case and // return. - if (!strncmp(hostname, "localhost", 9)) { + if (!strcmp(hostname, LOCALHOST)) { wlog ("Detected localhost as system hostname." " Cannot resolve IP address"); return rc; @@ -328,19 +328,17 @@ int msgClassAddr::getAddressFromInterface(const char* interface, char* address, // if it is cluster-host then we need to determine the interface // host name. For management interface, the system hostname // is the intf hostname - const char* cluster_host_suffix = "-cluster-host"; - size_t cluster_host_suffix_len = sizeof(cluster_host_suffix); - char iface_hostname[MAX_HOST_NAME_SIZE+cluster_host_suffix_len]; + char iface_hostname[MAX_CHARS_HOSTNAME]; memset(iface_hostname, 0, sizeof(iface_hostname)); snprintf(iface_hostname, sizeof(iface_hostname), "%s%s", hostname, - (((interface_type == CLSTR_IFACE)) ? cluster_host_suffix : "")); + (((interface_type == CLSTR_IFACE)) ? CLUSTER_HOST_SUFFIX : "")); struct addrinfo *res = NULL; int ret = getaddrinfo(iface_hostname, NULL, NULL, &res); if(ret) { - elog("IP address resolution failed for %s (err: %s)", + elog("%s ip address resolution failed (err: %s)", iface_hostname, gai_strerror(ret)); return rc; } diff --git a/mtce-common/src/common/nlEvent.cpp b/mtce-common/src/common/nlEvent.cpp index 985a2ff5..d6e78e3a 100644 --- a/mtce-common/src/common/nlEvent.cpp +++ b/mtce-common/src/common/nlEvent.cpp @@ -165,8 +165,8 @@ int get_netlink_events ( int nl_socket , std::list & links_gone_down, void log_link_events ( int netlink_sock, - int ioctl_sock, - const char * mgmnt_iface_ptr, + int ioctl_sock, + const char * mgmnt_iface_ptr, const char * clstr_iface_ptr, bool & mgmnt_link_up_and_running, bool & clstr_link_up_and_running) @@ -175,28 +175,28 @@ void log_link_events ( int netlink_sock, std::list links_gone_up ; std::list::iterator iter_curr_ptr ; dlog3 ("logging for interfaces %s and %s\n", mgmnt_iface_ptr, clstr_iface_ptr); - if ( get_netlink_events ( netlink_sock, links_gone_down, links_gone_up )) + if ( get_netlink_events ( netlink_sock, links_gone_down, links_gone_up )) { bool running = false ; if ( !links_gone_down.empty() ) { dlog3 ("%ld links have dropped\n", links_gone_down.size() ); - + /* Look at the down list */ for ( iter_curr_ptr = links_gone_down.begin(); iter_curr_ptr != links_gone_down.end() ; iter_curr_ptr++ ) { - dlog3 ( "downed link: %s (running:%d:%d)\n", - iter_curr_ptr->c_str(), - mgmnt_link_up_and_running, + dlog3 ( "downed link: %s (running:%d:%d)\n", + iter_curr_ptr->c_str(), + mgmnt_link_up_and_running, clstr_link_up_and_running ); if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data())) { if ( mgmnt_link_up_and_running == true ) { - mgmnt_link_up_and_running = false ; + mgmnt_link_up_and_running = false ; wlog ("Mgmnt link %s is down\n", mgmnt_iface_ptr ); } } @@ -208,14 +208,19 @@ void log_link_events ( int netlink_sock, wlog ("Cluster-host link %s is down\n", clstr_iface_ptr ); } } - + if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS ) { - dlog ("%s is down (oper:%s)\n", iter_curr_ptr->c_str(), running ? "up" : "down" ); + wlog ("%s is down (oper:%s) (%ld)\n", + iter_curr_ptr->c_str(), + running ? "up" : "down", + iter_curr_ptr->length() ); } else { - wlog ("%s is down (driver query failed)\n", iter_curr_ptr->c_str() ); + wlog ("%s is down (driver query failed) (len:%ld)\n", + iter_curr_ptr->c_str(), + iter_curr_ptr->length() ); } } } @@ -228,14 +233,14 @@ void log_link_events ( int netlink_sock, iter_curr_ptr != links_gone_up.end() ; iter_curr_ptr++ ) { - dlog3 ( "recovered link: %s (running:%d:%d)\n", - iter_curr_ptr->c_str(), - mgmnt_link_up_and_running, + dlog3 ( "recovered link: %s (running:%d:%d)\n", + iter_curr_ptr->c_str(), + mgmnt_link_up_and_running, clstr_link_up_and_running ); if ( !strcmp (mgmnt_iface_ptr, iter_curr_ptr->data())) { - mgmnt_link_up_and_running = true ; + mgmnt_link_up_and_running = true ; wlog ("Mgmnt link %s is up\n", mgmnt_iface_ptr ); } if ( !strcmp (clstr_iface_ptr, iter_curr_ptr->data())) @@ -246,13 +251,16 @@ void log_link_events ( int netlink_sock, if ( get_link_state ( ioctl_sock, iter_curr_ptr->data(), &running ) == PASS ) { - dlog ("%s is up (oper:%s)\n", - iter_curr_ptr->c_str(), - running ? "up" : "down" ); + wlog ("%s is up (oper:%s) (len:%ld)\n", + iter_curr_ptr->c_str(), + running ? "up" : "down", + iter_curr_ptr->length() ); } else { - wlog ("%s is up (driver query failed)\n", iter_curr_ptr->c_str() ); + wlog ("%s is up (driver query failed) (len:%ld)\n", + iter_curr_ptr->c_str(), + iter_curr_ptr->length() ); } } } @@ -291,9 +299,9 @@ int open_netlink_socket ( int groups ) addr.nl_pid = getpid (); /* addr.nl_groups = RTMGRP_LINK | RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR; */ addr.nl_groups = groups ; /* allow the caller to specify the groups */ - + if (bind (nl_socket, (struct sockaddr *) &addr, sizeof (addr)) < 0) - { + { elog ( "Failed to bind netlink socket (%d:%s)\n", errno, strerror(errno)); close (nl_socket); nl_socket = 0 ; diff --git a/mtce-common/src/common/nodeBase.cpp b/mtce-common/src/common/nodeBase.cpp index e799815f..e8383abb 100755 --- a/mtce-common/src/common/nodeBase.cpp +++ b/mtce-common/src/common/nodeBase.cpp @@ -194,7 +194,7 @@ const char * get_mtcNodeCommand_str ( int cmd ) case MTC_EVENT_HOST_STALLED: return("host stalled event"); /* pmon events */ - case MTC_EVENT_PMON_CLEAR: return("pmon clear"); + case MTC_EVENT_PMON_CLEAR: return("pmon degrade clear"); case MTC_EVENT_PMON_CRIT: return("pmon critical event"); case MTC_EVENT_PMON_MAJOR: return("pmon major event"); case MTC_EVENT_PMON_MINOR: return("pmon minor event"); diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index 14136a26..94c80ca0 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -163,6 +163,9 @@ void daemon_exit ( void ); #define LOOPBACK_IPV6 "::1" #define LOCALHOST "localhost" + +#define CLUSTER_HOST_SUFFIX ((const char*)("-cluster-host")) + #define NONE (const char *)"none" /** Largest heartbeat pulse (req/resp) message size */ @@ -198,8 +201,10 @@ void daemon_exit ( void ); #define MTC_JSON_INV_BMIP "bm_ip" #define MTC_JSON_INV_BMTYPE "bm_type" #define MTC_JSON_INV_BMUN "bm_username" - +#define MTC_JSON_SERVICE "service" #define MTC_JSON_SEVERITY "severity" +#define MTC_JSON_SENSOR "sensor" +#define MTC_JSON_PROCESS "process" /* These Task strings should not be changed without * the corresponding change in Horizon. @@ -336,9 +341,9 @@ void daemon_exit ( void ); /* root@controller-0:~# getconf HOST_NAME_MAX * 64 */ -#define MAX_CHARS_HOSTNAME (32) /**< The largest hostname length */ -// #define MAX_CHARS_HOSTNAME (64) /**< The largest hostname length */ -#define MAX_CHARS_FILENAME (256) /**< The largest hostname length */ +#define MAX_CHARS_HOSTNAME_32 (32) +#define MAX_CHARS_HOSTNAME (256) /**< the largest hostname length */ +#define MAX_CHARS_FILENAME (256) /**< the largest filename length */ #define MAX_CHARS_ON_LINE (256) /**> max number of chars on a single line */ #define MAX_CHARS_IN_INT (65) /**> max number of chars in an integer */ @@ -389,6 +394,26 @@ void daemon_exit ( void ); /* This label will resolve to an IP on the management network */ #define CONTROLLER_NFS ((const char *)"controller-nfs") +/* Maintenance Daemon Services - actual names of the daemons */ +/* ... controller only service / daemons */ +#define MTC_SERVICE_MTCAGENT_NAME "mtcAgent" +#define MTC_SERVICE_HBSAGENT_NAME "hbsAgent" +#define MTC_SERVICE_HWMOND_NAME "hwmond" +#define MTC_SERVICE_GUESTAGENT_NAME "guestAgent" + +/* ... all nodes services / daemons */ +#define MTC_SERVICE_PMOND_NAME "pmond" +#define MTC_SERVICE_HBSCLIENT_NAME "hbsClient" +#define MTC_SERVICE_MTCCLIENT_NAME "mtcClient" +#define MTC_SERVICE_HOSTW_NAME "hostwd" +#define MTC_SERVICE_FSMON_NAME "fsmond" +#define MTC_SERVICE_LMON_NAME "lmond" +#define MTC_SERVICE_MTCLOG_NAME "mtclogd" + +/* ... compute only services / daemons */ +#define MTC_SERVICE_GUESTSERVER_NAME "guestServer" + + #define CGTS_NODE_TYPES 4 #define CGTS_NODE_TYPE_SIZE 12 #define CGTS_NODE_NULL (0x00) @@ -499,6 +524,7 @@ const char * get_heartbeat_ready_header( void ) ; #define MTC_CMD_REVISION (0) #define MTC_CMD_FEATURE_VER__MACADDR_IN_CMD (1) +#define MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF (2) typedef struct { diff --git a/mtce-common/src/common/nodeUtil.cpp b/mtce-common/src/common/nodeUtil.cpp index 2727158d..e69dbe61 100755 --- a/mtce-common/src/common/nodeUtil.cpp +++ b/mtce-common/src/common/nodeUtil.cpp @@ -110,6 +110,7 @@ void node_inv_init (node_inv_type & inv) inv.uuid.clear(); inv.name.clear(); inv.ip.clear(); + inv.clstr_ip.clear(); inv.mac.clear(); inv.admin.clear(); inv.oper.clear(); @@ -750,7 +751,7 @@ int get_iface_address ( const char * iface_ptr, string & ip_addr , bool retry ) if ( rc == PASS ) { ip_addr = ip_cstr; - ilog ("IP Address : %s\n", ip_addr.c_str() ); + dlog ("IP Address : %s\n", ip_addr.c_str() ); } else { diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index 8dbb755a..96cd1d89 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -281,6 +281,7 @@ nodeLinkClass::nodeLinkClass() my_hostname.clear() ; my_local_ip.clear() ; my_float_ip.clear() ; + my_clstr_ip.clear() ; active_controller_hostname.clear() ; inactive_controller_hostname.clear() ; @@ -2351,15 +2352,15 @@ int nodeLinkClass::mod_host ( node_inv_type & inv ) } if ( node_ptr->clstr_ip.compare ( inv.clstr_ip ) ) { - if (( hostUtil_is_valid_ip_addr ( inv.clstr_ip )) || ( hostUtil_is_valid_ip_addr ( node_ptr->clstr_ip ))) + if ( hostUtil_is_valid_ip_addr ( inv.clstr_ip )) { plog ("%s Modify 'clstr_ip' from %s -> %s\n", node_ptr->hostname.c_str(), node_ptr->clstr_ip.c_str(), inv.clstr_ip.c_str() ); modify = true ; /* we have a delta */ + node_ptr->clstr_ip = inv.clstr_ip ; } - node_ptr->clstr_ip = inv.clstr_ip ; } if ( (!inv.name.empty()) && (node_ptr->hostname.compare ( inv.name)) ) { @@ -2976,10 +2977,13 @@ int nodeLinkClass::add_heartbeat_host ( const node_inv_type & inv ) /* Handle the case where we are adding a node that is already */ /* present if so just update the inventory data not the mtc state */ node_ptr = nodeLinkClass::getNode(inv.name); - if ( node_ptr ) + if ( node_ptr ) { dlog ("%s already provisioned\n", node_ptr->hostname.c_str()); - rc = RETRY ; + node_ptr->nodetype = inv.nodetype ; + node_ptr->ip = inv.ip ; + node_ptr->clstr_ip = inv.clstr_ip ; + rc = PASS ; } /* Otherwise add it as a new node */ else @@ -2989,6 +2993,8 @@ int nodeLinkClass::add_heartbeat_host ( const node_inv_type & inv ) { node_ptr->hostname = inv.name ; node_ptr->nodetype = inv.nodetype ; + node_ptr->ip = inv.ip ; + node_ptr->clstr_ip = inv.clstr_ip ; dlog ("%s added to linked list\n", inv.name.c_str()); rc = PASS ; } @@ -3323,21 +3329,32 @@ int nodeLinkClass::set_clstr_hostaddr ( string & hostname, string & ip ) node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) { - node_ptr->clstr_ip = ip ; + if (( hostUtil_is_valid_ip_addr(ip)) && ( node_ptr->clstr_ip != ip )) + { + ilog ("%s cluster address provision change from %s to %s", + hostname.c_str(), + node_ptr->clstr_ip.empty() ? "none" : node_ptr->clstr_ip.c_str(), + ip.c_str()); + node_ptr->clstr_ip = ip ; + send_hbs_command ( node_ptr->hostname, MTC_CMD_MOD_HOST ); + } rc = PASS ; } return ( rc ); } -string nodeLinkClass::get_hostname ( string & hostaddr ) +string nodeLinkClass::get_hostname ( string hostaddr ) { if (( hostaddr == LOOPBACK_IPV6 ) || - ( hostaddr == LOOPBACK_IP ) || - ( hostaddr == LOCALHOST )) + ( hostaddr == LOOPBACK_IP ) || + ( hostaddr == LOCALHOST ) || + ( hostaddr == my_local_ip ) || + ( hostaddr == my_float_ip ) || + ( hostaddr == my_clstr_ip )) { - return(my_hostname); + return(this->my_hostname); } - else + else if ( this->hosts ) { nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostaddr ); @@ -3345,8 +3362,8 @@ string nodeLinkClass::get_hostname ( string & hostaddr ) { return ( node_ptr->hostname ); } - return ( null_str ); } + return ( null_str ); } string nodeLinkClass::get_hostname_from_bm_ip ( string bm_ip ) @@ -4763,7 +4780,9 @@ int nodeLinkClass::declare_service_ready ( string & hostname, else if ( service == MTC_SERVICE_PMOND ) { node_ptr->pmond_ready = true ; - plog ("%s got pmond ready event\n", hostname.c_str()); + plog ("%s %s ready event\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME); /* A ready event means that pmond pocess has started. * Any previous history is gone. Cleanup mtce. @@ -4775,7 +4794,9 @@ int nodeLinkClass::declare_service_ready ( string & hostname, else if ( service == MTC_SERVICE_HWMOND ) { node_ptr->hwmond_ready = true ; - plog ("%s got hwmond ready event\n", hostname.c_str()); + plog ("%s %s ready event\n", + hostname.c_str(), + MTC_SERVICE_HWMOND_NAME); if ( node_ptr->bm_provisioned == true ) { send_hwmon_command ( node_ptr->hostname, MTC_CMD_ADD_HOST ); @@ -4788,7 +4809,9 @@ int nodeLinkClass::declare_service_ready ( string & hostname, if ( node_ptr->hbsClient_ready == false ) { node_ptr->hbsClient_ready = true ; - plog ("%s got hbsClient ready event\n", hostname.c_str()); + plog ("%s %s ready event\n", + hostname.c_str(), + MTC_SERVICE_HBSCLIENT_NAME); } return (PASS); } @@ -4901,7 +4924,7 @@ int nodeLinkClass::node_degrade_control ( string & hostname, int state, string s slog ("%s service not specified", hostname.c_str()); return (FAIL_STRING_EMPTY); } - else if ( !service.compare("hwmon") ) + else if ( service == MTC_SERVICE_HWMOND_NAME ) { service_flag = DEGRADE_MASK_HWMON ; } @@ -5048,6 +5071,9 @@ int nodeLinkClass::invoke_hwmon_action ( string & hostname, int action, string mtcTimer_reset ( node_ptr->hwmon_reset.recovery_timer ); mtcTimer_start ( node_ptr->hwmon_reset.recovery_timer, mtcTimer_handler, MTC_MINS_15 ); + wlog ("%s invoking 'reset' due to critical '%s' sensor assertion\n", + hostname.c_str(), sensor.c_str()); + force_full_enable ( node_ptr ); } else @@ -7394,7 +7420,7 @@ bool nodeLinkClass::get_hwmond_monitor_state ( string & hostname ) { bool state = false ; if ( hostname.length() ) - { + { struct nodeLinkClass::node* node_ptr ; node_ptr = nodeLinkClass::getNode ( hostname ); if ( node_ptr != NULL ) @@ -7419,9 +7445,14 @@ bool nodeLinkClass::get_hbs_monitor_state ( string & hostname, int iface ) state = node_ptr->monitor[iface] ; if ( state == true ) { - wlog_throttled (node_ptr->no_rri_log_throttle, rri_max, - "%s Not Offering RRI (%d)\n", - hostname.c_str(), this->hosts ); + /* fast lookup not supported for hostnames longer than 31 + * chars so in those cases don't do Not Offering RRI log */ + if ( hostname.length() < MAX_CHARS_HOSTNAME_32 ) + { + wlog_throttled (node_ptr->no_rri_log_throttle, rri_max, + "%s Not Offering RRI (hosts:%d)\n", + hostname.c_str(), this->hosts ); + } } else { @@ -7450,7 +7481,7 @@ void nodeLinkClass::manage_pulse_flags ( string & hostname, unsigned int flags ) /* Manage the heartbeat pulse flags by pulse_ptr */ void nodeLinkClass::manage_pulse_flags ( struct nodeLinkClass::node * node_ptr, unsigned int flags ) { - /* Do nothing with the flags for missing pulse + /* Do nothing with the flags for missing pulse * responses (identified with flags=NULL_PULSE_FLAGS) */ if ( flags == NULL_PULSE_FLAGS ) { @@ -7473,9 +7504,9 @@ void nodeLinkClass::manage_pulse_flags ( struct nodeLinkClass::node * node_ptr, /* TODO: Does this need to be debounced ??? */ node_ptr->monitor[CLSTR_IFACE] = true ; } - + /* A host indicates that its process monitor is running by setting the - * PMOND_FLAG occasionally in its pulse response. + * PMOND_FLAG occasionally in its pulse response. * The following if/else if clauses manage raising an alarm and degrading * a host has stopped sending the PMOND_FLAG. */ if ( flags & PMOND_FLAG ) @@ -7707,8 +7738,8 @@ struct nodeLinkClass::node* nodeLinkClass::getPulseNode ( string & hostname , if /* Find the node in the list of nodes being heartbeated and splice it out */ int nodeLinkClass::remPulse_by_index ( string hostname, int index, iface_enum iface, bool clear_b2b_misses_count, unsigned int flags ) { - int rc = FAIL ; - if (( index > 0 ) && ( !(index > hosts))) + int rc = RETRY ; + if (!(index > hosts)) { if ( hbs_rra[index] != NULL ) { @@ -7727,14 +7758,20 @@ int nodeLinkClass::remPulse_by_index ( string hostname, int index, iface_enum if } else { - wlog_throttled ( node_ptr->unexpected_pulse_log_throttle, 200, "%s is not being monitored\n", hostname.c_str()); + wlog_throttled ( node_ptr->unexpected_pulse_log_throttle, 200, + "%s is not being monitored\n", + hostname.c_str()); rc = PASS; } } else { rc = remPulse_by_name ( hostname, iface, clear_b2b_misses_count, flags ); - wlog_throttled ( node_ptr->lookup_mismatch_log_throttle, 200, "%s rri lookup mismatch (%s:%d) ; %s\n", hostname.c_str(), node_ptr->hostname.c_str(), index, rc ? "" : "removed by hostname" ); + wlog_throttled ( node_ptr->lookup_mismatch_log_throttle, 200, + "%s rri lookup mismatch (%s:%d) ; %s\n", + hostname.c_str(), + node_ptr->hostname.c_str(), + index, rc ? "" : "removed by hostname" ); return (rc); } } @@ -7984,13 +8021,6 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle * By index does not require a lookup whereas hostname does */ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags ) { - /* TODO: consider removing this check */ - if ( hostname == "localhost" ) - { - /* localhost is not a supported hostname and indicates - * an unconfigured host response ; return the ignore response */ - return(ENXIO); - } if ( index ) { int rc = remPulse_by_index ( hostname, index , iface, true , flags ); @@ -8001,8 +8031,11 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index default: mlog ("%s RRI Miss (rri:%d) (rc:%d)\n", hostname.c_str(), index, rc ); } } - else + /* fast lookup not supported for hostnames longer than 31 + * chars so in those cases don't do Not Offering RRI log */ + if ( hostname.length() < MAX_CHARS_HOSTNAME_32 ) { + get_hbs_monitor_state ( hostname, iface ) ; } return ( remPulse_by_name ( hostname , iface, true, flags )); } diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 75e38694..c095f89a 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -1331,6 +1331,7 @@ public: string my_hostname ; /**< */ string my_local_ip ; /**< Primary IP address */ string my_float_ip ; /**< Secondary (floating) IP address */ + string my_clstr_ip ; /**< Cluster network IP address */ /********* New Public Constructs for IPMI Comamnd Handling ***********/ @@ -1371,7 +1372,7 @@ public: int set_clstr_hostaddr ( string & hostname, string & ip ); /** get hostname for any hostname */ - string get_hostname ( string & hostaddr ); + string get_hostname ( string hostaddr ); /******************************/ /* NODE TYPE Member Functions */ diff --git a/mtce/src/heartbeat/hbsAgent.cpp b/mtce/src/heartbeat/hbsAgent.cpp index dc90df40..890cc6c5 100644 --- a/mtce/src/heartbeat/hbsAgent.cpp +++ b/mtce/src/heartbeat/hbsAgent.cpp @@ -41,7 +41,7 @@ using namespace std; #include "hbsBase.h" /* Heartbeat Base Header File */ #include "hbsAlarm.h" /* for ... hbsAlarm_clear_all */ #include "alarm.h" /* for ... alarm send message to mtcalarmd */ -#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */ +#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */ /************************************************************** * Implementation Structure @@ -414,39 +414,6 @@ int daemon_configure ( void ) ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_agent_mgmnt_port ); ilog("Mgmnt Port : %d (tx)\n", hbs_config.hbs_client_mgmnt_port ); - /* Fetch the cluster-host interface name. - * calls daemon_get_iface_master inside so the - * aggrigated name is returned if it exists */ - get_clstr_iface (&hbs_config.clstr_iface ); - if ( strlen(hbs_config.clstr_iface) ) - { - if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface)) - { - hbsInv.clstr_network_provisioned = false ; - } - else - { - hbsInv.clstr_network_provisioned = true ; - ilog ("Clstr Name : %s", hbs_config.clstr_iface ); - ilog ("Clstr Port : %d (rx)", hbs_config.hbs_agent_clstr_port ); - ilog ("Clstr Port : %d (tx)", hbs_config.hbs_client_clstr_port ); - } - } - - ilog("Command Port: %d (rx)\n", hbs_config.mtc_to_hbs_cmd_port ); - ilog("Event Port : %d (tx)\n", hbs_config.hbs_to_mtc_event_port ); - ilog("Alarm Port : %d (tx)\n", hbs_config.mtcalarm_req_port ); - - hbsInv.hbs_state_change = true ; - - /* pull in the degrade only config option */ - hbsInv.clstr_degrade_only = hbs_config.clstr_degrade_only ; - - if ( hbsInv.hbs_degrade_threshold >= hbsInv.hbs_failure_threshold ) - { - wlog ("Degrade threshold should be smaller than Failure threshold\n"); - wlog ("Heartbeat 'degrade' state disabled ; see %s\n", MTCE_CONF_FILE); - } for ( ;; ) { get_ip_addresses ( hbsInv.my_hostname, hbsInv.my_local_ip , hbsInv.my_float_ip ); @@ -464,6 +431,39 @@ int daemon_configure ( void ) break ; } } + /* Fetch the cluster-host interface name. + * calls daemon_get_iface_master inside so the + * aggrigated name is returned if it exists */ + get_clstr_iface (&hbs_config.clstr_iface ); + if ( strlen(hbs_config.clstr_iface) ) + { + int rc = get_iface_address ( hbs_config.clstr_iface, + hbsInv.my_clstr_ip, false ); + if ( rc ) + { + elog ("get Clstr IP address failed '%s' (%d:%d:%m)\n", + hbs_config.clstr_iface, rc, errno ); + } + else + { + ilog ("Clstr Addr : %s\n", hbsInv.my_clstr_ip.c_str()); + } + + if (!strcmp(hbs_config.clstr_iface, hbs_config.mgmnt_iface)) + { + hbsInv.clstr_network_provisioned = false ; + } + else + { + hbsInv.clstr_network_provisioned = true ; + ilog ("Clstr Name : %s", hbs_config.clstr_iface ); + ilog ("Clstr Port : %d (rx)", hbs_config.hbs_agent_clstr_port ); + ilog ("Clstr Port : %d (tx)", hbs_config.hbs_client_clstr_port ); + } + } + ilog("Command Port: %d (rx)\n", hbs_config.mtc_to_hbs_cmd_port ); + ilog("Event Port : %d (tx)\n", hbs_config.hbs_to_mtc_event_port ); + ilog("Alarm Port : %d (tx)\n", hbs_config.mtcalarm_req_port ); /* Set Controller Activity State */ hbs_config.active = daemon_get_run_option ("active") ; @@ -476,6 +476,19 @@ int daemon_configure ( void ) else hbsInv.set_activity_state ( false ); + /* Start assuming a change */ + hbsInv.hbs_state_change = true ; + + /* pull in the degrade only config option */ + hbsInv.clstr_degrade_only = hbs_config.clstr_degrade_only ; + + if ( hbsInv.hbs_degrade_threshold >= hbsInv.hbs_failure_threshold ) + { + wlog ("Degrade threshold should be smaller than Failure threshold\n"); + wlog ("Heartbeat 'degrade' state disabled ; see %s\n", MTCE_CONF_FILE); + } + + return (PASS); } @@ -888,9 +901,6 @@ int hbs_pulse_request ( iface_enum iface, int bytes = 0 ; if ( hbs_sock.tx_sock[iface] ) { - // int unused_networks = 0 ; - memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME ); - /* Add message version - 0 -> 1 with the acction of cluster information */ hbs_sock.tx_mesg[iface].v = HBS_MESSAGE_VERSION ; @@ -903,14 +913,21 @@ int hbs_pulse_request ( iface_enum iface, /* Add this controller's lookup_clue * ... aka RRI (Resource Reference Index) */ + + /* Fast lookup clue supported for hostnames less than 32 bytes */ + memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME_32 ); if (( lookup_clue ) && - ( hostname_clue.length() <= MAX_CHARS_HOSTNAME )) + ( hostname_clue.length() < MAX_CHARS_HOSTNAME_32 )) { hbs_sock.tx_mesg[iface].c = lookup_clue ; memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], hostname_clue.data(), hostname_clue.length()); } + else + { + hbs_sock.tx_mesg[iface].c = 0; + } /* Append the cluster info to the pulse request */ hbs_cluster_append(hbs_sock.tx_mesg[iface]) ; @@ -973,19 +990,6 @@ hbs_pulse_request_out: return (PASS); } -string get_hostname_from_pulse ( char * msg_ptr ) -{ - char temp [MAX_HOST_NAME_SIZE]; - string hostname ; - - char * str_ptr = strstr ( msg_ptr, ":" ); - memset ( temp, 0 , MAX_HOST_NAME_SIZE ); - - sscanf ( ++str_ptr, "%31s", &temp[0] ); - hostname = temp ; - return (hostname); -} - int _pulse_receive ( iface_enum iface , unsigned int seq_num ) { int bytes = 0 ; @@ -1022,9 +1026,9 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) // (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT); continue ; } - mlog ("%s Pulse Rsp: (%d) %s:%d: s:%d f:%x [%-27s] RRI:%d\n", + mlog ("%s Pulse Rsp: (%d) from:%s:%d: s:%d flags:%x [%-27s] RRI:%d\n", get_iface_name_str(iface), bytes, - hbs_sock.rx_sock[iface]->get_dst_addr()->toString(), + hbs_sock.rx_sock[iface]->get_src_str(), hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(), hbs_sock.rx_mesg[iface].s, hbs_sock.rx_mesg[iface].f, @@ -1035,7 +1039,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) if ( strstr ( hbs_sock.rx_mesg[iface].m, rsp_msg_header) ) { int rc = RETRY ; - string hostname = get_hostname_from_pulse (&hbs_sock.rx_mesg[iface].m[0]); + string hostname = hbsInv.get_hostname (hbs_sock.rx_sock[iface]->get_src_str()); #ifdef WANT_FIT_TESTING if ( hbs_config.testmode == 1 ) @@ -1169,11 +1173,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num ) int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) { - int bytes ; - int bytes_to_send ; - int rc = PASS ; - int retries = 0 ; - if ((hbs_sock.hbs_event_tx_sock == NULL ) || (hbs_sock.hbs_event_tx_sock->sock_ok() == false )) { @@ -1192,23 +1191,10 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) hbs_cluster_log ( hbsInv.my_hostname, "event", true ); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header()); } - else if ( event_cmd == MTC_EVENT_LOOPBACK ) - { - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_event_header()); - } - else if ( event_cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) - { - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_event_header()); - } - else if ( event_cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) - { - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_event_header()); - } - else if ( event_cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET ) - { - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_event_header()); - } - else if ( event_cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR ) + else if (( event_cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) || + ( event_cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) || + ( event_cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET ) || + ( event_cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR )) { snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_event_header()); } @@ -1228,38 +1214,34 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface ) return ( FAIL_BAD_CASE ); } - /* Put the hostname in the buffer - as well */ - snprintf ( &event.buf[0] , MAX_CHARS_HOSTNAME, "%s", hostname.data()); - - /* TODO: obsolete this method in the future as it limits the host name lenth to 32 */ - snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME, "%s", hostname.data()); + snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME_32, "%s", hostname.data()); event.cmd = event_cmd ; event.num = 1 ; event.parm[0] = iface ; + /* Support for 64 byte hostnames */ + event.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + + /* Json string starts at the beginning of the buffer */ + event.res = 0 ; + + string buf_info = "{\"hostname\":\"" ; + buf_info.append(hostname); + buf_info.append("\",\"service\":\""); + buf_info.append(MTC_SERVICE_HBSAGENT_NAME); + buf_info.append("\"}"); + + /* copy the string into the buffer and add one to the length to + * accomodate for the null terminator snprintf automatically adds */ + snprintf ( &event.buf[event.res], buf_info.length()+1, "%s", buf_info.data()); print_mtc_message ( hostname, MTC_CMD_TX, event, get_iface_name_str(iface) , false ); - - /* remove the buffer as it is not needed for this message */ - bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE-hostname.length())) ; - do + if ( hbs_sock.hbs_event_tx_sock->write((char*)&event, sizeof(mtc_message_type)) <= 0 ) { - bytes = hbs_sock.hbs_event_tx_sock->write((char*)&event,bytes_to_send); - if ( bytes <= 0 ) - { - rc = FAIL_TO_TRANSMIT ; - - if ( retries++ > 3 ) - { - elog ("Cannot communicate with maintenance\n"); - return (RETRY); - } - } - else - rc = PASS ; - } while ( bytes <= 0 ) ; - - return rc ; + elog ("%s failed to send event to maintenance (%d:%m)", hostname.c_str(), errno ); + return ( FAIL_TO_TRANSMIT ) ; + } + return PASS ; } /* The main heartbeat service loop */ @@ -1830,13 +1812,47 @@ void daemon_service_run ( void ) bytes = hbs_sock.mtc_to_hbs_sock->read((char*)&msg,sizeof(mtc_message_type)); if ( bytes > 0 ) { - mlog ("Received Maintenance Command (%i)\n", bytes ); - mlog ("%s - cmd:0x%x\n", &msg.hdr[0], msg.cmd ); - if ( !strncmp ( get_hbs_cmd_req_header(), &msg.hdr[0], MSG_HEADER_SIZE )) { - string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - if ( msg.cmd == MTC_CMD_ACTIVE_CTRL ) + string hostname ; + node_inv_type inv ; + node_inv_init(inv); + + /* 64 byte hostname support adds a json string to + * the message buffer containing the hostname as a + * key/value pair. */ + if (( msg.ver >= MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ) && + ( msg.buf[msg.res] == '{' )) + { + if ( jsonUtil_get_key_val(&msg.buf[msg.res], + MTC_JSON_INV_NAME, hostname) == PASS ) + { + inv.name = hostname ; + if (( msg.cmd == MTC_CMD_ADD_HOST ) || + ( msg.cmd == MTC_CMD_MOD_HOST )) + { + jsonUtil_get_key_val(&msg.buf[msg.res], MTC_JSON_INV_HOSTIP, inv.ip); + if ( hbsInv.clstr_network_provisioned == true ) + { + jsonUtil_get_key_val(&msg.buf[msg.res], MTC_JSON_INV_CLSTRIP, inv.clstr_ip); + } + } + } + } + else if ( msg.hdr[MSG_HEADER_SIZE] != '\0' ) + { + /* get hostname by legacy method, + * ... from the header */ + hostname = &msg.hdr[MSG_HEADER_SIZE] ; + } + if ( hostname.empty() ) + { + /* no hostname ; no action to take */ + wlog ("unable to get hostname from %s command", + get_mtcNodeCommand_str(msg.cmd)); + } + + else if ( msg.cmd == MTC_CMD_ACTIVE_CTRL ) { bool logit = false ; if ( hostname == hbsInv.my_hostname ) @@ -1876,13 +1892,12 @@ void daemon_service_run ( void ) } else if ( msg.cmd == MTC_CMD_ADD_HOST ) { - node_inv_type inv ; - node_inv_init(inv); - inv.name = hostname ; inv.nodetype = msg.parm[0]; hbsInv.add_heartbeat_host ( inv ) ; - hostname_inventory.push_back ( hostname ); - ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] ); + hostname_inventory.push_back ( inv.name ); + ilog ("%s added to heartbeat service (%d)\n", + inv.name.c_str(), + inv.nodetype); /* clear any outstanding alarms on the ADD */ if (( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) && @@ -1892,6 +1907,20 @@ void daemon_service_run ( void ) hbsInv.clstr_network_provisioned ); } } + else if ( msg.cmd == MTC_CMD_MOD_HOST ) + { + inv.nodetype = msg.parm[0]; + hbsInv.add_heartbeat_host ( inv ) ; + ilog ("%s modified heartbeat info [%d]\n", + inv.name.c_str(), + inv.nodetype ); + + /* clear any outstanding alarms on the ADD */ + if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE ) + { + hbsAlarm_clear_all ( hostname, hbsInv.clstr_network_provisioned ); + } + } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { hbsInv.mon_host ( hostname, false, false ); diff --git a/mtce/src/heartbeat/hbsBase.h b/mtce/src/heartbeat/hbsBase.h index ea3f7673..932ed543 100755 --- a/mtce/src/heartbeat/hbsBase.h +++ b/mtce/src/heartbeat/hbsBase.h @@ -61,7 +61,7 @@ const char req_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse req:"}; /** Heartbeat pulse response message header content */ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"}; -#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME) +#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME_32) #define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info diff --git a/mtce/src/heartbeat/hbsClient.cpp b/mtce/src/heartbeat/hbsClient.cpp index ea121ea2..f94e238b 100644 --- a/mtce/src/heartbeat/hbsClient.cpp +++ b/mtce/src/heartbeat/hbsClient.cpp @@ -875,6 +875,18 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags ) } } + /* Manage the Resource Reference Index (RRI) "lookup clue" + * Only supported for hostnames -lt 32 bytes */ + if (( strnlen(&my_hostname[0], MAX_CHARS_HOSTNAME) < MAX_CHARS_HOSTNAME_32) && + (!strncmp(&hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME_32))) + { + if( rri[controller] != hbs_sock.rx_mesg[iface].c ) + { + rri[controller] = hbs_sock.rx_mesg[iface].c ; + ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller ); + } + } + /* Log the received cluster info * ... if the message version shows that it is supported */ if ( hbs_sock.rx_mesg[iface].v ) @@ -1070,7 +1082,6 @@ int hbs_send_event ( unsigned int event ) mtc_message_type msg ; int rc = FAIL_BAD_PARM ; - int bytes = 0 ; memset (&msg, 0 , sizeof(mtc_message_type)); @@ -1097,22 +1108,31 @@ int hbs_send_event ( unsigned int event ) /* build the message */ snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); - snprintf ( &msg.hdr[MSG_HEADER_SIZE], MAX_CHARS_HOSTNAME, "%s", &my_hostname[0]); - #define MAX_PROC_NAME_SIZE (64) - snprintf ( &msg.buf[0], MAX_PROC_NAME_SIZE, "%s", program_invocation_short_name); - - size_t len = strlen(program_invocation_short_name); - bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); msg.cmd = event ; + msg.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + + string event_info = "{\"" ; + event_info.append(MTC_JSON_INV_NAME); + event_info.append("\":\""); + event_info.append(my_hostname); + event_info.append("\",\""); + event_info.append(MTC_JSON_SERVICE); + event_info.append("\":\""); + event_info.append(MTC_SERVICE_HBSCLIENT_NAME ); + event_info.append( "\"}"); + + size_t len = event_info.length()+1 ; + snprintf ( &msg.buf[0], len, "%s", event_info.data()); + int bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); if (( hbs_sock.hbs_ready_tx_sock ) && ( hbs_sock.hbs_ready_tx_sock->sock_ok() == true )) { - mlog ("Ready message\n"); + mlog ("%s sending ready event\n", my_hostname ); if ((rc = hbs_sock.hbs_ready_tx_sock->write((char*)&msg.hdr[0], bytes))!= bytes ) { - elog ("Ready message send failed (%d) (%d:%s)\n", rc, errno, strerror(errno) ); + elog ("... ready event send failed (%d) (%d:%s)\n", rc, errno, strerror(errno) ); rc = FAIL_SOCKET_SENDTO ; } else diff --git a/mtce/src/hwmon/hwmonInit.cpp b/mtce/src/hwmon/hwmonInit.cpp index 70f76ffb..fa84a972 100644 --- a/mtce/src/hwmon/hwmonInit.cpp +++ b/mtce/src/hwmon/hwmonInit.cpp @@ -357,7 +357,7 @@ void daemon_service_run ( void ) ilog ("Transmitting: Monitor READY Event\n" ); do { - rc = hwmon_send_event ( hwmon_ctrl.my_hostname, MTC_EVENT_MONITOR_READY, "hwmond" ); + rc = hwmon_send_event ( hwmon_ctrl.my_hostname, MTC_EVENT_MONITOR_READY, "" ); if ( rc == RETRY ) { mtcWait_secs ( 2 ); diff --git a/mtce/src/hwmon/hwmonMsg.cpp b/mtce/src/hwmon/hwmonMsg.cpp index 6572776b..6e0aaf11 100644 --- a/mtce/src/hwmon/hwmonMsg.cpp +++ b/mtce/src/hwmon/hwmonMsg.cpp @@ -131,9 +131,7 @@ int mtclogd_tx_port_init ( void ) int hwmon_send_event ( string hostname, unsigned int event_code , const char * sensor_ptr ) { mtc_message_type event ; - - int rc = FAIL ; - int bytes = 0 ; + int rc = PASS ; memset (&event, 0 , sizeof(mtc_message_type)); @@ -149,59 +147,55 @@ int hwmon_send_event ( string hostname, unsigned int event_code , const char * s ( event_code == MTC_DEGRADE_RAISE ) || ( event_code == MTC_DEGRADE_CLEAR )) { - mlog ("%s sending '%s' event to mtcAgent for '%s'\n", - hostname.c_str(), + string event_info = "" ; + + mlog ("%s sending '%s' event to mtcAgent for '%s'\n", + hostname.c_str(), get_event_str(event_code).c_str(), sensor_ptr ); snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); - - snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data()); + + /* Limit the size of the hostname in the header to 32 bytes + * - legacy support */ + snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME_32, "%s", hostname.data()); + + /* Add support for up to 64 byte hostnames as a + * json string in the buffer. */ + event.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + event_info.append( "{\"service\":\"hwmond\",\"hostname\":\"" ) ; + event_info.append( hostname ); if ( sensor_ptr ) { - size_t len = strnlen ( sensor_ptr, MAX_SENSOR_NAME_LEN ); - - /* We don't use the buffer for hwmon events to remove it from the size */ - bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); + event_info.append( "\",\"sensor\":\"" ) ; + event_info.append( sensor_ptr ); + } + event_info.append( "\"}"); + snprintf ( &event.buf[event.res] , event_info.length()+1, "%s", event_info.data()); - snprintf ( &event.buf[0], MAX_SENSOR_NAME_LEN, "%s", sensor_ptr ); + /* Update the event code */ + event.cmd = event_code ; + + /* Send the event */ + rc = hwmon_sock.event_sock->write((char*)&event.hdr[0],sizeof(mtc_message_type)); + if ( rc ) + { + rc = PASS ; + } + else + { + elog ("event send to %s:%d failed (%d:%d:%m)", + hwmon_sock.event_sock->get_dst_str(), + hwmon_sock.event_sock->get_dst_addr()->getPort(), rc, errno); + rc = FAIL_SOCKET_SENDTO ; } } - else if ( event_code == MTC_EVENT_LOOPBACK ) - { - snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data()); - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_loopback_header()); - - /* We don't use the buffer for hwmon events to remove it from the size */ - bytes = ((sizeof(mtc_message_type))-(BUF_SIZE)); - } else { - elog ("Unsupported process monitor event (%d)\n", event_code ); - return ( FAIL_BAD_PARM ); - } - - /* Update the event code */ - event.cmd = event_code ; - - /* Send the event */ - if ((rc = hwmon_sock.event_sock->write((char*)&event.hdr[0],bytes)) != bytes ) - { - elog ("Message send failed. (%d)\n", rc); - elog ("Message: %d bytes to <%s:%d>\n", bytes, - hwmon_sock.event_sock->get_dst_str(), - hwmon_sock.event_sock->get_dst_addr()->getPort()); - rc = FAIL_SOCKET_SENDTO ; - } - else - { - mlog ("Sending '%s' Event with %d bytes to %s:%d\n", - get_event_str (event.cmd).c_str(), bytes, - hwmon_sock.event_sock->get_dst_str(), - hwmon_sock.event_sock->get_dst_addr()->getPort()); - print_mtc_message (&event); - rc = PASS ; + elog ("Unsupported hardware monitor event (%d)\n", event_code ); + rc = FAIL_BAD_PARM ; } + print_mtc_message ( hostname, MTC_CMD_TX, event, get_iface_name_str(MGMNT_INTERFACE), rc ); return rc ; } @@ -213,7 +207,7 @@ int hwmon_service_inbox ( void ) int rc = PASS ; - /* clean the rx/tx buffer */ + /* clean the rx/tx buffer */ memset ((void*)&msg,0,sizeof(mtc_message_type)); bytes = hwmon_sock.cmd_sock->read((char*)&msg.hdr[0], sizeof(mtc_message_type)); if( bytes <= 0 ) @@ -238,8 +232,6 @@ int hwmon_service_inbox ( void ) hwmon_sock.cmd_sock->get_src_str(), hwmon_sock.cmd_sock->get_dst_addr()->getPort()); - print_mtc_message ( &msg ); - if ( !strnlen ( &msg.hdr[MSG_HEADER_SIZE], MAX_CHARS_HOSTNAME )) { wlog ("Mtce message (%x) did not specify target hostname\n", msg.cmd ); @@ -254,6 +246,7 @@ int hwmon_service_inbox ( void ) wlog ("%s failed to parse host info\n", inv.name.c_str()); return (FAIL_KEY_VALUE_PARSE); } + print_mtc_message ( inv.name, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_IFACE) , false); rc = PASS; if ( msg.cmd == MTC_CMD_ADD_HOST ) @@ -269,7 +262,6 @@ int hwmon_service_inbox ( void ) { mlog ("%s add host message\n", inv.name.c_str()); } - } else if ( msg.cmd == MTC_CMD_DEL_HOST ) { @@ -306,17 +298,13 @@ int hwmon_service_inbox ( void ) { mlog ("%s query host message - NOT IMPLEMENTED YET !!!\n", inv.name.c_str()); } - else if ( msg.cmd == MTC_CMD_LOOPBACK ) - { - mlog ("Loopback command received\n"); - } else { rc = FAIL_BAD_PARM ; elog ( "Unsupported maintenance command (%d)\n", msg.cmd ); - } + } } - else + else { elog ("Unsupported Message\n"); print_mtc_message ( &msg ) ; @@ -324,15 +312,14 @@ int hwmon_service_inbox ( void ) } #ifdef WANT_COMMAND_RESPONSE - /* TODO: Test and enable reply message */ - // snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_rsp_msg_header()); + // snprintf ( &msg.hdr[0], MSG_HEADER_SIZE, "%s", get_cmd_rsp_msg_header()); if ( rc == PASS ) - { + { bytes = sizeof(mtc_message_type)-BUF_SIZE; - rc = sendto( hwmon_sock.mtc_client_tx_sock, + rc = sendto( hwmon_sock.mtc_client_tx_sock, (char*)&msg.hdr[0], bytes , 0, - (struct sockaddr *) &hwmon_sock.agent_addr, + (struct sockaddr *) &hwmon_sock.agent_addr, sizeof(hwmon_sock.agent_addr)); if (rc != bytes ) { @@ -345,9 +332,8 @@ int hwmon_service_inbox ( void ) { mlog ("Response: <%s> to %s:%d\n", &msg.hdr[0], inet_ntoa(hwmon_sock.client_addr.sin_addr), - ntohs(hwmon_sock.agent_addr.sin_port)); + ntohs(hwmon_sock.agent_addr.sin_port)); } - fflush(stdout); } #endif return (rc); diff --git a/mtce/src/maintenance/mtcCompMsg.cpp b/mtce/src/maintenance/mtcCompMsg.cpp index 8ed0b7a3..87c8cde2 100755 --- a/mtce/src/maintenance/mtcCompMsg.cpp +++ b/mtce/src/maintenance/mtcCompMsg.cpp @@ -61,10 +61,11 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) int bytes = 0 ; mtc_message_type msg ; int rc = FAIL ; + ctrl_type * ctrl_ptr = get_ctrl_ptr() ; if ( interface == CLSTR_INTERFACE ) { - if ( ! get_ctrl_ptr()->clstr_iface_provisioned ) + if ( ! ctrl_ptr->clstr_iface_provisioned ) { wlog ("cannot receive from unprovisioned %s interface\n", get_iface_name_str(interface) ); @@ -74,13 +75,14 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) /* clean the rx/tx buffer */ memset ((void*)&msg,0,sizeof(mtc_message_type)); - + string hostaddr = "" ; if ( interface == MGMNT_INTERFACE ) { if (( sock_ptr->mtc_client_rx_socket ) && ( sock_ptr->mtc_client_rx_socket->sock_ok() == true )) { - bytes = sock_ptr->mtc_client_rx_socket->read((char*)&msg.hdr[0], sizeof(mtc_message_type)); + rc = sock_ptr->mtc_client_rx_socket->read((char*)&msg.hdr[0], sizeof(mtc_message_type)); + hostaddr = sock_ptr->mtc_client_rx_socket->get_src_str(); } else { @@ -93,7 +95,8 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) if (( sock_ptr->mtc_client_clstr_rx_socket ) && ( sock_ptr->mtc_client_clstr_rx_socket->sock_ok() == true )) { - bytes = sock_ptr->mtc_client_clstr_rx_socket->read((char*)&msg.hdr[0], sizeof(mtc_message_type)); + rc = sock_ptr->mtc_client_clstr_rx_socket->read((char*)&msg.hdr[0], sizeof(mtc_message_type)); + hostaddr = sock_ptr->mtc_client_clstr_rx_socket->get_src_str(); } else { @@ -102,7 +105,7 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) } } - if( bytes <= 0 ) + if( rc <= 0 ) { if ( ( errno == EINTR ) || ( errno == EAGAIN )) { @@ -113,24 +116,34 @@ int mtc_service_command ( mtc_socket_type * sock_ptr, int interface ) return (FAIL_TO_RECEIVE); } } + rc = PASS ; - print_mtc_message ( get_hostname(), MTC_CMD_RX, msg, get_iface_name_str(interface), false ); + bool self = false ; + if (( hostaddr == ctrl_ptr->address ) || + ( hostaddr == ctrl_ptr->address_clstr )) + { + self = true ; + } /* Message version greater than zero have the hosts management * mac address appended to the header string */ - if ( msg.ver >= MTC_CMD_FEATURE_VER__MACADDR_IN_CMD ) + if (( !self ) && ( msg.ver >= MTC_CMD_FEATURE_VER__MACADDR_IN_CMD )) { /* the minus 1 is to back up from the null char that is accounted for in the hearder size */ - if ( strncmp ( &msg.hdr[MSG_HEADER_SIZE-1], get_ctrl_ptr()->macaddr.data(), MSG_HEADER_SIZE )) + if ( strncmp ( &msg.hdr[MSG_HEADER_SIZE-1], ctrl_ptr->macaddr.data(), MSG_HEADER_SIZE )) { wlog ("%s command not for this host (exp:%s det:%s) ; ignoring ...\n", get_mtcNodeCommand_str(msg.cmd), - get_ctrl_ptr()->macaddr.c_str(), + ctrl_ptr->macaddr.c_str(), &msg.hdr[MSG_HEADER_SIZE-1]); - print_mtc_message ( get_hostname(), MTC_CMD_RX, msg, get_iface_name_str(interface), true ); - return (FAIL_INVALID_DATA); + rc = FAIL_INVALID_DATA ; } } + + print_mtc_message ( hostaddr, MTC_CMD_RX, msg, get_iface_name_str(interface), rc ); + if ( rc ) + return rc; + /* Check for response messages */ if ( strstr ( &msg.hdr[0], get_cmd_req_msg_header() ) ) { diff --git a/mtce/src/maintenance/mtcCtrlMsg.cpp b/mtce/src/maintenance/mtcCtrlMsg.cpp index 6cbf86b1..b2ba4bea 100755 --- a/mtce/src/maintenance/mtcCtrlMsg.cpp +++ b/mtce/src/maintenance/mtcCtrlMsg.cpp @@ -193,15 +193,16 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), false ); + /* handle messages that are not mtc_message_type + * but rather are simply a json string */ if ( msg.hdr[0] == '{' ) { - int rc1 ; string service ; mlog1 ("%s\n", &msg.hdr[0] ); - rc1 = jsonUtil_get_key_val(&msg.hdr[0],"service", service ); - if ( rc1 == PASS ) + rc = jsonUtil_get_key_val(&msg.hdr[0],"service", service ); + if ( rc == PASS ) { if ( service == "collectd_notifier" ) { @@ -215,23 +216,27 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, { elog ("failed to parse '%s' message\n", service.c_str()); wlog ("... %s\n", &msg.hdr[0] ); + rc = FAIL_JSON_PARSE ; } else { obj_ptr->collectd_notify_handler ( hostname, resource, state ); + return (PASS) ; } } /* future service requests */ else { wlog ("Unexpected service request: '%s'\n", service.c_str()); + rc = FAIL_BAD_PARM ; } } else { wlog("Unexpected json message: %s\n", &msg.hdr[0] ); + rc = FAIL_BAD_CASE ; } } @@ -246,112 +251,112 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, */ else if ( strstr ( &msg.hdr[0], get_worker_msg_header() ) ) { - if ( msg.cmd == MTC_MSG_MTCALIVE ) + if ( msg.cmd == MTC_MSG_MTCALIVE ) + { + string functions = "" ; + rc = jsonUtil_get_key_val ( &msg.buf[0], "personality", functions ); + if ( rc ) { - string functions = "" ; - rc = jsonUtil_get_key_val ( &msg.buf[0], "personality", functions ); - if ( rc ) - { - wlog ("%s failed to get personality from mtcAlive message\n", hostname.c_str()); - return (FAIL_KEY_VALUE_PARSE); - } - rc = obj_ptr->update_host_functions ( hostname, functions ); - dlog3 ("%s functions: %s\n", hostname.c_str(), functions.c_str()); - if ( rc ) - { - wlog ("%s failed to load functions from mtcAlive message\n", hostname.c_str()); - return (FAIL_NODETYPE); - } - obj_ptr->set_uptime ( hostname , msg.parm[MTC_PARM_UPTIME_IDX], false ); - obj_ptr->set_health ( hostname , msg.parm[MTC_PARM_HEALTH_IDX] ); - obj_ptr->set_mtce_flags ( hostname , msg.parm[MTC_PARM_FLAGS_IDX] ); - - obj_ptr->set_mtcAlive ( hostname, iface ); - - mlog1("%s Uptime:%d Health:%d Flags:0x%x mtcAlive:%s\n", - hostname.c_str(), - msg.parm[MTC_PARM_UPTIME_IDX], - msg.parm[MTC_PARM_HEALTH_IDX], - msg.parm[MTC_PARM_FLAGS_IDX], - obj_ptr->get_mtcAlive_gate ( hostname ) ? "gated" : "open"); - - string cluster_host_ip = ""; - /* Get the clstr ip address if it is provisioned */ - rc = jsonUtil_get_key_val ( &msg.buf[0], "cluster_host_ip", cluster_host_ip ); - if ( rc == PASS ) - { - obj_ptr->set_clstr_hostaddr ( hostname, cluster_host_ip ); - } - else - { - mlog ("%s null or missing 'cluster_host_ip' value (rc:%d)\n", hostname.c_str(), rc); - } + wlog ("%s failed to get personality from mtcAlive message\n", hostname.c_str()); + return (FAIL_KEY_VALUE_PARSE); } - else if ( msg.cmd == MTC_MSG_MAIN_GOENABLED ) + rc = obj_ptr->update_host_functions ( hostname, functions ); + dlog3 ("%s functions: %s\n", hostname.c_str(), functions.c_str()); + if ( rc ) { - if ( !obj_ptr->my_hostname.compare(hostname) ) - { - ilog ("%s received GOENABLED from self\n", hostname.c_str()); - } - rc = send_mtc_cmd ( hostname , msg.cmd, MGMNT_INTERFACE ); - if ( rc != PASS ) - { - elog ("%s GOENABLED send reply failed (rc:%d)\n", - hostname.c_str(), rc); + wlog ("%s failed to load functions from mtcAlive message\n", hostname.c_str()); + return (FAIL_NODETYPE); + } + obj_ptr->set_uptime ( hostname , msg.parm[MTC_PARM_UPTIME_IDX], false ); + obj_ptr->set_health ( hostname , msg.parm[MTC_PARM_HEALTH_IDX] ); + obj_ptr->set_mtce_flags ( hostname , msg.parm[MTC_PARM_FLAGS_IDX] ); - wlog ("%s ... need successful GOENABLED reply, dropping ...\n", - hostname.c_str() ); - } - else - { - mlog ("%s got GOENABLED (out-of-service tests passed) message\n", hostname.c_str()); - obj_ptr->set_goEnabled ( hostname ); - } - } - else if ( msg.cmd == MTC_MSG_MAIN_GOENABLED_FAILED ) - { - if ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) - { - wlog ("%s failed out-of-service test: %s\n", hostname.c_str(), &msg.buf[0] ); - obj_ptr->set_goEnabled_failed ( hostname ); - } - /* We don't send a reply on a fail */ - } - else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED ) - { - mlog ("%s-worker GOENABLED message\n", hostname.c_str()); - if ( !obj_ptr->my_hostname.compare(hostname) ) - { - ilog ("%s-worker received GOENABLED from self\n", hostname.c_str()); - } - rc = send_mtc_cmd ( hostname , msg.cmd, MGMNT_INTERFACE ); - if ( rc != PASS ) - { - elog ("%s-worker GOENABLED send reply failed (rc:%d)\n", - hostname.c_str(), rc); + obj_ptr->set_mtcAlive ( hostname, iface ); - wlog ("%s-worker ... need successful GOENABLED reply, dropping ...\n", - hostname.c_str() ); - } - else - { - mlog ("%s-worker got GOENABLED (out-of-service tests passed) message\n", hostname.c_str()); - obj_ptr->set_goEnabled_subf ( hostname ); - } - } - else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED_FAILED ) + mlog1("%s Uptime:%d Health:%d Flags:0x%x mtcAlive:%s\n", + hostname.c_str(), + msg.parm[MTC_PARM_UPTIME_IDX], + msg.parm[MTC_PARM_HEALTH_IDX], + msg.parm[MTC_PARM_FLAGS_IDX], + obj_ptr->get_mtcAlive_gate ( hostname ) ? "gated" : "open"); + + string cluster_host_ip = ""; + /* Get the clstr ip address if it is provisioned */ + rc = jsonUtil_get_key_val ( &msg.buf[0], "cluster_host_ip", cluster_host_ip ); + if ( rc == PASS ) { - if ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) - { - wlog ("%s-worker failed GOENABLE test: %s\n", hostname.c_str(), &msg.buf[0] ); - obj_ptr->set_goEnabled_failed_subf ( hostname ); - } - /* We don't send a reply on a fail */ + obj_ptr->set_clstr_hostaddr ( hostname, cluster_host_ip ); } else { - wlog ("Unexpected worker message (0x%x) from '%s'\n", msg.cmd, hostname.c_str()); + mlog ("%s null or missing 'cluster_host_ip' value (rc:%d)\n", hostname.c_str(), rc); } + } + else if ( msg.cmd == MTC_MSG_MAIN_GOENABLED ) + { + if ( !obj_ptr->my_hostname.compare(hostname) ) + { + ilog ("%s received GOENABLED from self\n", hostname.c_str()); + } + rc = send_mtc_cmd ( hostname , msg.cmd, MGMNT_INTERFACE ); + if ( rc != PASS ) + { + elog ("%s GOENABLED send reply failed (rc:%d)\n", + hostname.c_str(), rc); + + wlog ("%s ... need successful GOENABLED reply, dropping ...\n", + hostname.c_str() ); + } + else + { + mlog ("%s got GOENABLED (out-of-service tests passed) message\n", hostname.c_str()); + obj_ptr->set_goEnabled ( hostname ); + } + } + else if ( msg.cmd == MTC_MSG_MAIN_GOENABLED_FAILED ) + { + if ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) + { + wlog ("%s failed out-of-service test: %s\n", hostname.c_str(), &msg.buf[0] ); + obj_ptr->set_goEnabled_failed ( hostname ); + } + /* We don't send a reply on a fail */ + } + else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED ) + { + mlog ("%s-worker GOENABLED message\n", hostname.c_str()); + if ( !obj_ptr->my_hostname.compare(hostname) ) + { + ilog ("%s-worker received GOENABLED from self\n", hostname.c_str()); + } + rc = send_mtc_cmd ( hostname , msg.cmd, MGMNT_INTERFACE ); + if ( rc != PASS ) + { + elog ("%s-worker GOENABLED send reply failed (rc:%d)\n", + hostname.c_str(), rc); + + wlog ("%s-worker ... need successful GOENABLED reply, dropping ...\n", + hostname.c_str() ); + } + else + { + mlog ("%s-worker got GOENABLED (out-of-service tests passed) message\n", hostname.c_str()); + obj_ptr->set_goEnabled_subf ( hostname ); + } + } + else if ( msg.cmd == MTC_MSG_SUBF_GOENABLED_FAILED ) + { + if ( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) + { + wlog ("%s-worker failed GOENABLE test: %s\n", hostname.c_str(), &msg.buf[0] ); + obj_ptr->set_goEnabled_failed_subf ( hostname ); + } + /* We don't send a reply on a fail */ + } + else + { + wlog ("Unexpected worker message (0x%x) from '%s'\n", msg.cmd, hostname.c_str()); + } } /* @@ -359,183 +364,180 @@ int mtc_service_inbox ( nodeLinkClass * obj_ptr, */ else if ( strstr ( &msg.hdr[0], get_mtce_event_header() ) ) { - rc = PASS ; - if ( hostname.empty() ) - { - mlog2 ( "Received mtce event from unknown host\n"); - rc = FAIL_UNKNOWN_HOSTNAME ; - } - else if ( !hostname.compare("localhost") ) - { - mlog2 ("localhost event (%x) ignored", msg.cmd); - } - else - { - string event = "" ; + string service = "" ; + string sensor = "" ; + string process = "" ; + hostname = "unknown" ; - /* TODO: fix this hostname setting */ - if (( msg.cmd == MTC_DEGRADE_CLEAR ) || - ( msg.cmd == MTC_DEGRADE_RAISE ) || - ( msg.cmd == MTC_EVENT_HWMON_CLEAR ) || - ( msg.cmd == MTC_EVENT_HWMON_MINOR ) || - ( msg.cmd == MTC_EVENT_HWMON_MAJOR ) || - ( msg.cmd == MTC_EVENT_HWMON_CRIT ) || - ( msg.cmd == MTC_EVENT_HWMON_RESET ) || - ( msg.cmd == MTC_EVENT_HWMON_POWERDOWN ) || - ( msg.cmd == MTC_EVENT_HWMON_POWERCYCLE) || - ( msg.cmd == MTC_EVENT_HWMON_CONFIG )) + int rc1 = FAIL ; + if ( ( rc = jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_INV_NAME, hostname )) == PASS ) + { + if ( ( rc1 = jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_SERVICE, service )) == PASS ) { - hostname = &msg.hdr[MSG_HEADER_SIZE] ; + if (( msg.cmd == MTC_EVENT_HWMON_CLEAR ) || + ( msg.cmd == MTC_EVENT_HWMON_MINOR ) || + ( msg.cmd == MTC_EVENT_HWMON_MAJOR ) || + ( msg.cmd == MTC_EVENT_HWMON_CRIT ) || + ( msg.cmd == MTC_EVENT_HWMON_RESET ) || + ( msg.cmd == MTC_EVENT_HWMON_POWERDOWN )|| + ( msg.cmd == MTC_EVENT_HWMON_POWERCYCLE )) + { + jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_SENSOR, sensor ); + } + else if (( msg.cmd == MTC_EVENT_PMON_CLEAR ) || + ( msg.cmd == MTC_EVENT_PMON_CRIT ) || + ( msg.cmd == MTC_EVENT_PMON_MAJOR ) || + ( msg.cmd == MTC_EVENT_PMON_MINOR ) || + ( msg.cmd == MTC_EVENT_PMON_LOG )) + { + jsonUtil_get_key_val(&msg.buf[0], MTC_JSON_PROCESS, process ); + } } - /* the mtce event (process or resource) that causes this raised event is at the - * head of the message buffer. Load it into an 'event' - * string to be passed into the individual handlers for - * convenience. Safer to pass reference to a string than - * the raw buffer pointer. */ - if ( strnlen ( &msg.buf[0] , MAX_MTCE_EVENT_NAME_LEN ) ) + } + if (( rc | rc1 ) != PASS ) + { + elog ("received invalid event [rc:%d:%d]", rc, rc1); + print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(iface), true ); + return ( FAIL_INVALID_OPERATION ); + } + switch ( msg.cmd ) + { + case MTC_EVENT_MONITOR_READY: { - event = msg.buf ; - } - - switch ( msg.cmd ) - { - /* TODO: Port other services to use this common code */ - case MTC_EVENT_MONITOR_READY: + if ( service == MTC_SERVICE_PMOND_NAME ) + { + obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_PMOND ); + return (PASS); + } + else if ( service == MTC_SERVICE_HBSCLIENT_NAME ) + { + obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT ); + return (PASS); + } + if ( service == MTC_SERVICE_HWMOND_NAME ) { std::list::iterator temp ; - // bool start_monitoring_flag = false ; - if ( !event.compare("pmond") ) - { - /* Notify mtcAgent that we got a pmond ready event */ - obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_PMOND ); - return (PASS); - } - else if ( !event.compare("hbsClient") ) - { - /* Notify mtcAgent that we got a hbsClient ready event */ - obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HEARTBEAT ); - return (PASS); - } - - /* If the active controller got the ready event from a local service - * then push the inventory to that service and for each host that is - * enabled send the start monitoring command to it if the bm_ip is - * provisioned. - * Handles the daemon restart case */ - for ( temp = obj_ptr->hostname_inventory.begin () ; + /* push inventory to hardware hwmond. + * handles the daemon restart case. + */ + for ( temp = obj_ptr->hostname_inventory.begin () ; temp != obj_ptr->hostname_inventory.end () ; temp++ ) { hostname = temp->data(); - - /* Set the general start monitoring flag based on service state. - * This lag may be over ridden my individual services based on - * additional information */ - if (( obj_ptr->get_adminState ( hostname ) == MTC_ADMIN_STATE__UNLOCKED ) && - ( obj_ptr->get_operState ( hostname ) == MTC_OPER_STATE__ENABLED ) && - ((obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__AVAILABLE ) || - (obj_ptr->get_availStatus ( hostname ) == MTC_AVAIL_STATUS__DEGRADED ))) - { - ; // start_monitoring_flag = true ; - } - else - { - ; // start_monitoring_flag = false ; - } - - if ( !event.compare("hwmond") ) - { - obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HWMOND ); - } - else - { - wlog ("%s Global Ready Event not supported for '%s' service\n", - hostname.c_str(), event.c_str()); - - return (FAIL_BAD_PARM); - } + obj_ptr->declare_service_ready ( hostname, MTC_SERVICE_HWMOND ); } - break ; } + else + { + wlog ("%s ready event not supported for '%s' service\n", + hostname.c_str(), service.c_str()); + return (FAIL_BAD_PARM); + } + break ; + } - /***************************************************************** - * Data Port Events * - *****************************************************************/ + /***************************************************************** + * Process Monitor Events + * ---------------------- + * + * service is the process name for this event. + * parm[0] is the nodetype the process serves. + * + *****************************************************************/ + case MTC_EVENT_PMON_CLEAR: + { + mlog ("%s %s: '%s' recovered (clear)\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME, + service.c_str()); - /***************************************************************** - * Process Monitor Events * - *****************************************************************/ - case MTC_EVENT_PMON_CLEAR: - { - mlog ("%s pmond: '%s' recovered (clear)\n", hostname.c_str(), event.c_str()); - obj_ptr->degrade_pmond_clear ( hostname ); - break ; - } - case MTC_EVENT_PMON_CRIT: - { - mlog ("%s pmond: '%s' failed (critical)\n", hostname.c_str(), event.c_str()); + obj_ptr->degrade_pmond_clear ( hostname ); + break ; + } + case MTC_EVENT_PMON_CRIT: + { + mlog ("%s %s: '%s' failed (critical)\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME, + process.c_str()); - /** - * event is the process name that has failed - * parm[0] is the nodetype the process serves - **/ - obj_ptr->critical_process_failed ( hostname, event, msg.parm[0] ); - break ; - } - case MTC_EVENT_PMON_MAJOR: - { - mlog ("%s pmond: '%s' failed (major)\n", hostname.c_str(), event.c_str()); - obj_ptr->degrade_process_raise ( hostname, event ); - break ; - } - case MTC_EVENT_PMON_MINOR: - { - mlog ("%s pmond: '%s' failed (minor)\n", hostname.c_str(), event.c_str()); - obj_ptr->alarm_process_failure ( hostname, event ); - break ; - } - case MTC_EVENT_PMON_LOG: - { - mlog ("%s pmond: '%s' failed (log)\n", hostname.c_str(), event.c_str()); - obj_ptr->log_process_failure ( hostname, event ); - break ; - } + obj_ptr->critical_process_failed ( hostname, + process, + msg.parm[0] ); + break ; + } + case MTC_EVENT_PMON_MAJOR: + { + mlog ("%s %s: '%s' failed (major)\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME, + process.c_str()); + obj_ptr->degrade_process_raise ( hostname, process ); + break ; + } + case MTC_EVENT_PMON_MINOR: + { + mlog ("%s %s: '%s' failed (minor)\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME, + process.c_str()); + obj_ptr->alarm_process_failure ( hostname, process ); + break ; + } + case MTC_EVENT_PMON_LOG: + { + mlog ("%s %s: '%s' failed (log)\n", + hostname.c_str(), + MTC_SERVICE_PMOND_NAME, + process.c_str()); + obj_ptr->log_process_failure ( hostname, process ); + break ; + } - case MTC_EVENT_HWMON_CLEAR: - case MTC_DEGRADE_CLEAR: - { - mlog ("%s hwmon requests to clear its degrade flag\n", hostname.c_str()); - obj_ptr->node_degrade_control ( hostname, MTC_DEGRADE_CLEAR , "hwmon" ); - break ; - } - case MTC_EVENT_HWMON_MINOR: - case MTC_EVENT_HWMON_MAJOR: - case MTC_EVENT_HWMON_CRIT: - case MTC_DEGRADE_RAISE: - { - mlog ("%s hwmon requested to set its degrade flag\n", hostname.c_str()); - obj_ptr->node_degrade_control ( hostname, MTC_DEGRADE_RAISE , "hwmon" ); - break ; - } - case MTC_EVENT_HWMON_RESET: - case MTC_EVENT_HWMON_POWERDOWN: - case MTC_EVENT_HWMON_POWERCYCLE: - { - mlog ("%s requires maintenance '%s' action due to failing '%s' sensor \n", - hostname.c_str(), - get_event_str(msg.cmd).c_str(), - event.c_str()); + case MTC_EVENT_HWMON_CLEAR: + case MTC_DEGRADE_CLEAR: + { + mlog ("%s %s degrade clear request", + hostname.c_str(), + service.c_str()); + obj_ptr->node_degrade_control ( hostname, + MTC_DEGRADE_CLEAR, + service ); + break ; + } + case MTC_EVENT_HWMON_MINOR: + case MTC_EVENT_HWMON_MAJOR: + case MTC_EVENT_HWMON_CRIT: + case MTC_DEGRADE_RAISE: + { + mlog ("%s %s degrade request %s", + hostname.c_str(), + service.c_str(), + sensor.empty() ? "" : sensor.c_str()); + obj_ptr->node_degrade_control ( hostname, + MTC_DEGRADE_RAISE, + sensor ); + break ; + } + case MTC_EVENT_HWMON_RESET: + case MTC_EVENT_HWMON_POWERDOWN: + case MTC_EVENT_HWMON_POWERCYCLE: + { + mlog ("%s '%s' action due to failing '%s' sensor", + hostname.c_str(), + get_event_str(msg.cmd).c_str(), + sensor.c_str()); - obj_ptr->invoke_hwmon_action ( hostname, msg.cmd, event ); - break ; - } - default: - { - wlog ("%s Unknown Event (%x)\n", hostname.c_str(), msg.cmd ); - rc = FAIL ; - break ; - } + obj_ptr->invoke_hwmon_action ( hostname, msg.cmd, sensor ); + break ; + } + default: + { + wlog ("%s Unknown Event (%x)\n", hostname.c_str(), msg.cmd ); + rc = FAIL ; + break ; } } } @@ -731,8 +733,6 @@ int send_mtc_cmd ( string & hostname, int cmd , int interface ) int send_hbs_command ( string hostname, int cmd, string controller ) { - int bytes = 0 ; - int bytes_to_send = 0 ; int rc = PASS ; nodeLinkClass * obj_ptr = get_mtcInv_ptr () ; @@ -741,11 +741,7 @@ int send_hbs_command ( string hostname, int cmd, string controller ) memset (&event, 0 , sizeof(mtc_message_type)); snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_hbs_cmd_req_header() ); - snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME , "%s", hostname.data()); - - /* There is no buffer data in any of these messages */ - bytes_to_send = ((sizeof(mtc_message_type))-(BUF_SIZE)) ; - + snprintf ( &event.hdr[MSG_HEADER_SIZE] , MAX_CHARS_HOSTNAME_32 , "%s", hostname.data()); event.cmd = cmd ; event.num = 1 ; @@ -773,47 +769,105 @@ int send_hbs_command ( string hostname, int cmd, string controller ) { switch ( cmd ) { - case MTC_CMD_ACTIVE_CTRL: - mlog3 ("%s sending 'activity state' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_CMD_STOP_HOST: - ilog ("%s sending 'stop' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_CMD_START_HOST: - obj_ptr->manage_heartbeat_clear ( hostname , MAX_IFACES ); - ilog ("%s sending 'start' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_CMD_DEL_HOST: - ilog ("%s sending 'delete' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_CMD_ADD_HOST: - obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); - ilog ("%s sending 'add' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_RESTART_HBS: - ilog ("%s sending 'restart' to %s heartbeat service\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_BACKOFF_HBS: - ilog ("%s requesting %s heartbeat period backoff\n", hostname.c_str(), unit->c_str()); - break ; - case MTC_RECOVER_HBS: - ilog ("%s requesting %s heartbeat period recovery\n", hostname.c_str(), unit->c_str()); - break ; - default: - { - slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); - rc = FAIL_BAD_PARM ; - continue ; - } + case MTC_CMD_ADD_HOST: + case MTC_CMD_MOD_HOST: + case MTC_CMD_START_HOST: + obj_ptr->manage_heartbeat_clear ( hostname, MAX_IFACES ); + break ; + case MTC_CMD_ACTIVE_CTRL: + case MTC_CMD_STOP_HOST: + case MTC_CMD_DEL_HOST: + case MTC_RESTART_HBS: + case MTC_BACKOFF_HBS: + case MTC_RECOVER_HBS: + break ; + default: + { + slog ("%s Unsupported command operation 0x%x\n", hostname.c_str(), cmd ); + rc = FAIL_BAD_PARM ; + continue ; + } } - ip = get_mtcInv_ptr()->get_hostaddr(*unit) ; - bytes = sock_ptr->mtc_to_hbs_sock->write((char*) &event, bytes_to_send, ip.data()); - if ( bytes <= 0 ) + /* the command */ + event.cmd = cmd ; + + /* add the node type */ + event.num = 1 ; + event.parm[0] = obj_ptr->get_nodetype(hostname); + + /* support for 64 byte hostnames */ + event.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + + /* the json string with hostname starts at the beginning of the buffer */ + event.res = 0 ; + + /* build the message info */ + string hbs_info = "{\""; + hbs_info.append(MTC_JSON_INV_NAME); + hbs_info.append("\":\"") ; + hbs_info.append(hostname); + + hbs_info.append("\",\""); + hbs_info.append(MTC_JSON_INV_HOSTIP); + hbs_info.append("\":\""); + hbs_info.append(obj_ptr->get_hostaddr(hostname)); + + if ( obj_ptr->clstr_network_provisioned ) { - wlog ("%s failed to send command (0x%x) to heartbeat service at %s\n", unit->c_str(), cmd, ip.c_str() ); - rc = FAIL_TO_TRANSMIT ; + hbs_info.append("\",\""); + hbs_info.append(MTC_JSON_INV_CLSTRIP); + hbs_info.append("\":\""); + hbs_info.append(obj_ptr->get_clstr_hostaddr(hostname)); } + hbs_info.append("\"}"); + + /* copy the json info string into the buffer. + * + * add one to the length to accomodate for the null terminator + * snprintf automatically adds */ + snprintf ( &event.buf[event.res], hbs_info.length()+1, + "%s", hbs_info.data()); + + /* send to hbsAgent for the specific controller */ + string ip = get_mtcInv_ptr()->get_hostaddr(*unit) ; + if ( ! ip.empty() ) + { + rc = sock_ptr->mtc_to_hbs_sock->write((char*) &event, + sizeof(mtc_message_type), + ip.data()); + if ( rc <= 0 ) + { + wlog ("%s send command (0x%x) failed (%s)", + unit->c_str(), cmd, ip.c_str() ); + rc = FAIL_TO_TRANSMIT ; + } + else + { + if ( cmd == MTC_CMD_ACTIVE_CTRL ) + { + mlog3 ("%s %s sent to %s %s", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + unit->c_str(), + MTC_SERVICE_HBSAGENT_NAME); + } + else + { + ilog ("%s %s sent to %s %s", + hostname.c_str(), + get_mtcNodeCommand_str(cmd), + unit->c_str(), + MTC_SERVICE_HBSAGENT_NAME); + } + rc = PASS ; + } + } + else + { + rc = FAIL_STRING_EMPTY ; + } + print_mtc_message ( hostname, MTC_CMD_RX, event, get_iface_name_str(MGMNT_INTERFACE), rc ); } return rc ; } @@ -839,7 +893,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) { wlog ("%s ignoring service event from unknown host (%s)", obj_ptr->my_hostname.c_str(), hostaddr.c_str()); - return (PASS); + return (FAIL_UNKNOWN_HOSTNAME); } if (( hostname != obj_ptr->my_hostname ) && (( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS ) || @@ -848,31 +902,64 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET )|| ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR ))) { - wlog ("%s %s from %s heartbeat service", - &msg.buf[0], - get_mtcNodeCommand_str(msg.cmd), - hostname.c_str()); + mlog3 ("%s '%s' heartbeat event for '%s' from inactive controller ... ignoring", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + msg.buf[0] ? &msg.buf[0] : "unknown host"); return (PASS); } - if ( msg.cmd == MTC_EVENT_LOOPBACK ) - { - const char * event_hdr_ptr = get_loopback_header() ; - /* Confirm header */ - if ( strncmp ( &msg.hdr[0], event_hdr_ptr, MSG_HEADER_SIZE ) ) + else if (( msg.cmd == MTC_EVENT_HEARTBEAT_LOSS ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_SET ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_DEGRADE_CLR ) || + ( msg.cmd == MTC_EVENT_PMOND_CLEAR ) || + ( msg.cmd == MTC_EVENT_PMOND_RAISE ) || + ( msg.cmd == MTC_EVENT_HOST_STALLED )) + { + if (( msg.ver >= MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ) && + ( msg.buf[msg.res] == '{' )) { - elog ("Invalid Event header\n"); + jsonUtil_get_key_val(&msg.buf[msg.res], MTC_JSON_INV_NAME, hostname) ; + } + else if ( msg.buf[0] != '\0' ) + { + hostname = &msg.buf[0] ; } else { - ilog ("Service ping\n"); - - /* Should send back a response */ + slog ("failed to get hostname from '%s' message", + get_mtcNodeCommand_str(msg.cmd)); + print_mtc_message ( "unknown", MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), true ); + return (FAIL_UNKNOWN_HOSTNAME); } } - else if (( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) || - ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR )) + /* print the ready event log */ + if (( msg.cmd != MTC_EVENT_HEARTBEAT_READY ) && ( !hostname.empty () )) + { + string log_suffix = "" ; + if (msg.num) + { + log_suffix = "(" ; + log_suffix.append(get_iface_name_str((int)msg.parm[0])); + log_suffix.append(")") ; + } + if ( msg.cmd != MTC_EVENT_MONITOR_READY ) + { + ilog ("%s %s %s", + hostname.c_str(), + get_mtcNodeCommand_str(msg.cmd), + log_suffix.c_str() ); + } + } + + /* handle the events */ + /* ----------------- */ + int rc = PASS ; + if (( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_SET ) || + ( msg.cmd == MTC_EVENT_HEARTBEAT_MINOR_CLR )) { const char * event_hdr_ptr = get_heartbeat_event_header() ; @@ -880,14 +967,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) if ( strncmp ( &msg.hdr[0], event_hdr_ptr, MSG_HEADER_SIZE ) ) { elog ("Invalid Heartbeat Event header\n"); + rc = FAIL_BAD_PARM ; } else { - string hostname = &msg.buf[0] ; - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); - - /* The interface that the heartbeat loss occurred over is - * specified in parm[0 for this command + /* The interface that the heartbeat minor occurred over is + * specified in parm[0] for this command * 0 = MGMNT_IFACE * 1 = CLSTR_IFACE * else default to 0 (MGMNT_IFACE) to be backwards compatible @@ -922,14 +1007,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) if ( strncmp ( &msg.hdr[0], event_hdr_ptr, MSG_HEADER_SIZE ) ) { elog ("Invalid Heartbeat Event header\n"); + rc = FAIL_BAD_PARM ; } else { - string hostname = &msg.buf[0] ; - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); - - /* The interface that the heartbeat loss occurred over is - * specified in parm[0 for this command + /* The interface that the heartbeatdegrade occurred over is + * specified in parm[0] for this command * 0 = MGMNT_IFACE * 1 = CLSTR_IFACE * else default to 0 (MGMNT_IFACE) to be backwards compatible @@ -976,6 +1059,7 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) if ( strncmp ( &msg.hdr[0], loss_hdr_ptr, MSG_HEADER_SIZE ) ) { elog ("Invalid Heartbeat Loss event header\n"); + rc = FAIL_BAD_PARM ; } else { @@ -994,8 +1078,6 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) iface = CLSTR_IFACE ; } } - string hostname = &msg.buf[0] ; - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); /* If heartbeat failure action is fail then call the fail handler */ if ( obj_ptr->hbs_failure_action == HBS_FAILURE_ACTION__FAIL ) @@ -1017,24 +1099,18 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) } else if ( msg.cmd == MTC_EVENT_PMOND_CLEAR ) { - string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - string process = "pmond" ; + string process = MTC_SERVICE_PMOND_NAME ; ilog ("%s Degrade Clear Event for process '%s'\n", hostname.c_str(), process.c_str()); - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); obj_ptr->degrade_pmond_clear ( hostname ); } else if ( msg.cmd == MTC_EVENT_PMOND_RAISE ) { - string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - string process = "pmond" ; + string process = MTC_SERVICE_PMOND_NAME ; ilog ("%s Degrade Assert Event for process '%s'\n", hostname.c_str(), process.c_str()); - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); obj_ptr->degrade_process_raise ( hostname , process ); } else if ( msg.cmd == MTC_EVENT_HOST_STALLED ) { - string hostname = &msg.hdr[MSG_HEADER_SIZE] ; - print_mtc_message ( hostname, MTC_CMD_RX, msg, get_iface_name_str(MGMNT_INTERFACE), false ); elog ("%s Stalled !!!\n", hostname.c_str()); } @@ -1042,10 +1118,12 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) { string daemon = &msg.hdr[MSG_HEADER_SIZE] ; - if ( !daemon.compare("guestAgent") ) + if ( !daemon.compare(MTC_SERVICE_GUESTAGENT_NAME) ) { std::list::iterator temp ; - int rc = PASS ; + rc = PASS ; + + ilog ("%s %s ready event", hostname.c_str(), MTC_SERVICE_GUESTAGENT_NAME ); /* If the active controller got the ready event from a local service * then push the inventory to that service and for each host that is @@ -1055,15 +1133,19 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) temp != obj_ptr->hostname_inventory.end () ; temp++ ) { - string hostname = temp->data(); + hostname = temp->data(); rc = send_guest_command ( hostname, MTC_CMD_ADD_HOST ); if ( rc ) { - elog ("%s host add to '%s' failed\n", hostname.c_str(), daemon.c_str()); + elog ("%s host add to '%s' failed", + hostname.c_str(), + daemon.c_str()); } else { - ilog ("%s added to guestAgent\n", hostname.c_str()); + ilog ("%s added to %s", + hostname.c_str(), + daemon.c_str()); } } /* Done sending the host info */ @@ -1076,40 +1158,41 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) else if ( msg.cmd == MTC_EVENT_HEARTBEAT_READY ) { - string controller = CONTROLLER ; - std::list::iterator temp ; - /* no heartbeating in simplex mode */ if ( obj_ptr->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX ) { return (PASS); } - /* get the controller that sent this ready event */ - if (( msg.buf[0] != '\0' ) && ( strnlen( msg.buf, BUF_SIZE) <= MAX_CHARS_HOSTNAME )) + /* Support for json formatted message in buffer */ + if (( msg.ver >= MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ) && + ( msg.buf[msg.res] == '{' )) { - controller = msg.buf ; - ilog ("%s Heartbeat Service Ready Event (%s)\n", - msg.buf, sock_ptr->mtc_event_rx_sock->get_src_str()); + jsonUtil_get_key_val(&msg.buf[msg.res], MTC_JSON_INV_NAME, hostname) ; } - else - { - ilog ("Heartbeat Service Ready Event\n"); - } - obj_ptr->hbs_ready = true ; + ilog ("%s %s ready event", + hostname.c_str(), + MTC_SERVICE_HBSAGENT_NAME); - /* Run Maintenance on Inventory */ + obj_ptr->hbs_ready = true ; + /* Send inventory to the controller's hbsAgent that sent + * the ready request. Save controller hostname. */ + string controller = hostname ; + ilog ("%s %s inventory push ... start", + controller.c_str(), + MTC_SERVICE_HBSAGENT_NAME); + + std::list::iterator temp ; for ( temp = obj_ptr->hostname_inventory.begin () ; temp != obj_ptr->hostname_inventory.end () ; temp++ ) { - string hostname = "" ; - hostname.append( temp->c_str() ) ; + hostname = temp->data(); - /* Add all hosts, even the active controller, to - * the heartbeat service. This tell the heartbeat - * service about all the hosts so that it will - * send heartbeat oob flag events to mtce. */ + /* Add all hosts, even the active controller, to + * the heartbeat service. This tell the heartbeat + * service about all the hosts so that it will + * send heartbeat oob flag events to mtce. */ if ( send_hbs_command( hostname, MTC_CMD_ADD_HOST, controller ) != PASS ) { elog ("%s Failed to send inventory to heartbeat service\n", hostname.c_str()); @@ -1123,12 +1206,18 @@ int service_events ( nodeLinkClass * obj_ptr, mtc_socket_type * sock_ptr ) send_hbs_command ( hostname, MTC_CMD_START_HOST, controller ); } } + ilog ("%s %s inventory push ... done", + controller.c_str(), + MTC_SERVICE_HBSAGENT_NAME); } else { - wlog ("Unrecognized Event from Heartbeat Service (hbsAgent)\n"); + wlog ("Unrecognized Event from Heartbeat Service (hbsAgent)\n"); + rc = FAIL_BAD_PARM ; } - return PASS ; + /* print the message if there was an error */ + print_mtc_message ( hostname, MTC_CMD_TX, msg, get_iface_name_str(MGMNT_INTERFACE), rc ); + return rc ; } @@ -1158,7 +1247,10 @@ int send_hwmon_command ( string hostname, int command ) get_mtcInv_ptr()->set_hwmond_monitor_state ( hostname, false ); } - ilog ("%s sending '%s' to hwmond service\n", hostname.c_str(), get_event_str(command).c_str()); + ilog ("%s %s sent to %s", + hostname.c_str(), + get_mtcNodeCommand_str(command), + MTC_SERVICE_HWMOND_NAME); break ; } default: @@ -1173,23 +1265,31 @@ int send_hwmon_command ( string hostname, int command ) mtc_message_type cmd ; string hwmon_info = "" ; - int bytes = 0; mtc_socket_type * sock_ptr = get_sockPtr (); nodeLinkClass * obj_ptr = get_mtcInv_ptr (); memset ( &cmd, 0 , sizeof(mtc_message_type)); snprintf ( &cmd.hdr[0] , MSG_HEADER_SIZE, "%s", get_cmd_req_msg_header()); - snprintf ( &cmd.hdr[MSG_HEADER_SIZE], MAX_CHARS_HOSTNAME, "%s", hostname.data()); + snprintf ( &cmd.hdr[MSG_HEADER_SIZE], MAX_CHARS_HOSTNAME_32, "%s", hostname.data()); - /* Store the command, get the board management info and copy it into the message buffer */ + /* Support for 64 byte hostnames */ + cmd.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + + /* Hostname starts at the beginning of the buffer */ + cmd.res = 0 ; + + /* Store the command */ cmd.cmd = command ; - hwmon_info = obj_ptr->get_hwmon_info ( hostname ); - memcpy ( &cmd.buf[0], hwmon_info.data(), hwmon_info.length()); - /* rc = message size */ - bytes = sizeof(mtc_message_type); - rc = sock_ptr->hwmon_cmd_sock->write((char *)&cmd, bytes, obj_ptr->my_float_ip.c_str(), 0); + /* Copy the board management info string into the buffer and add one + * to the length to accomodate for the null terminator snprintf + * automatically adds */ + + hwmon_info = obj_ptr->get_hwmon_info ( hostname ); + snprintf ( &cmd.buf[cmd.res] , hwmon_info.length()+1, "%s", hwmon_info.data()); + + rc = sock_ptr->hwmon_cmd_sock->write((char *)&cmd, sizeof(mtc_message_type), obj_ptr->my_float_ip.c_str(), 0); if ( 0 > rc ) { elog ("%s Failed sendto command to hwmond (%d:%s)\n", hostname.c_str(), errno, strerror(errno)); @@ -1197,12 +1297,9 @@ int send_hwmon_command ( string hostname, int command ) } else { - print_mtc_message ( hostname, MTC_CMD_TX, cmd, get_iface_name_str(MGMNT_INTERFACE), false ); rc = PASS ; } + print_mtc_message ( hostname, MTC_CMD_TX, cmd, get_iface_name_str(MGMNT_INTERFACE), rc ); } return rc ; } - - - diff --git a/mtce/src/maintenance/mtcHttpSvr.cpp b/mtce/src/maintenance/mtcHttpSvr.cpp index 309a11ce..e4cadb52 100644 --- a/mtce/src/maintenance/mtcHttpSvr.cpp +++ b/mtce/src/maintenance/mtcHttpSvr.cpp @@ -418,8 +418,7 @@ string mtcHttpSvr_inv_req ( char * request_ptr, { node_inv_type inv ; node_inv_init (inv); - - ilog ("%s %s : '%s'\n", obj_ptr->my_hostname.c_str(), key.c_str(), value.c_str()) ; + dlog ("%s %s : '%s'\n", obj_ptr->my_hostname.c_str(), key.c_str(), value.c_str()) ; rc = jsonUtil_load_host ( request_ptr, inv ); if ( rc == PASS ) @@ -451,6 +450,7 @@ string mtcHttpSvr_inv_req ( char * request_ptr, */ if ( rc == RETRY ) { + ilog ("%s Modify Operation\n", inv.name.c_str()); rc = obj_ptr->mod_host ( inv ); } diff --git a/mtce/src/maintenance/mtcNodeComp.cpp b/mtce/src/maintenance/mtcNodeComp.cpp index e0be0756..1f3e1514 100644 --- a/mtce/src/maintenance/mtcNodeComp.cpp +++ b/mtce/src/maintenance/mtcNodeComp.cpp @@ -289,7 +289,7 @@ void setup_mgmnt_rx_socket ( void ) ilog("Mgmnt iface : %s\n", ctrl.mgmnt_iface.c_str() ); get_iface_macaddr ( ctrl.mgmnt_iface.data(), ctrl.macaddr ); get_iface_address ( ctrl.mgmnt_iface.data(), ctrl.address , true ); - get_hostname ( &ctrl.hostname[0], MAX_HOST_NAME_SIZE ); + get_hostname ( &ctrl.hostname[0], MAX_HOST_NAME_SIZE ); _close_mgmnt_rx_socket (); mtc_sock.mtc_client_rx_socket = new msgClassRx(ctrl.address.c_str(),mtc_sock.mtc_cmd_port, IPPROTO_UDP, ctrl.mgmnt_iface.data(), false ); diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index 1cd96655..d01de614 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -649,15 +649,13 @@ int daemon_configure ( void ) ilog("Inv Port : %d (tx)\n", mtc_config.sysinv_api_port ); ilog("Inv Address : %s (tx)\n", mtc_config.sysinv_api_bind_ip ); ilog("Inv Event : %d (rx)\n", mtc_config.inv_event_port ); - ilog("Keystone Port: %d (rx)\n", mtc_config.keystone_port ); + ilog("Keystone Prt: %d (rx)\n", mtc_config.keystone_port ); ilog("Mtce Logger : %d (tx)\n", mtc_config.daemon_log_port ); ilog("nfv-vim-api : %d (port)\n", mtc_config.vim_cmd_port ); ilog("hbsAgent : %d (port)\n", mtc_config.mtc_to_hbs_cmd_port ); ilog("guestAgent : %d (port)\n", mtc_config.mtc_to_guest_cmd_port ); ilog("hwmond : %d (port)\n", mtc_config.hwmon_cmd_port ); ilog("auth_host : %s \n", mtc_config.keystone_auth_host ); - ilog("Barbican Port: %d (rx)\n", mtc_config.barbican_api_port ); - ilog("Barbican Address : %s (tx)\n", mtc_config.barbican_api_host ); /* log system wide service based auto recovery control values */ ilog("AR Config : %d (threshold) %d sec (retry interval)", diff --git a/mtce/src/pmon/pmonHdlr.cpp b/mtce/src/pmon/pmonHdlr.cpp index 2024bb48..ed3fb603 100644 --- a/mtce/src/pmon/pmonHdlr.cpp +++ b/mtce/src/pmon/pmonHdlr.cpp @@ -423,7 +423,7 @@ void load_processes ( void ) } } - pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ; + pmon_send_event ( MTC_EVENT_PMON_CLEAR, NULL ) ; ilog ("Registering Processes With Kernel\n"); ilog ("---------------------------------------------------------------\n"); @@ -1945,7 +1945,7 @@ void pmon_service ( pmon_ctrl_type * ctrl_ptr ) if ( want_degrade_clear () == true ) { dlog ("sending degrade clear\n"); - pmon_send_event ( MTC_EVENT_PMON_CLEAR, &process_config[0] ) ; + pmon_send_event ( MTC_EVENT_PMON_CLEAR, NULL ) ; } else { diff --git a/mtce/src/pmon/pmonMsg.cpp b/mtce/src/pmon/pmonMsg.cpp index 2b231e6f..1acf189f 100644 --- a/mtce/src/pmon/pmonMsg.cpp +++ b/mtce/src/pmon/pmonMsg.cpp @@ -249,116 +249,87 @@ int pmon_send_pulse ( void ) int pmon_send_event ( unsigned int event_cmd , process_config_type * ptr ) { mtc_message_type event ; - int rc = PASS ; - int bytes = 0 ; /* Don't report events while we are in reset mode */ if ( daemon_is_file_present ( NODE_RESET_FILE ) ) return ( PASS ); + pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ; memset (&event, 0 , sizeof(mtc_message_type)); + snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); - if (( event_cmd == MTC_EVENT_MONITOR_READY) || - ( event_cmd == MTC_EVENT_PMON_LOG) || - ( event_cmd == MTC_EVENT_PMON_MINOR) || - ( event_cmd == MTC_EVENT_PMON_MAJOR) || - ( event_cmd == MTC_EVENT_PMON_CRIT ) || - ( event_cmd == MTC_EVENT_PMON_CLEAR )) + event.cmd = event_cmd ; + event.ver = MTC_CMD_FEATURE_VER__KEYVALUE_IN_BUF ; + event.num = 1 ; + event.parm[0] = ctrl_ptr->nodetype ; /* default to node type */ + + string event_info = "{\"" ; + event_info.append(MTC_JSON_INV_NAME); + event_info.append("\":\""); + event_info.append(ctrl_ptr->my_hostname); + event_info.append("\",\""); + event_info.append(MTC_JSON_SERVICE); + event_info.append("\":\""); + event_info.append(MTC_SERVICE_PMOND_NAME ); + if ( ( ptr != NULL ) && ( ptr->process ) ) { - pmon_ctrl_type * ctrl_ptr = get_ctrl_ptr () ; + event_info.append("\",\""); + event_info.append(MTC_JSON_PROCESS); + event_info.append("\":\""); + event_info.append(ptr->process); + } + event_info.append( "\"}"); - snprintf ( &event.hdr[0], MSG_HEADER_SIZE, "%s", get_mtce_event_header()); + size_t len = event_info.length()+1 ; + snprintf ( &event.buf[0], len, "%s", event_info.data()); + int bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-len)); - /* Set the version/revision for PMON messages. */ - event.ver = MTC_MSG_VERSION_15_12_GA_PMON ; - event.rev = MTC_MSG_REVISION_15_12_GA_PMON ; - - if ( ptr->process ) + /* override with subfunction case */ + if (( ptr != NULL ) && + ( ctrl_ptr->subfunction != 0 ) && + ( ctrl_ptr->subfunction != ctrl_ptr->function )) + { + if ( ptr->subfunction != NULL ) { - /* We don't use the buffer for pmon events to remove it from the size */ - bytes = ((sizeof(mtc_message_type))-(BUF_SIZE-MAX_FILENAME_LEN)); - - snprintf( &event.buf[0], MAX_PROCESS_NAME_LEN, "%s", ptr->process ); - - /* Put the process function in parm zero of the event message */ - event.num = 1 ; - event.parm[0] = ctrl_ptr->nodetype ; /* default to node type */ - - if ( event_cmd == MTC_EVENT_PMON_CLEAR ) + string temp = ptr->subfunction ; + event.parm[0]= get_host_function_mask (temp) ; + if ( ( event_cmd == MTC_EVENT_PMON_MINOR) || + ( event_cmd == MTC_EVENT_PMON_MAJOR) || + ( event_cmd == MTC_EVENT_PMON_LOG) || + ( event_cmd == MTC_EVENT_PMON_CRIT ) ) { - dlog ("pmond degrade clear\n" ); - snprintf( &event.buf[0], MAX_PROCESS_NAME_LEN, "%s", "pmond" ); + mlog ("%s process failed\n", ptr->process ); } - else if (( event_cmd == MTC_EVENT_PMON_CRIT ) || - ( event_cmd == MTC_EVENT_PMON_MAJOR )) + else if (( event_cmd == MTC_EVENT_PMON_CLEAR ) && ( ptr->was_failed == true )) { - wlog ("%s caused degrade assert\n", ptr->process ); - } - else if ( event_cmd == MTC_EVENT_PMON_MINOR ) - { - slog ("degrade does not apply to minor\n" ); - rc = FAIL_BAD_CASE ; - } - - /* override with subfunction case */ - if (( ctrl_ptr->subfunction != 0 ) && - ( ctrl_ptr->subfunction != ctrl_ptr->function )) - { - if ( ptr->subfunction != NULL ) - { - string temp = ptr->subfunction ; - event.parm[0]= get_host_function_mask (temp) ; - if ( ( event_cmd == MTC_EVENT_PMON_MINOR) || - ( event_cmd == MTC_EVENT_PMON_MAJOR) || - ( event_cmd == MTC_EVENT_PMON_LOG) || - ( event_cmd == MTC_EVENT_PMON_CRIT ) ) - { - mlog ("%s process failed\n", ptr->process ); - } - else if (( event_cmd == MTC_EVENT_PMON_CLEAR ) && ( ptr->was_failed == true )) - { - ilog ("%s process recovered\n", ptr->process ); - ptr->was_failed = false ; - } - } + ilog ("%s process recovered\n", ptr->process ); + ptr->was_failed = false ; } } } - else if ( event_cmd == MTC_EVENT_LOOPBACK ) - { - snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_loopback_header()); - - /* We don't use the buffer for pmon events to remove it from the size */ - bytes = ((sizeof(mtc_message_type))-(BUF_SIZE)); - } - else - { - elog ("Unsupported process monitor event (%d)\n", event_cmd ); - return ( FAIL_BAD_CASE ); - } - - event.cmd = event_cmd ; - - print_mtc_message ( LOCALHOST, MTC_CMD_TX, event, get_iface_name_str(MGMNT_INTERFACE), false ); /* Send the event */ if ((rc = pmon_sock.event_sock->write((char*)&event.hdr[0], bytes)) != bytes ) { - elog ("Message send failed. (%d)\n", rc); - elog ("Message: %d bytes to <%s:%d>\n", bytes, - pmon_sock.event_sock->get_dst_addr()->toString(), - pmon_sock.event_sock->get_dst_addr()->getPort()); + elog ("event message send failed (%d) (%d) (%d:%m) (%s:%d)\n", + bytes, rc, errno, + pmon_sock.event_sock->get_dst_addr()->toString(), + pmon_sock.event_sock->get_dst_addr()->getPort()); } else { string severity = get_event_str ( event.cmd ); - mlog ("Sending '%s' event for process '%s' to %s:%d (bytes:%d)\n", - severity.c_str(), event.buf, - pmon_sock.event_sock->get_dst_addr()->toString(), - pmon_sock.event_sock->get_dst_addr()->getPort(), bytes); + if ( ptr ) + { + /* Only log the clear event for a specified process. + * Avoid logging the periodic degrade clear event. */ + ilog ("%s %s sent", ctrl_ptr->my_hostname, + get_mtcNodeCommand_str(event_cmd)); + } rc = PASS ; } + print_mtc_message ( ctrl_ptr->my_hostname, MTC_CMD_TX, event, get_iface_name_str(MGMNT_INTERFACE), rc ); return rc ; }