Mtce: Add heartbeat cluster information for SM query

This part one of a two part HA Improvements feature that introduces
the collection of heartbeat health at the system level.

The full feature is intended to provide service management (SM)
with the last 2 seconds of maintenace's heartbeat health view that
is reflective of each controller's connectivity to each host
including its peer controller.

The heartbeat cluster summary information is additional information
for SM to draw on when needing to make a choice of which controller
is healthier, if/when to switch over and to ultimately avoid split
brain scenarios in a two controller system.

Feature Behavior: A common heartbeat cluster data structure is
introduced and published to the sysroot for SM. The heartbeat
service populates and maintains a local copy of this structure
with data that reflects the responsivness for each monitored
network of all the monitored hosts for the last 20 heartbeat
periods. Mtce sends the current cluster summary to SM upon request.

General flow of cluster feature wrt hbsAgent:

  hbs_cluster_init: general data init
  hbs_cluster_nums: set controller and network numbers
  forever:

    select:
      hbs_cluster_add / hbs_cluster_del: - add/del hosts from mtcAgent
      hbs_sm_handler -> hbs_cluster_send: - send cluster to SM

    heartbeating:
      hbs_cluster_append: add controller cluster to pulse request
      hbs_cluster_update: get controller cluster data from pulse responses
      hbs_cluster_save: save other controller cluster view in cluster vault
      hbs_cluster_log: log cluster state changes (clog)

Test Plan:

  PASS: Verify compute system install
  PASS: Verify storage system install
  PASS: Verify cluster data ; all members of structure
  PASS: Verify storage-0 state management
  PASS: Verify add of second controller
  PASS: Verify add of storage-0 node
  PASS: Verify behavior over Swact
  PASS: Verify lock/unlock of second controller ; overall behavior
  PASS: Verify lock/unlock of storage-0 ; overall behavior
  PASS: Verify lock/unlock of storage-1 ; overall behavior
  PASS: Verify lock/unlock of compute nodes ; overall behavior
  PASS: Verify heartbeat failure and recovery of compute node
  PASS: Verify heartbeat failure and recovery of storage-0
  PASS: Verify heartbeat failure and recovery of controller
  PASS: Verify delete of controller node
  PASS: Verify delete of storage-0
  PASS: Verify delete of compute node
  PASS: Verify cluster when controller-1 active / controller-0 disabled
  PASS: Verify MNFA and recovery handling
  PASS: Verify handling in presence of multiple failure conditions
  PASS: Verify hbsAgent memory leak soak test with continuous SM query.
  PASS: Verify active controller-1 infra network failure behavior.
  PASS: Verify inactive controller-1 infra network failure behavior.

Change-Id: I4154287f6dcf5249be5ab3180f2752ab47c5da3c
Story: 2003576
Task: 24907
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2018-10-03 07:45:35 -04:00
parent 3f337d5edb
commit 8a223f395d
23 changed files with 2095 additions and 248 deletions

View File

@ -249,6 +249,44 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
return (PASS);
}
int jsonUtil_get_key_val_int ( char * json_str_ptr,
string key,
int & value )
{
/* init to null to avoid trap on early cleanup call with
* bad non-null default pointer value */
struct json_object *raw_obj = (struct json_object *)(NULL);
if ((json_str_ptr == NULL) || ( *json_str_ptr == '\0' ) || ( ! strncmp ( json_str_ptr, "(null)" , 6 )))
{
elog ("Cannot tokenize a null json string\n");
elog ("... json string: %s\n", json_str_ptr );
return (FAIL);
}
size_t len_before = strlen (json_str_ptr);
jlog2 ("String: %s\n", json_str_ptr );
raw_obj = json_tokener_parse( json_str_ptr );
if ( raw_obj )
{
value = jsonUtil_get_key_value_int ( raw_obj, key.data() ) ;
jlog1 ("%s:%d\n", key.c_str(), value);
}
else
{
size_t len_after = strlen (json_str_ptr);
elog ("Unable to tokenize string (before:%ld after:%ld);\n", len_before, len_after);
elog ("... json string: %s\n", json_str_ptr );
}
if (raw_obj)
json_object_put(raw_obj);
return (PASS);
}
/** This utility freads the passed in inventory GET request
* response json character string and performes the following

View File

@ -69,6 +69,10 @@ int jsonUtil_get_key_val ( char * json_str_ptr,
string key,
string & value );
int jsonUtil_get_key_val_int ( char * json_str_ptr,
string key,
int & value );
/** Submit a request to get an authorization token and nova URL */
int jsonApi_auth_request ( string & hostname, string & payload );

View File

@ -114,6 +114,8 @@ typedef struct
int event_port ; /**< daemon specific event tx port */
int cmd_port ; /**< daemon specific command rx port */
int sensor_port ; /**< sensor read value port */
int sm_server_port ; /**< port mtce uses to receive data from SM */
int sm_client_port ; /**< port mtce uses to send SM data */
int start_delay ; /**< startup delay, added for pmon */
int api_retries ; /**< api retries before failure */
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
@ -243,6 +245,19 @@ extern char *program_invocation_short_name;
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
}
/** Error logger macro with throttling */
#define elog_throttled(cnt,max,format,args...) { \
if ( ++cnt == 1 ) \
{ \
if (ltc()) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
else { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Error : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
} \
if ( cnt >= max ) \
{ \
cnt = 0 ; \
} \
}
/** Warning logger macro */
#define wlog(format, args...) { \
if ( ltc() ) { printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Warn : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
@ -387,7 +402,9 @@ extern char *program_invocation_short_name;
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }

View File

@ -23,7 +23,9 @@ using namespace std;
#include "returnCodes.h"
#include "nodeTimers.h"
#ifndef ALIGN_PACK
#define ALIGN_PACK(x) __attribute__((packed)) x
#endif
/* Out-Of-Service Stress tests */
#define WANT_SYSINV_API_STRESS 0x00000001
@ -359,8 +361,12 @@ void daemon_exit ( void );
#define CONTROLLER_0 ((const char *)"controller-0")
#define CONTROLLER_1 ((const char *)"controller-1")
#define CONTROLLER_2 ((const char *)"controller-2")
#define CONTROLLER ((const char *)"controller")
#define STORAGE_0 ((const char *)"storage-0")
#define STORAGE_1 ((const char *)"storage-1")
/* The infrastructure networking floating IP
*
* Note: If there is no infra then this label will resolve

View File

@ -267,7 +267,7 @@ bool is_goenabled ( int nodeType, bool pass )
return daemon_is_file_present ( file );
}
#define LOG_MEMORY(buf) ilog ("%s", buf ); \
#define LOG_MEMORY(buf) syslog ( LOG_INFO, "%s", buf ); \
buf_ptr = &buf[0]; \
MEMSET_ZERO ( buf );
@ -279,7 +279,7 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
char buf[0x1024] ;
char * buf_ptr = &buf[0];
MEMSET_ZERO ( buf );
ilog ("Dumping Memory:\n");
syslog ( LOG_INFO, "Dumping Memory: %ld bytes", bytes );
if ( format == 4 )
{
int loops = bytes/format ;
@ -294,7 +294,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr, "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr, "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
@ -315,7 +314,6 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr , "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
@ -336,21 +334,12 @@ void dump_memory ( void * raw_ptr , int format, size_t bytes )
buf_ptr += sprintf ( buf_ptr , "%c", *byte_ptr) ;
else
buf_ptr += sprintf ( buf_ptr , "%c", '.');
byte_ptr++ ;
}
LOG_MEMORY(buf);
word_ptr += 4 ;
}
}
byte_ptr = (uint8_t*)raw_ptr ;
ilog ("Raw Hex Dump : %ld\n", bytes );
for ( unsigned int x = 0 ; x < bytes ; x++ )
{
buf_ptr += sprintf ( buf_ptr, " %02x", *byte_ptr );
byte_ptr++ ;
}
// printf ("\n\n");
}

View File

@ -93,7 +93,7 @@
#define FAIL_INVALID_DATA (71)
#define FAIL_BAD_STATE (72)
#define FAIL_KEY_VALUE_PARSE (73)
#define FAIL____UNUSED____74 (74)
#define FAIL_DATA_SIZE (74)
#define FAIL_NOT_FOUND (75)
#define FAIL_WORKQ_TIMEOUT (76)
#define FAIL_HTTP_DELETE (77)

View File

@ -207,7 +207,7 @@ int daemon_run_testhead ( void );
#define CONFIG_AGENT_INV_PORT 0x00000100 /**< Inventory Port Number */
#define CONFIG_AGENT_HA_PORT 0x00000200 /**< HA Framework Port Number */
#define CONFIG_CLIENT_MTCALARM_PORT 0x00000400 /**< Send alarm requests to */
#define CONFIG_RESERVED_800 0x00000800 /**< */
#define CONFIG_AGENT_SM_CLIENT_PORT 0x00000800 /**< Port to Send SM data on */
#define CONFIG_MTC_TO_HWMON_CMD_PORT 0x00001000 /**< HWmon Port Number */
#define CONFIG_AGENT_KEY_PORT 0x00002000 /**< Keystone HTTP port */
#define CONFIG_AGENT_HBS_MTC_PORT 0x00004000 /**< Heartbeat Service Port */
@ -217,8 +217,8 @@ int daemon_run_testhead ( void );
#define CONFIG_AGENT_MTC_MGMNT_PORT 0x00040000 /**< Agent Infr network port */
#define CONFIG_AGENT_TOKEN_REFRESH 0x00080000 /**< Token refresh rate mask */
#define CONFIG_CLIENT_MTC_INFRA_PORT 0x00100000 /**< Client Infra nwk mtc port */
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
#define CONFIG_AGENT_VIM_CMD_PORT 0x00400000 /**< VIM Command Port Mask */
#define CONFIG_CLIENT_MTC_MGMNT_PORT 0x00200000 /**< Client mgmnt nwk mtc port */
#define CONFIG_AGENT_SM_SERVER_PORT 0x00400000 /**< Port to RX data from SM */
#define CONFIG_CLIENT_HBS_INFRA_PORT 0x00800000 /**< Infrastructure ntwk Port */
#define CONFIG_CLIENT_HBS_MGMNT_PORT 0x01000000 /**< Management network Port */
#define CONFIG_CLIENT_HBS_EVENT_PORT 0x02000000 /**< Heartbeat Event Messaging */

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=139
TIS_PATCH_VER=140
BUILD_IS_SLOW=5

View File

@ -90,6 +90,15 @@ of spec operating conditions that can reduce outage time through automated
notification and recovery thereby improving overall platform availability
for the customer.
%package -n mtce-dev
Summary: Titanuim Server Maintenance Software Development Package
Group: base
Provides: mtce-dev = %{version}-%{release}
%description -n mtce-dev
Titanuim Cloud Maintenance. This package contains header files,
and related items necessary for software development.
%package -n mtce-pmon
Summary: Titanuim Server Maintenance Process Monitor Package
Group: base
@ -424,6 +433,9 @@ install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.logrotate %{buildroot}%
install -m 644 -p -D %{_buildsubdir}/hwmon/scripts/hwmon.logrotate %{buildroot}%{local_etc_logrotated}/hwmon.logrotate
install -m 644 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.logrotate %{buildroot}%{local_etc_logrotated}/mtcalarm.logrotate
# software development files
install -m 644 -p -D %{_buildsubdir}/heartbeat/mtceHbsCluster.h %{buildroot}/%{_includedir}/mtceHbsCluster.h
install -m 755 -p -D %{_buildsubdir}/public/libamon.so.$MAJOR %{buildroot}%{_libdir}/libamon.so.$MAJOR
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so.$MAJOR.$MINOR
cd %{buildroot}%{_libdir} ; ln -s libamon.so.$MAJOR libamon.so
@ -621,3 +633,10 @@ install -m 755 -d %{buildroot}/var/run
%{_sysconfdir}/init.d/hostw
%{local_bindir}/hostwd
###############################
# Maintenance Software Development RPM
###############################
%files -n mtce-dev
%defattr(-,root,root,-)
%{_includedir}/mtceHbsCluster.h

View File

@ -269,7 +269,7 @@ nodeLinkClass::nodeLinkClass()
hbs_ready = false ;
hbs_state_change = false ;
hbs_disabled = true ;
hbs_pulse_period = hbs_pulse_period_save = 200 ;
hbs_pulse_period = hbs_pulse_period_save = 0 ;
hbs_minor_threshold = HBS_MINOR_THRESHOLD ;
hbs_degrade_threshold = HBS_DEGRADE_THRESHOLD ;
hbs_failure_threshold = HBS_FAILURE_THRESHOLD ;
@ -7325,18 +7325,40 @@ int nodeLinkClass::launch_host_services_cmd ( struct nodeLinkClass::node * node_
int send_event ( string & hostname, unsigned int cmd, iface_enum iface );
int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear )
int nodeLinkClass::mon_host ( const string & hostname, bool true_false, bool send_clear )
{
int rc = FAIL ;
if ( ! hostname.empty() )
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
{
nodeLinkClass::node* node_ptr ;
node_ptr = nodeLinkClass::getNode ( hostname );
if ( node_ptr != NULL )
bool want_log = true ;
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
node_ptr->monitor[iface] = true_false ;
if ( node_ptr->monitor[iface] == true_false )
continue ;
if ( iface == INFRA_IFACE )
{
if ( this->infra_network_provisioned == false )
continue ;
if ( node_ptr->monitor[MGMNT_IFACE] == true_false )
want_log = false ;
}
if ( send_clear == true )
{
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, (iface_enum)iface ) ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, (iface_enum)iface ) ;
}
if ( true_false == true )
{
if ( want_log )
{
ilog ("%s starting heartbeat service \n",
hostname.c_str());
}
node_ptr->no_work_log_throttle = 0 ;
node_ptr->b2b_misses_count[iface] = 0 ;
node_ptr->hbs_misses_count[iface] = 0 ;
@ -7345,16 +7367,20 @@ int nodeLinkClass::mon_host ( const string & hostname, iface_enum iface, bool tr
node_ptr->hbs_failure[iface] = false ;
node_ptr->hbs_minor[iface] = false ;
node_ptr->hbs_degrade[iface] = false ;
if ( send_clear == true )
}
else
{
if ( want_log )
{
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_CLR, iface ) ;
send_event ( node_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_CLR, iface ) ;
ilog ("%s stopping heartbeat service\n",
hostname.c_str());
}
}
return PASS ;
node_ptr->monitor[iface] = true_false ;
}
return PASS ;
}
return ( rc );
return ( FAIL );
}
/* store the current hardware monitor monitoring state */
@ -7887,11 +7913,11 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
else
{
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
pulse_list[iface].head_ptr = pulse_list[iface].head_ptr->pulse_link[iface].next_ptr ;
pulse_list[iface].head_ptr->pulse_link[iface].prev_ptr = NULL ;
}
}
}
}
else if ( pulse_list[iface].tail_ptr == pulse_ptr )
{
qlog2 ("%s Pulse: Multiple Node -> Tail Case : %d of %d\n", node_ptr->hostname.c_str(), pulse_ptr->linknum[iface], pulses[iface] );
@ -7906,19 +7932,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
else
{
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
}
pulse_list[iface].tail_ptr = pulse_list[iface].tail_ptr->pulse_link[iface].prev_ptr ;
pulse_list[iface].tail_ptr->pulse_link[iface].next_ptr = NULL ;
}
}
else
{
/* July 1 emacdona: Make failure path case more robust */
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 4\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 5\n"); rc = FAIL; }
if ( pulse_ptr == NULL ) { slog ("Internal Err 1\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].prev_ptr == NULL ) { slog ("Internal Err 2\n"); rc = FAIL; }
else if ( pulse_ptr->pulse_link[iface].next_ptr == NULL ) { slog ("Internal Err 3\n"); rc = FAIL; }
if ( rc == FAIL )
{
slog ("%s Null pointer error splicing %s out of pulse list with %d pulses remaining (Monitoring:%s)\n",
@ -7935,7 +7958,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
}
if ( rc == PASS )
{
pulse_ptr->linknum[iface]-- ; // = 0 ;
pulse_ptr->linknum[iface]-- ;
}
pulses[iface]-- ;
}
@ -8082,14 +8105,26 @@ void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_p
int nodeLinkClass::lost_pulses ( iface_enum iface )
int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
{
int rc = PASS ;
int lost = 0 ;
/*
* Assume storage-0 is responding until otherwise proven its not.
* keep in mind that this interface counts nodes that have not responded ;
* not those that have.
*/
storage_0_responding = true ;
/*
* Loop over the pulse_list which now onoly contains a list of hosts
* that have not responded in this heartbeat period.
*/
for ( ; pulse_list[iface].head_ptr != NULL ; )
{
daemon_signal_hdlr ();
pulse_ptr = pulse_list[iface].head_ptr ;
lost++ ;
if ( active )
{
string flat = "Flat Line:" ;
@ -8098,6 +8133,15 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
pulse_ptr->b2b_pulses_count[iface] = 0 ;
// pulse_ptr->max_count[iface]++ ;
/*
* Update storage_0_responding reference to false if storgate-0
* is found in the pulse lots list.
*/
if ( pulse_ptr->hostname == STORAGE_0 )
{
storage_0_responding = false ;
}
/* Don't log single misses unless in debug mode */
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
{
@ -8156,8 +8200,9 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
get_iface_name_str(iface),
pulse_ptr->b2b_misses_count[iface] );
}
#ifdef WANT_HBS_MEM_LOGS
mem_log ( flat, pulse_ptr->b2b_misses_count[iface], pulse_ptr->hostname.c_str());
#endif
if ( iface == MGMNT_IFACE )
{
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
@ -8252,8 +8297,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
}
rc = remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS );
if ( rc != PASS )
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
{
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
get_iface_name_str(iface));
@ -8266,7 +8310,7 @@ int nodeLinkClass::lost_pulses ( iface_enum iface )
break ;
}
}
return (rc);
return (lost);
}
/* Return true if the specified interface is being monitored for this host */
@ -8301,7 +8345,7 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
if ( pulse_list[iface].head_ptr != NULL )
{
for ( pulse_ptr = pulse_list[iface].head_ptr ;
for ( pulse_ptr = pulse_list[iface].head_ptr ;
pulse_ptr != NULL ;
pulse_ptr = pulse_ptr->pulse_link[iface].next_ptr )
{
@ -8310,12 +8354,15 @@ void nodeLinkClass::print_pulse_list ( iface_enum iface )
}
dlog ("Patients: %s\n", pulse_host_list.c_str());
}
#ifdef WANT_HBS_MEM_LOGS
if ( pulses[iface] && !pulse_host_list.empty() )
{
string temp = get_iface_name_str(iface) ;
temp.append(" Patients :") ;
mem_log ( temp, pulses[iface], pulse_host_list );
}
#endif
}

View File

@ -1940,7 +1940,7 @@ public:
void manage_pulse_flags ( string & hostname, unsigned int flags );
/** Control the heartbeat monitoring state of a host */
int mon_host ( const string & hostname, iface_enum iface, bool true_false, bool send_clear );
int mon_host ( const string & hostname, bool true_false, bool send_clear );
/** Return true if the pulse list is empty */
bool pulse_list_empty ( iface_enum iface );
@ -1956,7 +1956,7 @@ public:
* that exceed preset thresholds.
*
*/
int lost_pulses ( iface_enum iface );
int lost_pulses ( iface_enum iface, bool & storage_0_responding );
bool monitored_pulse ( string hostname , iface_enum iface );

View File

@ -4,10 +4,10 @@
# SPDX-License-Identifier: Apache-2.0
#
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsStubs.cpp
SRCS = hbsAlarm.cpp hbsClient.cpp hbsAgent.cpp hbsPmon.cpp hbsUtil.cpp hbsCluster.cpp hbsStubs.cpp
OBJS = $(SRCS:.cpp=.o)
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid
LDLIBS = -lstdc++ -ldaemon -lcommon -lthreadUtil -lpthread -lfmcommon -lalarm -lrt -lamon -lcrypto -luuid -ljson-c
INCLUDES = -I. -I/usr/include/mtce-daemon -I/usr/include/mtce-common
INCLUDES += -I../common -I../alarm -I../maintenance -I../public
@ -31,8 +31,8 @@ endif
all: static_analysis common agent client
build: static_analysis $(OBJS)
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o -L../public -L../alarm $(LDLIBS) -o hbsClient
$(CXX) $(CCFLAGS) hbsAlarm.o hbsAgent.o hbsUtil.o hbsCluster.o hbsStubs.o ../common/nodeClass.o -L../public -L../alarm $(LDLIBS) -o hbsAgent
$(CXX) $(CCFLAGS) hbsClient.o hbsPmon.o hbsUtil.o -L../public -L../alarm $(LDLIBS) -o hbsClient
common:
( cd ../common ; make clean ; make lib VER=$(VER) VER_MJR=$(VER_MJR))

View File

@ -41,6 +41,7 @@ using namespace std;
#include "hbsBase.h" /* Heartbeat Base Header File */
#include "hbsAlarm.h" /* for ... hbsAlarm_clear_all */
#include "alarm.h" /* for ... alarm send message to mtcalarmd */
#include "jsonUtil.h" /* for ... jsonUtil_get_key_val */
/**************************************************************
* Implementation Structure
@ -68,6 +69,8 @@ using namespace std;
/* Number of back to back interface errors before the interface is re-initialized. */
#define INTERFACE_ERRORS_FOR_REINIT (8)
#define MAX_LEN 1000
/* Historical String data for mem_logs */
static string unexpected_pulse_list[MAX_IFACES] = { "" , "" } ;
static string arrival_histogram[MAX_IFACES] = { "" , "" } ;
@ -90,6 +93,8 @@ int module_init ( void )
return (PASS);
}
static unsigned int controller_number = 0 ;
void daemon_sigchld_hdlr ( void )
{
; /* dlog("Received SIGCHLD ... no action\n"); */
@ -184,14 +189,16 @@ void daemon_exit ( void )
CONFIG_AGENT_HBS_DEGRADE |\
CONFIG_AGENT_HBS_FAILURE |\
CONFIG_AGENT_MULTICAST |\
CONFIG_SCHED_PRIORITY |\
CONFIG_SCHED_PRIORITY |\
CONFIG_MTC_TO_HBS_CMD_PORT |\
CONFIG_HBS_TO_MTC_EVENT_PORT |\
CONFIG_AGENT_HBS_MGMNT_PORT |\
CONFIG_AGENT_HBS_INFRA_PORT |\
CONFIG_CLIENT_HBS_MGMNT_PORT |\
CONFIG_CLIENT_MTCALARM_PORT |\
CONFIG_CLIENT_HBS_INFRA_PORT )
CONFIG_CLIENT_HBS_INFRA_PORT |\
CONFIG_AGENT_SM_SERVER_PORT |\
CONFIG_AGENT_SM_CLIENT_PORT)
/* Startup config read */
static int hbs_config_handler ( void * user,
@ -203,6 +210,8 @@ static int hbs_config_handler ( void * user,
if (MATCH("agent", "heartbeat_period"))
{
int curr_period = hbsInv.hbs_pulse_period ;
config_ptr->hbs_pulse_period = atoi(value);
hbsInv.hbs_pulse_period = atoi(value);
hbsInv.hbs_state_change = true ;
@ -227,10 +236,14 @@ static int hbs_config_handler ( void * user,
}
}
}
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
if ( curr_period != hbsInv.hbs_pulse_period )
{
/* initialize cluster info */
hbs_cluster_init ( hbsInv.hbs_pulse_period );
}
}
hbsInv.hbs_pulse_period_save = hbsInv.hbs_pulse_period ;
if (MATCH("agent", "hbs_minor_threshold"))
{
config_ptr->hbs_minor_threshold =
@ -312,6 +325,16 @@ static int hbs_config_handler ( void * user,
config_ptr->hbs_agent_mgmnt_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_HBS_MGMNT_PORT ;
}
else if (MATCH("agent", "sm_server_port"))
{
config_ptr->sm_server_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_SM_SERVER_PORT ;
}
else if (MATCH("agent", "sm_client_port"))
{
config_ptr->sm_client_port = atoi(value);
config_ptr->mask |= CONFIG_AGENT_SM_CLIENT_PORT ;
}
else if (MATCH("client", "hbs_client_mgmnt_port"))
{
config_ptr->hbs_client_mgmnt_port = atoi(value);
@ -617,6 +640,34 @@ int alarm_port_init ( void )
return ( hbs_sock.alarm_sock->return_status ) ;
}
int hbs_sm_sockets_init ( void )
{
int rc = PASS ;
/* Create an UDP RX Message Socket for SM Requests; LO interface only */
hbs_sock.sm_server_sock = new msgClassRx(LOOPBACK_IP, hbs_config.sm_server_port, IPPROTO_UDP);
if ( ! hbs_sock.sm_server_sock )
{
elog ("Failed to setup SM receive socket");
rc = FAIL_SOCKET_CREATE ;
}
/* Create an UDP TX Message Socket for SM Requests; LO interface only */
hbs_sock.sm_client_sock = new msgClassTx(LOOPBACK_IP, hbs_config.sm_client_port,IPPROTO_UDP);
if ( ! hbs_sock.sm_client_sock )
{
elog ("Failed to setup SM transmit socket");
rc = FAIL_SOCKET_CREATE ;
}
if ( rc == PASS )
{
hbs_sock.sm_server_sock->sock_ok(true);
hbs_sock.sm_client_sock->sock_ok(true);
}
return (rc);
}
/* Init the internal/local sockets ; the ones that will no change.
* This way we don't miss add and start commands from maintenance. */
@ -654,6 +705,9 @@ int hbs_int_socket_init ( void )
{
elog ("Alarm port setup or registration failed (rc:%d)\n", rc );
}
rc = hbs_sm_sockets_init () ;
return (rc);
}
@ -697,26 +751,36 @@ int hbs_pulse_request ( iface_enum iface,
string hostname_clue,
unsigned int lookup_clue)
{
int rc = PASS ;
#define MAX_LEN 1000
#ifdef WANT_HBS_MEM_LOGS
char str[MAX_LEN] ;
/* Add the sequence number */
hbs_sock.tx_mesg[iface].s = seq_num ;
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
if (( lookup_clue ) &&
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
{
hbs_sock.tx_mesg[iface].c = lookup_clue ;
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
hostname_clue.data(),
hostname_clue.length());
}
/* Message length is the size of the sequence number, the clue and the buffer */
int msg_len = (HBS_MAX_MSG+(sizeof(unsigned int)*2)) ;
#endif
int bytes = 0 ;
if ( hbs_sock.tx_sock[iface] )
{
// int unused_networks = 0 ;
memset ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE], 0, MAX_CHARS_HOSTNAME );
/* Add message version - 0 -> 1 with the acction of cluster information */
hbs_sock.tx_mesg[iface].v = HBS_MESSAGE_VERSION ;
/* Add the sequence number */
hbs_sock.tx_mesg[iface].s = seq_num ;
if (( lookup_clue ) &&
( hostname_clue.length() <= MAX_CHARS_HOSTNAME ))
{
hbs_sock.tx_mesg[iface].c = lookup_clue ;
memcpy ( &hbs_sock.tx_mesg[iface].m[HBS_HEADER_SIZE],
hostname_clue.data(),
hostname_clue.length());
}
/* Append the cluster info to the pulse request */
hbs_cluster_append(hbs_sock.tx_mesg[iface]) ;
/* Calculate the total message size */
bytes = sizeof(hbs_message_type)-hbs_cluster_unused_bytes();
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__NO_PULSE_REQUEST, "any" , get_iface_name_str(iface) ) )
{
@ -727,14 +791,15 @@ int hbs_pulse_request ( iface_enum iface,
goto hbs_pulse_request_out ;
}
#endif
if ( (rc = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], msg_len)) < 0 )
if ( (bytes = hbs_sock.tx_sock[iface]->write((char*)&hbs_sock.tx_mesg[iface], bytes)) < 0 )
{
elog("Failed to send Pulse request: %d:%s to %s.%d (rc:%i ; %d:%s)\n",
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
rc, errno, strerror(errno) );
bytes, errno, strerror(errno) );
return (FAIL_SOCKET_SENDTO);
}
}
@ -748,16 +813,17 @@ int hbs_pulse_request ( iface_enum iface,
hbs_pulse_request_out:
#endif
mlog1("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), rc,
mlog("%s Pulse Req: (%5d): %17s:%5d: %d:%d:%d:%x:%s\n",
get_iface_name_str(iface), bytes,
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_mesg[iface].v,
hbs_sock.tx_mesg[iface].s,
hbs_sock.tx_mesg[iface].c,
hbs_sock.tx_mesg[iface].f,
hbs_sock.tx_mesg[iface].m);
#ifdef WANT_HBS_MEM_LOGS
snprintf ( &str[0], MAX_LEN, "%s Pulse Req: %17s:%5d: %u:%u:%s\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
@ -766,6 +832,7 @@ hbs_pulse_request_out:
hbs_sock.tx_mesg[iface].c,
hbs_sock.tx_mesg[iface].m);
mem_log (&str[0]);
#endif
return (PASS);
}
@ -785,7 +852,7 @@ string get_hostname_from_pulse ( char * msg_ptr )
int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
int n = 0 ;
int bytes = 0 ;
int detected_pulses = 0 ;
@ -796,7 +863,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
do
{
/* Clean the receive buffer */
memset ( hbs_sock.rx_mesg[iface].m, 0, HBS_MAX_MSG );
memset ( hbs_sock.rx_mesg[iface].m, 0, sizeof(hbs_message_type) );
hbs_sock.rx_mesg[iface].s = 0 ;
hbs_sock.rx_mesg[iface].c = 0 ;
if ( hbs_sock.rx_sock[iface] == NULL )
@ -804,10 +871,10 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
elog ("%s cannot receive pulses - null object\n", get_iface_name_str(iface) );
return (0);
}
if ( (n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
if ( (bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type))) != -1 )
{
mlog1 ("%s Pulse Rsp: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), n,
get_iface_name_str(iface), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
@ -839,7 +906,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
}
#endif
mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
// mlog ("%s Pulse Rsp from (%s)\n", get_iface_name_str(iface), hostname.c_str());
if ( !hostname.compare("localhost") )
{
mlog3 ("%s Pulse Rsp (local): %17s:%5d: %d:%d:%x:%s\n",
@ -868,7 +935,6 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
if ( hbsInv.monitored_pulse ( hostname , iface ) == true )
{
#define MAX_LEN 1000
char str[MAX_LEN] ;
string extra = "Rsp" ;
@ -880,25 +946,42 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
{
rc = hbsInv.remove_pulse ( hostname, iface, hbs_sock.rx_mesg[iface].c, hbs_sock.rx_mesg[iface].f ) ;
}
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %17s:%5d: %u:%u:%x:%s\n",
get_iface_name_str(iface), extra.c_str(), n,
snprintf (&str[0], MAX_LEN, "%s Pulse %s: (%5d): %s:%d: %u:%u:%x:%s\n",
get_iface_name_str(iface), extra.c_str(), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].c,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m);
mlog1 ("%s", &str[0]);
mlog ("%s", &str[0]);
#ifdef WANT_HBS_MEM_LOGS
mem_log (str);
#endif
if ( extra.empty())
{
detected_pulses++ ;
}
/* don't save data from self */
if ( hostname != hbsInv.my_hostname )
{
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
if ( iface == MGMNT_IFACE )
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_MGMT , hbs_sock.rx_mesg[iface]);
else
hbs_cluster_save ( hostname, MTCE_HBS_NETWORK_INFRA , hbs_sock.rx_mesg[iface]);
}
}
else
{
ilog ("skipping my hostname");
}
}
else
{
mlog3 ("%s Pulse Dis: (%5d): %17s:%5d: %d:%d:%x:%s\n",
get_iface_name_str(iface), n,
get_iface_name_str(iface), bytes,
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
@ -934,7 +1017,7 @@ int _pulse_receive ( iface_enum iface , unsigned int seq_num )
hbs_sock.rx_mesg[iface].m) ;
}
}
} while ( n > 0 ) ;
} while ( bytes > 0 ) ;
monitor_scheduling ( after_rx_time, before_rx_time, detected_pulses, SCHED_MONITOR__RECEIVER );
return (detected_pulses);
}
@ -951,6 +1034,8 @@ int send_event ( string & hostname, unsigned int event_cmd, iface_enum iface )
if ( event_cmd == MTC_EVENT_HEARTBEAT_LOSS )
{
daemon_dump_membuf_banner ();
hbsInv.print_node_info ();
hbs_cluster_log( hbsInv.my_hostname, "event");
daemon_dump_membuf ();
snprintf ( &event.hdr[0] , MSG_HEADER_SIZE, "%s", get_heartbeat_loss_header());
}
@ -1038,6 +1123,9 @@ int daemon_init ( string iface, string nodetype )
/* Initialize the hbs control struct */
MEMSET_ZERO ( hbs_ctrl );
/* init the utility module */
hbs_utils_init ();
/* initialize the timer */
mtcTimer_init ( hbsTimer, "controller", "heartbeat" );
@ -1091,9 +1179,123 @@ int daemon_init ( string iface, string nodetype )
return (rc);
}
/*****************************************************************************
*
* Name : hbs_sm_handler
*
* Description: Try and receive a Service Management request from sm_server_sock
*
* Expecting request in the following form:
* ~66 bytes with moderate spacing
*
* {
* "origin" :"sm",
* "service":"heartbeat",
* "request":"cluster_info"
* "req_id" : number
* }
*
* Successfully parsed request results in a call to
* hbs_cluser_send which sends the latest snapshot of
* the heartbeat cluser info to SM.
*
* Assumptions: log flooding is avoided.
*
* Returns : Nothing
*
****************************************************************************/
static int _hbs_sm_handler_log_throttle = 0 ;
void hbs_sm_handler ( void )
{
#define _MAX_MSG_LEN (80)
#define _MAX_LOG_CNT (1000)
#define PRIMARY_LABEL "origin"
#define SERVICE_LABEL "service"
#define REQUEST_LABEL "request"
#define REQID_LABEL "reqid"
#define SUPPORTED_ORIGIN "sm"
#define SUPPERTED_SERVICE "heartbeat"
#define SUPPORTED_REQUEST "cluster_info"
char sm_mesg[_MAX_MSG_LEN] ;
MEMSET_ZERO(sm_mesg);
int bytes = hbs_sock.sm_server_sock->read((char*)&sm_mesg, _MAX_MSG_LEN);
if ( bytes )
{
/* Expecting request in the following form:
* { "origin":"sm" ... } */
if ( sm_mesg[0] == '{' )
{
int reqid = 0 ;
string origin = "" ;
string service = "" ;
string request = "" ;
if ( jsonUtil_get_key_val ( sm_mesg, PRIMARY_LABEL, origin ) != PASS )
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"missing primary label 'origin' in request.");
}
else if (( origin == SUPPORTED_ORIGIN ) &&
( jsonUtil_get_key_val ( sm_mesg, SERVICE_LABEL, service ) == PASS ) &&
( jsonUtil_get_key_val ( sm_mesg, REQUEST_LABEL, request ) == PASS ) &&
( jsonUtil_get_key_val_int ( sm_mesg, REQID_LABEL, reqid ) == PASS ))
{
if (( service == SUPPERTED_SERVICE ) &&
( request == SUPPORTED_REQUEST ))
{
/* success path ... */
hbs_cluster_send( hbs_sock.sm_client_sock, reqid );
/* reset log throttle */
_hbs_sm_handler_log_throttle = 0 ;
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"missing service or request labels in request.");
}
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"failed to parse one or more request labels.");
}
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"improperly formatted json string request.");
}
}
else if ( bytes == -1 )
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"message receive error (%d:%s)",
errno, strerror(errno));
}
else
{
wlog_throttled ( _hbs_sm_handler_log_throttle, _MAX_LOG_CNT,
"unknown error Error (rc:%d)", bytes );
}
dlog ("... %s", sm_mesg );
}
/****************************************************************************
*
* Name : daemon_service_run
*
* Description: Daemon's main loop
*
***************************************************************************/
void daemon_service_run ( void )
{
#ifdef WANT_HBS_MEM_LOGS
int exp_pulses[MAX_IFACES] ;
#endif
int rc = PASS ;
int counter = 0 ;
int goenabled_wait_log_throttle = 0 ;
@ -1154,6 +1356,8 @@ void daemon_service_run ( void )
daemon_exit ();
}
/* set this controller as provisioned */
hbs_manage_controller_state ( hbsInv.my_hostname , true );
/* CGTS 4114: Small Footprint: Alarm 200.005 remains active after connectivity restored
*
@ -1195,6 +1399,16 @@ void daemon_service_run ( void )
/* enable the base level signal handler latency monitor */
daemon_latency_monitor (true);
/* load this controller index number - used for cluster stuff */
if ( hbsInv.my_hostname == CONTROLLER_0 )
controller_number = 0 ;
else
controller_number = 1 ;
/* tell the cluster which controller this is and
* how many networks are being monitored */
hbs_cluster_nums (controller_number,hbsInv.infra_network_provisioned ?2:1);
/* Run heartbeat service forever or until stop condition */
for ( hbsTimer.ring = false ; ; )
{
@ -1315,6 +1529,14 @@ void daemon_service_run ( void )
FD_SET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds);
}
/* Add the sm request receiver to the select list */
if (( hbs_sock.sm_server_sock ) &&
( hbs_sock.sm_server_sock->getFD()))
{
socks.push_front (hbs_sock.sm_server_sock->getFD());
FD_SET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds);
}
/* Add the netlink event listener to the select list */
if ( hbs_sock.netlink_sock )
{
@ -1379,6 +1601,11 @@ void daemon_service_run ( void )
hbs_sock.fired[INFRA_INTERFACE] = true ;
}
if ((hbs_sock.sm_server_sock != NULL ) &&
( FD_ISSET(hbs_sock.sm_server_sock->getFD(), &hbs_sock.readfds)))
{
hbs_sm_handler();
}
if ((hbs_sock.mtc_to_hbs_sock != NULL ) &&
( FD_ISSET(hbs_sock.mtc_to_hbs_sock->getFD(), &hbs_sock.readfds)))
{
@ -1404,7 +1631,7 @@ void daemon_service_run ( void )
inv.nodetype = msg.parm[0];
hbsInv.add_heartbeat_host ( inv ) ;
hostname_inventory.push_back ( hostname );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), inv.nodetype );
ilog ("%s added to heartbeat service (%d)\n", hostname.c_str(), msg.parm[0] );
/* clear any outstanding alarms on the ADD */
if ( hbsInv.hbs_failure_action != HBS_FAILURE_ACTION__NONE )
@ -1415,10 +1642,7 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_DEL_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
}
hbsInv.mon_host ( hostname, false, false );
hostname_inventory.remove ( hostname );
hbsInv.del_host ( hostname );
ilog ("%s deleted from heartbeat service\n", hostname.c_str());
@ -1432,27 +1656,24 @@ void daemon_service_run ( void )
}
else if ( msg.cmd == MTC_CMD_STOP_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, true );
}
ilog ("%s stopping heartbeat service\n", hostname.c_str());
hbsInv.mon_host ( hostname, false, true );
hbs_cluster_del ( hostname );
ilog ("%s stopping heartbeat service\n",
hostname.c_str());
}
else if ( msg.cmd == MTC_CMD_START_HOST )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, true, true );
}
ilog ("%s starting heartbeat service\n", hostname.c_str());
hbsInv.mon_host ( hostname, true, true );
hbs_cluster_add ( hostname );
ilog ("%s starting heartbeat service\n",
hostname.c_str());
}
else if ( msg.cmd == MTC_RESTART_HBS )
{
for ( int iface = 0 ; iface < MAX_IFACES ; iface++ )
{
hbsInv.mon_host ( hostname, (iface_enum)iface, false, false );
hbsInv.mon_host ( hostname, (iface_enum)iface, true, false );
}
hbsInv.mon_host ( hostname, false, false );
hbsInv.mon_host ( hostname, true, false );
ilog ("%s restarting heartbeat service\n", hostname.c_str());
hbsInv.print_node_info();
}
@ -1616,7 +1837,9 @@ void daemon_service_run ( void )
int rri = 0 ;
string lf = "\n" ;
#ifdef WANT_HBS_MEM_LOGS
mem_log ((char*)lf.data());
#endif
/* Get the next Resource Reference Identifier
* and its Resourvce Identifier. These values
@ -1630,7 +1853,9 @@ void daemon_service_run ( void )
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned == false ))
continue ;
#ifdef WANT_HBS_MEM_LOGS
exp_pulses[iface] =
#endif
hbsInv.hbs_expected_pulses[iface] =
hbsInv.create_pulse_list((iface_enum)iface);
@ -1759,28 +1984,33 @@ void daemon_service_run ( void )
if (( iface == INFRA_IFACE ) && ( hbsInv.infra_network_provisioned != true ))
continue ;
#define MAX_LEN 1000
#ifdef WANT_HBS_MEM_LOGS
char str[MAX_LEN] ;
snprintf (&str[0], MAX_LEN, "%s Histogram: %d - %s\n",
get_iface_name_str(iface),
exp_pulses[iface],
arrival_histogram[iface].c_str());
mem_log (str);
if ( !unexpected_pulse_list[iface].empty() )
{
snprintf ( &str[0], MAX_LEN, "%s Others : %s\n",
get_iface_name_str(iface),
unexpected_pulse_list[iface].c_str());
mem_log(str);
}
hbsInv.lost_pulses ( (iface_enum)iface );
#endif
/*
* Assume storage-0 is responding until otherwise proven
* its not. Keep in mind that the 'lost_pulses' interface
* only counts nodes that have not responded.
*/
bool storage_0_responding = true ;
int lost = hbsInv.lost_pulses ((iface_enum)iface, storage_0_responding);
hbs_cluster_update ((iface_enum)iface, lost, storage_0_responding);
}
hbsTimer.ring = false ;
heartbeat_request = true ;
// hbs_cluster_log ( hbsInv.my_hostname, "->") ;
seq_num++ ;
}
daemon_load_fit ();
@ -1796,7 +2026,9 @@ void daemon_dump_info ( void )
hbsInv.print_node_info ();
hbsInv.memDumpAllState ();
#ifdef WANT_HBS_MEM_LOGS
daemon_dump_membuf (); /* write mem_logs to log file and clear log list */
#endif
}
const char MY_DATA [100] = { "eieio\n" } ;

View File

@ -27,6 +27,8 @@
#include <signal.h>
#include <list>
#include "msgClass.h"
#include "mtceHbsCluster.h"
#include "hbsCluster.h"
/**
* @addtogroup hbs_base
@ -38,6 +40,8 @@
#endif
#define __AREA__ "hbs"
// #define WANT_CLUSTER_DEBUG
#define ALIGN_PACK(x) __attribute__((packed)) x
/** Maximum service fail count before action */
@ -56,15 +60,18 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
/* Heartbeat control structure */
typedef struct
{
unsigned int nodetype ;
bool clear_alarms ;
} hbs_ctrl_type ;
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
/* A heartbeat service message
* if this structire is changed then
* if this structure is changed then
* hbs_pulse_request needs to be looked at
*/
typedef struct
@ -76,7 +83,7 @@ typedef struct
unsigned int s ;
/* Fast Lookup Clue Info */
unsigned int c ;
unsigned int c ;
/* Status Flags
* ------------
@ -89,6 +96,16 @@ typedef struct
/** message version number */
unsigned int v ;
/** Heartbeat cluster information that is put into heartbeat messages.
*
* Pulse Request : To hbsClient: Only 1 controller with up to 2 network types history.
* Pulse Response: From hbsClient: Can include up to 2 controllers with 2 networks each.
*
* This addition requires message verison increment.
*
**/
mtce_hbs_cluster_type cluster ;
} ALIGN_PACK(hbs_message_type) ;
@ -104,6 +121,12 @@ typedef struct
/** Heartbeat Service Event Transmit Interface - hbsClient -> mtcAgent */
msgClassSock* hbs_ready_tx_sock;
/** Heartbeat Service SM Transmit Interface - hbsAgent -> sm */
msgClassSock* sm_client_sock;
/** Heartbeat Service SM Receive Interface - sm -> hbsAgent */
msgClassSock* sm_server_sock;
/** PMON Pulse Receive Interface - pmond -> hbsClient */
msgClassSock* pmon_pulse_sock;
@ -166,6 +189,9 @@ int hbs_refresh_pids ( std::list<procList> & proc_list );
int hbs_process_monitor ( std::list<procList> & pmon_list );
int hbs_self_recovery ( unsigned int cmd );
/* returns this controller's number ; 0 or 1 */
unsigned int hbs_get_controller_number ( void );
/* Setup the pulse messaging interfaces
* 'p' is a boot that indicates if the infrastructure network is provisioned
* 'p' = true means it is provisioned */
@ -184,6 +210,93 @@ int hbs_self_recovery ( unsigned int cmd );
} \
}
/*********** Common Heartbeat Utilities in hbsUtil.cpp ***************/
/* module init */
void hbs_utils_init ( void );
/* network enum to name lookup */
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
/* Initialize the specified history array */
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
/* Clear all history in the cluster vault */
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
/* Set the cluster vault to default state.
* Called upon daemon init or heartbeat period change. */
void hbs_cluster_init ( unsigned short period );
/* Calculate number of bytes that is unused in the cluster data structure.
* Primarily to know how many history elements are missing. */
unsigned short hbs_cluster_unused_bytes ( void );
/* Add and delete hosts from the monitored list.
* Automatically adjusts the numbers in the cluster vault. */
void hbs_cluster_add ( string & hostname );
void hbs_cluster_del ( string & hostname );
/* Report status of storgate-0 */
void hbs_cluster_storage0_status ( iface_enum iface , bool responding );
/* Look for and clog changes in cluster state */
int hbs_cluster_cmp ( hbs_message_type & msg );
/* Manage the enabled state of the controllers */
void hbs_manage_controller_state ( string & hostname, bool enabled );
/* Set the number of monitored hosts and this controller's
* number in the cluster vault. */
void hbs_cluster_nums ( unsigned short this_controller,
unsigned short monitored_networks );
/* Copy/Save the peer controller's cluster info from the hbsClient's
* pulse response into the cluster vault so its there and ready for
* an SM cluster_info request. */
int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg );
/*
* Called by the hbsAgent pulse receiver to create a network specific
* history update entry consisting of
*
* 1. the number of monitored hosts
* 2. how many of those that responded in the last heartbeat period.
* 3. threshold storage-0 responding count and manage that state in that
* networks history header.
*/
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding );
/* Called by the hbsAgent pulse transmitter to append this controllers
* running cluster view in the next multicast pulse request.
* The hbsClient is expected to loop this data and any other like data from
* the other controller back in its response. */
void hbs_cluster_append ( hbs_message_type & msg );
/* Produce formatted clog's that characterize current and changing cluster
* history for a given network. Each log is controller/network specific. */
void hbs_cluster_log ( string & hostname, string prefix );
/* Service SM cluster info request */
void hbs_sm_handler ( void );
/* send the cluster vault to SM */
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
/* print the contents of the vault */
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
/**
* @} hbs_base
*/

View File

@ -20,7 +20,6 @@
* daemon_files_init
* daemon_configure
* daemon_signal_init
* hbs_message_init
* hbs_socket_init
*
* daemon_service_run
@ -59,7 +58,7 @@ using namespace std;
#include "daemon_option.h" /* Common options for daemons */
#include "nodeTimers.h" /* for ... maintenance timers */
#include "nodeMacro.h" /* for ... CREATE_NONBLOCK_INET_UDP_RX_SOCKET */
#include "nlEvent.h" /* for ... open_netlink_socket */
#include "nlEvent.h" /* for ... open_netlink_socket */
#include "hbsBase.h" /* Heartbeat Base Header File */
extern "C"
@ -95,8 +94,9 @@ typedef struct
std::list<procList>::iterator proc_ptr ;
} stallMon_type ;
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
static char my_hostname [MAX_HOST_NAME_SIZE+1];
static char my_hostname_length ;
static string my_macaddr = "" ;
static string my_address = "" ;
static unsigned int my_nodetype= CGTS_NODE_NULL ;
@ -360,6 +360,12 @@ static int hbs_config_handler ( void * user,
config_ptr->pmon_pulse_port = atoi(value);
config_ptr->mask |= CONFIG_CLIENT_PULSE_PORT ;
}
#ifdef WANT_CLUSTER_DEBUG
else if (MATCH("agent", "sm_client_port"))
{
config_ptr->sm_client_port = atoi(value);
}
#endif
else
{
return (PASS);
@ -446,20 +452,6 @@ int daemon_configure ( void )
/* Initialization Utilities */
/****************************/
/* Initialize the unicast pulse response message */
/* One time thing ; tx same message all the time. */
int hbs_message_init ( void )
{
/* Build the transmit pulse response message for each interface */
for ( int i = 0 ; i < MAX_IFACES ; i++ )
{
memset ( &hbs_sock.tx_mesg[i], 0, sizeof (hbs_message_type));
memcpy ( &hbs_sock.tx_mesg[i].m[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
memcpy ( &hbs_sock.tx_mesg[i].m[HBS_HEADER_SIZE], my_hostname, strlen(my_hostname));
}
return (PASS);
}
/* Initialize pulse messaging for the specified interface
* This is called by a macro defined in hbsBase.h */
int _setup_pulse_messaging ( iface_enum i, int rmem )
@ -621,6 +613,11 @@ int hbs_socket_init ( void )
return (FAIL_SOCKET_NOBLOCK);
}
#ifdef WANT_CLUSTER_DEBUG
hbs_sock.sm_client_sock = new msgClassRx(LOOPBACK_IP,hbs_config.sm_client_port,IPPROTO_UDP);
if ( rc ) return (rc) ;
hbs_sock.sm_client_sock->sock_ok(true);
#endif
return (PASS);
}
@ -648,7 +645,7 @@ int get_pmon_pulses ( void )
if ( !strncmp ( &msg.hdr[0] , get_pmond_pulse_header(), MSG_HEADER_SIZE ))
{
pulses++ ;
mlog ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
mlog1 ("Pmon Pulse (%s) (%d)\n", msg.hdr, pulses );
}
else
{
@ -710,92 +707,87 @@ static unsigned int my_rri = 0 ;
static int rx_error_count[MAX_IFACES] = {0,0} ;
static int tx_error_count[MAX_IFACES] = {0,0} ;
#define ERROR_LOG_THRESHOLD (200)
int _service_pulse_request ( iface_enum iface , unsigned int flags )
{
unsigned int s = 0 ; /* Sequence number */
int n = 0 ; /* message size */
int rc = 0 ;
if (( iface != MGMNT_IFACE ) && ( iface != INFRA_IFACE ))
return (FAIL_BAD_CASE);
memset ( (char*) &hbs_sock.rx_mesg[iface], 0, sizeof(hbs_message_type));
if ( ! hbs_sock.rx_sock[iface] )
{
elog ("cannot receive from null rx_mesg[%s] socket\n", get_iface_name_str(iface) );
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot receive from null rx_mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_RECEIVE);
}
else if ( hbs_sock.rx_sock[iface]->sock_ok() == false )
else if ( ! hbs_sock.tx_sock[iface] )
{
elog ("cannot receive from failed rx_mesg[%s] socket\n", get_iface_name_str(iface) );
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot send to null mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
else if ( ! hbs_sock.rx_sock[iface]->sock_ok() )
{
elog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot receive from failed rx_mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_RECEIVE);
}
n = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
if( n < HBS_HEADER_SIZE )
else if ( ! hbs_sock.tx_sock[iface]->sock_ok() )
{
rx_error_count[iface]++ ;
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"cannot send to failed mesg[%s] socket\n",
get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
/* throtle the log so that if they come back-to-back we avoid flooding */
if ( n == -1 )
// MEMSET_ZERO(hbs_sock.rx_mesg[iface]);
int rx_bytes = hbs_sock.rx_sock[iface]->read((char*)&hbs_sock.rx_mesg[iface], sizeof(hbs_message_type));
if ( rx_bytes < HBS_HEADER_SIZE )
{
if ( rx_bytes == -1 )
{
if ( rx_error_count[iface] > 1 )
{
wlog_throttled ( rx_error_count[iface], 500, "%s receive error (%d:%m)\n", get_iface_name_str(iface), errno );
}
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s receive error (%d:%m)\n",
get_iface_name_str(iface), errno );
}
else
{
wlog_throttled ( rx_error_count[iface], 500, "%s message underrun (expected %ld but got %d)\n",
get_iface_name_str(iface), sizeof(hbs_message_type), n );
}
if ( rx_error_count[iface] == 100 )
{
wlog ( "%s is getting a lot of receive errors (%d:%m)\n", get_iface_name_str(iface), errno );
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s message underrun (expected %ld but got %d)\n",
get_iface_name_str(iface),
sizeof(hbs_message_type), rx_bytes );
}
return (FAIL_TO_RECEIVE);
}
/* Clear the error count since we got a good receive */
rx_error_count[iface] = 0 ;
#ifdef WANT_NO_SELF_HEARTBEAT_REPLY
/* Don't reply to the heartbeat if the request came from myself */
if ( ! strncmp ( my_address.data(),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
MAX_CHARS_IN_IP_ADDR ))
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
if ( cfg_ptr->debug_msg )
{
ilog ("%s Refusing to send heartbeat response to self\n", hbs_sock.rx_sock[iface]->get_dst_addr()->toString());
return (PASS);
mlog ("\n");
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
get_iface_name_str(iface),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c);
}
#else
/* We use this to monitor pmond on active controller */
#endif
/* Save the sequence number */
s = hbs_sock.rx_mesg[iface].s ;
mlog ("\n");
mlog ("%s Pulse Req: %s:%5d: %d: :%s RRI:%d\n", get_iface_name_str(iface),
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c);
/* verify the message header */
if ( strncmp ( (const char *)&hbs_sock.rx_mesg[iface].m, (const char *)&req_msg_header, HBS_HEADER_SIZE ))
{
wlog_throttled ( rx_error_count[iface], 200, "%s Invalid header (%d:%s)\n",
get_iface_name_str(iface),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m );
mlog ("Detected: %d <%s>\n", HBS_HEADER_SIZE,hbs_sock.rx_mesg[iface].m);
mlog ("Expected: %d <%s>\n", HBS_HEADER_SIZE,req_msg_header);
wlog_throttled ( rx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s Invalid header (%d:%s)\n",
get_iface_name_str(iface),
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].m );
return (FAIL_MSG_HEADER) ;
}
/* Manage the Resource Reference Index (RRI) "lookup clue" */
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
{
@ -807,32 +799,31 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
/* Add my RRI to the response message */
hbs_sock.tx_mesg[iface].c = my_rri ;
hbs_sock.rx_mesg[iface].c = my_rri ;
/* Clear struct */
hbs_sock.tx_mesg[iface].s = s ;
hbs_sock.tx_mesg[iface].f = flags ;
/* Manage OOB flags */
hbs_sock.rx_mesg[iface].f = flags ;
if ( pmonPulse_counter )
{
hbs_sock.tx_mesg[iface].f |= ( PMOND_FLAG ) ;
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
}
if ( infra_network_provisioned == true )
{
hbs_sock.tx_mesg[iface].f |= INFRA_FLAG ;
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
}
n = (int)sizeof(hbs_message_type) ;
if ( ! hbs_sock.tx_sock[iface] )
#define WANT_CLUSTER_INFO_LOG
#ifdef WANT_CLUSTER_INFO_LOG
/* Log the received cluster info */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
elog ("cannot send to null tx_mesg[%s] socket\n", get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
}
else if ( hbs_sock.tx_sock[iface]->sock_ok() == false )
{
elog ("cannot send to failed tx_mesg[%s] socket\n", get_iface_name_str(iface) );
return (FAIL_TO_TRANSMIT);
char str[100] ;
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
string hostname = my_hostname ;
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
}
#endif
#ifdef WANT_PULSE_RESPONSE_FIT
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
@ -848,44 +839,69 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
}
#endif
/* Send pulse response message with sequence number, flags and resource referecen index */
rc = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.tx_mesg[iface], n);
if ( rc == -1 )
int rc = PASS ;
/* replace the request header with the response header */
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
/* Deal with the cluster info if it exists.
* ... Introduced in messaging version 1 */
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
{
elog ("Failed to sendto socket %d through %s:%d len:%d (%s) (%d:%s)\n",
hbs_sock.tx_sock[iface]->getFD(),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
get_iface_name_str(iface), errno, strerror(errno));
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
{
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
}
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
// {
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
// }
/* Add peer controller cluster data to this controller's response */
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
}
else if ( rc != n)
/* send pulse response message */
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
if ( tx_bytes == -1 )
{
/* Avoid log flooding
elog ("unicast send failed. (%d)\n", rc); */
wlog_throttled ( tx_error_count[iface], 200,
"%s Pulse Rsp: %d:%d bytes < %d:%s > to <%s>\n",
get_iface_name_str(iface), n, rc,
hbs_sock.tx_mesg[iface].s,
&hbs_sock.tx_mesg[iface].m[0],
elog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"pulse tx failed %d:%s:%d len:%d (%s) (%d:%s)\n",
hbs_sock.tx_sock[iface]->getFD(),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getSockLen(),
get_iface_name_str(iface), errno, strerror(errno));
}
else if ( tx_bytes != rx_bytes)
{
wlog_throttled ( tx_error_count[iface], ERROR_LOG_THRESHOLD,
"%s Pulse Rsp: %d:%d bytes < %d:%s >",
get_iface_name_str(iface), rx_bytes, tx_bytes,
hbs_sock.rx_mesg[iface].s,
&hbs_sock.rx_mesg[iface].m[0]);
return (rc);
rc = FAIL_DATA_SIZE ;
}
else
{
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d)\n",
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
get_iface_name_str(iface),
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
hbs_sock.tx_mesg[iface].s,
hbs_sock.tx_mesg[iface].f,
hbs_sock.tx_mesg[iface].m,
hbs_sock.tx_mesg[iface].c,
pmonPulse_counter);
/* Clear the error count since we got a good transmit */
tx_error_count[iface] = 0 ;
hbs_sock.rx_mesg[iface].s,
hbs_sock.rx_mesg[iface].f,
hbs_sock.rx_mesg[iface].m,
hbs_sock.rx_mesg[iface].c,
pmonPulse_counter, rx_bytes, tx_bytes);
}
return PASS;
/* Clear the error count since we got a good receive */
if ( rx_error_count[iface] )
rx_error_count[iface] = 0 ;
if ( tx_error_count[iface] )
tx_error_count[iface] = 0 ;
return rc ;
}
#ifdef WANT_FIT_TESTING
@ -968,6 +984,9 @@ int daemon_init ( string iface, string nodeType_str )
/* Initialize socket construct and pointer to it */
memset ( &hbs_sock, 0, sizeof(hbs_sock));
/* init the utility module */
hbs_utils_init ();
/* Defaults */
hbs_config.stall_pmon_thld = -1 ;
hbs_config.stall_mon_period = MTC_HRS_8 ;
@ -1025,12 +1044,6 @@ int daemon_init ( string iface, string nodeType_str )
rc = FAIL_DAEMON_CONFIG ;
}
/* Init the heartbeat transmit pulse response message */
else if ( hbs_message_init () != PASS )
{
elog ("Failed to initialize pulse response message\n");
rc = FAIL_MESSAGE_INIT ;
}
/* Setup the heartbeat service messaging sockets */
else if ( hbs_socket_init () != PASS )
{
@ -1119,6 +1132,11 @@ void daemon_service_run ( void )
ilog ("Sending Heartbeat Ready Event\n");
hbs_send_event ( MTC_EVENT_MONITOR_READY );
my_hostname_length = strlen(my_hostname) ;
memset ( &pulse_resp_tx_hdr[0], 0, HBS_MAX_MSG );
memcpy ( &pulse_resp_tx_hdr[0], &rsp_msg_header[0], HBS_HEADER_SIZE );
memcpy ( &pulse_resp_tx_hdr[HBS_HEADER_SIZE], my_hostname, my_hostname_length );
/* Run heartbeat service forever or until stop condition */
for ( ; ; )
{
@ -1153,7 +1171,9 @@ void daemon_service_run ( void )
FD_SET(hbs_sock.pmon_pulse_sock->getFD(),&hbs_sock.readfds);
FD_SET(hbs_sock.amon_socket, &hbs_sock.readfds);
FD_SET(hbs_sock.netlink_sock, &hbs_sock.readfds);
#ifdef WANT_CLUSTER_DEBUG
FD_SET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds);
#endif
rc = select( socks.back()+1,
&hbs_sock.readfds, NULL, NULL,
&hbs_sock.waitd);
@ -1176,6 +1196,19 @@ void daemon_service_run ( void )
/* Only service sockets for the rc > 0 case */
else if ( rc )
{
#ifdef WANT_CLUSTER_DEBUG
if ( hbs_sock.sm_client_sock && FD_ISSET(hbs_sock.sm_client_sock->getFD(), &hbs_sock.readfds ) )
{
mtce_hbs_cluster_type msg ;
/* Receive event messages */
memset ( &msg , 0, sizeof(mtce_hbs_cluster_type));
int bytes = hbs_sock.sm_client_sock->read((char*)&msg, sizeof(mtce_hbs_cluster_type));
if ( bytes )
{
hbs_cluster_dump (msg);
}
}
#endif
if (hbs_sock.rx_sock[MGMNT_IFACE]&&FD_ISSET(hbs_sock.rx_sock[MGMNT_IFACE]->getFD(), &hbs_sock.readfds))
{
/* Receive pulse request and send a response */

View File

@ -0,0 +1,748 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file Maintenance Heartbeat Agent Cluster Manager Module
*
*************************************************************************
*
* This module provides the heartbeat cluster implementation member
* functions that the hbsAgent service calls to collect, store and
* send heartbeat cluster information to SM upon request.
*
* See mtceHbsCluster.h for formal API between SM and Mtce.
*
*************************************************************************/
using namespace std;
#include "nodeBase.h" /* common maintenance constructs and definitions */
#include "daemon_common.h" /* common daemon constructs and definitions */
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
/* Error log throttle counter. */
#define THROTTLE_COUNT (500)
/* Private Heartbeat Cluster Control Structure. */
typedef struct
{
/* Contains the controller number (0 or 1) for this controller. */
unsigned short this_controller ;
/* Preserves which controllers are enabled. */
bool controller_0_enabled ;
bool controller_1_enabled ;
#ifdef THREE_CONTROLLER_SYSTEM
bool controller_2_enabled ;
#endif
/* Used to prevent log flooding in presence of back to back errors. */
unsigned int log_throttle ;
/* Used to threshold storage-0 not responding state */
unsigned int storage_0_not_responding_count[MTCE_HBS_NETWORKS];
/* Contains the number of monitored networks in the system.
* Management only = 1
* Management and Inrastructure = 2 */
unsigned short monitored_networks ;
/* This contains the current number of heartbeat enabled hosts.
*
* Used to improve performance.
*
* Performance: This value is included in each history entry so
* rather than do the size calculation of monitored_hostname_list
* each time, this variable is updated from monitored_hostname_list
* after each add/del operation. */
unsigned short monitored_hosts ;
/* List of host names being monitored. */
std::list<string>monitored_hostname_list ;
/* The working heartbeat cluster data vault. */
mtce_hbs_cluster_type cluster ;
} hbs_cluster_ctrl_type ;
/* Cluster control structire construct allocation. */
static hbs_cluster_ctrl_type ctrl ;
/****************************************************************************
*
* Name : hbs_cluster_init
*
* Description : Initialize the cluster structure to default values.
*
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
*
***************************************************************************/
void hbs_cluster_init ( unsigned short period )
{
ctrl.monitored_hosts = 0;
ctrl.monitored_hostname_list.clear();
/* Init the cluster - header. */
ctrl.cluster.version = MTCE_HBS_CLUSTER_VERSION ;
ctrl.cluster.revision = MTCE_HBS_CLUSTER_REVISION ;
ctrl.cluster.magic_number = MTCE_HBS_MAGIC_NUMBER ;
/* Init the cluster - global / dynamic data. */
ctrl.cluster.reqid = 0 ;
ctrl.cluster.period_msec = period ;
ctrl.cluster.storage0_enabled = false ;
ctrl.cluster.histories = 0 ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
/* The storage-0 thresholding counter for each network. */
for ( int n = 0 ; n < MTCE_HBS_NETWORKS ; n++ )
ctrl.storage_0_not_responding_count[n] = 0 ;
for ( int h = 0 ; h < MTCE_HBS_MAX_HISTORY_ELEMENTS ; h++ )
hbs_cluster_history_init ( ctrl.cluster.history[h] );
ilog ("Cluster Info: v%d.%d sig:%x bytes:%d (%ld)",
ctrl.cluster.version,
ctrl.cluster.revision,
ctrl.cluster.magic_number,
ctrl.cluster.bytes,
sizeof(mtce_hbs_cluster_history_type));
ctrl.log_throttle = 0 ;
}
/****************************************************************************
*
* Name : hbs_cluster_nums
*
* Description : Set this controller number and the number of monitored
* networks in this system.
*
* These values do not change without a process restart.
*
* Assumtions : Called by hbsAgent.cpp before entering the main loop.
*
* Returns : None
*
***************************************************************************/
void hbs_cluster_nums ( unsigned short this_controller,
unsigned short monitored_networks )
{
ctrl.this_controller = this_controller ;
ctrl.monitored_networks = monitored_networks ;
}
/****************************************************************************
*
* Name : log_monitored_hosts_list
*
* Description : Log the list of monitored hosts.
* Typically done on a list change.
*
* Returns : None
*
***************************************************************************/
void log_monitored_hosts_list ( void )
{
std::list<string>::iterator iter_ptr ;
string list = "" ;
for ( iter_ptr = ctrl.monitored_hostname_list.begin() ;
iter_ptr != ctrl.monitored_hostname_list.end() ;
iter_ptr++ )
{
list.append (*(iter_ptr));
list.append (" ");
}
ilog ("cluster of %ld: %s",
ctrl.monitored_hostname_list.size(),
list.c_str());
}
/****************************************************************************
*
* Name : cluster_storage0_state
*
* Description : Record the heartbeat monitoring state of storage-0.
*
* Parameters : true if storage-0 heartbeating is in the 'started' state.
* false if storage-0 heartbeating is in the 'stopped' state.
*
* Returns : None
*
***************************************************************************/
void cluster_storage0_state ( bool enabled )
{
if ( ctrl.cluster.storage0_enabled != enabled )
{
ctrl.cluster.storage0_enabled = enabled ;
ilog ("storage-0 heartbeat state changed to %s",
enabled ? "enabled" : "disabled" );
}
}
/****************************************************************************
*
* Name : hbs_manage_controller_state
*
* Description : Track the monitored enabled state of the controllers.
*
***************************************************************************/
void hbs_manage_controller_state ( string & hostname, bool enabled )
{
/* track controller state */
if ( hostname == CONTROLLER_0 )
{
ctrl.controller_0_enabled = enabled ;
}
else if ( hostname == CONTROLLER_1 )
{
ctrl.controller_1_enabled = enabled ;
}
#ifdef THREE_CONTROLLER_SYSTEM
else if ( hostname == CONTROLLER_2 )
{
ctrl.controller_2_enabled = enabled ;
}
#endif
}
/****************************************************************************
*
* Name : hbs_cluster_add
*
* Description : Add the specified hostname to the enabled hosts list.
*
* Updates : hostname is added to monitored_hostname_list
*
* If added host is storage-0 then update its enabled status.
* if added host is a controller then update controller state.
*
* Parameters : hostname string
*
* Updates : monitored_hostname_list
*
***************************************************************************/
void hbs_cluster_add ( string & hostname )
{
/* Consider using 'unique' after instead of remove before update. */
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hostname_list.push_back(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state */
if ( hostname == STORAGE_0 )
{
cluster_storage0_state ( true );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec );
}
/* Manage controller state ; true means enabled in this case. */
hbs_manage_controller_state ( hostname, true );
ilog ("%s added to cluster", hostname.c_str());
log_monitored_hosts_list ();
}
/****************************************************************************
*
* Name : hbs_cluster_del
*
* Description : Delete the specified hostname from the enabled hosts list.
*
* Updates : hostname is removed from monitored_hostname_list
*
* If added host is storage-0 then update its enabled status.
* if added host is a controller then update controller count.
*
* Parameters : hostname string
*
* Updates : monitored_hostname_list
*
***************************************************************************/
void hbs_cluster_del ( string & hostname )
{
ctrl.monitored_hostname_list.remove(hostname) ;
ctrl.monitored_hosts = (unsigned short)ctrl.monitored_hostname_list.size();
/* Manage storage-0 state. */
if ( hostname == STORAGE_0 )
{
cluster_storage0_state ( false );
}
/* If we get down to 0 monitored hosts then just start fresh */
if (( ctrl.monitored_hosts ) == 0 )
{
hbs_cluster_init ( ctrl.cluster.period_msec );
}
/* Manage controller state ; false means not enabled in this case. */
hbs_manage_controller_state ( hostname , false );
ilog ("%s deleted from cluster", hostname.c_str());
log_monitored_hosts_list ();
}
/****************************************************************************
*
* Name : hbs_cluster_update
*
* Description : Update this controller's cluster info for the specified
* network with
*
* 1. The number of enabled hosts.
* 2. The number of responding hosts.
* 3. The oldest history index in the rotational history fifo.
* 4. Maintain a back to back non-responding count for storage-0.
* Once the count reaches the minimum threshold of
* STORAGE_0_NR_THRESHOLD then the specific network history
* is updated to indicate storgae-0 is not responding. Once
* storage-0 starts responding again with a single response
* then that network history is updated to indicate storage-0
* is responding.
*
* Assumptions : Converts heartbeat interface number to cluster network number.
*
* Parameters : heartbeat interface number ( iface_enum )
* network index
* number of not responding hosts for this interval
*
* Updates : This and last history as well as storage-0 not responding
* count.
*
***************************************************************************/
#define STORAGE_0_NR_THRESHOLD (4)
void hbs_cluster_update ( iface_enum iface,
unsigned short not_responding_hosts,
bool storage_0_responding )
{
if ( ctrl.monitored_hosts == 0 )
return ;
/* convert heartbeat iface enum to cluster network enum. */
mtce_hbs_network_enum n ;
if ( iface == MGMNT_IFACE )
n = MTCE_HBS_NETWORK_MGMT ;
else if ( iface == INFRA_IFACE )
n = MTCE_HBS_NETWORK_INFRA ;
#ifdef MONITORED_OAM_NETWORK
else if ( iface == OAM_IFACE )
n = MTCE_HBS_NETWORK_OAM ;
#endif
else
return ;
if ( not_responding_hosts )
{
clog1 ("controller-%d %s enabled:%d not responding:%d",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts,
not_responding_hosts);
}
else
{
clog1 ("controller-%d %s has %d monitored hosts and all are responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
ctrl.monitored_hosts);
}
/* Look-up active history array for this network combination */
mtce_hbs_cluster_history_type * history_ptr = NULL ;
GET_CLUSTER_HISTORY_PTR(ctrl.cluster, ctrl.this_controller ,n);
if ( history_ptr == NULL )
{
if ( ctrl.cluster.histories >= MTCE_HBS_MAX_HISTORY_ELEMENTS )
{
/* Should never happen but if it does then log without floooding */
wlog_throttled ( ctrl.log_throttle, THROTTLE_COUNT,
"Unable to store history beyond %d ",
ctrl.cluster.histories );
return ;
}
else
{
/* Adding a new history slot. */
history_ptr = &ctrl.cluster.history[ctrl.cluster.histories] ;
ctrl.cluster.histories++ ;
ctrl.cluster.bytes = BYTES_IN_CLUSTER_VAULT(ctrl.cluster.histories);
history_ptr->controller = ctrl.this_controller ;
history_ptr->network = n ;
/* Log new network history as its being started. */
ilog ("controller-%d %s network history add",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str());
}
}
/* Manage storage-0 status. */
if ( ctrl.cluster.storage0_enabled )
{
/* Handle storage-0 status change from not responding to responding. */
if ( storage_0_responding == true )
{
if (history_ptr->storage0_responding == false)
{
history_ptr->storage0_responding = true ;
ilog ("controller-%d %s heartbeat ; storage-0 is ok",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str());
}
if (ctrl.storage_0_not_responding_count[n])
ctrl.storage_0_not_responding_count[n] = 0 ;
}
/* Count the storage-0 not responding case for this network. */
else
{
ctrl.storage_0_not_responding_count[n]++ ;
if ( ctrl.storage_0_not_responding_count[n] == 2 )
{
ilog ("controller-%d %s heartbeat ; storage-0 has 2 misses",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str() );
}
}
/* Handle storage-0 status change from responding to not responding. */
if (( history_ptr->storage0_responding == true ) &&
( ctrl.storage_0_not_responding_count[n] >= STORAGE_0_NR_THRESHOLD ))
{
history_ptr->storage0_responding = false ;
ilog ("controller-%d %s heartbeat ; storage-0 is not responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str() );
}
}
else
{
/* Typical path for storage-0 disabled or normal non-storage system case */
if ( history_ptr->storage0_responding == true )
history_ptr->storage0_responding = false ;
/* Handle clearing threshold count when storage-0 is not enabled. */
if ( ctrl.storage_0_not_responding_count[n] )
ctrl.storage_0_not_responding_count[n] = 0 ;
}
/*
* Manage the history entry index.
*
* Get the previous entry index ...
* ... which is the one before the oldest index.
* ... which is the index for the next entry.
*/
unsigned short last_entry_index ;
if ( history_ptr->oldest_entry_index == 0 )
{
/* Go to the end of the array. */
last_entry_index = MTCE_HBS_HISTORY_ENTRIES-1 ;
}
else
{
/* Otherwise, the previous index in the array */
last_entry_index = history_ptr->oldest_entry_index - 1 ;
}
/* Update the history with this data. */
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled = ctrl.monitored_hosts ;
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding = ctrl.monitored_hosts - not_responding_hosts ;
if (( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled !=
history_ptr->entry[ last_entry_index].hosts_enabled ) ||
( history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding !=
history_ptr->entry[ last_entry_index].hosts_responding))
{
/* Only log on change events. */
if ( history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled ==
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding )
{
ilog ("controller-%d %s cluster of %d is healthy",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled);
}
else
{
ilog ("controller-%d %s cluster of %d with %d responding",
ctrl.this_controller,
hbs_cluster_network_name(n).c_str(),
history_ptr->entry[history_ptr->oldest_entry_index].hosts_enabled,
history_ptr->entry[history_ptr->oldest_entry_index].hosts_responding);
}
}
/* Increment the entries count till it reaches the max. */
if ( history_ptr->entries < MTCE_HBS_HISTORY_ENTRIES )
history_ptr->entries++ ;
/* Manage the next entry update index ; aka the oldest index. */
if ( history_ptr->oldest_entry_index == (MTCE_HBS_HISTORY_ENTRIES-1))
history_ptr->oldest_entry_index = 0 ;
else
history_ptr->oldest_entry_index++ ;
/* clear the log throttle if we are updating history ok. */
ctrl.log_throttle = 0 ;
}
/****************************************************************************
*
* Name : hbs_cluster_append
*
* Description : Add this controller's cluster info to this pulse
* request message.
*
***************************************************************************/
void hbs_cluster_append ( hbs_message_type & msg )
{
unsigned short c = ctrl.this_controller ;
CHECK_CTRL_NTWK_PARMS(c, ctrl.monitored_networks);
msg.cluster.version = ctrl.cluster.version ;
msg.cluster.revision = ctrl.cluster.revision ;
msg.cluster.magic_number = ctrl.cluster.magic_number ;
msg.cluster.period_msec = ctrl.cluster.period_msec ;
msg.cluster.storage0_enabled = ctrl.cluster.storage0_enabled ;
msg.cluster.histories = ctrl.cluster.histories ;
int bytes = BYTES_IN_CLUSTER_VAULT(ctrl.monitored_networks);
clog1 ("controller-%d appending cluster info to heartbeat message (%d:%d:%d)",
c, ctrl.monitored_networks, ctrl.cluster.histories, bytes );
/* Copy the cluster into the message. */
memcpy( &msg.cluster.history[0], &ctrl.cluster.history[c], bytes);
}
/****************************************************************************
*
* Name : hbs_cluster_unused_bytes
*
* Descrition : Used to set how much data to send in the heartbeat pulse
* requests.
*
* Returns : The number of bytes that are not used in the full
* history array cluster structure.
*
***************************************************************************/
unsigned short hbs_cluster_unused_bytes ( void )
{
if ( ctrl.cluster.histories <= MTCE_HBS_MAX_HISTORY_ELEMENTS )
{
unsigned short tmp = MTCE_HBS_MAX_HISTORY_ELEMENTS - ctrl.cluster.histories ;
return((unsigned short)(sizeof(mtce_hbs_cluster_history_type)*tmp)) ;
}
return 0;
}
/****************************************************************************
*
* Name : hbs_cluster_send
*
* Description: Send the cluster vault to SM.
*
* Returns : Nothing
*
***************************************************************************/
/* NOTE: All code wrapped in this directive will be removed once
* active/active heartbeating is delivered in next update */
#define WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid )
{
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
/* To assist SM with duplex integration ...
*
* This code emulates heartbeat redundancy by duplicating
* controller history up to the number of provisioned
* controllers until active-active heartbeat is delivered.
*/
int peer_controller ;
bool copy_cluster = false ;
if ( ctrl.this_controller == 0 )
{
peer_controller = 1 ;
if ( ctrl.controller_1_enabled )
{
copy_cluster = true ;
}
}
else
{
peer_controller = 0 ;
if ( ctrl.controller_0_enabled )
{
copy_cluster = true ;
}
}
int n, networks = ctrl.cluster.histories ;
if ( copy_cluster )
{
for ( n = 0 ; n < networks ; n++ )
{
/* copy this controller history to create peer controller */
ctrl.cluster.history[ctrl.cluster.histories] = ctrl.cluster.history[n] ;
/* update the controller */
ctrl.cluster.history[ctrl.cluster.histories].controller = peer_controller ;
ctrl.cluster.bytes += sizeof(mtce_hbs_cluster_history_type) ;
ctrl.cluster.histories++ ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
ctrl.cluster.reqid = (unsigned short)reqid ;
if (( sm_client_sock ) && ( sm_client_sock->sock_ok() == true ))
{
int len = sizeof(mtce_hbs_cluster_type)-hbs_cluster_unused_bytes();
int bytes = sm_client_sock->write((char*)&ctrl.cluster, len);
if ( bytes <= 0 )
{
elog ("failed to send cluster vault to SM (bytes=%d) (%d:%s)\n",
bytes , errno, strerror(errno));
}
else
{
ilog ("heartbeat cluster vault sent to SM (%d bytes)", len );
hbs_cluster_dump ( ctrl.cluster );
}
}
#ifdef WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
if ( copy_cluster )
{
/* Clear out the other controllers data. */
for ( n = networks ; n > 0 ; n-- )
{
/* copy c0 history to another controller */
hbs_cluster_history_init(ctrl.cluster.history[ctrl.cluster.histories-1]);
ctrl.cluster.bytes -= sizeof(mtce_hbs_cluster_history_type);
ctrl.cluster.histories-- ;
}
}
#endif // WANT_ACTIVE_ACTIVE_HEARTBEAT_RESULTS
}
void hbs_cluster_log ( string & hostname, string prefix )
{
hbs_cluster_log ( hostname, ctrl.cluster, prefix );
}
/****************************************************************************
*
* Active Active Heartbeating and Debug Member Functions
*
***************************************************************************/
/****************************************************************************
*
* Name : hbs_cluster_cmp
*
* Descrition : Performs a sanity check over the cluster structure.
*
* Assumptions : Debug tool, not called at runtime.
*
* Returns : PASS or FAIL
*
***************************************************************************/
int hbs_cluster_cmp( hbs_message_type & msg )
{
if ( msg.cluster.version < ctrl.cluster.version )
{
wlog ("Unexpected version (%d:%d)",
msg.cluster.version, ctrl.cluster.version );
}
else if ( msg.cluster.revision != ctrl.cluster.revision )
{
wlog ("Unexpected revision (%d:%d)",
msg.cluster.revision, ctrl.cluster.revision );
}
else if ( msg.cluster.magic_number != ctrl.cluster.magic_number )
{
wlog ("Unexpected magic number (%d:%d)",
msg.cluster.magic_number, ctrl.cluster.magic_number );
}
else if ( msg.cluster.period_msec != ctrl.cluster.period_msec )
{
wlog ("Cluster Heartbeat period delta (%d:%d)",
msg.cluster.period_msec, ctrl.cluster.period_msec );
}
else if ( msg.cluster.storage0_enabled != ctrl.cluster.storage0_enabled )
{
wlog ("Cluster storage0 enabled state delta (%d:%d)",
msg.cluster.storage0_enabled, ctrl.cluster.storage0_enabled );
}
else
{
return (PASS);
}
return (FAIL);
}
/****************************************************************************
*
* Name : hbs_cluster_save
*
* Descrition : Copies the other controllers information from msg into
* the cluster.
*
* NOTE: Does not do that right now.
*
* Assumptions : Place holder until active/active heartbeating is implemented.
*
* Returns : PASS or FAIL
*
***************************************************************************/
int hbs_cluster_save ( string & hostname,
mtce_hbs_network_enum network,
hbs_message_type & msg )
{
// clog ("Add cluster info from peer controller");
if ( ctrl.monitored_hosts )
{
/* compare cluster info and log deltas */
// hbs_cluster_cmp( msg );
UNUSED(msg);
hbs_cluster_log( hostname, ctrl.cluster, hbs_cluster_network_name(network) );
}
return (PASS);
}

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
*
*************************************************************************
*
* This module provides API for the hbsAgent service to call to
* collect, store and send heartbeat cluster information to SM
* upon request. See hbsCluster.h for formal API.
*
*************************************************************************/
#ifndef __HBSCLUSTER_H__
#define __HBSCLUSTER_H__
using namespace std;
#include "mtceHbsCluster.h" /* for ... the public API */
/****************************************************************************
*
* Name : BYTES_IN_CLUSTER_VAULT
*
* Description : Calculates the number of bytes in the cluster vault based on
* the number of valid history array elements included.
*
* Parameters :
*
***************************************************************************/
#define BYTES_IN_CLUSTER_VAULT(e) \
(sizeof(mtce_hbs_cluster_type)-(sizeof(mtce_hbs_cluster_history_type)*(MTCE_HBS_MAX_HISTORY_ELEMENTS-e)))
/****************************************************************************
*
* Name : CHECK_CTRL_NTWK_PARMS
*
* Description :
*
* Parameters :
*
***************************************************************************/
#define CHECK_CTRL_NTWK_PARMS(c,n) \
if (( c > MTCE_HBS_MAX_CONTROLLERS ) || \
( n > MTCE_HBS_NETWORKS )) \
{ \
slog ("Invalid parameter: %d:%d", c, n); \
return ; \
}
/****************************************************************************
*
* Name : GET_CLUSTER_HISTORY_PTR
*
* Description :
*
* Parameters :
*
***************************************************************************/
#define GET_CLUSTER_HISTORY_PTR(cluster, c,n) \
for ( int h = 0 ; h < cluster.histories ; h++ ) \
{ \
if (( cluster.history[h].controller == c ) && \
( cluster.history[h].network == n )) \
{ \
history_ptr = &cluster.history[h] ; \
} \
}
#define SET_CONTROLLER_HOSTNAME(c) \
if ( c == 0 ) \
controller = CONTROLLER_0 ; \
else if ( c == 1 ) \
controller = CONTROLLER_1 ; \
else if ( c == 2 ) \
controller = CONTROLLER_2 ; \
else \
controller = "unknown" \
#endif // __HBSCLUSTER_H__

View File

@ -0,0 +1,346 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file Maintenance Heartbeat Utilities Module
*
*************************************************************************
*
* This module provides heartbeat utilities that are common to both
* hbsAgent and hbsClient.
*
*************************************************************************/
using namespace std;
#include "daemon_common.h" /* common daemon constructs and definitions */
#include "hbsBase.h" /* mtce heartbeat constructs and definitions */
/* hbs_cluster_log utility support. log control array. */
bool first_log[MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* has first history log out */
bool was_diff [MTCE_HBS_MAX_HISTORY_ELEMENTS]; /* was there a history diff */
/****************************************************************************
*
* Name : hbs_utils_init
*
* Description : Module Init function
*
***************************************************************************/
void hbs_utils_init ( void )
{
MEMSET_ZERO ( first_log );
MEMSET_ZERO ( was_diff );
}
/****************************************************************************
*
* Name : hbs_cluster_history_init
*
* Description : Initialize a cluster history element.
*
* Parameters : Reference to a mtce_hbs_cluster_history_type (history element)
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history )
{
MEMSET_ZERO(history);
history.entries_max = MTCE_HBS_HISTORY_ENTRIES ;
}
/****************************************************************************
*
* Name : hbs_cluster_history_clear
*
* Description : Clear all history in the cluster vault.
*
* Parameters : mtce_hbs_cluster_type instance : the vault.
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_history_clear ( mtce_hbs_cluster_type & cluster )
{
if ( cluster.histories )
{
for ( int h = 0 ; h < cluster.histories ; h++ )
hbs_cluster_history_init ( cluster.history[h] ) ;
}
}
/****************************************************************************
*
* Name : cluster_network_name
*
* Description : converts what is a heartbeat cluster network id to
* network name.
*
* Parameters : network id
*
* Returns : network name as a string
*
***************************************************************************/
string hbs_cluster_network_name ( mtce_hbs_network_enum network )
{
switch ( network )
{
case MTCE_HBS_NETWORK_MGMT:
return ("Mgmnt");
case MTCE_HBS_NETWORK_INFRA:
return ("Infra");
#ifdef MONITORED_OAM_NETWORK
case MTCE_HBS_NETWORK_OAM:
return ("Oam");
#endif
default:
slog ("invalid network enum (%d)", network );
return ("unknown");
}
}
/****************************************************************************
*
* Name : hbs_cluster_log
*
* Description : logs changes to the heartbeat cluster
*
* Parameters : The heartbeat cluster structure
*
* Returns : Nothing
*
***************************************************************************/
void hbs_cluster_log ( string & hostname,
mtce_hbs_cluster_type & cluster,
string log_prefix )
{
// bool want_log = false ;
clog1 ("log %d histories", cluster.histories );
for ( int h = 0 ; h < cluster.histories ; h++ )
{
if ( cluster.history[h].entries == MTCE_HBS_HISTORY_ENTRIES )
{
#define MAX_CLUSTER_LINE_LEN 100
#define MAX_ENTRY_STR_LEN 10 /* "9999:9999 " */
mtce_hbs_cluster_entry_type e = { 0, 0 } ;
char str[MAX_CLUSTER_LINE_LEN] ;
string line = "";
int start = 0 ;
int stop = 0 ;
bool newline = false ;
bool logit = false ;
bool first = false ;
string controller = "" ;
mtce_hbs_cluster_history_type * history_ptr = &cluster.history[h] ;
clog1 ("%s %s has %d entries (controller-%d view from %s)", hostname.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
history_ptr->entries,
history_ptr->controller,
log_prefix.c_str());
/* Manage local this_index for log display.
* Display oldest to newest ; left to right
*
* */
int this_index = history_ptr->oldest_entry_index ;
for ( int count = 0 ; count < history_ptr->entries ; count++ )
{
if (( line.length() + MAX_ENTRY_STR_LEN ) >=
MAX_CLUSTER_LINE_LEN )
{
newline = true ;
}
#ifdef WANT_MINIMAL_LOGS
/* TODO: enable in final update */
if (( first_log[h] == true ) && ( newline == false ) &&
( history_ptr->entry[this_index].hosts_enabled ==
history_ptr->entry[this_index].hosts_responding ))
{
line.append(". ");
continue ;
}
#endif
// want_log = true ;
if ( count == 0 )
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
line.append (str);
str[0] = '\0' ;
}
//#ifdef WANT_DOTS
else if (( history_ptr->entry[this_index].hosts_enabled ==
e.hosts_enabled ) &&
( history_ptr->entry[this_index].hosts_responding ==
e.hosts_responding ))
{
line.append(". ");
}
//#endif
else
{
snprintf (&str[0], MAX_ENTRY_STR_LEN , "%d:%d ", // -%d",
history_ptr->entry[this_index].hosts_enabled,
history_ptr->entry[this_index].hosts_responding ); // , this_index );
line.append (str);
str[0] = '\0' ;
logit = true ;
was_diff[h] = true ;
}
if (( logit == false ) && ( first_log[h] == false ))
{
first_log[h] = true ;
logit = true ;
}
stop++ ;
if ( newline == true )
{
if ( logit )
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
{
clog ("%s view %s %s %02d..%02d: %s,",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s,",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
}
start = stop + 1 ;
line.clear();
first = true ;
newline = false ;
}
e = history_ptr->entry[this_index] ;
/* manage index tracking */
if ( this_index == (MTCE_HBS_HISTORY_ENTRIES-1))
this_index = 0 ;
else
this_index++ ;
}
if (( newline == false ) && ( line.length() ))
{
// ERIC
if (( logit == false ) && ( was_diff[h] == true ))
{
logit = true ;
was_diff[h] = false ;
}
if ( logit )
{
if ( first )
{
clog ("............ %s %s %02d..%02d: %s",
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
SET_CONTROLLER_HOSTNAME(history_ptr->controller);
if ( hostname == controller )
{
clog ("%s view %s %s %02d..%02d: %s",
hostname.c_str(),
log_prefix.c_str(),
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
else
{
clog ("%s view from %s %s %s %02d..%02d: %s",
controller.c_str(),
hostname.c_str(),
log_prefix.c_str(), /* Infra <- */
hbs_cluster_network_name((mtce_hbs_network_enum)history_ptr->network).c_str(),
start, stop, line.c_str());
}
}
}
else
{
was_diff[h] = false ;
}
}
}
}
}
/****************************************************************************
*
* name : hbs_cluster_dump
*
* Description: Formatted dump of the vault contents to the log file.
*
***************************************************************************/
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault )
{
syslog ( LOG_INFO, "Cluster Vault Dump: --------------------------------------------------------------------------------------------");
syslog ( LOG_INFO, "Cluster Vault: v%d.%d %d msec period ; SM Reqid is %d with storage-0 %s and %d histories in %d bytes",
vault.version,
vault.revision,
vault.period_msec,
vault.reqid,
vault.storage0_enabled ? "enabled" : "disabled",
vault.histories,
vault.bytes );
for ( int h = 0 ; h < vault.histories ; h++ )
{
#define MAX_LINE_LEN (500)
char str[MAX_LINE_LEN] ;
int i = 0 ;
for ( int e = 0 ; e < vault.history[h].entries_max ; e++ )
{
snprintf ( &str[i], MAX_LINE_LEN, "%c[%d:%d]" ,
vault.history[h].oldest_entry_index==e ? '>' : ' ',
vault.history[h].entry[e].hosts_enabled,
vault.history[h].entry[e].hosts_responding);
i = strlen(str) ;
}
syslog ( LOG_INFO, "Cluster Vault: C%d %s S:%s:%s (%d:%d) %s",
vault.history[h].controller,
hbs_cluster_network_name((mtce_hbs_network_enum)vault.history[h].network).c_str(),
vault.storage0_enabled ? "y" : "n",
vault.history[h].storage0_responding ? "y" : "n",
vault.history[h].entries_max,
vault.history[h].entries,
str);
}
// dump_memory ( &vault, 16, vault.bytes );
}

View File

@ -0,0 +1,109 @@
/*
* Copyright (c) 2018 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
* @file StarlingX Maintenance Heartbeat Cluster Manager Module
*
*************************************************************************
*
* This module provides API for the hbsAgent service to call to
* collect, store and send heartbeat cluster information to SM
* upon request. See hbsCluster.h for formal API.
*
*************************************************************************/
#ifndef __MTCEHBSCLUSTER_H__
#define __MTCEHBSCLUSTER_H__
#include <sys/types.h>
/**************************************************************
* Implementation Structure
*************************************************************/
#define MTCE_HBS_CLUSTER_VERSION (1)
#define MTCE_HBS_CLUSTER_REVISION (0)
#define MTCE_HBS_MAGIC_NUMBER (0x5aa5)
typedef enum
{
MTCE_HBS_NETWORK_MGMT = 0,
MTCE_HBS_NETWORK_INFRA = 1,
#ifdef MONITORED_OAM_NETWORK
MTCE_HBS_NETWORK_OAM,
#endif
MTCE_HBS_NETWORKS
} mtce_hbs_network_enum ;
#ifdef THREE_CONTROLLER_SYSTEM
#define MTCE_HBS_MAX_CONTROLLERS (3)
#else
#define MTCE_HBS_MAX_CONTROLLERS (2)
#endif
#ifdef MONITORED_OAM_NETWORK
#define MTCE_HBS_MAX_NETWORKS (3)
#else
#define MTCE_HBS_MAX_NETWORKS (2)
#endif
// value of 20 at 100 msec period is 2 seconds of history */
#define MTCE_HBS_HISTORY_ENTRIES (20)
/* maximum number of history elements permitted in a cluster history summary */
#define MTCE_HBS_MAX_HISTORY_ELEMENTS ((MTCE_HBS_MAX_CONTROLLERS)*(MTCE_HBS_NETWORKS))
#ifndef ALIGN_PACK
#define ALIGN_PACK(x) __attribute__((packed)) x
#endif
/* A single element of Heartbeat Cluster History for one heartbeat period */
typedef struct
{
unsigned short hosts_enabled ; /* # of hosts being hb monitored */
unsigned short hosts_responding ; /* # of hosts that responsed to hb*/
} ALIGN_PACK(mtce_hbs_cluster_entry_type);
/* Heartbeat Cluster History for all monitored networks of a Controller */
typedef struct
{
unsigned short controller :4 ; /* value 0 or 1 (and 2 in future) */
unsigned short network :4 ; /* see mtce_hbs_network_enum */
unsigned short reserved_bits :7 ; /* future - initted to 0 */
unsigned short storage0_responding:1 ; /* 1 = storage-0 is hb healthy */
unsigned short entries ; /* # of valid values in .entry */
unsigned short entries_max ; /* max size of the enry array */
unsigned short oldest_entry_index ; /* the oldest entry in the array */
/* historical array of entries for a specific network */
mtce_hbs_cluster_entry_type entry [MTCE_HBS_HISTORY_ENTRIES] ;
} ALIGN_PACK(mtce_hbs_cluster_history_type) ;
/* Heartbeat Cluster History for all monitored networks of all Controllers */
typedef struct
{
/* Header - Static Data - 4 bytes */
unsigned char version ; /* public API MTCE_HBS_CLUSTER_VERSION */
unsigned char revision ; /* public API MTCE_HBS_CLUSTER_REVISION */
unsigned short magic_number ; /* public API MTCE_HBS_MAGIC_NUMBER */
/* Control - Dynamic Data - 8 bytes */
unsigned short reqid ; /* added from SM cluster request */
unsigned short period_msec ; /* heartbeat period in milliseconds */
unsigned short bytes ; /* total struct size self check */
unsigned char storage0_enabled; /* bool containing true or false */
unsigned char histories ; /* How many hostory elements follow */
/* Array of Cluster History
*
* - histories above specifies how many
* elements of this array are populated.
*/
mtce_hbs_cluster_history_type history [MTCE_HBS_MAX_HISTORY_ELEMENTS] ;
} ALIGN_PACK(mtce_hbs_cluster_type) ;
#endif // __HBSCLUSTER_H__

View File

@ -23,6 +23,7 @@ SRCS += mtcKeyApi.cpp
SRCS += mtcCmdHdlr.cpp
SRCS += mtcNodeMnfa.cpp
SRCS += mtcVimApi.cpp
SRCS += mtcStubs.cpp
COMPUTE_OBJS = mtcNodeComp.o
COMPUTE_OBJS += mtcCompMsg.o

View File

@ -1935,8 +1935,10 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
* the host has not reset yet we have disabled services
* then now we need to reset the host to prevet VM duplication
* by forcing a full enable */
if (( node_ptr->uptime_save != 0 ) &&
( node_ptr->uptime >= node_ptr->uptime_save ))
if ((( node_ptr->uptime_save != 0 ) &&
( node_ptr->uptime >= node_ptr->uptime_save )) ||
(( node_ptr->uptime_save == 0 ) &&
( node_ptr->uptime > MTC_MINS_15 )))
{
ilog ("%s regained MTCALIVE from host that did not reboot (uptime:%d)\n",
node_ptr->hostname.c_str(), node_ptr->uptime );

View File

@ -0,0 +1,17 @@
/*
* Copyright (c) 2013, 2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Maintenance Agent Stubs
*/
using namespace std;
#include "nodeClass.h" /* The main link class */
void hbs_cluster_log ( void ) { }

40
mtce/src/scripts/hbs-query Executable file
View File

@ -0,0 +1,40 @@
#!/bin/bash
# Copyright (c) 2013-2016 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# This utility is primarily used by no reboot patching for process restart
#
# This script sends a jason string containing the the restart command
# and ${1} as the specified process name to pmond over the loopback
# interface on port 2117
#
# Linux Standard Base (LSB) Error Codes
RETVAL=0
GENERIC_ERROR=1
INVALID_ARGS=2
UNSUPPORTED_FEATURE=3
NOT_INSTALLED=5
NOT_RUNNING=7
PROTOCOL="UDP4-DATAGRAM"
ADDRESS="127.0.0.1"
socat_exec=`(which socat) 2> /dev/null`
if [ -z ${socat_exec} ] ; then
logger "Error: $0 cannot find socat exec"
exit ${NOT_INSTALLED}
fi
reqid=123
if [ "${1}" != "" ] ; then
reqid=${1}
fi
port=$(cat /etc/mtc.ini | awk '{if ($1 == "sm_server_port") { print $3; }}')
echo "{\"origin\":\"sm\", \"service\":\"heartbeat\", \"request\":\"cluster_info\", \"reqid\": $reqid }" | socat - ${PROTOCOL}:${ADDRESS}:${port}
exit ${RETVAL}