Implement Active-Active Heartbeat as HA Improvement
This update introduces mtce changes to support Active-Active Heartbeating. The purpose of Active-Active Heartbeating is help avoid Split-Brain. Active-Active heartbeating has each controller maintain a 5 second heartbeat response history cache of each network for all monitored hosts as well as the on-going health of storage-0 if provisioned and enabled. This is referred to as the 'heartbeat cluster history' Each controller then includes its cluster history in each heartbeat pulse request message. The hbsClient, now modified to handle heartbeat from both controllers, saves each controllers' heartbeat cluster history in a local cache and criss-crosses the data in its pulse responses. So when the hbsClient receives a pulse request from controller-0 it saves its reported history and then replaces that history information in its response to controller-0 with what it saved from controller-1's last pulse request ; i.e. its view of the system. Controller-0, receiving a host's pulse response, saves its peers heartbeat cluster history so that it has summary of heartbeat cluster history for the last 5 seconds for each monitored network of every monitored host in the system from both controllers' perspectives. Same for controller-1 with controller-0's history. The hbsAgent is then further enhanced to support a query request for this information. So now SM, when it needs to make a decision to avoid Split-Brain or otherwise, can query either controller for its heartbeat cluster history and get the last 5 second summary view of heartbeat (network) responsivness from both controllers perspectives to help decide which controller to make active. This involved removing the hbsAgent process from SM control and monitor and adding a new hbsAgent LSB init script for process launch, service file to run the init script and pmon config file for hbsAgent process monitoring. With hbsAgent now running on both controllers, changes to maintenance were required to send inventory to hbsAgent on both controllers, listen for hbsAgent event messages over the management interface and inform both hbsAgents which controller is active. The hbsAgent running on the inactive controller does not - does not send heartbeat events to maintenance - does not send raise or clear alarms or produce customer logs Test Plan: Feature: PASS: Verify hbsAgent runs on both controllers PASS: Verify hbsAgent as pmon monitored process (not SM) PASS: Verify system install and cluster collection in all system types (10+) PASS: Verify active controller hbsAgent detects and handles heartbeat loss PASS: Verify inactive controller hbsAgent detects and logs heartbeat loss PASS: Verify heartbeat cluster history collection functions properly. PASS: Verify storage-0 state tracking in cluster into. PASS: Verify storage-0 not responding handling PASS: Verify heartbeat response is sent back to only the requesting controller. PASS: Verify heartbeat history is correct from each controller PASS: Verify MNFA from active controller after install to controller-0 PASS: Verify MNFA from active controller after swact to controller-1 PASS: Verify MNFA for 80%+ of the hosts in the storage system PASS: Verify SM cluster query operation and content from both controllers PASS: Verify restart of inactive hbsAgent doesn't clear existing heartbeat alarms Logging: PASS: Verify cluster info logs. PASS: Verify feature design logging. PASS: Verify hbsAgent and hbsClient design logs on all hosts add value PASS: Verify design logging from both controllers in heartbeat loss case PASS: Verify design logging from both controllers in MNFA case PASS: Verify clog logs cluster info vault status and updates for controllers PASS: Verify clog1 logs full cluster state change for all hosts PASS: Verify clog2 logs cluster info save/append logs for controllers PASS: Verify clog3 memory dumps a cluster history PASS: Verify USR2 forces heartbeat and cluster info log dump PASS: Verify hourly heartbeat and cluster info log dump PASS: Verify loss events force heartbeat and cluster info log dump Regression: PASS: Verify Large System DOR PASS: Verify pmond regression test that now includes hbsAgent PASS: Verify Lock/Unlock of inactive controller (x3) PASS: Verify Swact behavior (x10) PASS: Verify compute Lock/Unlock PASS: Verify storage-0 Lock/Unlock PASS: Verify compute Host Failure and Graceful Recovery PASS: Verify Graceful Recovery Retry to Max:3 then Full Enable PASS: Verify Delete Host PASS: Verify Patching hbsAgent and hbsClient PASS: Verify event driven cluster push Story: 2003576 Task: 24907 Change-Id: I5baf5bcca23601a99473d039356d58250ffb01b5 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
07818aac5e
commit
0b922227ac
@ -39,7 +39,6 @@ typedef struct
|
||||
{
|
||||
int scheduling_priority ; /**< Scheduling priority of this daemon */
|
||||
bool active ; /**< Maintenance activity state true|false */
|
||||
int hbs_pulse_period ; /**< time (msec) between heartbeat requests */
|
||||
int token_refresh_rate ; /**< token refresh rate in seconds */
|
||||
int hbs_minor_threshold ; /**< heartbeat miss minor threshold */
|
||||
int hbs_degrade_threshold ; /**< heartbeat miss degrade threshold */
|
||||
@ -351,7 +350,7 @@ extern char *program_invocation_short_name;
|
||||
}
|
||||
|
||||
#define blog(format, args...) { \
|
||||
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
if ( ltc() ) { if(daemon_get_cfg_ptr()->debug_bmgmt&1) printf ( "%s [%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, pt(), getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
else { if(daemon_get_cfg_ptr()->debug_bmgmt) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: BMgt : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; } \
|
||||
}
|
||||
|
||||
@ -380,22 +379,22 @@ extern char *program_invocation_short_name;
|
||||
#define mlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&4 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg4 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define mlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_msg&8 ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Msg8 : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog(format, args...) { if(daemon_get_cfg_ptr()->debug_json&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_json&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_json&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define jlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_json&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Json8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog(format, args...) { if(daemon_get_cfg_ptr()->debug_http&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_http&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_http&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define hlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_http&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Http8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive ) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog1(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog2(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define alog3(format, args...) { if(daemon_get_cfg_ptr()->debug_alive&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Alive8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog(format, args...) { if(daemon_get_cfg_ptr()->debug_work&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog1(format, args...) { if(daemon_get_cfg_ptr()->debug_work&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog2(format, args...) { if(daemon_get_cfg_ptr()->debug_work&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define qlog3(format, args...) { if(daemon_get_cfg_ptr()->debug_work&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Work8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
@ -403,8 +402,11 @@ extern char *program_invocation_short_name;
|
||||
#define flog(format, args...) { if(daemon_get_cfg_ptr()->debug_fsm) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: FSM : " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define tlog(format, args...) { if(daemon_get_cfg_ptr()->debug_timer) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Timer: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog(format, args...) { if(daemon_get_cfg_ptr()->debug_state&1) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Change: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog1(format, args...) { if(daemon_get_cfg_ptr()->debug_state&2) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang2: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog2(format, args...) { if(daemon_get_cfg_ptr()->debug_state&4) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang4: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define clog3(format, args...) { if(daemon_get_cfg_ptr()->debug_state&8) syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Chang8: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
||||
|
||||
#define log_event(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s: Event: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
#define log_stress(format, args...) { syslog(LOG_INFO, "[%d.%05d] %s %s %-3s %-18s(%4d) %-24s:Stress: " format, getpid(), lc(), _hn(), _pn, __AREA__, __FILE__, __LINE__, __FUNCTION__, ##args) ; }
|
||||
|
@ -233,6 +233,7 @@ const char * get_mtcNodeCommand_str ( int cmd )
|
||||
case MTC_CMD_QRY_HOST: return("query host");
|
||||
case MTC_CMD_START_HOST: return("start host service");
|
||||
case MTC_CMD_STOP_HOST: return("stop host service");
|
||||
case MTC_CMD_ACTIVE_CTRL: return("publish active controller");
|
||||
|
||||
/* VM Instance Commands */
|
||||
case MTC_CMD_ADD_INST: return("add instance");
|
||||
|
@ -359,6 +359,7 @@ void daemon_exit ( void );
|
||||
* a power-off to online transition */
|
||||
#define MTC_MTCALIVE_HITS_TO_GO_ONLINE (5)
|
||||
|
||||
#define CONTROLLER_X ((const char *)"controller-x")
|
||||
#define CONTROLLER_0 ((const char *)"controller-0")
|
||||
#define CONTROLLER_1 ((const char *)"controller-1")
|
||||
#define CONTROLLER_2 ((const char *)"controller-2")
|
||||
@ -526,7 +527,8 @@ typedef struct
|
||||
#define MTC_CMD_MOD_HOST (0x11110012) /* Query Host */
|
||||
#define MTC_CMD_QRY_HOST (0x11110013) /* Modify Host */
|
||||
#define MTC_CMD_START_HOST (0x11110014) /* Start Monitoring Host */
|
||||
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Moniroting Host */
|
||||
#define MTC_CMD_STOP_HOST (0x11110015) /* Stop Monitoring Host */
|
||||
#define MTC_CMD_ACTIVE_CTRL (0x11110016) /* Active Controller */
|
||||
|
||||
#define MTC_CMD_ADD_INST (0x11110020) /* Add Inst */
|
||||
#define MTC_CMD_DEL_INST (0x11110021) /* Delete Inst */
|
||||
@ -643,6 +645,9 @@ typedef struct
|
||||
#define PMOND_FLAG (0x00000001) /**< Process Monitor O.K. Flag */
|
||||
#define INFRA_FLAG (0x00000002) /**< Infrastructure iface provisioned Flag */
|
||||
|
||||
#define CTRLX_MASK (0x00000300) /**< From/To Controller-0/1/2/3 Number */
|
||||
#define CTRLX_BIT ((unsigned int)8) /**< used to shift right mask into bit 0 */
|
||||
|
||||
#define STALL_MON_FLAG (0x00010000) /**< Flag indicating hang monitor running */
|
||||
#define STALL_REC_FLAG (0x00020000) /**< Flag indicating hbsClient took action */
|
||||
#define STALL_ERR1_FLAG (0x00100000) /**< Error 1 Flag */
|
||||
@ -1217,15 +1222,15 @@ string get_availStatus_str ( mtc_nodeAvailStatus_enum availStatus );
|
||||
string get_operState_str ( mtc_nodeOperState_enum operState );
|
||||
string get_adminState_str ( mtc_nodeAdminState_enum adminState );
|
||||
|
||||
void log_adminAction ( string hostname,
|
||||
mtc_nodeAdminAction_enum currAction,
|
||||
void log_adminAction ( string hostname,
|
||||
mtc_nodeAdminAction_enum currAction,
|
||||
mtc_nodeAdminAction_enum newAction );
|
||||
|
||||
int send_hbs_command ( string hostname, int command );
|
||||
int send_hbs_command ( string hostname, int command, string controller=CONTROLLER );
|
||||
int send_hwmon_command ( string hostname, int command );
|
||||
int send_guest_command ( string hostname, int command );
|
||||
|
||||
int daemon_log_message ( const char * hostname,
|
||||
int daemon_log_message ( const char * hostname,
|
||||
const char * filename,
|
||||
const char * log_str );
|
||||
|
||||
|
@ -48,6 +48,7 @@
|
||||
#define MTC_MINS_20 (1200)
|
||||
#define MTC_MINS_30 (1800)
|
||||
#define MTC_MINS_40 (2400)
|
||||
#define MTC_HRS_1 (3600)
|
||||
#define MTC_HRS_4 (14400)
|
||||
#define MTC_HRS_8 (28800) /* old token refresh rate */
|
||||
|
||||
|
@ -269,7 +269,7 @@ void daemon_dump_cfg ( void )
|
||||
{
|
||||
daemon_config_type * ptr = daemon_get_cfg_ptr();
|
||||
|
||||
ilog ("Configuration Settings\n------------------------------\n");
|
||||
ilog ("Configuration Settings ...\n");
|
||||
if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); }
|
||||
|
||||
if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );}
|
||||
@ -277,7 +277,6 @@ void daemon_dump_cfg ( void )
|
||||
if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );}
|
||||
|
||||
/* hbsAgent */
|
||||
if ( ptr->hbs_pulse_period ) { ilog ("hbs_pulse_period = %d\n", ptr->hbs_pulse_period );}
|
||||
if ( ptr->token_refresh_rate ) { ilog ("token_refresh_rate = %d\n", ptr->token_refresh_rate );}
|
||||
if ( ptr->hbs_minor_threshold ) { ilog ("hbs_minor_threshold = %d\n", ptr->hbs_minor_threshold );}
|
||||
if ( ptr->hbs_degrade_threshold ) { ilog ("hbs_degrade_threshold = %d\n", ptr->hbs_degrade_threshold );}
|
||||
|
@ -78,6 +78,7 @@ void print_help ( void )
|
||||
printf ("\t-l --log - Log to file ; /var/log/<daemon>.log\n");
|
||||
printf ("\t-p --passive - Passive mode ; do not act on failures\n");
|
||||
printf ("\t-v --verbose - Show command line arguments\n");
|
||||
printf ("\t-V --Virtual - Running in virtual environment\n");
|
||||
printf ("\t-t --test - Run Test Head\n");
|
||||
printf ("\t-g --gap - Gap in seconds\n");
|
||||
printf ("\t-m --mode - Word string representing a run mode\n");
|
||||
@ -106,6 +107,9 @@ int daemon_get_run_option ( const char * option )
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
else if ( !strcmp ( option, "Virtual" ) )
|
||||
return opts.Virtual ;
|
||||
|
||||
else if ( !strcmp ( option, "front" ) )
|
||||
return opts.front ;
|
||||
|
||||
@ -118,6 +122,7 @@ void opts_init ( void)
|
||||
opts.log = false ;
|
||||
opts.test = false ;
|
||||
opts.verbose = false ;
|
||||
opts.Virtual = false ;
|
||||
opts.active = false ;
|
||||
opts.front = false ;
|
||||
opts.front = false ;
|
||||
@ -152,8 +157,8 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
||||
int cmd_arg_count = 1 ; /* command args start at 1 */
|
||||
|
||||
/* A string listing of valid short options letters. */
|
||||
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvta";
|
||||
|
||||
const char* const short_options = "u:c:p:g:i:m:n:d:hlfpvVta";
|
||||
|
||||
/* An array listing of valid long options. */
|
||||
const struct option long_options[] =
|
||||
{
|
||||
@ -167,9 +172,10 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
||||
{ "username" , 1, NULL, 'u' },
|
||||
{ "help" , 0, NULL, 'h' },
|
||||
{ "active" , 0, NULL, 'a' },
|
||||
{ "foreground", 0, NULL, 'f' },
|
||||
{ "log" , 0, NULL, 'l' },
|
||||
{ "foreground", 0, NULL, 'f' },
|
||||
{ "log" , 0, NULL, 'l' },
|
||||
{ "verbose" , 0, NULL, 'v' },
|
||||
{ "Virtual" , 0, NULL, 'V' },
|
||||
{ "test" , 0, NULL, 't' },
|
||||
{ NULL , 0, NULL, 0 } /* Required at end of array. */
|
||||
};
|
||||
@ -254,19 +260,25 @@ int parseArg ( int argc, char * argv[], opts_type * opts_ptr )
|
||||
case 't': /* -t or --test */
|
||||
{
|
||||
opts_ptr->test = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'v': /* -t or --verbose */
|
||||
case 'v': /* -v or --verbose */
|
||||
{
|
||||
opts_ptr->verbose = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'V': /* -V or --Virtual */
|
||||
{
|
||||
opts_ptr->Virtual = true ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case 'a': /* -a or --active */
|
||||
{
|
||||
opts_ptr->active = true ;
|
||||
cmd_arg_count++ ;
|
||||
cmd_arg_count++ ;
|
||||
break;
|
||||
}
|
||||
case '?':
|
||||
|
@ -33,6 +33,7 @@ typedef struct
|
||||
int test ; /**< Enable test mode */
|
||||
int info ; /**< Dump data module info */
|
||||
int verbose ; /**< Dump command line options */
|
||||
int Virtual ; /**< Set to non-zero when in virtual env */
|
||||
int active ; /**< Set daemon active */
|
||||
int debug ; /**< Set tracing debug mode "debug,"test","info","trace" */
|
||||
int front ; /**< run in the foreground ; do not daemonize */
|
||||
@ -43,7 +44,7 @@ typedef struct
|
||||
string username ;
|
||||
string command ;
|
||||
string password ;
|
||||
} opts_type ;
|
||||
} opts_type ;
|
||||
|
||||
opts_type * daemon_get_opts_ptr ( void );
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
SRC_DIR="$PKG_BASE/src"
|
||||
COPY_LIST="$SRC_DIR/*"
|
||||
TIS_PATCH_VER=6
|
||||
TIS_PATCH_VER=7
|
||||
|
@ -34,6 +34,7 @@ make install buildroot=%{buildroot} _sysconfdir=%{_sysconfdir} _unitdir=%{_unitd
|
||||
if [ $1 -eq 1 ] ; then
|
||||
/bin/systemctl enable lighttpd.service
|
||||
/bin/systemctl enable qemu_clean.service
|
||||
/bin/systemctl enable hbsAgent.service
|
||||
fi
|
||||
exit 0
|
||||
|
||||
@ -41,6 +42,9 @@ exit 0
|
||||
%defattr(-,root,root,-)
|
||||
%{_sysconfdir}/init.d/goenabledControl
|
||||
%license %{_datarootdir}/licenses/mtce-control-1.0/LICENSE
|
||||
%{_sysconfdir}/pmon.d/hbsAgent.conf
|
||||
%{_sysconfdir}/init.d/hbsAgent
|
||||
%{_unitdir}/hbsAgent.service
|
||||
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
@ -1,19 +1,32 @@
|
||||
SOURCE1 = goenabled
|
||||
SOURCE2 = LICENSE
|
||||
SOURCE1 = LICENSE
|
||||
SOURCE2 = goenabled
|
||||
SOURCE3 = hbsAgent
|
||||
SOURCE4 = hbsAgent.conf
|
||||
SOURCE5 = hbsAgent.service
|
||||
|
||||
local_etc_pmond = $(_sysconfdir)/pmond.d
|
||||
local_etc_pmond = $(_sysconfdir)/pmon.d
|
||||
local_etc_goenabledd = $(_sysconfdir)/goenabled.d
|
||||
|
||||
.PHONY: default
|
||||
|
||||
install:
|
||||
# Controller-Only Init Scripts
|
||||
install -m 755 -p -D scripts/$(SOURCE1) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
|
||||
# Controller-Only Process Monitor Config files
|
||||
install -m 755 -d $(buildroot)/$(local_etc_pmond)
|
||||
# Controller-Only Go Enabled Test
|
||||
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)
|
||||
|
||||
# for license
|
||||
install -m 755 -d $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0
|
||||
install -p -D -m 600 $(SOURCE2) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
|
||||
install -m 600 -p -D $(SOURCE1) $(buildroot)/$(_datarootdir)/licenses/mtce-control-1.0/LICENSE
|
||||
|
||||
# Controller-Only Init Scripts
|
||||
install -m 755 -d $(buildroot)/$(_sysconfdir)/init.d
|
||||
install -m 755 -p -D scripts/$(SOURCE2) $(buildroot)/$(_sysconfdir)/init.d/goenabledControl
|
||||
install -m 755 -p -D scripts/$(SOURCE3) $(buildroot)/$(_sysconfdir)/init.d/hbsAgent
|
||||
|
||||
# Controller-Only Process Monitor Config files
|
||||
install -m 755 -d $(buildroot)/$(local_etc_pmond)
|
||||
install -m 644 -p -D scripts/$(SOURCE4) $(buildroot)/$(local_etc_pmond)/hbsAgent.conf
|
||||
|
||||
# Controller-Only Heartbeat Service file
|
||||
install -m 644 -p -D scripts/$(SOURCE5) $(buildroot)/$(_unitdir)/hbsAgent.service
|
||||
|
||||
# Controller-Only Go Enabled Test
|
||||
install -m 755 -d $(buildroot)/$(local_etc_goenabledd)
|
||||
|
||||
|
117
mtce-control/src/scripts/hbsAgent
Normal file
117
mtce-control/src/scripts/hbsAgent
Normal file
@ -0,0 +1,117 @@
|
||||
#! /bin/sh
|
||||
#
|
||||
# Copyright (c) 2018 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
#
|
||||
# chkconfig: 2345 95 95
|
||||
#
|
||||
### BEGIN INIT INFO
|
||||
# Provides: hbsAgent
|
||||
# Default-Start: 3 5
|
||||
# Default-Stop: 0 1 2 6
|
||||
# Short-Description: Heartbeat Agent Daemon
|
||||
### END INIT INFO
|
||||
|
||||
. /etc/init.d/functions
|
||||
|
||||
DAEMON_NAME="hbsAgent"
|
||||
DAEMON="/usr/local/bin/${DAEMON_NAME}"
|
||||
PIDFILE="/var/run/${DAEMON_NAME}.pid"
|
||||
|
||||
VIRT_TOOL='virt-what'
|
||||
# controller-1:~$ sudo virt-what
|
||||
# virtualbox ... in virtualbox
|
||||
# kvm ... in qemu
|
||||
|
||||
# Linux Standard Base (LSB) Error Codes
|
||||
RETVAL=0
|
||||
GENERIC_ERROR=1
|
||||
INVALID_ARGS=2
|
||||
UNSUPPORTED_FEATURE=3
|
||||
NOT_INSTALLED=5
|
||||
NOT_RUNNING=7
|
||||
|
||||
PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
|
||||
export PATH
|
||||
|
||||
if [ ! -e "${DAEMON}" ] ; then
|
||||
logger "${DAEMON} is missing"
|
||||
exit ${NOT_INSTALLED}
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
logger "Starting ${DAEMON_NAME}"
|
||||
echo -n "Starting ${DAEMON_NAME}: "
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
echo -n "is already running "
|
||||
RETVAL=0
|
||||
else
|
||||
tool=$(which ${VIRT_TOOL})
|
||||
if [ $? -eq 0 ] ; then
|
||||
virtual=`${VIRT_TOOL}`
|
||||
else
|
||||
virtual=""
|
||||
fi
|
||||
|
||||
if [ "${virtual}" == "virtualbox" -o "${virtual}" == "kvm" ] ; then
|
||||
start-stop-daemon --start -b -x ${DAEMON} -- -l -a -V
|
||||
else
|
||||
start-stop-daemon --start -b -x ${DAEMON} -- -l -a
|
||||
fi
|
||||
RETVAL=$?
|
||||
fi
|
||||
if [ ${RETVAL} -eq 0 ] ; then
|
||||
pid=`pidof ${DAEMON_NAME}`
|
||||
echo "OK"
|
||||
logger "${DAEMON} (${pid})"
|
||||
else
|
||||
echo "FAIL"
|
||||
RETVAL=${GENERIC_ERROR}
|
||||
fi
|
||||
;;
|
||||
|
||||
stop)
|
||||
logger "Stopping ${DAEMON_NAME}"
|
||||
echo -n "Stopping ${DAEMON_NAME}: "
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
killproc ${DAEMON_NAME}
|
||||
fi
|
||||
if [ -n "`pidof ${DAEMON_NAME}`" ] ; then
|
||||
echo "FAIL"
|
||||
RETVAL=${NOT_RUNNING}
|
||||
else
|
||||
echo "OK"
|
||||
fi
|
||||
rm -f ${PIDFILE}
|
||||
;;
|
||||
|
||||
restart)
|
||||
$0 stop
|
||||
$0 start
|
||||
;;
|
||||
|
||||
status)
|
||||
pid=`pidof ${DAEMON_NAME}`
|
||||
RETVAL=$?
|
||||
if [ ${RETVAL} -eq 0 ] ; then
|
||||
echo "${DAEMON_NAME} is running"
|
||||
else
|
||||
echo "${DAEMON_NAME} is NOT running"
|
||||
RETVAL=${NOT_RUNNING}
|
||||
fi
|
||||
;;
|
||||
|
||||
condrestart)
|
||||
$0 restart
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "usage: $0 { start | stop | status | restart | condrestart | status }"
|
||||
;;
|
||||
esac
|
||||
|
||||
exit ${RETVAL}
|
25
mtce-control/src/scripts/hbsAgent.conf
Normal file
25
mtce-control/src/scripts/hbsAgent.conf
Normal file
@ -0,0 +1,25 @@
|
||||
[process]
|
||||
process = hbsAgent
|
||||
service = hbsAgent
|
||||
pidfile = /var/run/hbsAgent.pid
|
||||
style = lsb ; ocf or lsb
|
||||
severity = major ; minor, major, critical
|
||||
restarts = 1 ; restart retries before error assertion
|
||||
interval = 10 ; number of seconds to wait between restarts
|
||||
debounce = 10 ; number of seconds that a process needs to remain
|
||||
; running before degrade is removed and retry count
|
||||
; is cleared.
|
||||
startuptime = 5 ; Seconds to wait after process start before starting the debounce monitor
|
||||
mode = passive ; Monitoring mode: passive (default) or active
|
||||
; passive: process death monitoring (default: always)
|
||||
; active : heartbeat monitoring, i.e. request / response messaging
|
||||
; ignore : do not monitor or stop monitoring
|
||||
quorum = 0 ; process is in the host watchdog quorum
|
||||
|
||||
; Active Monitoring Options
|
||||
|
||||
port = 2201
|
||||
period = 5 ; monitor period in seconds
|
||||
timeout = 4 ; Messaging timeout period in seconds, must be shorter than period
|
||||
threshold = 5 ; Number of back to back heartbeat failures before action
|
||||
|
22
mtce-control/src/scripts/hbsAgent.service
Normal file
22
mtce-control/src/scripts/hbsAgent.service
Normal file
@ -0,0 +1,22 @@
|
||||
[Unit]
|
||||
Description=Titanium Cloud Maintenance Heartbeat Agent
|
||||
After=network.target syslog.service config.service
|
||||
Before=pmon.service
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
ExecStart=/etc/rc.d/init.d/hbsAgent start
|
||||
ExecStop=/etc/rc.d/init.d/hbsAgent start
|
||||
PIDFile=/var/run/hbsAgent.pid
|
||||
KillMode=process
|
||||
SendSIGKILL=no
|
||||
|
||||
# Process recovery is handled by pmond if its running.
|
||||
# Delay 10 seconds to give pmond a chance to recover
|
||||
# before systemd kicks in to do it as a backup plan.
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
@ -1,3 +1,3 @@
|
||||
SRC_DIR="src"
|
||||
TIS_PATCH_VER=140
|
||||
TIS_PATCH_VER=142
|
||||
BUILD_IS_SLOW=5
|
||||
|
@ -313,7 +313,6 @@ install -m 755 -d %{buildroot}/usr/lib/ocf
|
||||
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d
|
||||
install -m 755 -d %{buildroot}/usr/lib/ocf/resource.d/platform
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/mtcAgent %{buildroot}/usr/lib/ocf/resource.d/platform/mtcAgent
|
||||
install -m 755 -p -D %{_buildsubdir}/scripts/hbsAgent %{buildroot}/usr/lib/ocf/resource.d/platform/hbsAgent
|
||||
install -m 755 -p -D %{_buildsubdir}/hwmon/scripts/ocf/hwmon %{buildroot}/usr/lib/ocf/resource.d/platform/hwmon
|
||||
|
||||
# config files
|
||||
@ -482,7 +481,6 @@ install -m 755 -d %{buildroot}/var/run
|
||||
|
||||
# SM OCF Start/Stop/Monitor Scripts
|
||||
%{ocf_resourced}/platform/mtcAgent
|
||||
%{ocf_resourced}/platform/hbsAgent
|
||||
|
||||
# Config files
|
||||
%config(noreplace)/etc/mtc.ini
|
||||
|
@ -47,6 +47,11 @@ int alarm_register_user ( msgClassSock * sock_ptr )
|
||||
return (rc);
|
||||
}
|
||||
|
||||
void alarm_unregister_user ( void )
|
||||
{
|
||||
user_sock_ptr = NULL ;
|
||||
}
|
||||
|
||||
/* Construct an alarm request json string in the following form
|
||||
{\"mtcalarm\":[{\"alarmid\":\"200.009\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Infrastructure\",\"prefix\":\"service=heartbeat\"}, {\"alarmid\":\"200.005\",\"hostname\":\"compute-3\",\"operation\":\"set\",\"severity\":\"major\",\"entity\":\"Management\",\"prefix\":\"service=heartbeat\"}]}"
|
||||
|
||||
@ -73,6 +78,17 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
|
||||
string msg_type ;
|
||||
string sev ;
|
||||
|
||||
if ( user_sock_ptr == NULL )
|
||||
{
|
||||
slog ("alarm socket is NULL");
|
||||
return (FAIL_NULL_POINTER );
|
||||
}
|
||||
else if ( ! user_sock_ptr->sock_ok() )
|
||||
{
|
||||
elog ("alarm socket is not ok");
|
||||
return (FAIL_OPERATION);
|
||||
}
|
||||
|
||||
if ( state == FM_ALARM_STATE_MSG )
|
||||
msg_type = "msg" ;
|
||||
else if ( state == FM_ALARM_STATE_SET )
|
||||
@ -127,7 +143,8 @@ int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSev
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("%s %s\n", hostname.c_str(), request);
|
||||
ilog ("%s %s %s %s %s", hostname.c_str(), entity, msg_type.c_str(), sev.c_str(), id);
|
||||
mlog ("%s %s\n", hostname.c_str(), request);
|
||||
return ( PASS ) ;
|
||||
}
|
||||
daemon_signal_hdlr ();
|
||||
|
@ -68,6 +68,7 @@ EFmAlarmSeverityT alarmUtil_getSev_enum ( string severity );
|
||||
#ifndef __MODULE_PRIVATE__
|
||||
|
||||
int alarm_register_user ( msgClassSock * sock_ptr );
|
||||
void alarm_unregister_user ( void );
|
||||
|
||||
/* Public API */
|
||||
int alarm_ ( string hostname, const char * id, EFmAlarmStateT state, EFmAlarmSeverityT severity, const char * entity, string prefix );
|
||||
|
@ -36,6 +36,7 @@ using namespace std;
|
||||
#include "mtcAlarm.h"
|
||||
#include "alarm.h"
|
||||
#include "hbsAlarm.h"
|
||||
#include "hbsBase.h"
|
||||
|
||||
/** Initialize the supplied command buffer */
|
||||
void mtcCmd_init ( mtcCmd & cmd )
|
||||
@ -263,7 +264,8 @@ nodeLinkClass::nodeLinkClass()
|
||||
/* Make no assumption on the service */
|
||||
maintenance = false ;
|
||||
heartbeat = false ;
|
||||
active = false ;
|
||||
active = false ; /* run active */
|
||||
active_controller = false ; /* true if this controller is active */
|
||||
|
||||
/* Set some defaults for the hearbeat service */
|
||||
hbs_ready = false ;
|
||||
@ -1156,26 +1158,26 @@ void nodeLinkClass::print_node_info ( void )
|
||||
if (( i == INFRA_IFACE ) && ( infra_network_provisioned == false ))
|
||||
continue ;
|
||||
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "| %s: %3d | Mon | Mis | Max | Deg | Fail | Pulses Tot | Pulses | %s (%4d) |\n" ,
|
||||
get_iface_name_str ((iface_enum)i), hosts, hbs_disabled ? "DISABLED" : "Enabled ", hbs_pulse_period );
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
|
||||
for ( struct node * ptr = head ; ptr != NULL ; ptr = ptr->next )
|
||||
{
|
||||
syslog ( LOG_INFO, "| %-12s | %c | %3i | %4i | %3i | %4i | %8x | %7x | %d msec\n",
|
||||
syslog ( LOG_INFO, "| %-12s | %c | %5i | %5i | %5i | %5i | %10x | %8x | %d msec\n",
|
||||
ptr->hostname.c_str(),
|
||||
ptr->monitor[i] ? 'Y' : 'n',
|
||||
ptr->hbs_misses_count[i],
|
||||
ptr->max_count[i],
|
||||
ptr->hbs_degrade_count[i],
|
||||
ptr->hbs_failure_count[i],
|
||||
ptr->hbs_misses_count[i],
|
||||
ptr->max_count[i],
|
||||
ptr->hbs_degrade_count[i],
|
||||
ptr->hbs_failure_count[i],
|
||||
ptr->hbs_count[i],
|
||||
ptr->b2b_pulses_count[i],
|
||||
hbs_pulse_period );
|
||||
}
|
||||
}
|
||||
syslog ( LOG_INFO, "+--------------+-----+-----+------+-----+------+------------+---------+-----------------+\n");
|
||||
syslog ( LOG_INFO, "+--------------+-----+-------+-------+-------+-------+------------+----------+-----------------+\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -7778,7 +7780,7 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
{
|
||||
/* This default RC allows the caller to filter out unexpected pulse responses */
|
||||
int rc = ENXIO ;
|
||||
|
||||
|
||||
if ( head == NULL )
|
||||
{
|
||||
return -ENODEV ;
|
||||
@ -7962,6 +7964,16 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
}
|
||||
pulses[iface]-- ;
|
||||
}
|
||||
else if ( node_ptr )
|
||||
{
|
||||
dlog ("%s unexpected pulse response ; %s",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
}
|
||||
else
|
||||
{
|
||||
slog ("null pointer");
|
||||
}
|
||||
|
||||
return rc ;
|
||||
}
|
||||
@ -7972,6 +7984,13 @@ int nodeLinkClass::remPulse ( struct node * node_ptr, iface_enum iface, bool cle
|
||||
* By index does not require a lookup whereas hostname does */
|
||||
int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index, unsigned int flags )
|
||||
{
|
||||
/* TODO: consider removing this check */
|
||||
if ( hostname == "localhost" )
|
||||
{
|
||||
/* localhost is not a supported hostname and indicates
|
||||
* an unconfigured host response ; return the ignore response */
|
||||
return(ENXIO);
|
||||
}
|
||||
if ( index )
|
||||
{
|
||||
int rc = remPulse_by_index ( hostname, index , iface, true , flags );
|
||||
@ -7984,16 +8003,6 @@ int nodeLinkClass::remove_pulse ( string & hostname, iface_enum iface, int index
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( hostname.compare("localhost") )
|
||||
{
|
||||
get_hbs_monitor_state ( hostname , iface ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* localhost is not a supported hostname and indicates
|
||||
* an unconfigured host response ; return the ignore response */
|
||||
return(ENXIO);
|
||||
}
|
||||
}
|
||||
return ( remPulse_by_name ( hostname , iface, true, flags ));
|
||||
}
|
||||
@ -8016,7 +8025,6 @@ void nodeLinkClass::clear_pulse_list ( iface_enum iface )
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Runs in the hbsAgent to set or clear heartbat alarms for all supported interfaces */
|
||||
void nodeLinkClass::manage_heartbeat_alarm ( struct nodeLinkClass::node * node_ptr, EFmAlarmSeverityT sev, int iface )
|
||||
{
|
||||
@ -8142,7 +8150,6 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
storage_0_responding = false ;
|
||||
}
|
||||
|
||||
/* Don't log single misses unless in debug mode */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > 1 )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] >= hbs_failure_threshold )
|
||||
@ -8207,7 +8214,10 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
{
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_minor_threshold )
|
||||
{
|
||||
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_MINOR_SET, iface );
|
||||
}
|
||||
pulse_ptr->hbs_minor[iface] = true ;
|
||||
pulse_ptr->hbs_minor_count[iface]++ ;
|
||||
wlog ("%s %s -> MINOR\n", pulse_ptr->hostname.c_str(), get_iface_name_str(iface));
|
||||
@ -8215,10 +8225,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_degrade_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
@ -8231,11 +8248,17 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
( pulse_ptr->hbs_degrade[iface] == false ))
|
||||
{
|
||||
wlog ("%s -> DEGRADED - Auto-Correction\n", pulse_ptr->hostname.c_str());
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_MAJOR, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_DEGRADE_SET, iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_degrade[iface] = true ;
|
||||
}
|
||||
@ -8250,11 +8273,16 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
}
|
||||
|
||||
@ -8268,35 +8296,46 @@ int nodeLinkClass::lost_pulses ( iface_enum iface, bool & storage_0_responding )
|
||||
/* Only print the log at the threshold boundary */
|
||||
if ( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
}
|
||||
wlog_throttled ( pulse_ptr->no_work_log_throttle, 500,
|
||||
"%s %s *** Heartbeat Loss *** (degrade only)\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
}
|
||||
|
||||
else if (( pulse_ptr->b2b_misses_count[iface] == hbs_failure_threshold ) &&
|
||||
( pulse_ptr->hbs_failure[iface] == false ))
|
||||
( pulse_ptr->hbs_failure[iface] == false ))
|
||||
{
|
||||
elog ("%s %s -> FAILED\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", pulse_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface) );
|
||||
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
if ( this->active_controller )
|
||||
{
|
||||
manage_heartbeat_alarm ( pulse_ptr, FM_ALARM_SEVERITY_CRITICAL, iface );
|
||||
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
/* report this host as failed */
|
||||
if ( send_event ( pulse_ptr->hostname, MTC_EVENT_HEARTBEAT_LOSS , iface ) == PASS )
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
pulse_ptr->hbs_failure[iface] = true ;
|
||||
this->print_node_info ();
|
||||
hbs_cluster_log ( this->my_hostname, "event", true );
|
||||
}
|
||||
|
||||
pulse_ptr->hbs_failure_count[iface]++ ;
|
||||
}
|
||||
if ( pulse_ptr->b2b_misses_count[iface] > pulse_ptr->max_count[iface] )
|
||||
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
|
||||
pulse_ptr->max_count[iface] = pulse_ptr->b2b_misses_count[iface] ;
|
||||
}
|
||||
|
||||
if ( remPulse_by_name ( pulse_ptr->hostname, iface, false, NULL_PULSE_FLAGS ))
|
||||
{
|
||||
elog ("%s %s not in pulse list\n", pulse_ptr->hostname.c_str(),
|
||||
|
@ -1266,6 +1266,10 @@ public:
|
||||
bool maintenance ;
|
||||
bool heartbeat ;
|
||||
|
||||
/* Set to true if this controller is active.
|
||||
* Currently only used by heartbeat service. */
|
||||
bool active_controller ;
|
||||
|
||||
/* offline_handler tuning controls */
|
||||
int offline_threshold ; /* number of back to back mtcAlive misses before offline */
|
||||
int offline_period ; /* offline handler mtcAlive request period */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -47,6 +47,9 @@
|
||||
/** Maximum service fail count before action */
|
||||
#define MAX_FAIL_COUNT (1)
|
||||
|
||||
/** Audit Rate/Count */
|
||||
#define AUDIT_RATE (9)
|
||||
|
||||
/** Heartbeat pulse request/response message header byte size */
|
||||
#define HBS_HEADER_SIZE (15)
|
||||
|
||||
@ -60,13 +63,16 @@ const char rsp_msg_header [HBS_HEADER_SIZE+1] = {"cgts pulse rsp:"};
|
||||
|
||||
#define HBS_MAX_MSG (HBS_HEADER_SIZE+MAX_CHARS_HOSTNAME)
|
||||
|
||||
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
|
||||
#define HBS_MESSAGE_VERSION (1) // 0 -> 1 with intro of cluster info
|
||||
|
||||
/* Heartbeat control structure */
|
||||
typedef struct
|
||||
{
|
||||
unsigned int controller ;
|
||||
unsigned int audit ;
|
||||
unsigned int nodetype ;
|
||||
bool clear_alarms ;
|
||||
bool locked ;
|
||||
} hbs_ctrl_type ;
|
||||
hbs_ctrl_type * get_hbs_ctrl_ptr ( void );
|
||||
|
||||
@ -218,22 +224,17 @@ void hbs_utils_init ( void );
|
||||
/* network enum to name lookup */
|
||||
string hbs_cluster_network_name ( mtce_hbs_network_enum network );
|
||||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix );
|
||||
|
||||
/* Initialize the specified history array */
|
||||
void hbs_cluster_history_init ( mtce_hbs_cluster_history_type & history );
|
||||
|
||||
/* Clear all history in the cluster vault */
|
||||
void hbs_cluster_history_clear( mtce_hbs_cluster_type & cluster );
|
||||
|
||||
|
||||
/******** Heartbeat Agent Cluster Functions in hbsCluster.cpp ********/
|
||||
|
||||
/* Set the cluster vault to default state.
|
||||
* Called upon daemon init or heartbeat period change. */
|
||||
void hbs_cluster_init ( unsigned short period );
|
||||
void hbs_cluster_init ( unsigned short period , msgClassSock * sm_socket_ptr );
|
||||
|
||||
/* Calculate number of bytes that is unused in the cluster data structure.
|
||||
* Primarily to know how many history elements are missing. */
|
||||
@ -286,7 +287,9 @@ void hbs_cluster_append ( hbs_message_type & msg );
|
||||
|
||||
/* Produce formatted clog's that characterize current and changing cluster
|
||||
* history for a given network. Each log is controller/network specific. */
|
||||
void hbs_cluster_log ( string & hostname, string prefix );
|
||||
void hbs_cluster_log ( string & hostname, string prefix, bool force=false );
|
||||
void hbs_cluster_log ( string & hostname, mtce_hbs_cluster_type & cluster, string prefix, bool force=false );
|
||||
|
||||
|
||||
/* Service SM cluster info request */
|
||||
void hbs_sm_handler ( void );
|
||||
@ -294,8 +297,14 @@ void hbs_sm_handler ( void );
|
||||
/* send the cluster vault to SM */
|
||||
void hbs_cluster_send ( msgClassSock * sm_client_sock, int reqid );
|
||||
|
||||
/* copy cluster data from src to dst */
|
||||
void hbs_cluster_copy ( mtce_hbs_cluster_type & src, mtce_hbs_cluster_type & dst );
|
||||
|
||||
/* print the contents of the vault */
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault );
|
||||
void hbs_cluster_dump ( mtce_hbs_cluster_type & vault, string log_prefix, bool force );
|
||||
|
||||
/* Heartbeat service state audit */
|
||||
void hbs_state_audit ( void );
|
||||
|
||||
/**
|
||||
* @} hbs_base
|
||||
|
@ -66,6 +66,8 @@ extern "C"
|
||||
#include "amon.h" /* for ... active monitoring utilities */
|
||||
}
|
||||
|
||||
#define MAX_LEN (300)
|
||||
|
||||
/* Where to send events */
|
||||
string mtcAgent_ip = "" ;
|
||||
|
||||
@ -96,12 +98,17 @@ typedef struct
|
||||
|
||||
static char pulse_resp_tx_hdr [HBS_MAX_MSG];
|
||||
static char my_hostname [MAX_HOST_NAME_SIZE+1];
|
||||
static string hostname = "" ;
|
||||
static char my_hostname_length ;
|
||||
static string my_macaddr = "" ;
|
||||
static string my_address = "" ;
|
||||
static unsigned int my_nodetype= CGTS_NODE_NULL ;
|
||||
static stallMon_type stallMon ;
|
||||
|
||||
/* Cached Cluster view from controllers */
|
||||
mtce_hbs_cluster_type controller_cluster_cache[MTCE_HBS_MAX_CONTROLLERS];
|
||||
|
||||
|
||||
void daemon_sigchld_hdlr ( void )
|
||||
{
|
||||
; /* dlog("Received SIGCHLD ... no action\n"); */
|
||||
@ -407,16 +414,17 @@ int daemon_configure ( void )
|
||||
else
|
||||
{
|
||||
ilog("Realtime Pri: FIFO/%i \n", hbs_config.scheduling_priority );
|
||||
ilog("Multicast: %s\n", hbs_config.multicast );
|
||||
ilog("Multicast : %s\n", hbs_config.multicast );
|
||||
|
||||
hbs_config.mgmnt_iface = daemon_get_iface_master ( hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt iface : %s\n", hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt RxPort: %d\n", hbs_config.hbs_client_mgmnt_port );
|
||||
ilog("Mgmnt TxPort: %d\n", hbs_config.hbs_agent_mgmnt_port );
|
||||
ilog("Mgmnt Name : %s\n", hbs_config.mgmnt_iface );
|
||||
ilog("Mgmnt Port : %d (rx)", hbs_config.hbs_client_mgmnt_port );
|
||||
ilog("Mgmnt Port : %d (tx)", hbs_config.hbs_agent_mgmnt_port );
|
||||
|
||||
get_iface_macaddr ( hbs_config.mgmnt_iface, my_macaddr );
|
||||
get_iface_address ( hbs_config.mgmnt_iface, my_address, true );
|
||||
get_hostname ( &my_hostname[0], MAX_HOST_NAME_SIZE );
|
||||
hostname = my_hostname ;
|
||||
|
||||
/* Fetch the infrastructure interface name.
|
||||
* calls daemon_get_iface_master inside so the
|
||||
@ -427,11 +435,14 @@ int daemon_configure ( void )
|
||||
if (strcmp(hbs_config.infra_iface, hbs_config.mgmnt_iface))
|
||||
{
|
||||
infra_network_provisioned = true ;
|
||||
ilog ("Infra iface : %s\n", hbs_config.infra_iface );
|
||||
ilog ("Infra Name : %s\n", hbs_config.infra_iface );
|
||||
}
|
||||
}
|
||||
ilog("Infra RxPort: %d\n", hbs_config.hbs_client_infra_port );
|
||||
ilog("Infra TxPort: %d\n", hbs_config.hbs_agent_infra_port );
|
||||
if ( infra_network_provisioned == true )
|
||||
{
|
||||
ilog("Infra Port : %d (rx)", hbs_config.hbs_client_infra_port );
|
||||
ilog("Infra Port : %d (tx)", hbs_config.hbs_agent_infra_port );
|
||||
}
|
||||
|
||||
/* initialize the stall detection monitor */
|
||||
stallMon_init ();
|
||||
@ -663,7 +674,37 @@ int get_pmon_pulses ( void )
|
||||
return (pulses);
|
||||
}
|
||||
|
||||
static unsigned int my_rri = 0 ;
|
||||
/*************************************************************
|
||||
*
|
||||
* Name : have_other_controller_history
|
||||
*
|
||||
* Description: returns true if there is cached history for any
|
||||
* controller number other than this one supplied.
|
||||
*
|
||||
*************************************************************/
|
||||
|
||||
bool have_other_controller_history ( unsigned short controller )
|
||||
{
|
||||
if ( controller < MTCE_HBS_MAX_CONTROLLERS )
|
||||
{
|
||||
/* look for history for any controller other than the one specified */
|
||||
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
|
||||
{
|
||||
/* skip specified controller */
|
||||
if ( c != controller )
|
||||
{
|
||||
if ( controller_cluster_cache[c].histories )
|
||||
{
|
||||
return true ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false ;
|
||||
}
|
||||
|
||||
|
||||
static unsigned int rri[MTCE_HBS_MAX_CONTROLLERS] = {0,0} ;
|
||||
|
||||
/*************************************************************
|
||||
*
|
||||
@ -766,12 +807,13 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
daemon_config_type * cfg_ptr = daemon_get_cfg_ptr();
|
||||
if ( cfg_ptr->debug_msg )
|
||||
{
|
||||
mlog ("\n");
|
||||
mlog ("%s Pulse Req: %s:%5d: %d:%s RRI:%d\n",
|
||||
mlog (" ");
|
||||
mlog ("%s Pulse Req: %s:%d s:%d f:%x [%s] RRI:%d\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.rx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c);
|
||||
}
|
||||
@ -787,19 +829,9 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
return (FAIL_MSG_HEADER) ;
|
||||
}
|
||||
|
||||
|
||||
/* Manage the Resource Reference Index (RRI) "lookup clue" */
|
||||
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
if( my_rri!= hbs_sock.rx_mesg[iface].c )
|
||||
{
|
||||
my_rri = hbs_sock.rx_mesg[iface].c ;
|
||||
ilog ("%s Caching New RRI: %d\n", &my_hostname[0], my_rri );
|
||||
}
|
||||
}
|
||||
|
||||
/* Add my RRI to the response message */
|
||||
hbs_sock.rx_mesg[iface].c = my_rri ;
|
||||
/* Update local copy for the controller this pulse came from */
|
||||
/* ... before the flags are cleared and setup for the reply. */
|
||||
unsigned int controller = (hbs_sock.rx_mesg[iface].f & CTRLX_MASK ) >> CTRLX_BIT ;
|
||||
|
||||
/* Manage OOB flags */
|
||||
hbs_sock.rx_mesg[iface].f = flags ;
|
||||
@ -807,23 +839,102 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
{
|
||||
hbs_sock.rx_mesg[iface].f |= ( PMOND_FLAG ) ;
|
||||
}
|
||||
|
||||
if ( infra_network_provisioned == true )
|
||||
{
|
||||
hbs_sock.rx_mesg[iface].f |= INFRA_FLAG ;
|
||||
}
|
||||
|
||||
#define WANT_CLUSTER_INFO_LOG
|
||||
#ifdef WANT_CLUSTER_INFO_LOG
|
||||
/* Log the received cluster info */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
/*************************************************************************
|
||||
***** C L U S T E R D A T A M A N A G E M E N T ******
|
||||
* *
|
||||
* TODO: Add support for 3 controllers.
|
||||
* Only 2 suppoerted by some of this code.
|
||||
***** ******/
|
||||
|
||||
if ( controller >= MTCE_HBS_MAX_CONTROLLERS )
|
||||
{
|
||||
char str[100] ;
|
||||
// hbs_cluster_log (hbs_sock.rx_mesg[iface].cluster, hbs_sock.rx_mesg[iface].s );
|
||||
snprintf ( &str[0], 100, " seq %6d with %d bytes from %s ", hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
|
||||
string hostname = my_hostname ;
|
||||
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
|
||||
wlog ("invalid controller number: %d ; dropping message", controller );
|
||||
return ( FAIL_INVALID_DATA );
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Manage the Resource Reference Index (RRI) "lookup clue"
|
||||
* With the introduction of active-active heartbeating the hbsClient
|
||||
* is responsible for servicing pulses from both controllers.
|
||||
* This means that hbsClient needs to manage an rri for each controller. */
|
||||
if ( ! strncmp ( &hbs_sock.rx_mesg[iface].m[HBS_HEADER_SIZE], &my_hostname[0], MAX_CHARS_HOSTNAME ))
|
||||
{
|
||||
if( rri[controller] != hbs_sock.rx_mesg[iface].c )
|
||||
{
|
||||
rri[controller] = hbs_sock.rx_mesg[iface].c ;
|
||||
ilog ("Caching New RRI: %d (from controller-%d)\n", rri[controller], controller );
|
||||
}
|
||||
}
|
||||
|
||||
/* Log the received cluster info
|
||||
* ... if the message version shows that it is supported */
|
||||
if ( hbs_sock.rx_mesg[iface].v )
|
||||
{
|
||||
char str[MAX_LEN] ;
|
||||
snprintf ( &str[0], MAX_LEN, " seq %6d with %d bytes from %s ", (int)hbs_sock.rx_mesg[iface].s, rx_bytes, get_iface_name_str(iface));
|
||||
hbs_cluster_log ( hostname, hbs_sock.rx_mesg[iface].cluster, str );
|
||||
|
||||
/* add the controller back in */
|
||||
hbs_sock.rx_mesg[iface].f |= ( controller << CTRLX_BIT );
|
||||
|
||||
/* Add my RRI to the response message */
|
||||
hbs_sock.rx_mesg[iface].c = rri[controller] ;
|
||||
|
||||
if ( hbs_sock.rx_mesg[iface].cluster.histories > MTCE_HBS_MAX_NETWORKS )
|
||||
{
|
||||
slog ("controller-%d provided %d network histories ; max is %d per controller",
|
||||
controller,
|
||||
hbs_sock.rx_mesg[iface].cluster.histories,
|
||||
MTCE_HBS_MAX_NETWORKS );
|
||||
}
|
||||
else if ( hbs_sock.rx_mesg[iface].cluster.bytes != ( BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)))
|
||||
{
|
||||
slog ("controller-%d provided %d bytes of history ; expected %d",
|
||||
controller,
|
||||
hbs_sock.rx_mesg[iface].cluster.bytes,
|
||||
(unsigned short)(BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories)));
|
||||
}
|
||||
else if ( hbs_sock.rx_mesg[iface].cluster.histories )
|
||||
{
|
||||
hbs_cluster_copy ( hbs_sock.rx_mesg[iface].cluster,
|
||||
controller_cluster_cache[controller] );
|
||||
clog1 ("controller-%d cluster info from %s pulse request saved to cache",
|
||||
controller, get_iface_name_str(iface));
|
||||
|
||||
hbs_sock.rx_mesg[iface].cluster.histories = 0 ;
|
||||
|
||||
if ( have_other_controller_history ( controller ) == true )
|
||||
{
|
||||
/* Now copy the other controller's cached cluster info into
|
||||
* this controlers response */
|
||||
hbs_cluster_copy ( controller_cluster_cache[controller?0:1],
|
||||
hbs_sock.rx_mesg[iface].cluster );
|
||||
|
||||
if ( daemon_get_cfg_ptr()->debug_state & 4 )
|
||||
{
|
||||
string dump_banner = "" ;
|
||||
dump_banner.append("controller-") ;
|
||||
dump_banner.append(itos(controller?0:1));
|
||||
dump_banner.append(" cluster info from cache injected into controller-");
|
||||
dump_banner.append(itos(controller));
|
||||
dump_banner.append(":");
|
||||
dump_banner.append(get_iface_name_str(iface));
|
||||
dump_banner.append(" pulse response");
|
||||
hbs_cluster_dump ( hbs_sock.rx_mesg[iface].cluster, dump_banner, true );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Cluster Data management end */
|
||||
|
||||
/* replace the request header with the response header */
|
||||
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
|
||||
|
||||
#ifdef WANT_PULSE_RESPONSE_FIT
|
||||
if (( iface == INFRA_IFACE ) && ( daemon_is_file_present ( MTC_CMD_FIT__NO_INFRA_RSP )))
|
||||
@ -839,29 +950,11 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
}
|
||||
#endif
|
||||
|
||||
int rc = PASS ;
|
||||
|
||||
/* replace the request header with the response header */
|
||||
memcpy ( &hbs_sock.rx_mesg[iface].m[0], &pulse_resp_tx_hdr[0], HBS_MAX_MSG );
|
||||
|
||||
/* Deal with the cluster info if it exists.
|
||||
* ... Introduced in messaging version 1 */
|
||||
if ( hbs_sock.rx_mesg[iface].v >= HBS_MESSAGE_VERSION )
|
||||
{
|
||||
if ( hbs_sock.rx_mesg[iface].cluster.version < MTCE_HBS_CLUSTER_VERSION )
|
||||
{
|
||||
ilog ("Bad cluster verison (%d)", hbs_sock.rx_mesg[iface].cluster.version);
|
||||
}
|
||||
// if ( hbs_sock.rx_mesg[iface].cluster.revision != MTCE_HBS_CLUSTER_REVISION )
|
||||
// {
|
||||
// ilog ("Bad cluster revision (%d)", hbs_sock.rx_mesg[iface].cluster.revision);
|
||||
// }
|
||||
|
||||
/* Add peer controller cluster data to this controller's response */
|
||||
// hbs_cluster_loop(hbs_sock.rx_mesg[iface]);
|
||||
}
|
||||
/* reuse the rx_bytes variable */
|
||||
rx_bytes = sizeof(hbs_message_type)-sizeof(mtce_hbs_cluster_type)+BYTES_IN_CLUSTER_VAULT(hbs_sock.rx_mesg[iface].cluster.histories);
|
||||
|
||||
/* send pulse response message */
|
||||
int rc = PASS ;
|
||||
int tx_bytes = hbs_sock.tx_sock[iface]->reply(hbs_sock.rx_sock[iface],(char*)&hbs_sock.rx_mesg[iface], rx_bytes);
|
||||
if ( tx_bytes == -1 )
|
||||
{
|
||||
@ -884,15 +977,15 @@ int _service_pulse_request ( iface_enum iface , unsigned int flags )
|
||||
}
|
||||
else
|
||||
{
|
||||
mlog ("%s Pulse Rsp: %s:%5d: %d:%d:%s RRI:%d (%d:%d:%d)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
pmonPulse_counter, rx_bytes, tx_bytes);
|
||||
mlog ("%s Pulse Rsp: %s:%d: s:%d f:%x [%s] RRI:%d (%x:%d:%d)\n",
|
||||
get_iface_name_str(iface),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->toString(),
|
||||
hbs_sock.tx_sock[iface]->get_dst_addr()->getPort(),
|
||||
hbs_sock.rx_mesg[iface].s,
|
||||
hbs_sock.rx_mesg[iface].f,
|
||||
hbs_sock.rx_mesg[iface].m,
|
||||
hbs_sock.rx_mesg[iface].c,
|
||||
pmonPulse_counter, rx_bytes, tx_bytes);
|
||||
}
|
||||
|
||||
/* Clear the error count since we got a good receive */
|
||||
@ -984,6 +1077,10 @@ int daemon_init ( string iface, string nodeType_str )
|
||||
/* Initialize socket construct and pointer to it */
|
||||
memset ( &hbs_sock, 0, sizeof(hbs_sock));
|
||||
|
||||
/* Initialize the controller cluster view data bounce structure */
|
||||
for ( int c = 0 ; c < MTCE_HBS_MAX_CONTROLLERS ; c++ )
|
||||
memset ( &controller_cluster_cache[c], 0, sizeof(mtce_hbs_cluster_type)) ;
|
||||
|
||||
/* init the utility module */
|
||||
hbs_utils_init ();
|
||||
|
||||
@ -1007,6 +1104,11 @@ int daemon_init ( string iface, string nodeType_str )
|
||||
|
||||
/* convert node type to integer */
|
||||
my_nodetype = get_host_function_mask ( nodeType_str ) ;
|
||||
if ( my_nodetype & CONTROLLER_TYPE )
|
||||
{
|
||||
/* is controller but don't know what one yet. */
|
||||
set_hn((char*)CONTROLLER_X);
|
||||
}
|
||||
ilog ("Node Type : %s (%d)\n", nodeType_str.c_str(), my_nodetype );
|
||||
|
||||
/* Bind signal handlers */
|
||||
@ -1058,7 +1160,6 @@ int daemon_init ( string iface, string nodeType_str )
|
||||
|
||||
int stall_threshold_log = 0 ;
|
||||
int stall_times_threshold_log = 0 ;
|
||||
#define MAX_LEN 300
|
||||
void daemon_service_run ( void )
|
||||
{
|
||||
#ifdef WANT_DAEMON_DEBUG
|
||||
@ -1205,7 +1306,7 @@ void daemon_service_run ( void )
|
||||