Mtce: Make Multi-Node Failure Avoidance Configurable
The maintenance system implements a high availability (HA) feature designed to detect the simultaneous heartbeat failure of a group of hosts and avoid failing all those hosts until heartbeat resumes or after a set period of time. This feature is called Multi-Node Failure Avoidance, aka MNFA, and currently has the hosts threshold set to 3 and timeout set to 100 secs. This update implements enhancements to that existing feature by making the 'number-of-hosts threshold' and 'timeout period' customer configurable service parameters. The new service parameters are listed under platform:maintenance which display with the following command > system service-parameter-list mnfa_threshold: This new label and value is added to the puppet managed /etc/mtc.ini and represents the number of hosts that are required to fail heartbeat as a group; within the heartbeat failure window (heartbeat_failure_threshold) after which maintenance activates MNFA Mode. This update changes the default number of failing hosts from 3 to 2 while allowing a configurable range from 2 to 100. mnfa_timeout: This new label and value is added to the puppet managed /etc/mtc.ini. While MNFA mode is active, it will remain active until the number of failing hosts drop below the mnfa_threshold or this timer expires. The MNFA mode deactivates on the first occurance of either case. Upon deactivation the remaining failed hosts are no longer treated as a failure group but instead are all Gracefully Recovered individually. A value of zero imposes no timeout making the deactivation criteria solely host based. This update changes the default 100 second timer to 0; no-timeout while permitting valid a times range from 100 to 86400 secs or 1 day. Test Plan: PASS - Verify duplex and 4 compute DOR PASS - Verify default MNFA - 1 inactive controller and 4 computes PASS - Verify default MNFA - 4 computes PASS - Verify default MNFA - 1 active controller and 3 computes and failed host PASS - Verify Single host heartbeat failure handling - fail host PASS - Verify Multi Node failure below mnfa_threshold - fail hosts PASS - Verify MNFA handling with timeout of zero and threshold of 3 PASS - Verify MNFA timeout handling with timeout set at 100 sec PASS - Verify MNFA service parameter lising, default value and mtc.ini PASS - Verify MNFA service parameter change and inservice apply PASS - Verify MNFA timeout service parameter change from value to 0 PASS - Verify MNFA timeout service parameter change from to inrange value PASS - Verify MNFA service parametrer out of range change handling PASS - Verify MNFA timeout change from No-Timeout to 100 sec (while active) DocImpact Story: 2003576 Task: 24903 Change-Id: Ib56dd79b38c3726e042cf34aae361f229c89940b Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
482d1acea8
commit
82e851d651
@ -1,3 +1,3 @@
|
||||
SRC_DIR="cgts-mtce-common-1.0"
|
||||
TIS_PATCH_VER=135
|
||||
TIS_PATCH_VER=136
|
||||
BUILD_IS_SLOW=5
|
||||
|
@ -147,21 +147,6 @@ typedef struct
|
||||
|
||||
int latency_thld ; /**< scheduling latency threshold in msec b4 log */
|
||||
|
||||
/** Multi Node Failure Avoidance Controls */
|
||||
char * mnfa_threshold_type ; /**< value used in multi node failure
|
||||
avoidance calculation ;
|
||||
'number' / 'percent'age of hosts */
|
||||
int mnfa_threshold_percent ; /**< number of hosts simultaneously
|
||||
failing heartbeat */
|
||||
int mnfa_threshold_number ; /**< percentage of pool
|
||||
simultanepously failing heartbeat*/
|
||||
int mnfa_recovery_threshold ; /**< Multi-Node-Failure Avoidance Recovery Threshold
|
||||
Similar to the LOC above for graceful recovery
|
||||
hosts that have LOC for longer than this time in
|
||||
seconds are failed and sent into the enable_handler
|
||||
FSM while those that recover before this period are
|
||||
sent into the graceful recovery_handler FSM. */
|
||||
|
||||
/** Configurable Timeouts ; unit is 'seconds' */
|
||||
int controller_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
||||
int compute_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
||||
@ -172,7 +157,6 @@ typedef struct
|
||||
int sysinv_noncrit_timeout ; /**< sysinv nonc request timeout */
|
||||
int work_queue_timeout ; /**< end of action workq complete TO */
|
||||
int loc_recovery_timeout ; /**< loss of comms recovery timeout */
|
||||
int mnfa_recovery_timeout ; /**< mnfa recovery timeout */
|
||||
int node_reinstall_timeout ; /**< node reinstall timeout */
|
||||
int dor_mode_timeout ; /**< dead office recovery timeout */
|
||||
int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */
|
||||
|
@ -218,7 +218,6 @@ nodeLinkClass::nodeLinkClass()
|
||||
this->controller_mtcalive_timeout = 0;
|
||||
this->goenabled_timeout = 0;
|
||||
this->loc_recovery_timeout = 0;
|
||||
this->mnfa_recovery_timeout = 0;
|
||||
this->node_reinstall_timeout = 0;
|
||||
this->token_refresh_rate = 0;
|
||||
this->autorecovery_enabled = false ;
|
||||
@ -270,16 +269,16 @@ nodeLinkClass::nodeLinkClass()
|
||||
active_controller_hostname.clear() ;
|
||||
inactive_controller_hostname.clear() ;
|
||||
|
||||
/* MNFA Activity Controls */
|
||||
mnfa_threshold = 2 ; /* 2 hosts */
|
||||
mnfa_timeout = 0 ; /* no timeout */
|
||||
|
||||
/* Start with no failures */
|
||||
mnfa_awol_list.clear();
|
||||
mnfa_host_count[MGMNT_IFACE] = 0 ;
|
||||
mnfa_host_count[INFRA_IFACE] = 0 ;
|
||||
mnfa_occurances = 0 ;
|
||||
mnfa_active = false ;
|
||||
mnfa_threshold_type = MNFA_NUMBER ;
|
||||
mnfa_threshold_percent = 5 ;
|
||||
mnfa_threshold_number = 3 ;
|
||||
mnfa_threshold = mnfa_threshold_number ;
|
||||
|
||||
mgmnt_link_up_and_running = false ;
|
||||
infra_link_up_and_running = false ;
|
||||
@ -4303,16 +4302,15 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
||||
{
|
||||
/* clear it - possibly temporarily */
|
||||
node_ptr->hbs_minor[iface] = false ;
|
||||
|
||||
|
||||
/* manage counts over heartbeat failure */
|
||||
if ( mnfa_host_count[iface] )
|
||||
{
|
||||
/* If we are mnfa_active AND now below the threshold
|
||||
/* If we are mnfa_active AND now below the threshold
|
||||
* then trigger mnfa_exit */
|
||||
if (( --mnfa_host_count[iface] < mnfa_calculate_threshold( node_ptr->hostname ) ) &&
|
||||
if (( --mnfa_host_count[iface] < mnfa_threshold) &&
|
||||
( mnfa_active == true ))
|
||||
{
|
||||
|
||||
wlog ("%s MNFA exit with graceful recovery (%s:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_iface_name_str(iface),
|
||||
@ -4468,6 +4466,8 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
}
|
||||
else
|
||||
{
|
||||
alarm_enabled_failure (node_ptr);
|
||||
|
||||
mnfa_add_host ( node_ptr , iface );
|
||||
|
||||
if ( mnfa_active == false )
|
||||
@ -4481,17 +4481,13 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
{
|
||||
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
|
||||
}
|
||||
if ( mnfa_host_count[iface] < mnfa_calculate_threshold( hostname ))
|
||||
if (mnfa_host_count[iface] < this->mnfa_threshold)
|
||||
{
|
||||
|
||||
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
|
||||
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL;
|
||||
}
|
||||
|
||||
alarm_enabled_failure (node_ptr);
|
||||
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
|
||||
@ -8296,19 +8292,12 @@ void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
|
||||
void nodeLinkClass::mem_log_mnfa ( void )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
|
||||
int temp = mnfa_threshold_number ;
|
||||
if ( mnfa_threshold_type == MNFA_PERCENT )
|
||||
temp = mnfa_threshold_percent ;
|
||||
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: Mode:%s:%d State:%s Hosts:%d:%d Cases:%d Threshold:%d\n",
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: State:%s Hosts:%d:%d Threshold:%d Occurances:%d\n",
|
||||
my_hostname.c_str(),
|
||||
mnfa_threshold_type ? "Percent" : "Number",
|
||||
temp,
|
||||
mnfa_active ? "ACTIVE" : "inactive",
|
||||
mnfa_host_count[MGMNT_IFACE],
|
||||
mnfa_host_count[INFRA_IFACE],
|
||||
mnfa_calculate_threshold( "" ),
|
||||
mnfa_threshold,
|
||||
mnfa_occurances);
|
||||
mem_log (str);
|
||||
}
|
||||
@ -8316,7 +8305,7 @@ void nodeLinkClass::mem_log_mnfa ( void )
|
||||
void nodeLinkClass::mem_log_general_mtce_hosts ( void )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s EnableHosts -> Cont:%d Comp:%d Stor:%d StorType:%d\n",
|
||||
my_hostname.c_str(),
|
||||
num_controllers_enabled(),
|
||||
enabled_compute_nodes(),
|
||||
|
@ -1149,11 +1149,6 @@ private:
|
||||
/** Tracks the number of times multi failure avoidance was exited */
|
||||
int mnfa_occurances ;
|
||||
|
||||
/** true when the multi node failure count exceeds the multi
|
||||
* node failure avoidance threshold and until there are no more
|
||||
* in service trouble hosts */
|
||||
bool mnfa_active ;
|
||||
|
||||
/** Recover or exit from the muli-node failure avoidance state
|
||||
* This involves restarting the heartbeat on all the nodes
|
||||
* that remain hbs_minor and clearing any heartbneat degrade
|
||||
@ -1428,6 +1423,11 @@ public:
|
||||
std::list<string> hostname_inventory ;
|
||||
std::list<string>::iterator host ;
|
||||
|
||||
/** true when the multi node failure count exceeds the multi
|
||||
* node failure avoidance threshold and until there are no more
|
||||
* in service trouble hosts */
|
||||
bool mnfa_active ;
|
||||
|
||||
std::list<string> mnfa_awol_list ;
|
||||
void mnfa_timeout_handler ( void );
|
||||
|
||||
@ -1722,23 +1722,17 @@ public:
|
||||
int inotify_shadow_file_fd ;
|
||||
int inotify_shadow_file_wd ;
|
||||
|
||||
/** The multi node failure avoidance type */
|
||||
#define MNFA_NUMBER 0
|
||||
#define MNFA_PERCENT 1
|
||||
int mnfa_threshold_type ;
|
||||
/* MNFA Timeout
|
||||
*
|
||||
* Time in secs MNFA can remain active.
|
||||
* If 0 then there is no timeout. */
|
||||
int mnfa_timeout ;
|
||||
|
||||
/** % of hosts that need to simultaneously fail before 'mnfa' kicks in */
|
||||
int mnfa_threshold_percent ;
|
||||
|
||||
/** # of hosts that need to simultaneously fail before 'mnfa' kicks in */
|
||||
int mnfa_threshold_number ;
|
||||
|
||||
/** the calculated threshold */
|
||||
/* MNFA Host Involvement Threshold
|
||||
* Number of hosts simultaneously failing heartbeat
|
||||
* upon which feature will activate */
|
||||
int mnfa_threshold ;
|
||||
|
||||
/** Calculates and returns the mnfa threshold based on enabled hosts */
|
||||
int mnfa_calculate_threshold ( string hostname );
|
||||
|
||||
/* collectd event handler */
|
||||
int collectd_notify_handler ( string & hostname,
|
||||
string & resource,
|
||||
@ -1997,7 +1991,6 @@ public:
|
||||
int sysinv_timeout ;
|
||||
int sysinv_noncrit_timeout ;
|
||||
int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */
|
||||
int mnfa_recovery_timeout; /**< Multi-Node-Failure Avoidance Recovery Timeout */
|
||||
int work_queue_timeout ;
|
||||
int node_reinstall_timeout ;
|
||||
|
||||
|
@ -46,7 +46,6 @@ void daemon_config_default ( daemon_config_type* config_ptr )
|
||||
config_ptr->sysinv_api_bind_ip = strdup("none");
|
||||
config_ptr->mode = strdup("none");
|
||||
config_ptr->fit_host = strdup("none");
|
||||
config_ptr->mnfa_threshold_type = strdup("none");
|
||||
config_ptr->multicast = strdup("none");
|
||||
|
||||
config_ptr->debug_all = 0 ;
|
||||
@ -174,14 +173,6 @@ int timeout_config_handler ( void * user,
|
||||
config_ptr->loc_recovery_timeout = atoi(value);
|
||||
ilog ("LOC Timeout: %3d secs\n", config_ptr->loc_recovery_timeout );
|
||||
}
|
||||
else if (MATCH("timeouts", "mnfa_recovery_timeout"))
|
||||
{
|
||||
config_ptr->mnfa_recovery_timeout = atoi(value);
|
||||
if (( config_ptr->mnfa_recovery_timeout > 300 ) ||
|
||||
( config_ptr->mnfa_recovery_timeout == 0 ))
|
||||
config_ptr->mnfa_recovery_timeout = 5 ;
|
||||
ilog ("MNFA Timeout: %3d secs\n", config_ptr->mnfa_recovery_timeout );
|
||||
}
|
||||
else if (MATCH("timeouts", "node_reinstall_timeout"))
|
||||
{
|
||||
config_ptr->node_reinstall_timeout = atoi(value);
|
||||
@ -238,7 +229,7 @@ void daemon_dump_cfg ( void )
|
||||
|
||||
ilog ("Configuration Settings\n------------------------------\n");
|
||||
if ( ptr->scheduling_priority ) { ilog ("scheduling_priority = %d\n", ptr->scheduling_priority ); }
|
||||
|
||||
|
||||
if ( ptr->infra_degrade_only ) { ilog ("infra_degrade_only = %s\n", ptr->infra_degrade_only ? "Yes" : "No" );}
|
||||
if ( ptr->need_infra_poll_audit ) { ilog ("need_infra_poll_audit = %s\n", ptr->need_infra_poll_audit ? "Yes" : "No" );}
|
||||
if ( ptr->active ) { ilog ("active = %s\n", ptr->active ? "Yes" : "No" );}
|
||||
@ -254,7 +245,6 @@ void daemon_dump_cfg ( void )
|
||||
if ( strcmp(ptr->infra_iface, "none" )) { ilog ("infra_iface = %s\n", ptr->infra_iface );}
|
||||
if ( strcmp(ptr->multicast, "none" )) { ilog ("multicast = %s\n", ptr->multicast );}
|
||||
|
||||
|
||||
if ( ptr->ha_port ) { ilog ("ha_port = %d\n", ptr->ha_port );}
|
||||
if ( ptr->vim_cmd_port ) { ilog ("vim_cmd_port = %d\n", ptr->vim_cmd_port );}
|
||||
if ( ptr->vim_event_port ) { ilog ("vim_event_port = %d\n", ptr->vim_event_port );}
|
||||
@ -286,7 +276,7 @@ void daemon_dump_cfg ( void )
|
||||
if ( ptr->hwmon_cmd_port ) { ilog ("hwmon_cmd_port = %d\n", ptr->hwmon_cmd_port );}
|
||||
if ( ptr->hbs_to_mtc_event_port) { ilog ("hbs_to_mtc_event_port = %d\n", ptr->hbs_to_mtc_event_port);}
|
||||
if ( ptr->inv_event_port ) { ilog ("inv_event_port = %d\n", ptr->inv_event_port );}
|
||||
|
||||
|
||||
/* rmond */
|
||||
if ( ptr->per_node ) { ilog ("per_node = %d\n", ptr->per_node );}
|
||||
if ( ptr->audit_period ) { ilog ("audit_period = %d\n", ptr->audit_period );}
|
||||
@ -316,10 +306,6 @@ void daemon_dump_cfg ( void )
|
||||
if ( ptr->stall_rec_thld ) { ilog ("stall_rec_thld = %d\n", ptr->stall_rec_thld );}
|
||||
|
||||
/* mtcAgent */
|
||||
if ( ptr->mnfa_threshold_type ) { ilog ("mnfa_threshold_type = %s\n", ptr->mnfa_threshold_type );}
|
||||
if ( ptr->mnfa_threshold_percent ) { ilog ("mnfa_threshold_percent= %d\n", ptr->mnfa_threshold_percent );}
|
||||
if ( ptr->mnfa_threshold_number ) { ilog ("mnfa_threshold_number = %d\n", ptr->mnfa_threshold_number );}
|
||||
if ( ptr->mnfa_recovery_threshold ) { ilog ("mnfa_recovery_threshod= %d\n", ptr->mnfa_recovery_threshold );}
|
||||
if ( ptr->controller_mtcalive_timeout) { ilog ("controller_mtcalive_to= %d\n", ptr->controller_mtcalive_timeout );}
|
||||
if ( ptr->compute_mtcalive_timeout ) { ilog ("compute_mtcalive_to = %d\n", ptr->compute_mtcalive_timeout );}
|
||||
if ( ptr->goenabled_timeout ) { ilog ("goenabled_timeout = %d\n", ptr->goenabled_timeout );}
|
||||
@ -328,7 +314,6 @@ void daemon_dump_cfg ( void )
|
||||
if ( ptr->sysinv_noncrit_timeout ) { ilog ("sysinv_noncrit_timeout= %d\n", ptr->sysinv_noncrit_timeout );}
|
||||
if ( ptr->work_queue_timeout ) { ilog ("work_queue_timeout = %d\n", ptr->work_queue_timeout );}
|
||||
if ( ptr->loc_recovery_timeout ) { ilog ("loc_recovery_timeout = %d\n", ptr->loc_recovery_timeout );}
|
||||
if ( ptr->mnfa_recovery_timeout ) { ilog ("mnfa_recovery_timeout = %d\n", ptr->mnfa_recovery_timeout );}
|
||||
if ( ptr->node_reinstall_timeout ) { ilog ("node_reinstall_timeout= %d\n", ptr->node_reinstall_timeout );}
|
||||
if ( ptr->uptime_period ) { ilog ("uptime_period = %d\n", ptr->uptime_period );}
|
||||
if ( ptr->online_period ) { ilog ("online_period = %d\n", ptr->online_period );}
|
||||
|
@ -270,12 +270,6 @@ int daemon_log_message ( const char * hostname,
|
||||
return(PASS);
|
||||
}
|
||||
|
||||
|
||||
int nodeLinkClass::mnfa_calculate_threshold ( string hostname )
|
||||
{
|
||||
UNUSED(hostname);
|
||||
return(PASS) ;
|
||||
}
|
||||
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr, iface_enum iface )
|
||||
{ node_ptr = node_ptr ; iface = iface ; }
|
||||
void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
||||
|
@ -295,18 +295,6 @@ static int mtc_config_handler ( void * user,
|
||||
config_ptr->mask |= CONFIG_AGENT_API_RETRIES ;
|
||||
mtcInv.api_retries = config_ptr->api_retries ;
|
||||
}
|
||||
else if (MATCH("agent", "mnfa_threshold_type"))
|
||||
{
|
||||
config_ptr->mnfa_threshold_type = strdup(value);
|
||||
}
|
||||
else if (MATCH("agent", "mnfa_threshold_percent"))
|
||||
{
|
||||
config_ptr->mnfa_threshold_percent = atoi(value);
|
||||
}
|
||||
else if (MATCH("agent", "mnfa_threshold_number"))
|
||||
{
|
||||
config_ptr->mnfa_threshold_number = atoi(value);
|
||||
}
|
||||
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
|
||||
{
|
||||
config_ptr->failsafe_shutdown_delay = atoi(value);
|
||||
@ -335,6 +323,55 @@ static int mtc_config_handler ( void * user,
|
||||
return (FAIL);
|
||||
}
|
||||
|
||||
static int mtc_ini_handler ( void * user,
|
||||
const char * section,
|
||||
const char * name,
|
||||
const char * value)
|
||||
{
|
||||
UNUSED(user);
|
||||
|
||||
if (MATCH("agent", "mnfa_threshold"))
|
||||
{
|
||||
mtcInv.mnfa_threshold = atoi(value);
|
||||
ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold);
|
||||
}
|
||||
else if (MATCH("timeouts", "mnfa_timeout"))
|
||||
{
|
||||
int old = mtcInv.mnfa_timeout ;
|
||||
mtcInv.mnfa_timeout = atoi(value);
|
||||
if ( mtcInv.mnfa_timeout == 0 )
|
||||
{
|
||||
ilog ("MNFA Timeout: Never\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
ilog ("MNFA Timeout: %3d secs\n", mtcInv.mnfa_timeout );
|
||||
}
|
||||
|
||||
/* handle a change in mnfa timeout while MNFA is active */
|
||||
if (( mtcInv.mnfa_active == true ) &&
|
||||
( mtcInv.mnfa_timeout != old ))
|
||||
{
|
||||
mtcTimer_reset ( mtcInv.mtcTimer_mnfa );
|
||||
if (( old == 0 ) || mtcInv.mnfa_timeout != 0 )
|
||||
{
|
||||
wlog ("MNFA Auto-Recovery in %d seconds\n",
|
||||
mtcInv.mnfa_timeout);
|
||||
|
||||
mtcTimer_start ( mtcInv.mtcTimer_mnfa,
|
||||
mtcTimer_handler,
|
||||
mtcInv.mnfa_timeout);
|
||||
}
|
||||
else if ( mtcInv.mnfa_timeout == 0 )
|
||||
{
|
||||
ilog ("MNFA timer set to no-timeout ; previous %d sec timer cancelled", old );
|
||||
}
|
||||
}
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
|
||||
|
||||
/* Read and process mtc.ini file settings into the daemon configuration */
|
||||
int daemon_configure ( void )
|
||||
{
|
||||
@ -350,6 +387,12 @@ int daemon_configure ( void )
|
||||
return (FAIL_LOAD_INI);
|
||||
}
|
||||
|
||||
if (ini_parse(MTCE_INI_FILE, mtc_ini_handler, &mtc_config) < 0)
|
||||
{
|
||||
elog ("Can't load '%s'\n", MTCE_INI_FILE );
|
||||
return (FAIL_LOAD_INI);
|
||||
}
|
||||
|
||||
if (ini_parse(MTCE_INI_FILE, keystone_config_handler, &mtc_config) < 0)
|
||||
{
|
||||
elog ("Can't load '%s'\n", MTCE_INI_FILE );
|
||||
@ -406,14 +449,12 @@ int daemon_configure ( void )
|
||||
mtcInv.goenabled_timeout = DEFAULT_GOENABLE_TIMEOUT ;
|
||||
|
||||
mtcInv.loc_recovery_timeout = mtc_config.loc_recovery_timeout ;
|
||||
mtcInv.mnfa_recovery_timeout = mtc_config.mnfa_recovery_timeout ;
|
||||
|
||||
if ( mtc_config.node_reinstall_timeout )
|
||||
mtcInv.node_reinstall_timeout = mtc_config.node_reinstall_timeout ;
|
||||
else
|
||||
mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;
|
||||
|
||||
|
||||
if ( mtc_config.dor_mode_timeout <= 0 )
|
||||
{
|
||||
slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
|
||||
@ -423,25 +464,6 @@ int daemon_configure ( void )
|
||||
mtc_config.dor_mode_timeout = DEFAULT_DOR_MODE_TIMEOUT ;
|
||||
}
|
||||
|
||||
/* validate and auto correct manage multi node failure avoidance thresholds */
|
||||
if (( mtc_config.mnfa_threshold_type != NULL ) &&
|
||||
( !strncmp (mtc_config.mnfa_threshold_type, "percent", strlen("percent"))))
|
||||
{
|
||||
if ( mtc_config.mnfa_threshold_percent > 100 )
|
||||
{
|
||||
mtc_config.mnfa_threshold_percent = 100 ;
|
||||
}
|
||||
mtcInv.mnfa_threshold_type = MNFA_PERCENT ;
|
||||
ilog ("mnfAvoidance: %d%c\n", mtc_config.mnfa_threshold_percent, '%' );
|
||||
mtcInv.mnfa_threshold_percent = mtc_config.mnfa_threshold_percent ;
|
||||
}
|
||||
else
|
||||
{
|
||||
mtcInv.mnfa_threshold_type = MNFA_NUMBER ;
|
||||
ilog ("mnfAvoidance: %d hosts\n", mtc_config.mnfa_threshold_number );
|
||||
mtcInv.mnfa_threshold_number = mtc_config.mnfa_threshold_number ;
|
||||
}
|
||||
|
||||
if ( mtc_config.swact_timeout )
|
||||
{
|
||||
if ( mtc_config.swact_timeout < (MTC_SWACT_POLL_TIMER*2))
|
||||
|
@ -1690,11 +1690,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Go to the goEnabled stage */
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
||||
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
alarm_enabled_failure(node_ptr);
|
||||
break ;
|
||||
}
|
||||
}
|
||||
@ -1732,11 +1728,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Go to the goEnabled stage */
|
||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
||||
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
alarm_enabled_failure (node_ptr);
|
||||
}
|
||||
}
|
||||
/* A timer ring indicates that the host is not up */
|
||||
@ -1780,11 +1772,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Inform the VIM that this host has failed */
|
||||
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
||||
|
||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
||||
{
|
||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
||||
}
|
||||
alarm_enabled_failure(node_ptr);
|
||||
|
||||
/* Clear all degrade flags except for the HWMON one */
|
||||
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
||||
node_ptr->degraded_resources_list.clear();
|
||||
|
@ -41,48 +41,6 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
|
||||
ilog ("MNFA POOL:%s\n", pool_list.c_str());
|
||||
}
|
||||
|
||||
/*******************************************************************************
|
||||
*
|
||||
* Name : mnfa_calculate_threshold
|
||||
*
|
||||
* Description: Calculates and returns the mnfa threshold based
|
||||
* on enabled hosts.
|
||||
*
|
||||
* Auto corrects the value to a min number.
|
||||
*
|
||||
* Calculate the multi-node failure avoidance handling threshold
|
||||
* This is the number of hosts than need to fail simultaneously
|
||||
* in order to trigger mode ; i.e. mnfa_active=true
|
||||
*
|
||||
*******************************************************************************/
|
||||
int nodeLinkClass::mnfa_calculate_threshold ( string hostname )
|
||||
{
|
||||
int mnfa_enabled_nodes = enabled_nodes ();
|
||||
|
||||
/* Calculate the threshold */
|
||||
if ( mnfa_threshold_type == MNFA_PERCENT )
|
||||
mnfa_threshold = mnfa_enabled_nodes / mnfa_threshold_percent ;
|
||||
else
|
||||
mnfa_threshold = mnfa_threshold_number ;
|
||||
|
||||
/* Don't allow the multi-node failure avoidance
|
||||
* to ever be 1 or we would never fail a host */
|
||||
if ( mnfa_threshold < mnfa_threshold_number )
|
||||
{
|
||||
ilog ("%s MNFA threshold rounded to %d from %d\n",
|
||||
hostname.c_str(),
|
||||
mnfa_threshold_number,
|
||||
mnfa_enabled_nodes / mnfa_threshold_percent );
|
||||
mnfa_threshold = mnfa_threshold_number ;
|
||||
}
|
||||
|
||||
if ( mnfa_awol_list.size() )
|
||||
{
|
||||
log_mnfa_pool ( mnfa_awol_list );
|
||||
}
|
||||
return (mnfa_threshold);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : mnfa_add_host
|
||||
@ -105,6 +63,8 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
||||
/* if we are active then add the node to the awol list */
|
||||
if ( mnfa_active == true )
|
||||
{
|
||||
alarm_enabled_failure (node_ptr);
|
||||
|
||||
/* once we are mnfa_active we need to give all the
|
||||
* hbs_minor=true hosts a graceful recovery token
|
||||
* mnfa_graceful_recovery = true and add to the awol list */
|
||||
@ -116,7 +76,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
||||
}
|
||||
else if (( mnfa_active == false ) &&
|
||||
( mnfa_host_count[iface] >= mnfa_calculate_threshold( node_ptr->hostname )))
|
||||
( mnfa_host_count[iface] >= this->mnfa_threshold))
|
||||
{
|
||||
enter = true ;
|
||||
}
|
||||
@ -134,6 +94,11 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
||||
get_iface_name_str(INFRA_IFACE),
|
||||
node_ptr->hbs_minor_count[INFRA_IFACE]);
|
||||
|
||||
if ( mnfa_awol_list.size() )
|
||||
{
|
||||
log_mnfa_pool ( mnfa_awol_list );
|
||||
}
|
||||
|
||||
if ( enter == true )
|
||||
{
|
||||
mnfa_enter ();
|
||||
@ -196,7 +161,7 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
||||
* mnfa_graceful_recovery token
|
||||
*
|
||||
* 5. Start the MNFA Auto-Recovery timer with time based on the config
|
||||
* setting mnfa_recovery_timeout
|
||||
* setting mnfa_timeout
|
||||
*
|
||||
****************************************************************************/
|
||||
void nodeLinkClass::mnfa_enter ( void )
|
||||
@ -211,8 +176,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
* previous mnfa but the failure case occurs again. If that
|
||||
* happens we need to cancel the timer that will issue
|
||||
* the period recovery command. */
|
||||
if ( mtcTimer_mnfa.tid )
|
||||
mtcTimer_stop ( mtcTimer_mnfa );
|
||||
mtcTimer_reset ( mtcTimer_mnfa );
|
||||
|
||||
/* Loop through inventory and recover each host that
|
||||
* remains in the hbs_minor state.
|
||||
@ -232,6 +196,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
|
||||
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
|
||||
|
||||
alarm_enabled_failure (ptr);
|
||||
}
|
||||
if (( ptr->next == NULL ) || ( ptr == tail ))
|
||||
break ;
|
||||
@ -239,14 +204,20 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
|
||||
mnfa_awol_list.unique();
|
||||
|
||||
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
||||
wlog ("MNFA Auto-Recovery in %d seconds\n", mnfa_recovery_timeout);
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, mnfa_recovery_timeout);
|
||||
if ( this->mnfa_timeout )
|
||||
{
|
||||
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
|
||||
}
|
||||
if ( mnfa_awol_list.size() )
|
||||
{
|
||||
log_mnfa_pool ( mnfa_awol_list );
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* Name : mnfa_enter
|
||||
* Name : mnfa_exit
|
||||
*
|
||||
* Description: Perform the operations required to exit mnfa mode
|
||||
* These include ...
|
||||
@ -266,7 +237,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
||||
* hosts that remain in the hbs_minor state.
|
||||
*
|
||||
* if ( force == true )
|
||||
* The mnfa_recovery_timeout has expired
|
||||
* The mnfa_timeout has expired
|
||||
* All hosts in the awol list are forced failed and into the
|
||||
* enable_handler FSM.
|
||||
* else
|
||||
@ -279,18 +250,20 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
||||
{
|
||||
if ( mnfa_active == true )
|
||||
{
|
||||
wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
|
||||
force ? "(Auto-Recover)" : "");
|
||||
|
||||
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
|
||||
mnfa_occurances++ ;
|
||||
mnfa_active = false ;
|
||||
|
||||
if ( force == true )
|
||||
{
|
||||
elog ("... MNFA %d sec timeout - forcing full enable on ... \n",
|
||||
mnfa_recovery_timeout);
|
||||
this->mnfa_timeout);
|
||||
}
|
||||
|
||||
wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
|
||||
force ? "(Auto-Recover)" : "");
|
||||
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
|
||||
|
||||
if ( mnfa_awol_list.size() )
|
||||
{
|
||||
log_mnfa_pool ( mnfa_awol_list );
|
||||
}
|
||||
|
||||
@ -342,8 +315,7 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
||||
}
|
||||
|
||||
/* Stop the ... failure -> full enable ... window timer if it is active */
|
||||
if ( mtcTimer_mnfa.tid )
|
||||
mtcTimer_stop ( mtcTimer_mnfa );
|
||||
mtcTimer_reset ( mtcTimer_mnfa );
|
||||
|
||||
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
|
||||
|
@ -18,14 +18,6 @@ offline_threshold = 46 ; number of back to back mtcAlive requests missed
|
||||
; 100:46 will yield a typical 5 sec holdoff from
|
||||
; failed to offline
|
||||
|
||||
mnfa_threshold_type = number ; Two different types are supported
|
||||
; 'number' or 'percent' of simultaneous
|
||||
; failures that enable multi-node
|
||||
; failure avoidance handling
|
||||
;
|
||||
mnfa_threshold_percent = 10 ; if ( mnfa_threshold_type == percent )
|
||||
mnfa_threshold_number = 3 ; if ( mnfa_threshold_type == number )
|
||||
|
||||
inventory_port = 6385 ; The Inventory Port Number
|
||||
keystone_port = 5000 ; The Keystone Port Number
|
||||
ha_port = 7777 ; The Inventory Port Number
|
||||
@ -86,13 +78,6 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
|
||||
; the max number of seconds that a host can be in
|
||||
; loss of communication state without failing the unit
|
||||
|
||||
mnfa_recovery_timeout = 100 ; Multi-Node-Failure Avoidance Recovery Threshold
|
||||
; Similar to the LOC above for graceful recovery
|
||||
; hosts that have LOC for longer than this time in
|
||||
; seconds are failed and sent into the enable_handler
|
||||
; FSM while those that recover before this period are
|
||||
; sent into the graceful recovery_handler FSM.
|
||||
|
||||
dor_mode_timeout = 20 ; The default base time in seconds for how long
|
||||
; maintenance DOR mode is active. This number
|
||||
; is extended by the number of enabled hosts.
|
||||
|
Loading…
Reference in New Issue
Block a user