Mtce: Make Multi-Node Failure Avoidance Configurable
The maintenance system implements a high availability (HA) feature designed to detect the simultaneous heartbeat failure of a group of hosts and avoid failing all those hosts until heartbeat resumes or after a set period of time. This feature is called Multi-Node Failure Avoidance, aka MNFA, and currently has the hosts threshold set to 3 and timeout set to 100 secs. This update implements enhancements to that existing feature by making the 'number-of-hosts threshold' and 'timeout period' customer configurable service parameters. The new service parameters are listed under platform:maintenance which display with the following command > system service-parameter-list mnfa_threshold: This new label and value is added to the puppet managed /etc/mtc.ini and represents the number of hosts that are required to fail heartbeat as a group; within the heartbeat failure window (heartbeat_failure_threshold) after which maintenance activates MNFA Mode. This update changes the default number of failing hosts from 3 to 2 while allowing a configurable range from 2 to 100. mnfa_timeout: This new label and value is added to the puppet managed /etc/mtc.ini. While MNFA mode is active, it will remain active until the number of failing hosts drop below the mnfa_threshold or this timer expires. The MNFA mode deactivates on the first occurance of either case. Upon deactivation the remaining failed hosts are no longer treated as a failure group but instead are all Gracefully Recovered individually. A value of zero imposes no timeout making the deactivation criteria solely host based. This update changes the default 100 second timer to 0; no-timeout while permitting valid a times range from 100 to 86400 secs or 1 day. Test Plan: PASS - Verify duplex and 4 compute DOR PASS - Verify default MNFA - 1 inactive controller and 4 computes PASS - Verify default MNFA - 4 computes PASS - Verify default MNFA - 1 active controller and 3 computes and failed host PASS - Verify Single host heartbeat failure handling - fail host PASS - Verify Multi Node failure below mnfa_threshold - fail hosts PASS - Verify MNFA handling with timeout of zero and threshold of 3 PASS - Verify MNFA timeout handling with timeout set at 100 sec PASS - Verify MNFA service parameter lising, default value and mtc.ini PASS - Verify MNFA service parameter change and inservice apply PASS - Verify MNFA timeout service parameter change from value to 0 PASS - Verify MNFA timeout service parameter change from to inrange value PASS - Verify MNFA service parametrer out of range change handling PASS - Verify MNFA timeout change from No-Timeout to 100 sec (while active) DocImpact Story: 2003576 Task: 24903 Change-Id: Ib56dd79b38c3726e042cf34aae361f229c89940b Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
482d1acea8
commit
82e851d651
@ -1,3 +1,3 @@
|
|||||||
SRC_DIR="cgts-mtce-common-1.0"
|
SRC_DIR="cgts-mtce-common-1.0"
|
||||||
TIS_PATCH_VER=135
|
TIS_PATCH_VER=136
|
||||||
BUILD_IS_SLOW=5
|
BUILD_IS_SLOW=5
|
||||||
|
@ -147,21 +147,6 @@ typedef struct
|
|||||||
|
|
||||||
int latency_thld ; /**< scheduling latency threshold in msec b4 log */
|
int latency_thld ; /**< scheduling latency threshold in msec b4 log */
|
||||||
|
|
||||||
/** Multi Node Failure Avoidance Controls */
|
|
||||||
char * mnfa_threshold_type ; /**< value used in multi node failure
|
|
||||||
avoidance calculation ;
|
|
||||||
'number' / 'percent'age of hosts */
|
|
||||||
int mnfa_threshold_percent ; /**< number of hosts simultaneously
|
|
||||||
failing heartbeat */
|
|
||||||
int mnfa_threshold_number ; /**< percentage of pool
|
|
||||||
simultanepously failing heartbeat*/
|
|
||||||
int mnfa_recovery_threshold ; /**< Multi-Node-Failure Avoidance Recovery Threshold
|
|
||||||
Similar to the LOC above for graceful recovery
|
|
||||||
hosts that have LOC for longer than this time in
|
|
||||||
seconds are failed and sent into the enable_handler
|
|
||||||
FSM while those that recover before this period are
|
|
||||||
sent into the graceful recovery_handler FSM. */
|
|
||||||
|
|
||||||
/** Configurable Timeouts ; unit is 'seconds' */
|
/** Configurable Timeouts ; unit is 'seconds' */
|
||||||
int controller_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
int controller_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
||||||
int compute_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
int compute_mtcalive_timeout ; /**< mtcAlive wait timeout */
|
||||||
@ -172,7 +157,6 @@ typedef struct
|
|||||||
int sysinv_noncrit_timeout ; /**< sysinv nonc request timeout */
|
int sysinv_noncrit_timeout ; /**< sysinv nonc request timeout */
|
||||||
int work_queue_timeout ; /**< end of action workq complete TO */
|
int work_queue_timeout ; /**< end of action workq complete TO */
|
||||||
int loc_recovery_timeout ; /**< loss of comms recovery timeout */
|
int loc_recovery_timeout ; /**< loss of comms recovery timeout */
|
||||||
int mnfa_recovery_timeout ; /**< mnfa recovery timeout */
|
|
||||||
int node_reinstall_timeout ; /**< node reinstall timeout */
|
int node_reinstall_timeout ; /**< node reinstall timeout */
|
||||||
int dor_mode_timeout ; /**< dead office recovery timeout */
|
int dor_mode_timeout ; /**< dead office recovery timeout */
|
||||||
int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */
|
int dor_recovery_timeout_ext ; /**< dor recovery timeout extension */
|
||||||
|
@ -218,7 +218,6 @@ nodeLinkClass::nodeLinkClass()
|
|||||||
this->controller_mtcalive_timeout = 0;
|
this->controller_mtcalive_timeout = 0;
|
||||||
this->goenabled_timeout = 0;
|
this->goenabled_timeout = 0;
|
||||||
this->loc_recovery_timeout = 0;
|
this->loc_recovery_timeout = 0;
|
||||||
this->mnfa_recovery_timeout = 0;
|
|
||||||
this->node_reinstall_timeout = 0;
|
this->node_reinstall_timeout = 0;
|
||||||
this->token_refresh_rate = 0;
|
this->token_refresh_rate = 0;
|
||||||
this->autorecovery_enabled = false ;
|
this->autorecovery_enabled = false ;
|
||||||
@ -270,16 +269,16 @@ nodeLinkClass::nodeLinkClass()
|
|||||||
active_controller_hostname.clear() ;
|
active_controller_hostname.clear() ;
|
||||||
inactive_controller_hostname.clear() ;
|
inactive_controller_hostname.clear() ;
|
||||||
|
|
||||||
|
/* MNFA Activity Controls */
|
||||||
|
mnfa_threshold = 2 ; /* 2 hosts */
|
||||||
|
mnfa_timeout = 0 ; /* no timeout */
|
||||||
|
|
||||||
/* Start with no failures */
|
/* Start with no failures */
|
||||||
mnfa_awol_list.clear();
|
mnfa_awol_list.clear();
|
||||||
mnfa_host_count[MGMNT_IFACE] = 0 ;
|
mnfa_host_count[MGMNT_IFACE] = 0 ;
|
||||||
mnfa_host_count[INFRA_IFACE] = 0 ;
|
mnfa_host_count[INFRA_IFACE] = 0 ;
|
||||||
mnfa_occurances = 0 ;
|
mnfa_occurances = 0 ;
|
||||||
mnfa_active = false ;
|
mnfa_active = false ;
|
||||||
mnfa_threshold_type = MNFA_NUMBER ;
|
|
||||||
mnfa_threshold_percent = 5 ;
|
|
||||||
mnfa_threshold_number = 3 ;
|
|
||||||
mnfa_threshold = mnfa_threshold_number ;
|
|
||||||
|
|
||||||
mgmnt_link_up_and_running = false ;
|
mgmnt_link_up_and_running = false ;
|
||||||
infra_link_up_and_running = false ;
|
infra_link_up_and_running = false ;
|
||||||
@ -4309,10 +4308,9 @@ void nodeLinkClass::hbs_minor_clear ( struct nodeLinkClass::node * node_ptr, ifa
|
|||||||
{
|
{
|
||||||
/* If we are mnfa_active AND now below the threshold
|
/* If we are mnfa_active AND now below the threshold
|
||||||
* then trigger mnfa_exit */
|
* then trigger mnfa_exit */
|
||||||
if (( --mnfa_host_count[iface] < mnfa_calculate_threshold( node_ptr->hostname ) ) &&
|
if (( --mnfa_host_count[iface] < mnfa_threshold) &&
|
||||||
( mnfa_active == true ))
|
( mnfa_active == true ))
|
||||||
{
|
{
|
||||||
|
|
||||||
wlog ("%s MNFA exit with graceful recovery (%s:%d)\n",
|
wlog ("%s MNFA exit with graceful recovery (%s:%d)\n",
|
||||||
node_ptr->hostname.c_str(),
|
node_ptr->hostname.c_str(),
|
||||||
get_iface_name_str(iface),
|
get_iface_name_str(iface),
|
||||||
@ -4468,6 +4466,8 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
alarm_enabled_failure (node_ptr);
|
||||||
|
|
||||||
mnfa_add_host ( node_ptr , iface );
|
mnfa_add_host ( node_ptr , iface );
|
||||||
|
|
||||||
if ( mnfa_active == false )
|
if ( mnfa_active == false )
|
||||||
@ -4481,17 +4481,13 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
|||||||
{
|
{
|
||||||
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
|
node_ptr->heartbeat_failed[MGMNT_IFACE] = true ;
|
||||||
}
|
}
|
||||||
if ( mnfa_host_count[iface] < mnfa_calculate_threshold( hostname ))
|
if (mnfa_host_count[iface] < this->mnfa_threshold)
|
||||||
{
|
{
|
||||||
|
|
||||||
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
|
elog ("%s %s network heartbeat failure\n", hostname.c_str(), get_iface_name_str(iface));
|
||||||
|
|
||||||
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
|
nodeLinkClass::set_availStatus ( hostname, MTC_AVAIL_STATUS__FAILED );
|
||||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
|
||||||
{
|
alarm_enabled_failure (node_ptr);
|
||||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
||||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
|
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ) &&
|
||||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
|
( node_ptr->adminAction != MTC_ADMIN_ACTION__UNLOCK ))
|
||||||
@ -8296,19 +8292,12 @@ void nodeLinkClass::mem_log_dor ( struct nodeLinkClass::node * node_ptr )
|
|||||||
void nodeLinkClass::mem_log_mnfa ( void )
|
void nodeLinkClass::mem_log_mnfa ( void )
|
||||||
{
|
{
|
||||||
char str[MAX_MEM_LOG_DATA] ;
|
char str[MAX_MEM_LOG_DATA] ;
|
||||||
|
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: State:%s Hosts:%d:%d Threshold:%d Occurances:%d\n",
|
||||||
int temp = mnfa_threshold_number ;
|
|
||||||
if ( mnfa_threshold_type == MNFA_PERCENT )
|
|
||||||
temp = mnfa_threshold_percent ;
|
|
||||||
|
|
||||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s MNFA: Mode:%s:%d State:%s Hosts:%d:%d Cases:%d Threshold:%d\n",
|
|
||||||
my_hostname.c_str(),
|
my_hostname.c_str(),
|
||||||
mnfa_threshold_type ? "Percent" : "Number",
|
|
||||||
temp,
|
|
||||||
mnfa_active ? "ACTIVE" : "inactive",
|
mnfa_active ? "ACTIVE" : "inactive",
|
||||||
mnfa_host_count[MGMNT_IFACE],
|
mnfa_host_count[MGMNT_IFACE],
|
||||||
mnfa_host_count[INFRA_IFACE],
|
mnfa_host_count[INFRA_IFACE],
|
||||||
mnfa_calculate_threshold( "" ),
|
mnfa_threshold,
|
||||||
mnfa_occurances);
|
mnfa_occurances);
|
||||||
mem_log (str);
|
mem_log (str);
|
||||||
}
|
}
|
||||||
|
@ -1149,11 +1149,6 @@ private:
|
|||||||
/** Tracks the number of times multi failure avoidance was exited */
|
/** Tracks the number of times multi failure avoidance was exited */
|
||||||
int mnfa_occurances ;
|
int mnfa_occurances ;
|
||||||
|
|
||||||
/** true when the multi node failure count exceeds the multi
|
|
||||||
* node failure avoidance threshold and until there are no more
|
|
||||||
* in service trouble hosts */
|
|
||||||
bool mnfa_active ;
|
|
||||||
|
|
||||||
/** Recover or exit from the muli-node failure avoidance state
|
/** Recover or exit from the muli-node failure avoidance state
|
||||||
* This involves restarting the heartbeat on all the nodes
|
* This involves restarting the heartbeat on all the nodes
|
||||||
* that remain hbs_minor and clearing any heartbneat degrade
|
* that remain hbs_minor and clearing any heartbneat degrade
|
||||||
@ -1428,6 +1423,11 @@ public:
|
|||||||
std::list<string> hostname_inventory ;
|
std::list<string> hostname_inventory ;
|
||||||
std::list<string>::iterator host ;
|
std::list<string>::iterator host ;
|
||||||
|
|
||||||
|
/** true when the multi node failure count exceeds the multi
|
||||||
|
* node failure avoidance threshold and until there are no more
|
||||||
|
* in service trouble hosts */
|
||||||
|
bool mnfa_active ;
|
||||||
|
|
||||||
std::list<string> mnfa_awol_list ;
|
std::list<string> mnfa_awol_list ;
|
||||||
void mnfa_timeout_handler ( void );
|
void mnfa_timeout_handler ( void );
|
||||||
|
|
||||||
@ -1722,23 +1722,17 @@ public:
|
|||||||
int inotify_shadow_file_fd ;
|
int inotify_shadow_file_fd ;
|
||||||
int inotify_shadow_file_wd ;
|
int inotify_shadow_file_wd ;
|
||||||
|
|
||||||
/** The multi node failure avoidance type */
|
/* MNFA Timeout
|
||||||
#define MNFA_NUMBER 0
|
*
|
||||||
#define MNFA_PERCENT 1
|
* Time in secs MNFA can remain active.
|
||||||
int mnfa_threshold_type ;
|
* If 0 then there is no timeout. */
|
||||||
|
int mnfa_timeout ;
|
||||||
|
|
||||||
/** % of hosts that need to simultaneously fail before 'mnfa' kicks in */
|
/* MNFA Host Involvement Threshold
|
||||||
int mnfa_threshold_percent ;
|
* Number of hosts simultaneously failing heartbeat
|
||||||
|
* upon which feature will activate */
|
||||||
/** # of hosts that need to simultaneously fail before 'mnfa' kicks in */
|
|
||||||
int mnfa_threshold_number ;
|
|
||||||
|
|
||||||
/** the calculated threshold */
|
|
||||||
int mnfa_threshold ;
|
int mnfa_threshold ;
|
||||||
|
|
||||||
/** Calculates and returns the mnfa threshold based on enabled hosts */
|
|
||||||
int mnfa_calculate_threshold ( string hostname );
|
|
||||||
|
|
||||||
/* collectd event handler */
|
/* collectd event handler */
|
||||||
int collectd_notify_handler ( string & hostname,
|
int collectd_notify_handler ( string & hostname,
|
||||||
string & resource,
|
string & resource,
|
||||||
@ -1997,7 +1991,6 @@ public:
|
|||||||
int sysinv_timeout ;
|
int sysinv_timeout ;
|
||||||
int sysinv_noncrit_timeout ;
|
int sysinv_noncrit_timeout ;
|
||||||
int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */
|
int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */
|
||||||
int mnfa_recovery_timeout; /**< Multi-Node-Failure Avoidance Recovery Timeout */
|
|
||||||
int work_queue_timeout ;
|
int work_queue_timeout ;
|
||||||
int node_reinstall_timeout ;
|
int node_reinstall_timeout ;
|
||||||
|
|
||||||
|
@ -46,7 +46,6 @@ void daemon_config_default ( daemon_config_type* config_ptr )
|
|||||||
config_ptr->sysinv_api_bind_ip = strdup("none");
|
config_ptr->sysinv_api_bind_ip = strdup("none");
|
||||||
config_ptr->mode = strdup("none");
|
config_ptr->mode = strdup("none");
|
||||||
config_ptr->fit_host = strdup("none");
|
config_ptr->fit_host = strdup("none");
|
||||||
config_ptr->mnfa_threshold_type = strdup("none");
|
|
||||||
config_ptr->multicast = strdup("none");
|
config_ptr->multicast = strdup("none");
|
||||||
|
|
||||||
config_ptr->debug_all = 0 ;
|
config_ptr->debug_all = 0 ;
|
||||||
@ -174,14 +173,6 @@ int timeout_config_handler ( void * user,
|
|||||||
config_ptr->loc_recovery_timeout = atoi(value);
|
config_ptr->loc_recovery_timeout = atoi(value);
|
||||||
ilog ("LOC Timeout: %3d secs\n", config_ptr->loc_recovery_timeout );
|
ilog ("LOC Timeout: %3d secs\n", config_ptr->loc_recovery_timeout );
|
||||||
}
|
}
|
||||||
else if (MATCH("timeouts", "mnfa_recovery_timeout"))
|
|
||||||
{
|
|
||||||
config_ptr->mnfa_recovery_timeout = atoi(value);
|
|
||||||
if (( config_ptr->mnfa_recovery_timeout > 300 ) ||
|
|
||||||
( config_ptr->mnfa_recovery_timeout == 0 ))
|
|
||||||
config_ptr->mnfa_recovery_timeout = 5 ;
|
|
||||||
ilog ("MNFA Timeout: %3d secs\n", config_ptr->mnfa_recovery_timeout );
|
|
||||||
}
|
|
||||||
else if (MATCH("timeouts", "node_reinstall_timeout"))
|
else if (MATCH("timeouts", "node_reinstall_timeout"))
|
||||||
{
|
{
|
||||||
config_ptr->node_reinstall_timeout = atoi(value);
|
config_ptr->node_reinstall_timeout = atoi(value);
|
||||||
@ -254,7 +245,6 @@ void daemon_dump_cfg ( void )
|
|||||||
if ( strcmp(ptr->infra_iface, "none" )) { ilog ("infra_iface = %s\n", ptr->infra_iface );}
|
if ( strcmp(ptr->infra_iface, "none" )) { ilog ("infra_iface = %s\n", ptr->infra_iface );}
|
||||||
if ( strcmp(ptr->multicast, "none" )) { ilog ("multicast = %s\n", ptr->multicast );}
|
if ( strcmp(ptr->multicast, "none" )) { ilog ("multicast = %s\n", ptr->multicast );}
|
||||||
|
|
||||||
|
|
||||||
if ( ptr->ha_port ) { ilog ("ha_port = %d\n", ptr->ha_port );}
|
if ( ptr->ha_port ) { ilog ("ha_port = %d\n", ptr->ha_port );}
|
||||||
if ( ptr->vim_cmd_port ) { ilog ("vim_cmd_port = %d\n", ptr->vim_cmd_port );}
|
if ( ptr->vim_cmd_port ) { ilog ("vim_cmd_port = %d\n", ptr->vim_cmd_port );}
|
||||||
if ( ptr->vim_event_port ) { ilog ("vim_event_port = %d\n", ptr->vim_event_port );}
|
if ( ptr->vim_event_port ) { ilog ("vim_event_port = %d\n", ptr->vim_event_port );}
|
||||||
@ -316,10 +306,6 @@ void daemon_dump_cfg ( void )
|
|||||||
if ( ptr->stall_rec_thld ) { ilog ("stall_rec_thld = %d\n", ptr->stall_rec_thld );}
|
if ( ptr->stall_rec_thld ) { ilog ("stall_rec_thld = %d\n", ptr->stall_rec_thld );}
|
||||||
|
|
||||||
/* mtcAgent */
|
/* mtcAgent */
|
||||||
if ( ptr->mnfa_threshold_type ) { ilog ("mnfa_threshold_type = %s\n", ptr->mnfa_threshold_type );}
|
|
||||||
if ( ptr->mnfa_threshold_percent ) { ilog ("mnfa_threshold_percent= %d\n", ptr->mnfa_threshold_percent );}
|
|
||||||
if ( ptr->mnfa_threshold_number ) { ilog ("mnfa_threshold_number = %d\n", ptr->mnfa_threshold_number );}
|
|
||||||
if ( ptr->mnfa_recovery_threshold ) { ilog ("mnfa_recovery_threshod= %d\n", ptr->mnfa_recovery_threshold );}
|
|
||||||
if ( ptr->controller_mtcalive_timeout) { ilog ("controller_mtcalive_to= %d\n", ptr->controller_mtcalive_timeout );}
|
if ( ptr->controller_mtcalive_timeout) { ilog ("controller_mtcalive_to= %d\n", ptr->controller_mtcalive_timeout );}
|
||||||
if ( ptr->compute_mtcalive_timeout ) { ilog ("compute_mtcalive_to = %d\n", ptr->compute_mtcalive_timeout );}
|
if ( ptr->compute_mtcalive_timeout ) { ilog ("compute_mtcalive_to = %d\n", ptr->compute_mtcalive_timeout );}
|
||||||
if ( ptr->goenabled_timeout ) { ilog ("goenabled_timeout = %d\n", ptr->goenabled_timeout );}
|
if ( ptr->goenabled_timeout ) { ilog ("goenabled_timeout = %d\n", ptr->goenabled_timeout );}
|
||||||
@ -328,7 +314,6 @@ void daemon_dump_cfg ( void )
|
|||||||
if ( ptr->sysinv_noncrit_timeout ) { ilog ("sysinv_noncrit_timeout= %d\n", ptr->sysinv_noncrit_timeout );}
|
if ( ptr->sysinv_noncrit_timeout ) { ilog ("sysinv_noncrit_timeout= %d\n", ptr->sysinv_noncrit_timeout );}
|
||||||
if ( ptr->work_queue_timeout ) { ilog ("work_queue_timeout = %d\n", ptr->work_queue_timeout );}
|
if ( ptr->work_queue_timeout ) { ilog ("work_queue_timeout = %d\n", ptr->work_queue_timeout );}
|
||||||
if ( ptr->loc_recovery_timeout ) { ilog ("loc_recovery_timeout = %d\n", ptr->loc_recovery_timeout );}
|
if ( ptr->loc_recovery_timeout ) { ilog ("loc_recovery_timeout = %d\n", ptr->loc_recovery_timeout );}
|
||||||
if ( ptr->mnfa_recovery_timeout ) { ilog ("mnfa_recovery_timeout = %d\n", ptr->mnfa_recovery_timeout );}
|
|
||||||
if ( ptr->node_reinstall_timeout ) { ilog ("node_reinstall_timeout= %d\n", ptr->node_reinstall_timeout );}
|
if ( ptr->node_reinstall_timeout ) { ilog ("node_reinstall_timeout= %d\n", ptr->node_reinstall_timeout );}
|
||||||
if ( ptr->uptime_period ) { ilog ("uptime_period = %d\n", ptr->uptime_period );}
|
if ( ptr->uptime_period ) { ilog ("uptime_period = %d\n", ptr->uptime_period );}
|
||||||
if ( ptr->online_period ) { ilog ("online_period = %d\n", ptr->online_period );}
|
if ( ptr->online_period ) { ilog ("online_period = %d\n", ptr->online_period );}
|
||||||
|
@ -270,12 +270,6 @@ int daemon_log_message ( const char * hostname,
|
|||||||
return(PASS);
|
return(PASS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int nodeLinkClass::mnfa_calculate_threshold ( string hostname )
|
|
||||||
{
|
|
||||||
UNUSED(hostname);
|
|
||||||
return(PASS) ;
|
|
||||||
}
|
|
||||||
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr, iface_enum iface )
|
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr, iface_enum iface )
|
||||||
{ node_ptr = node_ptr ; iface = iface ; }
|
{ node_ptr = node_ptr ; iface = iface ; }
|
||||||
void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
||||||
|
@ -295,18 +295,6 @@ static int mtc_config_handler ( void * user,
|
|||||||
config_ptr->mask |= CONFIG_AGENT_API_RETRIES ;
|
config_ptr->mask |= CONFIG_AGENT_API_RETRIES ;
|
||||||
mtcInv.api_retries = config_ptr->api_retries ;
|
mtcInv.api_retries = config_ptr->api_retries ;
|
||||||
}
|
}
|
||||||
else if (MATCH("agent", "mnfa_threshold_type"))
|
|
||||||
{
|
|
||||||
config_ptr->mnfa_threshold_type = strdup(value);
|
|
||||||
}
|
|
||||||
else if (MATCH("agent", "mnfa_threshold_percent"))
|
|
||||||
{
|
|
||||||
config_ptr->mnfa_threshold_percent = atoi(value);
|
|
||||||
}
|
|
||||||
else if (MATCH("agent", "mnfa_threshold_number"))
|
|
||||||
{
|
|
||||||
config_ptr->mnfa_threshold_number = atoi(value);
|
|
||||||
}
|
|
||||||
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
|
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
|
||||||
{
|
{
|
||||||
config_ptr->failsafe_shutdown_delay = atoi(value);
|
config_ptr->failsafe_shutdown_delay = atoi(value);
|
||||||
@ -335,6 +323,55 @@ static int mtc_config_handler ( void * user,
|
|||||||
return (FAIL);
|
return (FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int mtc_ini_handler ( void * user,
|
||||||
|
const char * section,
|
||||||
|
const char * name,
|
||||||
|
const char * value)
|
||||||
|
{
|
||||||
|
UNUSED(user);
|
||||||
|
|
||||||
|
if (MATCH("agent", "mnfa_threshold"))
|
||||||
|
{
|
||||||
|
mtcInv.mnfa_threshold = atoi(value);
|
||||||
|
ilog ("MNFA Threshd: %d\n", mtcInv.mnfa_threshold);
|
||||||
|
}
|
||||||
|
else if (MATCH("timeouts", "mnfa_timeout"))
|
||||||
|
{
|
||||||
|
int old = mtcInv.mnfa_timeout ;
|
||||||
|
mtcInv.mnfa_timeout = atoi(value);
|
||||||
|
if ( mtcInv.mnfa_timeout == 0 )
|
||||||
|
{
|
||||||
|
ilog ("MNFA Timeout: Never\n");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ilog ("MNFA Timeout: %3d secs\n", mtcInv.mnfa_timeout );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* handle a change in mnfa timeout while MNFA is active */
|
||||||
|
if (( mtcInv.mnfa_active == true ) &&
|
||||||
|
( mtcInv.mnfa_timeout != old ))
|
||||||
|
{
|
||||||
|
mtcTimer_reset ( mtcInv.mtcTimer_mnfa );
|
||||||
|
if (( old == 0 ) || mtcInv.mnfa_timeout != 0 )
|
||||||
|
{
|
||||||
|
wlog ("MNFA Auto-Recovery in %d seconds\n",
|
||||||
|
mtcInv.mnfa_timeout);
|
||||||
|
|
||||||
|
mtcTimer_start ( mtcInv.mtcTimer_mnfa,
|
||||||
|
mtcTimer_handler,
|
||||||
|
mtcInv.mnfa_timeout);
|
||||||
|
}
|
||||||
|
else if ( mtcInv.mnfa_timeout == 0 )
|
||||||
|
{
|
||||||
|
ilog ("MNFA timer set to no-timeout ; previous %d sec timer cancelled", old );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (PASS);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Read and process mtc.ini file settings into the daemon configuration */
|
/* Read and process mtc.ini file settings into the daemon configuration */
|
||||||
int daemon_configure ( void )
|
int daemon_configure ( void )
|
||||||
{
|
{
|
||||||
@ -350,6 +387,12 @@ int daemon_configure ( void )
|
|||||||
return (FAIL_LOAD_INI);
|
return (FAIL_LOAD_INI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ini_parse(MTCE_INI_FILE, mtc_ini_handler, &mtc_config) < 0)
|
||||||
|
{
|
||||||
|
elog ("Can't load '%s'\n", MTCE_INI_FILE );
|
||||||
|
return (FAIL_LOAD_INI);
|
||||||
|
}
|
||||||
|
|
||||||
if (ini_parse(MTCE_INI_FILE, keystone_config_handler, &mtc_config) < 0)
|
if (ini_parse(MTCE_INI_FILE, keystone_config_handler, &mtc_config) < 0)
|
||||||
{
|
{
|
||||||
elog ("Can't load '%s'\n", MTCE_INI_FILE );
|
elog ("Can't load '%s'\n", MTCE_INI_FILE );
|
||||||
@ -406,14 +449,12 @@ int daemon_configure ( void )
|
|||||||
mtcInv.goenabled_timeout = DEFAULT_GOENABLE_TIMEOUT ;
|
mtcInv.goenabled_timeout = DEFAULT_GOENABLE_TIMEOUT ;
|
||||||
|
|
||||||
mtcInv.loc_recovery_timeout = mtc_config.loc_recovery_timeout ;
|
mtcInv.loc_recovery_timeout = mtc_config.loc_recovery_timeout ;
|
||||||
mtcInv.mnfa_recovery_timeout = mtc_config.mnfa_recovery_timeout ;
|
|
||||||
|
|
||||||
if ( mtc_config.node_reinstall_timeout )
|
if ( mtc_config.node_reinstall_timeout )
|
||||||
mtcInv.node_reinstall_timeout = mtc_config.node_reinstall_timeout ;
|
mtcInv.node_reinstall_timeout = mtc_config.node_reinstall_timeout ;
|
||||||
else
|
else
|
||||||
mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;
|
mtcInv.node_reinstall_timeout = MTC_REINSTALL_TIMEOUT_DEFAULT ;
|
||||||
|
|
||||||
|
|
||||||
if ( mtc_config.dor_mode_timeout <= 0 )
|
if ( mtc_config.dor_mode_timeout <= 0 )
|
||||||
{
|
{
|
||||||
slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
|
slog ("DOR Mode Timeout is invalid (%d), setting to default (%d)\n",
|
||||||
@ -423,25 +464,6 @@ int daemon_configure ( void )
|
|||||||
mtc_config.dor_mode_timeout = DEFAULT_DOR_MODE_TIMEOUT ;
|
mtc_config.dor_mode_timeout = DEFAULT_DOR_MODE_TIMEOUT ;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* validate and auto correct manage multi node failure avoidance thresholds */
|
|
||||||
if (( mtc_config.mnfa_threshold_type != NULL ) &&
|
|
||||||
( !strncmp (mtc_config.mnfa_threshold_type, "percent", strlen("percent"))))
|
|
||||||
{
|
|
||||||
if ( mtc_config.mnfa_threshold_percent > 100 )
|
|
||||||
{
|
|
||||||
mtc_config.mnfa_threshold_percent = 100 ;
|
|
||||||
}
|
|
||||||
mtcInv.mnfa_threshold_type = MNFA_PERCENT ;
|
|
||||||
ilog ("mnfAvoidance: %d%c\n", mtc_config.mnfa_threshold_percent, '%' );
|
|
||||||
mtcInv.mnfa_threshold_percent = mtc_config.mnfa_threshold_percent ;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
mtcInv.mnfa_threshold_type = MNFA_NUMBER ;
|
|
||||||
ilog ("mnfAvoidance: %d hosts\n", mtc_config.mnfa_threshold_number );
|
|
||||||
mtcInv.mnfa_threshold_number = mtc_config.mnfa_threshold_number ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( mtc_config.swact_timeout )
|
if ( mtc_config.swact_timeout )
|
||||||
{
|
{
|
||||||
if ( mtc_config.swact_timeout < (MTC_SWACT_POLL_TIMER*2))
|
if ( mtc_config.swact_timeout < (MTC_SWACT_POLL_TIMER*2))
|
||||||
|
@ -1690,11 +1690,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
/* Go to the goEnabled stage */
|
/* Go to the goEnabled stage */
|
||||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
||||||
|
|
||||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
alarm_enabled_failure(node_ptr);
|
||||||
{
|
|
||||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
||||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
||||||
}
|
|
||||||
break ;
|
break ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1732,11 +1728,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
/* Go to the goEnabled stage */
|
/* Go to the goEnabled stage */
|
||||||
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
recoveryStageChange ( node_ptr, MTC_RECOVERY__GOENABLED_TIMER );
|
||||||
|
|
||||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
alarm_enabled_failure (node_ptr);
|
||||||
{
|
|
||||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
||||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* A timer ring indicates that the host is not up */
|
/* A timer ring indicates that the host is not up */
|
||||||
@ -1780,11 +1772,8 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
|||||||
/* Inform the VIM that this host has failed */
|
/* Inform the VIM that this host has failed */
|
||||||
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
||||||
|
|
||||||
if ( node_ptr->alarms[MTC_ALARM_ID__ENABLE] != FM_ALARM_SEVERITY_CRITICAL )
|
alarm_enabled_failure(node_ptr);
|
||||||
{
|
|
||||||
mtcAlarm_critical ( node_ptr->hostname, MTC_ALARM_ID__ENABLE );
|
|
||||||
node_ptr->alarms[MTC_ALARM_ID__ENABLE] = FM_ALARM_SEVERITY_CRITICAL ;
|
|
||||||
}
|
|
||||||
/* Clear all degrade flags except for the HWMON one */
|
/* Clear all degrade flags except for the HWMON one */
|
||||||
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
clear_host_degrade_causes ( node_ptr->degrade_mask );
|
||||||
node_ptr->degraded_resources_list.clear();
|
node_ptr->degraded_resources_list.clear();
|
||||||
|
@ -41,48 +41,6 @@ void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
|
|||||||
ilog ("MNFA POOL:%s\n", pool_list.c_str());
|
ilog ("MNFA POOL:%s\n", pool_list.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
/*******************************************************************************
|
|
||||||
*
|
|
||||||
* Name : mnfa_calculate_threshold
|
|
||||||
*
|
|
||||||
* Description: Calculates and returns the mnfa threshold based
|
|
||||||
* on enabled hosts.
|
|
||||||
*
|
|
||||||
* Auto corrects the value to a min number.
|
|
||||||
*
|
|
||||||
* Calculate the multi-node failure avoidance handling threshold
|
|
||||||
* This is the number of hosts than need to fail simultaneously
|
|
||||||
* in order to trigger mode ; i.e. mnfa_active=true
|
|
||||||
*
|
|
||||||
*******************************************************************************/
|
|
||||||
int nodeLinkClass::mnfa_calculate_threshold ( string hostname )
|
|
||||||
{
|
|
||||||
int mnfa_enabled_nodes = enabled_nodes ();
|
|
||||||
|
|
||||||
/* Calculate the threshold */
|
|
||||||
if ( mnfa_threshold_type == MNFA_PERCENT )
|
|
||||||
mnfa_threshold = mnfa_enabled_nodes / mnfa_threshold_percent ;
|
|
||||||
else
|
|
||||||
mnfa_threshold = mnfa_threshold_number ;
|
|
||||||
|
|
||||||
/* Don't allow the multi-node failure avoidance
|
|
||||||
* to ever be 1 or we would never fail a host */
|
|
||||||
if ( mnfa_threshold < mnfa_threshold_number )
|
|
||||||
{
|
|
||||||
ilog ("%s MNFA threshold rounded to %d from %d\n",
|
|
||||||
hostname.c_str(),
|
|
||||||
mnfa_threshold_number,
|
|
||||||
mnfa_enabled_nodes / mnfa_threshold_percent );
|
|
||||||
mnfa_threshold = mnfa_threshold_number ;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( mnfa_awol_list.size() )
|
|
||||||
{
|
|
||||||
log_mnfa_pool ( mnfa_awol_list );
|
|
||||||
}
|
|
||||||
return (mnfa_threshold);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*****************************************************************************
|
/*****************************************************************************
|
||||||
*
|
*
|
||||||
* Name : mnfa_add_host
|
* Name : mnfa_add_host
|
||||||
@ -105,6 +63,8 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
|||||||
/* if we are active then add the node to the awol list */
|
/* if we are active then add the node to the awol list */
|
||||||
if ( mnfa_active == true )
|
if ( mnfa_active == true )
|
||||||
{
|
{
|
||||||
|
alarm_enabled_failure (node_ptr);
|
||||||
|
|
||||||
/* once we are mnfa_active we need to give all the
|
/* once we are mnfa_active we need to give all the
|
||||||
* hbs_minor=true hosts a graceful recovery token
|
* hbs_minor=true hosts a graceful recovery token
|
||||||
* mnfa_graceful_recovery = true and add to the awol list */
|
* mnfa_graceful_recovery = true and add to the awol list */
|
||||||
@ -116,7 +76,7 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
|||||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
|
||||||
}
|
}
|
||||||
else if (( mnfa_active == false ) &&
|
else if (( mnfa_active == false ) &&
|
||||||
( mnfa_host_count[iface] >= mnfa_calculate_threshold( node_ptr->hostname )))
|
( mnfa_host_count[iface] >= this->mnfa_threshold))
|
||||||
{
|
{
|
||||||
enter = true ;
|
enter = true ;
|
||||||
}
|
}
|
||||||
@ -134,6 +94,11 @@ void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , ifac
|
|||||||
get_iface_name_str(INFRA_IFACE),
|
get_iface_name_str(INFRA_IFACE),
|
||||||
node_ptr->hbs_minor_count[INFRA_IFACE]);
|
node_ptr->hbs_minor_count[INFRA_IFACE]);
|
||||||
|
|
||||||
|
if ( mnfa_awol_list.size() )
|
||||||
|
{
|
||||||
|
log_mnfa_pool ( mnfa_awol_list );
|
||||||
|
}
|
||||||
|
|
||||||
if ( enter == true )
|
if ( enter == true )
|
||||||
{
|
{
|
||||||
mnfa_enter ();
|
mnfa_enter ();
|
||||||
@ -196,7 +161,7 @@ void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
|
|||||||
* mnfa_graceful_recovery token
|
* mnfa_graceful_recovery token
|
||||||
*
|
*
|
||||||
* 5. Start the MNFA Auto-Recovery timer with time based on the config
|
* 5. Start the MNFA Auto-Recovery timer with time based on the config
|
||||||
* setting mnfa_recovery_timeout
|
* setting mnfa_timeout
|
||||||
*
|
*
|
||||||
****************************************************************************/
|
****************************************************************************/
|
||||||
void nodeLinkClass::mnfa_enter ( void )
|
void nodeLinkClass::mnfa_enter ( void )
|
||||||
@ -211,8 +176,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
|||||||
* previous mnfa but the failure case occurs again. If that
|
* previous mnfa but the failure case occurs again. If that
|
||||||
* happens we need to cancel the timer that will issue
|
* happens we need to cancel the timer that will issue
|
||||||
* the period recovery command. */
|
* the period recovery command. */
|
||||||
if ( mtcTimer_mnfa.tid )
|
mtcTimer_reset ( mtcTimer_mnfa );
|
||||||
mtcTimer_stop ( mtcTimer_mnfa );
|
|
||||||
|
|
||||||
/* Loop through inventory and recover each host that
|
/* Loop through inventory and recover each host that
|
||||||
* remains in the hbs_minor state.
|
* remains in the hbs_minor state.
|
||||||
@ -232,6 +196,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
|||||||
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
|
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
|
||||||
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
|
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
|
||||||
|
|
||||||
|
alarm_enabled_failure (ptr);
|
||||||
}
|
}
|
||||||
if (( ptr->next == NULL ) || ( ptr == tail ))
|
if (( ptr->next == NULL ) || ( ptr == tail ))
|
||||||
break ;
|
break ;
|
||||||
@ -239,14 +204,20 @@ void nodeLinkClass::mnfa_enter ( void )
|
|||||||
|
|
||||||
mnfa_awol_list.unique();
|
mnfa_awol_list.unique();
|
||||||
|
|
||||||
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
if ( this->mnfa_timeout )
|
||||||
wlog ("MNFA Auto-Recovery in %d seconds\n", mnfa_recovery_timeout);
|
{
|
||||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, mnfa_recovery_timeout);
|
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
|
||||||
|
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
|
||||||
|
}
|
||||||
|
if ( mnfa_awol_list.size() )
|
||||||
|
{
|
||||||
|
log_mnfa_pool ( mnfa_awol_list );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
*
|
*
|
||||||
* Name : mnfa_enter
|
* Name : mnfa_exit
|
||||||
*
|
*
|
||||||
* Description: Perform the operations required to exit mnfa mode
|
* Description: Perform the operations required to exit mnfa mode
|
||||||
* These include ...
|
* These include ...
|
||||||
@ -266,7 +237,7 @@ void nodeLinkClass::mnfa_enter ( void )
|
|||||||
* hosts that remain in the hbs_minor state.
|
* hosts that remain in the hbs_minor state.
|
||||||
*
|
*
|
||||||
* if ( force == true )
|
* if ( force == true )
|
||||||
* The mnfa_recovery_timeout has expired
|
* The mnfa_timeout has expired
|
||||||
* All hosts in the awol list are forced failed and into the
|
* All hosts in the awol list are forced failed and into the
|
||||||
* enable_handler FSM.
|
* enable_handler FSM.
|
||||||
* else
|
* else
|
||||||
@ -279,18 +250,20 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
|||||||
{
|
{
|
||||||
if ( mnfa_active == true )
|
if ( mnfa_active == true )
|
||||||
{
|
{
|
||||||
wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
|
|
||||||
force ? "(Auto-Recover)" : "");
|
|
||||||
|
|
||||||
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
|
|
||||||
mnfa_occurances++ ;
|
mnfa_occurances++ ;
|
||||||
mnfa_active = false ;
|
mnfa_active = false ;
|
||||||
|
|
||||||
if ( force == true )
|
if ( force == true )
|
||||||
{
|
{
|
||||||
elog ("... MNFA %d sec timeout - forcing full enable on ... \n",
|
elog ("... MNFA %d sec timeout - forcing full enable on ... \n",
|
||||||
mnfa_recovery_timeout);
|
this->mnfa_timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
|
||||||
|
force ? "(Auto-Recover)" : "");
|
||||||
|
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
|
||||||
|
|
||||||
|
if ( mnfa_awol_list.size() )
|
||||||
|
{
|
||||||
log_mnfa_pool ( mnfa_awol_list );
|
log_mnfa_pool ( mnfa_awol_list );
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -342,8 +315,7 @@ void nodeLinkClass::mnfa_exit ( bool force )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Stop the ... failure -> full enable ... window timer if it is active */
|
/* Stop the ... failure -> full enable ... window timer if it is active */
|
||||||
if ( mtcTimer_mnfa.tid )
|
mtcTimer_reset ( mtcTimer_mnfa );
|
||||||
mtcTimer_stop ( mtcTimer_mnfa );
|
|
||||||
|
|
||||||
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
|
||||||
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
|
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
|
||||||
|
@ -18,14 +18,6 @@ offline_threshold = 46 ; number of back to back mtcAlive requests missed
|
|||||||
; 100:46 will yield a typical 5 sec holdoff from
|
; 100:46 will yield a typical 5 sec holdoff from
|
||||||
; failed to offline
|
; failed to offline
|
||||||
|
|
||||||
mnfa_threshold_type = number ; Two different types are supported
|
|
||||||
; 'number' or 'percent' of simultaneous
|
|
||||||
; failures that enable multi-node
|
|
||||||
; failure avoidance handling
|
|
||||||
;
|
|
||||||
mnfa_threshold_percent = 10 ; if ( mnfa_threshold_type == percent )
|
|
||||||
mnfa_threshold_number = 3 ; if ( mnfa_threshold_type == number )
|
|
||||||
|
|
||||||
inventory_port = 6385 ; The Inventory Port Number
|
inventory_port = 6385 ; The Inventory Port Number
|
||||||
keystone_port = 5000 ; The Keystone Port Number
|
keystone_port = 5000 ; The Keystone Port Number
|
||||||
ha_port = 7777 ; The Inventory Port Number
|
ha_port = 7777 ; The Inventory Port Number
|
||||||
@ -86,13 +78,6 @@ loc_recovery_timeout = 5 ; Loss Of Communication Recovery Timeout
|
|||||||
; the max number of seconds that a host can be in
|
; the max number of seconds that a host can be in
|
||||||
; loss of communication state without failing the unit
|
; loss of communication state without failing the unit
|
||||||
|
|
||||||
mnfa_recovery_timeout = 100 ; Multi-Node-Failure Avoidance Recovery Threshold
|
|
||||||
; Similar to the LOC above for graceful recovery
|
|
||||||
; hosts that have LOC for longer than this time in
|
|
||||||
; seconds are failed and sent into the enable_handler
|
|
||||||
; FSM while those that recover before this period are
|
|
||||||
; sent into the graceful recovery_handler FSM.
|
|
||||||
|
|
||||||
dor_mode_timeout = 20 ; The default base time in seconds for how long
|
dor_mode_timeout = 20 ; The default base time in seconds for how long
|
||||||
; maintenance DOR mode is active. This number
|
; maintenance DOR mode is active. This number
|
||||||
; is extended by the number of enabled hosts.
|
; is extended by the number of enabled hosts.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user