Merge "Mtce: Add Thresholded Maintenance Enable Recovery support"
This commit is contained in:
commit
8eb55b2b03
@ -1,3 +1,3 @@
|
||||
SRC_DIR="src"
|
||||
TIS_PATCH_VER=140
|
||||
TIS_PATCH_VER=141
|
||||
BUILD_IS_SLOW=5
|
||||
|
@ -166,6 +166,18 @@ typedef struct
|
||||
int kernwd_update_period ; /**< expect kernel watchdog to be updated */
|
||||
int autorecovery_threshold ; /**< AIO stop autorecovery threshold */
|
||||
|
||||
/**< Auto Recovery Thresholds */
|
||||
int ar_config_threshold ; /**< Configuration Failure Threshold */
|
||||
int ar_goenable_threshold ; /**< GoEnable Failure Threshold */
|
||||
int ar_hostservices_threshold ; /**< Host Services Failure Threshold */
|
||||
int ar_heartbeat_threshold ; /**< Heartbeat Soak Failure Threshold*/
|
||||
|
||||
/**< Auto Recovery Retry Intervals */
|
||||
int ar_config_interval ; /**< Configuration Failure Interval */
|
||||
int ar_goenable_interval ; /**< GoEnable Failure Interval */
|
||||
int ar_hostservices_interval ; /**< Host Services Failure Interval */
|
||||
int ar_heartbeat_interval ; /**< Heartbeat Soak Failure Interval */
|
||||
|
||||
int debug_all ;
|
||||
int debug_json ; /**< Enable jlog (json string ) output if not false */
|
||||
int debug_timer ; /**< Enable tlog (timer logs ) output if not false */
|
||||
|
@ -229,28 +229,35 @@ void daemon_exit ( void );
|
||||
#define MTC_TASK_INIT_FAIL "Initialization Failed, recovering"
|
||||
#define MTC_TASK_START_SERVICE_FAIL "Start Services Failed"
|
||||
#define MTC_TASK_START_SERVICE_TO "Start Services Timeout"
|
||||
#define MTC_TASK_ENABLING "Enabling"
|
||||
#define MTC_TASK_ENABLING_SUBF "Enabling Compute Service"
|
||||
#define MTC_TASK_ENABLING_SUBF_FAIL "Enabling Compute Service Failed"
|
||||
#define MTC_TASK_ENABLING_SUBF_TO "Enabling Compute Service Timeout"
|
||||
#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed, re-enabling"
|
||||
#define MTC_TASK_ENABLE_WORK_FAIL_ "Enable Action Failed"
|
||||
#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout, re-enabling"
|
||||
#define MTC_TASK_ENABLE_WORK_TO_ "Enable Action Timeout"
|
||||
#define MTC_TASK_ENABLE_WORK_FAIL "Enable Action Failed"
|
||||
#define MTC_TASK_ENABLE_WORK_TO "Enable Action Timeout"
|
||||
#define MTC_TASK_ENABLE_FAIL_HB "Enable Heartbeat Failure, re-enabling"
|
||||
#define MTC_TASK_RECOVERY_FAIL "Graceful Recovery Failed, re-enabling"
|
||||
#define MTC_TASK_RECOVERY_WAIT "Graceful Recovery Wait"
|
||||
#define MTC_TASK_RECOVERED "Gracefully Recovered"
|
||||
|
||||
#define MTC_TASK_ENABLING "Enabling"
|
||||
#define MTC_TASK_MAIN_CONFIG_FAIL "Configuration Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_CONFIG_FAIL "Compute Configuration Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_CONFIG_FAIL_ "Compute Configuration Failed"
|
||||
#define MTC_TASK_MAIN_CONFIG_TO "Configuration Timeout, re-enabling"
|
||||
#define MTC_TASK_MAIN_INTEST_FAIL "In-Test Failed, re-enabling"
|
||||
#define MTC_TASK_MAIN_INTEST_TO "In-Test Timeout, re-enabling"
|
||||
#define MTC_TASK_MAIN_SERVICE_FAIL "Start Services Failed, re-enabling"
|
||||
#define MTC_TASK_MAIN_SERVICE_TO "Start Services Timeout, re-enabling"
|
||||
|
||||
#define MTC_TASK_ENABLING_SUBF "Enabling Compute Service"
|
||||
#define MTC_TASK_SUBF_CONFIG_FAIL "Compute Configuration Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_CONFIG_TO "Compute Configuration Timeout, re-enabling"
|
||||
#define MTC_TASK_SUBF_CONFIG_TO_ "Compute Configuration Timeout"
|
||||
#define MTC_TASK_INTEST_FAIL "In-Test Failed, re-enabling"
|
||||
#define MTC_TASK_INTEST_FAIL_ "In-Test Failed"
|
||||
#define MTC_TASK_INTEST_FAIL_TO "In-Test Timeout, re-enabling"
|
||||
#define MTC_TASK_INTEST_FAIL_TO_ "In-Test Timeout"
|
||||
#define MTC_TASK_SUBF_INTEST_FAIL "Compute In-Test Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_INTEST_TO "Compute In-Test Timeout, re-enabling"
|
||||
#define MTC_TASK_SUBF_SERVICE_FAIL "Compute Start Services Failed, re-enabling"
|
||||
#define MTC_TASK_SUBF_SERVICE_TO "Compute Start Services Timeout, re-enabling"
|
||||
|
||||
#define MTC_TASK_AR_DISABLED_CONFIG "Configuration failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_GOENABLE "In-Test Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_SERVICES "Service Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_ENABLE "Enable Failure, threshold reached, Lock/Unlock to retry"
|
||||
#define MTC_TASK_AR_DISABLED_HEARTBEAT "Heartbeat Failure, threshold reached, Lock/Unlock to retry"
|
||||
|
||||
#define MTC_TASK_RESET_FAIL "Reset Failed"
|
||||
#define MTC_TASK_RESET_QUEUE "Reset Failed, retrying (%d of %d)"
|
||||
#define MTC_TASK_POWERON_FAIL "Power-On Failed"
|
||||
@ -275,8 +282,6 @@ void daemon_exit ( void );
|
||||
#define MTC_TASK_RESETTING_HOST "Resetting Host, critical sensor"
|
||||
#define MTC_TASK_CPE_SX_UNLOCK_MSG "Unlocking, please stand-by while the system gracefully reboots"
|
||||
#define MTC_TASK_SELF_UNLOCK_MSG "Unlocking active controller, please stand-by while it reboots"
|
||||
#define MTC_TASK_AUTO_RECOVERY "Critical failure. Auto-recovery enabled, re-enabling"
|
||||
#define MTC_TASK_AUTO_RECOVERY_DISABLED "Critical failure. Auto-recovery disabled, threshold reached"
|
||||
#define MTC_TASK_FAILED_SWACT_REQ "Critical failure.Requesting SWACT to enabled standby controller"
|
||||
#define MTC_TASK_FAILED_NO_BACKUP "Critical failure.Please provision/enable standby controller"
|
||||
|
||||
@ -1176,13 +1181,6 @@ typedef enum
|
||||
MTC_STRESS_TEST__STAGES = 6,
|
||||
} mtc_stressStages_enum ;
|
||||
|
||||
typedef union
|
||||
{
|
||||
mtc_enableStages_enum enable ;
|
||||
mtc_disableStages_enum disable ;
|
||||
int raw ;
|
||||
} mtc_stages_union ;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
mtc_nodeAdminAction_enum adminAction ;
|
||||
@ -1210,6 +1208,24 @@ typedef enum
|
||||
MAX_IFACES = 2
|
||||
} iface_enum ;
|
||||
|
||||
/* Auto recovery Disable Causes */
|
||||
typedef enum
|
||||
{
|
||||
MTC_AR_DISABLE_CAUSE__CONFIG,
|
||||
MTC_AR_DISABLE_CAUSE__GOENABLE,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_AR_DISABLE_CAUSE__HEARTBEAT,
|
||||
MTC_AR_DISABLE_CAUSE__LAST,
|
||||
MTC_AR_DISABLE_CAUSE__NONE,
|
||||
} autorecovery_disable_cause_enum ;
|
||||
|
||||
/* Service Based Auto Recovery Control Structure */
|
||||
typedef struct
|
||||
{
|
||||
unsigned int count ; /* running back-2-back failure count */
|
||||
bool disabled ; /* true if autorecovery is disabled */
|
||||
} autorecovery_cause_ctrl_type ;
|
||||
|
||||
/** Returns true if the specified admin state string is valid */
|
||||
bool adminStateOk ( string admin );
|
||||
|
||||
|
@ -236,9 +236,6 @@ nodeLinkClass::nodeLinkClass()
|
||||
this->loc_recovery_timeout = 0;
|
||||
this->node_reinstall_timeout = 0;
|
||||
this->token_refresh_rate = 0;
|
||||
this->autorecovery_enabled = false ;
|
||||
this->autorecovery_disabled = false ;
|
||||
|
||||
|
||||
head = tail = NULL;
|
||||
memory_allocs = 0 ;
|
||||
@ -313,6 +310,11 @@ nodeLinkClass::nodeLinkClass()
|
||||
sysinv_noncrit_timeout = HTTP_SYSINV_NONC_TIMEOUT ;
|
||||
work_queue_timeout = MTC_WORKQUEUE_TIMEOUT ;
|
||||
|
||||
/* Init the auto recovery threshold and intervals to zero until
|
||||
* modified by daemon config */
|
||||
memset (&ar_threshold, 0, sizeof(ar_threshold));
|
||||
memset (&ar_interval, 0, sizeof(ar_interval));
|
||||
|
||||
/* Inservice test periods in seconds - 0 = disabled */
|
||||
insv_test_period = 0 ;
|
||||
oos_test_period = 0 ;
|
||||
@ -340,7 +342,6 @@ nodeLinkClass::nodeLinkClass()
|
||||
tokenEvent.buf = NULL ;
|
||||
|
||||
unknown_host_throttle = 0 ;
|
||||
invalid_arg_throttle = 0 ;
|
||||
|
||||
testmode = 0 ;
|
||||
module_init( );
|
||||
@ -564,6 +565,11 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->was_dor_recovery_mode= false ;
|
||||
ptr->dor_recovery_time = 0 ;
|
||||
|
||||
ptr->ar_disabled = false ;
|
||||
ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
|
||||
memset (&ptr->ar_count, 0, sizeof(ptr->ar_count));
|
||||
ptr->ar_log_throttle = 0 ;
|
||||
|
||||
mtcTimer_init ( ptr->mtcTimer, hostname, "mtc timer"); /* Init node's general mtc timer */
|
||||
mtcTimer_init ( ptr->insvTestTimer, hostname, "insv test timer");
|
||||
mtcTimer_init ( ptr->oosTestTimer, hostname, "oos test timer"); /* Init node's oos test timer */
|
||||
@ -603,7 +609,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->subStage = MTC_SUBSTAGE__DONE ;
|
||||
ptr->reinstallStage = MTC_REINSTALL__DONE ;
|
||||
ptr->resetStage = MTC_RESET__START ;
|
||||
ptr->handlerStage.enable = MTC_ENABLE__START ; /* Enable and Disable */
|
||||
ptr->enableStage = MTC_ENABLE__START ;
|
||||
ptr->disableStage = MTC_DISABLE__START ;
|
||||
|
||||
ptr->oos_test_count = 0 ;
|
||||
ptr->insv_test_count = 0 ;
|
||||
@ -613,9 +620,8 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->uptime_refresh_counter = 0 ;
|
||||
ptr->node_unlocked_counter = 0 ;
|
||||
|
||||
/* Default to a healthy config until mtcAlive messages prove otherwise */
|
||||
ptr->mtce_flags = ( MTC_FLAG__I_AM_CONFIGURED |
|
||||
MTC_FLAG__I_AM_HEALTHY ) ;
|
||||
/* Good health needs to be learned */
|
||||
ptr->mtce_flags = 0 ;
|
||||
|
||||
ptr->graceful_recovery_counter = 0 ;
|
||||
ptr->health_threshold_counter = 0 ;
|
||||
@ -746,8 +752,6 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->adminAction = MTC_ADMIN_ACTION__NONE ;
|
||||
ptr->adminAction_todo_list.clear();
|
||||
|
||||
ptr->handlerStage.enable = MTC_ENABLE__START;
|
||||
|
||||
hosts++ ;
|
||||
|
||||
/* (re)build the Resource Reference Array */
|
||||
@ -4504,16 +4508,31 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
}
|
||||
else
|
||||
{
|
||||
//bool want_degrade = true ;
|
||||
//if ( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM )
|
||||
// want_degrade = false ;
|
||||
// alarm_enabled_failure (node_ptr, want_degrade);
|
||||
/* handle auto recovery for heartbeat failure during enable */
|
||||
if ( node_ptr->ar_cause == MTC_AR_DISABLE_CAUSE__HEARTBEAT )
|
||||
return ;
|
||||
else if ( node_ptr->enableStage == MTC_ENABLE__HEARTBEAT_SOAK )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss *** (during enable soak)\n",
|
||||
hostname.c_str(),
|
||||
get_iface_name_str(iface));
|
||||
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HEARTBEAT,
|
||||
MTC_TASK_AR_DISABLED_HEARTBEAT ) == PASS )
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
mnfa_add_host ( node_ptr , iface );
|
||||
|
||||
if ( mnfa_active == false )
|
||||
{
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
elog ("%s %s *** Heartbeat Loss ***\n", hostname.c_str(), get_iface_name_str(iface));
|
||||
if ( iface == INFRA_IFACE )
|
||||
{
|
||||
node_ptr->heartbeat_failed[INFRA_IFACE] = true ;
|
||||
@ -4546,6 +4565,7 @@ void nodeLinkClass::manage_heartbeat_failure ( string hostname, iface_enum iface
|
||||
{
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_ENABLE_FAIL_HB );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5485,7 +5505,7 @@ int nodeLinkClass::critical_process_failed( string & hostname,
|
||||
/* Special critical process failure handling for AIO system */
|
||||
if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
|
||||
{
|
||||
if ( this->autorecovery_disabled == true )
|
||||
if ( node_ptr->ar_disabled == true )
|
||||
{
|
||||
dlog ("%s bypassing persistent critical process failure\n",
|
||||
node_ptr->hostname.c_str());
|
||||
@ -5510,7 +5530,7 @@ int nodeLinkClass::critical_process_failed( string & hostname,
|
||||
dlog ("%s adminState:%s EnableStage:%s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
adminAction_enum_to_str(node_ptr->adminAction).c_str(),
|
||||
get_enableStages_str(node_ptr->handlerStage.enable).c_str());
|
||||
get_enableStages_str(node_ptr->enableStage).c_str());
|
||||
}
|
||||
return (PASS);
|
||||
}
|
||||
@ -5843,7 +5863,7 @@ int nodeLinkClass::set_enableStage ( string & hostname,
|
||||
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
node_ptr->handlerStage.enable = stage ;
|
||||
node_ptr->enableStage = stage ;
|
||||
return (PASS);
|
||||
}
|
||||
return (FAIL);
|
||||
@ -5867,7 +5887,7 @@ mtc_enableStages_enum nodeLinkClass::get_enableStage ( string & hostname)
|
||||
nodeLinkClass::node * node_ptr = getNode ( hostname ) ;
|
||||
if ( node_ptr != NULL )
|
||||
{
|
||||
return ( node_ptr->handlerStage.enable ) ;
|
||||
return ( node_ptr->enableStage ) ;
|
||||
}
|
||||
return (MTC_ENABLE__STAGES);
|
||||
}
|
||||
@ -6124,16 +6144,15 @@ int nodeLinkClass::adminActionChange ( struct nodeLinkClass::node * node_ptr,
|
||||
node_ptr->node_unlocked_counter++ ;
|
||||
}
|
||||
|
||||
if ( is_controller ( node_ptr ) )
|
||||
autorecovery_clear (node_ptr->hostname);
|
||||
ar_enable (node_ptr);
|
||||
|
||||
node_ptr->handlerStage.enable = MTC_ENABLE__START ;
|
||||
node_ptr->enableStage = MTC_ENABLE__START ;
|
||||
break ;
|
||||
}
|
||||
case MTC_ADMIN_ACTION__LOCK:
|
||||
case MTC_ADMIN_ACTION__FORCE_LOCK:
|
||||
{
|
||||
node_ptr->handlerStage.disable = MTC_DISABLE__START ;
|
||||
node_ptr->disableStage = MTC_DISABLE__START ;
|
||||
break ;
|
||||
}
|
||||
case MTC_ADMIN_ACTION__RESET:
|
||||
@ -6423,26 +6442,26 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
{
|
||||
/* TODO: Consider converting stage to strings ... */
|
||||
if (( newHdlrStage >= MTC_ENABLE__STAGES ) ||
|
||||
( node_ptr->handlerStage.enable >= MTC_ENABLE__STAGES ))
|
||||
( node_ptr->enableStage >= MTC_ENABLE__STAGES ))
|
||||
{
|
||||
slog ("%s has invalid Enable stage (%d:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->handlerStage.enable,
|
||||
node_ptr->enableStage,
|
||||
newHdlrStage );
|
||||
|
||||
node_ptr->handlerStage.enable = MTC_ENABLE__FAILURE ;
|
||||
node_ptr->enableStage = MTC_ENABLE__FAILURE ;
|
||||
|
||||
/* TODO: cause failed or degraded state ? */
|
||||
return (FAIL);
|
||||
}
|
||||
else if ( node_ptr->handlerStage.enable != newHdlrStage )
|
||||
else if ( node_ptr->enableStage != newHdlrStage )
|
||||
{
|
||||
clog ("%s %s -> %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
|
||||
get_enableStages_str(node_ptr->enableStage).c_str(),
|
||||
get_enableStages_str(newHdlrStage).c_str());
|
||||
|
||||
node_ptr->handlerStage.enable = newHdlrStage ;
|
||||
node_ptr->enableStage = newHdlrStage ;
|
||||
return (PASS);
|
||||
}
|
||||
else
|
||||
@ -6450,7 +6469,7 @@ int nodeLinkClass::enableStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
/* No state change */
|
||||
dlog1 ("%s %s -> %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_enableStages_str(node_ptr->handlerStage.enable).c_str(),
|
||||
get_enableStages_str(node_ptr->enableStage).c_str(),
|
||||
get_enableStages_str(newHdlrStage).c_str());
|
||||
return (PASS);
|
||||
}
|
||||
@ -6461,15 +6480,15 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
mtc_disableStages_enum newHdlrStage )
|
||||
{
|
||||
/* TODO: Consider converting stage to strings ... */
|
||||
if (( newHdlrStage >= MTC_DISABLE__STAGES ) ||
|
||||
( node_ptr->handlerStage.disable >= MTC_DISABLE__STAGES ))
|
||||
if (( newHdlrStage >= MTC_DISABLE__STAGES ) ||
|
||||
( node_ptr->disableStage >= MTC_DISABLE__STAGES ))
|
||||
{
|
||||
slog ("%s has invalid disable stage (%d:%d)\n",
|
||||
slog ("%s has invalid disable stage (%d:%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->handlerStage.disable,
|
||||
node_ptr->disableStage,
|
||||
newHdlrStage );
|
||||
|
||||
node_ptr->handlerStage.disable = MTC_DISABLE__DISABLED ;
|
||||
node_ptr->disableStage = MTC_DISABLE__DISABLED ;
|
||||
|
||||
/* TODO: cause failed or degraded state ? */
|
||||
return (FAIL);
|
||||
@ -6478,10 +6497,10 @@ int nodeLinkClass::disableStageChange ( struct nodeLinkClass::node * node_ptr,
|
||||
{
|
||||
clog ("%s %s -> %s\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_disableStages_str(node_ptr->handlerStage.disable).c_str(),
|
||||
get_disableStages_str(node_ptr->disableStage).c_str(),
|
||||
get_disableStages_str(newHdlrStage).c_str());
|
||||
|
||||
node_ptr->handlerStage.disable = newHdlrStage ;
|
||||
node_ptr->disableStage = newHdlrStage ;
|
||||
return (PASS);
|
||||
}
|
||||
}
|
||||
@ -7053,61 +7072,131 @@ struct nodeLinkClass::node * nodeLinkClass::get_insvTestTimer ( timer_t tid )
|
||||
return static_cast<struct node *>(NULL);
|
||||
}
|
||||
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : autorecovery_clear
|
||||
* Name : ar_enable
|
||||
*
|
||||
* Assumptions: Applies when simplex.
|
||||
*
|
||||
* Description: Removes the auto recovery count file if it exists.
|
||||
*
|
||||
* Auto recovery count is tracked/preserved in a host named auto recovery
|
||||
* counter file /tmp/hostname_ar_count.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
void autorecovery_clear ( string hostname )
|
||||
{
|
||||
string ar_file = TMP_DIR_PATH + hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
wlog ("%s clearing autorecovery counter\n", hostname.c_str());
|
||||
daemon_remove_file (ar_file.data());
|
||||
}
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : manage_autorecovery
|
||||
*
|
||||
* Assumptions: Applies to the active controller only while simplex.
|
||||
*
|
||||
* Description: Issues an immediate lazy reboot if the autorecovery threshold
|
||||
* is reached. Otherwise it disables autorecovery and returns
|
||||
* do we don't get a rolling boot loop.
|
||||
* Description: Clears all auto recovery state for the specified host and
|
||||
* removes the auto recovery count file if it exists.
|
||||
*
|
||||
* Auto recovery count is tracked/preserved in a host named auto recovery
|
||||
* counter file /etc/mtc/tmp/hostname_ar_count.
|
||||
*
|
||||
* in the event of a persistent autorecovery failure that results in a
|
||||
* disable then the active controller goes enabled-degraded with a horizon
|
||||
* status that indicates the active controller has a critical failure but
|
||||
* auto recovery is disabled. The enable alarm is raised.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr )
|
||||
void nodeLinkClass::ar_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/* manage removing the auto recovery threshold count file */
|
||||
if ( ( THIS_HOST ) &&
|
||||
( this->autorecovery_enabled == true ) &&
|
||||
( this->autorecovery_disabled == false ) &&
|
||||
( is_inactive_controller_main_insv() == false ))
|
||||
string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
int value = 0 ;
|
||||
string ar_file = TMP_DIR_PATH + node_ptr->hostname + AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
int threshold = daemon_get_cfg_ptr()->autorecovery_threshold ;
|
||||
wlog ("%s clearing autorecovery file counter\n", node_ptr->hostname.c_str());
|
||||
daemon_remove_file (ar_file.data());
|
||||
}
|
||||
|
||||
if (( node_ptr->ar_disabled ) ||
|
||||
( node_ptr->ar_cause != MTC_AR_DISABLE_CAUSE__NONE ))
|
||||
{
|
||||
wlog ("%s re-enabling autorecovery\n", node_ptr->hostname.c_str());
|
||||
}
|
||||
|
||||
node_ptr->ar_disabled = false ;
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
|
||||
memset (&node_ptr->ar_count, 0, sizeof(node_ptr->ar_count));
|
||||
|
||||
node_ptr->ar_log_throttle = 0 ;
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* Name : ar_manage
|
||||
*
|
||||
* Purpose : Manage Auto Recovery state.
|
||||
*
|
||||
* Description: the following checks and operations are performed ...
|
||||
*
|
||||
* Pre Checks:
|
||||
*
|
||||
* Validate auto recovery cause code
|
||||
* Return if already in ar_disabled state. Unlikely but safe guard.
|
||||
*
|
||||
* Manage Auto Recovery:
|
||||
*
|
||||
* Case 1: Failed active controller with no enabled inactive controller.
|
||||
*
|
||||
* Requires persistent count file and self reboot until threshold
|
||||
* is reached.
|
||||
*
|
||||
* Issues an immediate lazy reboot if the autorecovery threshold
|
||||
* is not reached. Otherwise it disables autorecovery and returns
|
||||
* so we don't get a rolling boot loop.
|
||||
*
|
||||
* Auto recovery count is tracked/preserved in a host named auto
|
||||
* recovery counter file /etc/mtc/tmp/hostname_ar_count.
|
||||
*
|
||||
* Case 2: All other cases
|
||||
*
|
||||
* Case 2a: No auto recovery thresholding of active controller in non AIO SX
|
||||
* where the user can't lock and unlock the active controller.
|
||||
*
|
||||
* Maintain auto recovery count and set ar_disabled for the host when
|
||||
* the threshold is reached.
|
||||
*
|
||||
* Parameters:
|
||||
*
|
||||
* node_ptr nodeLinkClass ptr of failing host.
|
||||
*
|
||||
* cause autorecovery_disable_cause_enum failure cause code.
|
||||
*
|
||||
* string host status string to display when auto recovery
|
||||
* threshold is reached and autorecovery is disabled.
|
||||
*
|
||||
* Returns:
|
||||
*
|
||||
* FAIL tells the caller to break from its FSM at earliest opportunity
|
||||
* because auto recovery threshold is reached and auto recovery
|
||||
* is disabled.
|
||||
*
|
||||
* PASS tells the caller that the threshold is not reached and to
|
||||
* continue handling the failure.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
int nodeLinkClass::ar_manage ( struct nodeLinkClass::node * node_ptr,
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner )
|
||||
{
|
||||
int rc = FAIL ;
|
||||
|
||||
/* Auto recovery only applies for hosts that are unlocked
|
||||
* and not already in ar_disabled state */
|
||||
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
|
||||
( node_ptr->ar_disabled ))
|
||||
{
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/* check for invalid call case */
|
||||
if ( cause >= MTC_AR_DISABLE_CAUSE__LAST )
|
||||
{
|
||||
slog ("%s called with invalid auto recovery cause (%d)",
|
||||
node_ptr->hostname.c_str(), cause );
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/* update cause code */
|
||||
if ( node_ptr->ar_cause != cause )
|
||||
node_ptr->ar_cause = cause ;
|
||||
|
||||
|
||||
/* Case 1 check */
|
||||
if ( ( THIS_HOST ) && ( is_inactive_controller_main_insv() == false ))
|
||||
{
|
||||
/* manage the auto recovery threshold count file */
|
||||
unsigned int value = 0 ;
|
||||
|
||||
string ar_file = TMP_DIR_PATH +
|
||||
node_ptr->hostname +
|
||||
AUTO_RECOVERY_FILE_SUFFIX ;
|
||||
|
||||
if ( daemon_is_file_present (ar_file.data()))
|
||||
{
|
||||
@ -7119,48 +7208,75 @@ void nodeLinkClass::manage_autorecovery ( struct nodeLinkClass::node * node_ptr
|
||||
/* Save the new value in the file */
|
||||
daemon_log_value ( ar_file.data(), value );
|
||||
|
||||
value = daemon_get_file_int ( ar_file.data() );
|
||||
|
||||
/* set rc to reflect what the caller should do */
|
||||
if ( value > threshold )
|
||||
if ( value > this->ar_threshold[node_ptr->ar_cause] )
|
||||
{
|
||||
elog ("%s auto recovery threshold exceeded (%d)\n",
|
||||
node_ptr->hostname.c_str(), threshold );
|
||||
node_ptr->hostname.c_str(),
|
||||
this->ar_threshold[node_ptr->ar_cause] );
|
||||
|
||||
this->autorecovery_disabled = true ;
|
||||
|
||||
if ( this->system_type == SYSTEM_TYPE__CPE_MODE__SIMPLEX )
|
||||
{
|
||||
alarm_compute_failure ( node_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
alarm_enabled_failure ( node_ptr , true ) ;
|
||||
}
|
||||
node_ptr->ar_disabled = true ;
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
|
||||
allStateChange ( node_ptr, node_ptr->adminState,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_AUTO_RECOVERY_DISABLED );
|
||||
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
|
||||
|
||||
return ;
|
||||
return (rc);
|
||||
}
|
||||
|
||||
wlog ("%s auto recovery (try %d of %d)\n",
|
||||
node_ptr->hostname.c_str(), value , threshold );
|
||||
wlog ("%s auto recovery (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
value,
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
|
||||
mtcInvApi_update_states_now ( node_ptr,
|
||||
"unlocked",
|
||||
"disabled",
|
||||
"failed",
|
||||
"disabled",
|
||||
"failed" );
|
||||
|
||||
mtcInvApi_update_task_now ( node_ptr,
|
||||
MTC_TASK_AUTO_RECOVERY );
|
||||
mtcInvApi_update_states_now ( node_ptr, "unlocked",
|
||||
"disabled", "failed",
|
||||
"disabled", "failed" );
|
||||
|
||||
lazy_graceful_fs_reboot ( node_ptr );
|
||||
}
|
||||
else /* Case 2 */
|
||||
{
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_STOP_HOST );
|
||||
mtcInvApi_update_states ( node_ptr, "unlocked", "disabled", "failed" );
|
||||
|
||||
if (( NOT_THIS_HOST ) &&
|
||||
( this->system_type != SYSTEM_TYPE__CPE_MODE__SIMPLEX ))
|
||||
{
|
||||
if ( ++node_ptr->ar_count[node_ptr->ar_cause] >=
|
||||
this->ar_threshold [node_ptr->ar_cause] )
|
||||
{
|
||||
elog ("%s auto recovery threshold exceeded (%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
this->ar_threshold[node_ptr->ar_cause] );
|
||||
node_ptr->ar_disabled = true ;
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
mtcInvApi_update_task ( node_ptr, ar_disable_banner );
|
||||
rc = FAIL ;
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s auto recovery (try %d of %d) (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->ar_count[node_ptr->ar_cause],
|
||||
this->ar_threshold[node_ptr->ar_cause],
|
||||
node_ptr->ar_cause);
|
||||
rc = PASS ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s auto recovery\n", node_ptr->hostname.c_str());
|
||||
rc = PASS ;
|
||||
}
|
||||
}
|
||||
return (rc);
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
@ -7198,13 +7314,8 @@ void nodeLinkClass::report_dor_recovery ( struct nodeLinkClass::node * node_ptr,
|
||||
|
||||
void nodeLinkClass::force_full_enable ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
/* don't do a full enable if active controller in simplex mode */
|
||||
if ( THIS_HOST && SIMPLEX )
|
||||
{
|
||||
wlog ("%s avoiding full enable of simplex system\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... lock and unlock host to force recovery\n", node_ptr->hostname.c_str());
|
||||
if ( node_ptr->ar_disabled == true )
|
||||
return ;
|
||||
}
|
||||
|
||||
if ( node_ptr->was_dor_recovery_mode )
|
||||
{
|
||||
@ -8599,13 +8710,14 @@ void nodeLinkClass::mem_log_alarm1 ( struct nodeLinkClass::node * node_ptr )
|
||||
void nodeLinkClass::mem_log_stage ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
char str[MAX_MEM_LOG_DATA] ;
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Able:%d\n",
|
||||
snprintf (&str[0], MAX_MEM_LOG_DATA, "%s\tAdd:%d Offline:%d: Swact:%d Recovery:%d Enable:%d Disable:%d\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->addStage,
|
||||
node_ptr->offlineStage,
|
||||
node_ptr->swactStage,
|
||||
node_ptr->addStage,
|
||||
node_ptr->offlineStage,
|
||||
node_ptr->swactStage,
|
||||
node_ptr->recoveryStage,
|
||||
node_ptr->handlerStage.raw);
|
||||
node_ptr->enableStage,
|
||||
node_ptr->disableStage);
|
||||
mem_log (str);
|
||||
}
|
||||
|
||||
|
@ -254,13 +254,9 @@ private:
|
||||
mtc_nodeOperState_enum operState_dport ; /**< Data Port Operational State */
|
||||
mtc_nodeAvailStatus_enum availStatus_dport; /**< Data Port Availability Status */
|
||||
|
||||
|
||||
/** Maintains the current handler stage.
|
||||
* This is a union of all handler types such as enable,
|
||||
* disable, degrade etc. See nodeBase.h for list of union members */
|
||||
mtc_stages_union handlerStage;
|
||||
|
||||
/* Individual FSM handler stages */
|
||||
mtc_enableStages_enum enableStage ;
|
||||
mtc_disableStages_enum disableStage ;
|
||||
mtc_offlineStages_enum offlineStage ;
|
||||
mtc_onlineStages_enum onlineStage ;
|
||||
mtc_swactStages_enum swactStage ;
|
||||
@ -380,6 +376,24 @@ private:
|
||||
/** when true requests the task for this host be cleared at first opportunity */
|
||||
bool clear_task ;
|
||||
|
||||
/******* Auto Recovery Control Structure and member Functions ********/
|
||||
|
||||
/* reason/cause based host level enable failure counter */
|
||||
unsigned int ar_count[MTC_AR_DISABLE_CAUSE__LAST] ;
|
||||
|
||||
/* The last enable failure reason/cause.
|
||||
* Note: MTC_AR_DISABLE_CAUSE__NONE is no failure (default) */
|
||||
autorecovery_disable_cause_enum ar_cause ;
|
||||
|
||||
/* when true indicates that a host has reached its enbale failure
|
||||
* threshold and is left in the unlocked-disabled state */
|
||||
bool ar_disabled ;
|
||||
|
||||
/* throttles the ar_disabled log to periodically indicate auto
|
||||
* recovery disabled state but avoid flooding that same message. */
|
||||
#define AR_LOG_THROTTLE_THRESHOLD (100000)
|
||||
unsigned int ar_log_throttle ;
|
||||
|
||||
/** Host's mtc timer struct. Use to time handler stages.
|
||||
*
|
||||
* reset -> reset command response
|
||||
@ -870,9 +884,10 @@ private:
|
||||
|
||||
int update_dport_states ( struct nodeLinkClass::node * node_ptr, int event );
|
||||
|
||||
/* manage deciding to return or issue an immediate reboot if the
|
||||
* auto recovery threshold is exceeded. */
|
||||
void manage_autorecovery ( struct nodeLinkClass::node * node_ptr );
|
||||
/* manage auto recovery */
|
||||
int ar_manage ( struct nodeLinkClass::node * node_ptr,
|
||||
autorecovery_disable_cause_enum cause,
|
||||
string ar_disable_banner );
|
||||
|
||||
/** ***********************************************************************
|
||||
*
|
||||
@ -1041,6 +1056,12 @@ private:
|
||||
void clear_main_failed_bools ( struct nodeLinkClass::node * node_ptr );
|
||||
void clear_hostservices_ctls ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
/* Enables/Clears dynamic auto recovery state. start fresh !
|
||||
* called in disabled_handler (lock) and in the DONE stages
|
||||
* of the enable handler. */
|
||||
void ar_enable ( struct nodeLinkClass::node * node_ptr );
|
||||
|
||||
|
||||
/** Find the node that has this timerID in its general mtc timer */
|
||||
struct nodeLinkClass::node * get_mtcTimer_timer ( timer_t tid );
|
||||
struct nodeLinkClass::node * get_mtcConfig_timer ( timer_t tid );
|
||||
@ -2005,22 +2026,36 @@ public:
|
||||
int compute_mtcalive_timeout;
|
||||
int controller_mtcalive_timeout ;
|
||||
int goenabled_timeout ;
|
||||
|
||||
/** /etc/mtc.conf configurable audit intervals */
|
||||
int swact_timeout ;
|
||||
int sysinv_timeout ;
|
||||
int sysinv_noncrit_timeout ;
|
||||
int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */
|
||||
int work_queue_timeout ;
|
||||
int loc_recovery_timeout ; /**< Loss Of Communication Recovery Timeout */
|
||||
int work_queue_timeout ;
|
||||
int node_reinstall_timeout ;
|
||||
|
||||
/** /etc/mtc.ini configurable audit intervals */
|
||||
int insv_test_period ;
|
||||
int oos_test_period ;
|
||||
int uptime_period ;
|
||||
int online_period ;
|
||||
int token_refresh_rate;
|
||||
|
||||
/* Service specific max failures before autorecovery is disabled.
|
||||
*
|
||||
* ... values for each service are loaded from mtc config
|
||||
* file at daemon startup
|
||||
*/
|
||||
unsigned int ar_threshold[MTC_AR_DISABLE_CAUSE__LAST] ;
|
||||
|
||||
/* service specific secs between autorecovery retries.
|
||||
*
|
||||
* ... values for each service are loaded from mtc config
|
||||
* file at daemon startup
|
||||
*/
|
||||
unsigned int ar_interval[MTC_AR_DISABLE_CAUSE__LAST] ;
|
||||
|
||||
int unknown_host_throttle ;
|
||||
int invalid_arg_throttle ;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -2052,7 +2087,6 @@ const char * get_adminAction_str ( mtc_nodeAdminAction_enum action );
|
||||
string bmc_get_ip ( string hostname, string mac , string & current_bm_ip );
|
||||
void clear_host_degrade_causes ( unsigned int & degrade_mask );
|
||||
bool sensor_monitoring_supported ( string hostname );
|
||||
void autorecovery_clear ( string hostname );
|
||||
void log_mnfa_pool ( std::list<string> & mnfa_awol_list );
|
||||
|
||||
#endif /* __INCLUDE_NODECLASS_H__ */
|
||||
|
@ -316,6 +316,25 @@ static int mtc_config_handler ( void * user,
|
||||
mtcInv.offline_threshold = atoi(value);
|
||||
ilog ("OfflineThrsh: %d\n", mtcInv.offline_threshold );
|
||||
}
|
||||
|
||||
else if (MATCH("agent", "ar_config_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
|
||||
else if (MATCH("agent", "ar_goenable_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value);
|
||||
else if (MATCH("agent", "ar_hostservices_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
|
||||
else if (MATCH("agent", "ar_heartbeat_threshold"))
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
|
||||
|
||||
else if (MATCH("agent", "ar_config_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__CONFIG] = atoi(value);
|
||||
else if (MATCH("agent", "ar_goenable_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__GOENABLE] = atoi(value);
|
||||
else if (MATCH("agent", "ar_hostservices_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HOST_SERVICES] = atoi(value);
|
||||
else if (MATCH("agent", "ar_heartbeat_interval"))
|
||||
mtcInv.ar_interval[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = atoi(value);
|
||||
|
||||
else
|
||||
{
|
||||
return (PASS);
|
||||
@ -635,6 +654,20 @@ int daemon_configure ( void )
|
||||
ilog("hwmond : %d (port)\n", mtc_config.hwmon_cmd_port );
|
||||
ilog("auth_host : %s \n", mtc_config.keystone_auth_host );
|
||||
|
||||
/* log system wide service based auto recovery control values */
|
||||
ilog("AR Config : %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__CONFIG],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__CONFIG]);
|
||||
ilog("AR GoEnable : %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__GOENABLE],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__GOENABLE]);
|
||||
ilog("AR Host Svcs: %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HOST_SERVICES],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HOST_SERVICES]);
|
||||
ilog("AR Heartbeat: %d (threshold) %d sec (retry interval)",
|
||||
mtcInv.ar_threshold[MTC_AR_DISABLE_CAUSE__HEARTBEAT],
|
||||
mtcInv.ar_interval [MTC_AR_DISABLE_CAUSE__HEARTBEAT]);
|
||||
|
||||
/* Get this Controller Activity State */
|
||||
mtc_config.active = daemon_get_run_option ("active") ;
|
||||
ilog ("Controller : %s\n",
|
||||
|
@ -118,7 +118,8 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
* the insv_test_handler gets run as soon as a host's main function is enabled.
|
||||
****************************************************************************
|
||||
*/
|
||||
if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
if (( node_ptr->ar_disabled == false ) &&
|
||||
( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
((node_ptr->availStatus == MTC_AVAIL_STATUS__AVAILABLE ) ||
|
||||
(node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )))
|
||||
@ -265,7 +266,7 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
flog ("%s -> Running SubFunction Enable handler (%d)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->handlerStage.enable );
|
||||
node_ptr->enableStage );
|
||||
|
||||
nodeLinkClass::enable_subf_handler ( node_ptr );
|
||||
}
|
||||
|
@ -446,6 +446,15 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
int rc = PASS ;
|
||||
|
||||
if ( node_ptr->ar_disabled == true )
|
||||
{
|
||||
wlog_throttled ( node_ptr->ar_log_throttle,
|
||||
AR_LOG_THROTTLE_THRESHOLD,
|
||||
"%s auto recovery disabled cause:%d",
|
||||
node_ptr->hostname.c_str(), node_ptr->ar_cause );
|
||||
return (RETRY); ;
|
||||
}
|
||||
|
||||
if ( THIS_HOST )
|
||||
{
|
||||
/******************************************************************
|
||||
@ -476,7 +485,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
}
|
||||
|
||||
switch ( (int)node_ptr->handlerStage.enable )
|
||||
switch ( (int)node_ptr->enableStage )
|
||||
{
|
||||
case MTC_ENABLE__FAILURE:
|
||||
{
|
||||
@ -539,7 +548,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( is_inactive_controller_main_insv() == true )
|
||||
{
|
||||
wlog ("%s has critical failure\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... requesting swact to in-service inactive controller\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... requesting swact to peer controller",
|
||||
node_ptr->hostname.c_str());
|
||||
|
||||
mtcInvApi_update_task_now ( node_ptr, MTC_TASK_FAILED_SWACT_REQ );
|
||||
|
||||
@ -587,19 +597,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
this->autorecovery_enabled = true ;
|
||||
|
||||
/* use thresholded auto recovery for simplext failure case */
|
||||
manage_autorecovery ( node_ptr );
|
||||
|
||||
if ( this->autorecovery_disabled == false )
|
||||
{
|
||||
wlog ("%s has critical failure.\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... downgrading to degrade with auto recovery disabled\n", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... to avoid disabling only enabled controller\n", node_ptr->hostname.c_str());
|
||||
this->autorecovery_disabled = true ;
|
||||
}
|
||||
|
||||
if (( CPE_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
/* Raise Critical Compute Function Alarm */
|
||||
@ -620,7 +617,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
}
|
||||
|
||||
if ( degrade_only == true )
|
||||
/* if we get here in controller simplex mode then go degraded
|
||||
* if we are not already degraded. Otherwise, fail. */
|
||||
if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
|
||||
{
|
||||
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
|
||||
( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
|
||||
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
/* adminAction state is already changed to NONE. */
|
||||
}
|
||||
|
||||
else if ( degrade_only == true )
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
@ -636,25 +648,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
/* Inform the VIM of the failure */
|
||||
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
||||
|
||||
/* if we get here in controller simplex mode then go degraded
|
||||
* if we are not already degraded. Otherwise, fail. */
|
||||
if ( THIS_HOST && ( is_inactive_controller_main_insv() == false ))
|
||||
/* handle thresholded auto recovery retry delay interval */
|
||||
if ( node_ptr->ar_cause < MTC_AR_DISABLE_CAUSE__LAST )
|
||||
{
|
||||
/* autorecovery must be disabled */
|
||||
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
|
||||
( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
|
||||
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
|
||||
unsigned int interval = this->ar_interval[node_ptr->ar_cause] ;
|
||||
if ( interval )
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
/* Wait this failure cause's retry delay */
|
||||
mtcTimer_start ( node_ptr->mtcTimer,
|
||||
mtcTimer_handler,
|
||||
interval );
|
||||
|
||||
wlog ("%s waiting %d secs before enable sequence retry (%d)",
|
||||
node_ptr->hostname.c_str(),
|
||||
interval, node_ptr->ar_cause );
|
||||
}
|
||||
/* adminAction state is already changed to NONE. */
|
||||
else
|
||||
node_ptr->mtcTimer.ring = true ;
|
||||
}
|
||||
else
|
||||
{
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
|
||||
}
|
||||
node_ptr->mtcTimer.ring = true ;
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_WAIT );
|
||||
|
||||
break;
|
||||
}
|
||||
@ -717,6 +732,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
mtcCmd_workQ_purge ( node_ptr );
|
||||
mtcCmd_doneQ_purge ( node_ptr );
|
||||
|
||||
node_ptr->mtce_flags = 0 ;
|
||||
|
||||
/* Assert the mtc alive gate */
|
||||
node_ptr->mtcAlive_gate = true ;
|
||||
|
||||
@ -739,8 +756,9 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
/* enable auto recovery if the inactive controller
|
||||
* is out of service */
|
||||
if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
|
||||
this->autorecovery_enabled = true ;
|
||||
//if (( is_controller (node_ptr) ) && ( NOT_THIS_HOST ))
|
||||
// node_ptr->ar_disabled = false ;
|
||||
// this->autorecovery_enabled = true ;
|
||||
|
||||
/* fall through */
|
||||
|
||||
@ -757,20 +775,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
get_availStatus_str(node_ptr->availStatus).c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, "" );
|
||||
|
||||
/* Special case */
|
||||
// alarm_enabled_clear ( node_ptr, false );
|
||||
|
||||
//mtcAlarm_clear ( node_ptr->hostname, MTC_ALARM_ID__CONFIG );
|
||||
//node_ptr->alarms[MTC_ALARM_ID__CONFIG] = FM_ALARM_SEVERITY_CLEAR ;
|
||||
|
||||
//allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
// MTC_OPER_STATE__ENABLED,
|
||||
// MTC_AVAIL_STATUS__DEGRADED );
|
||||
|
||||
// adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
|
||||
// return (PASS);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -986,6 +990,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->mtcAlive_online = false ;
|
||||
node_ptr->mtcAlive_offline = true ;
|
||||
node_ptr->goEnabled = false ;
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
|
||||
|
||||
clear_service_readies ( node_ptr );
|
||||
|
||||
@ -1038,17 +1043,24 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
|
||||
/* Check to see if the host is/got configured correctly */
|
||||
if ( (node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED) == 0 )
|
||||
if ((( !node_ptr->mtce_flags & MTC_FLAG__I_AM_CONFIGURED )) ||
|
||||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY )))
|
||||
{
|
||||
elog ("%s configuration incomplete or failed (oob:%x:%x)\n",
|
||||
elog ("%s configuration failed or incomplete (oob:%x)\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->mtce_flags,
|
||||
MTC_FLAG__I_AM_CONFIGURED);
|
||||
node_ptr->mtce_flags)
|
||||
|
||||
/* raise an alarm for the failure of the config */
|
||||
alarm_config_failure ( node_ptr );
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_CONFIG_FAIL );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__CONFIG,
|
||||
MTC_TASK_AR_DISABLED_CONFIG ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1152,6 +1164,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
case MTC_ENABLE__GOENABLED_WAIT:
|
||||
{
|
||||
bool goenable_failed = false ;
|
||||
/* The healthy code comes from the host in the mtcAlive message.
|
||||
* This 'if' clause was introduced to detected failure of host
|
||||
* without having to wait for the GOENABLED phase to timeout.
|
||||
@ -1162,27 +1175,22 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
* be gracefully recovered to enabled in that case. Instead
|
||||
* we want to recover the card through a reset as quickly as
|
||||
* possible. */
|
||||
if ( node_ptr->health == NODE_UNHEALTHY )
|
||||
{
|
||||
elog ("%s is UNHEALTHY\n", node_ptr->hostname.c_str());
|
||||
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
/* search for the Go Enable message */
|
||||
else if ( node_ptr->goEnabled_failed == true )
|
||||
if (( node_ptr->health == NODE_UNHEALTHY ) ||
|
||||
(( node_ptr->mtce_flags & MTC_FLAG__I_AM_NOT_HEALTHY)) ||
|
||||
( node_ptr->goEnabled_failed == true ))
|
||||
{
|
||||
elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
goenable_failed = true ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
|
||||
}
|
||||
/* search for the Go Enable message */
|
||||
else if ( node_ptr->goEnabled == true )
|
||||
{
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
plog ("%s got GOENABLED\n", node_ptr->hostname.c_str());
|
||||
// plog ("%s main configured OK\n", node_ptr->hostname.c_str());
|
||||
|
||||
/* O.K. clearing the state now that we got it */
|
||||
node_ptr->goEnabled = false ;
|
||||
@ -1194,26 +1202,28 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else if ( mtcTimer_expired ( node_ptr->mtcTimer ))
|
||||
{
|
||||
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
||||
ilog ("%s ... the out-of-service tests took too long to complete\n",
|
||||
node_ptr->hostname.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL_TO_ );
|
||||
elog ("%s has GOENABLED Timeout", node_ptr->hostname.c_str());
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
|
||||
/* raise an alarm for the enable failure */
|
||||
alarm_enabled_failure ( node_ptr , true );
|
||||
|
||||
/* go back and issue reboot again */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
/* no longer In-Test ; we are 'Failed' again" */
|
||||
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__FAILED );
|
||||
goenable_failed = true ;
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
; /* wait some more */
|
||||
}
|
||||
|
||||
if ( goenable_failed )
|
||||
{
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__GOENABLE,
|
||||
MTC_TASK_AR_DISABLED_GOENABLE ) != PASS )
|
||||
break ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
|
||||
@ -1224,14 +1234,20 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
plog ("%s Starting Host Services\n", node_ptr->hostname.c_str());
|
||||
if ( this->launch_host_services_cmd ( node_ptr, start ) != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
|
||||
elog ("%s %s failed ; launch\n",
|
||||
node_ptr->hostname.c_str(),
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_START_SERVICE_FAIL );
|
||||
node_ptr->hostservices_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_SERVICE_FAIL );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1261,6 +1277,10 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
else if ( rc != PASS )
|
||||
{
|
||||
node_ptr->hostservices_failed = true ;
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
|
||||
/* distinguish 'timeout' from other 'execution' failures */
|
||||
if ( rc == FAIL_TIMEOUT )
|
||||
{
|
||||
@ -1269,7 +1289,7 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
node_ptr->host_services_req.name.c_str());
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_TO );
|
||||
MTC_TASK_MAIN_SERVICE_TO );
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1279,9 +1299,14 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
rc);
|
||||
|
||||
mtcInvApi_update_task ( node_ptr,
|
||||
MTC_TASK_START_SERVICE_FAIL );
|
||||
MTC_TASK_MAIN_SERVICE_FAIL );
|
||||
}
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE );
|
||||
|
||||
/* handle auto recovery for this failure */
|
||||
if ( ar_manage ( node_ptr,
|
||||
MTC_AR_DISABLE_CAUSE__HOST_SERVICES,
|
||||
MTC_TASK_AR_DISABLED_SERVICES ) != PASS )
|
||||
break ;
|
||||
}
|
||||
else /* success path */
|
||||
{
|
||||
@ -1321,8 +1346,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
}
|
||||
|
||||
/* Start Monitoring Services - heartbeat, process and hardware */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
|
||||
if ( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE )
|
||||
{
|
||||
@ -1338,11 +1361,13 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
MTC_HEARTBEAT_SOAK_BEFORE_ENABLE,
|
||||
node_ptr->hbsClient_ready ? " ready event" : "out ready event" );
|
||||
|
||||
|
||||
/* allow heartbeat to run for MTC_HEARTBEAT_SOAK_BEFORE_ENABLE
|
||||
* seconds before we declare enable */
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_HEARTBEAT_SOAK_BEFORE_ENABLE );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__HEARTBEAT_SOAK );
|
||||
|
||||
/* Start Monitoring Services - heartbeat, process and hardware */
|
||||
send_hbs_command ( node_ptr->hostname, MTC_CMD_START_HOST );
|
||||
}
|
||||
break ;
|
||||
}
|
||||
@ -1351,6 +1376,11 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
plog ("%s heartbeating\n", node_ptr->hostname.c_str() );
|
||||
|
||||
/* handle auto recovery ear for thsi potential cause */
|
||||
node_ptr->ar_cause = MTC_AR_DISABLE_CAUSE__NONE ;
|
||||
node_ptr->ar_count[MTC_AR_DISABLE_CAUSE__HEARTBEAT] = 0 ;
|
||||
|
||||
/* if heartbeat is not working then we will
|
||||
* never get here and enable the host */
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__STATE_CHANGE );
|
||||
@ -1490,7 +1520,6 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
node_ptr->enabled_count++ ;
|
||||
|
||||
/* Inform the VIM that this host is enabled */
|
||||
@ -1505,6 +1534,8 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
|
||||
node_ptr->health_threshold_counter = 0 ;
|
||||
|
||||
ar_enable ( node_ptr );
|
||||
}
|
||||
|
||||
break ;
|
||||
@ -2103,7 +2134,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
elog ("%s got GOENABLED Failed\n", node_ptr->hostname.c_str());
|
||||
mtcTimer_reset ( node_ptr->mtcTimer );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_INTEST_FAIL );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_FAIL );
|
||||
this->force_full_enable ( node_ptr );
|
||||
}
|
||||
|
||||
@ -2121,6 +2152,7 @@ int nodeLinkClass::recovery_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
else if ( node_ptr->mtcTimer.ring == true )
|
||||
{
|
||||
elog ("%s has GOENABLED Timeout\n", node_ptr->hostname.c_str());
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_MAIN_INTEST_TO );
|
||||
|
||||
node_ptr->mtcTimer.ring = false ;
|
||||
|
||||
@ -2640,7 +2672,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
{
|
||||
int rc = PASS ;
|
||||
|
||||
switch ( (int)node_ptr->handlerStage.disable )
|
||||
switch ( (int)node_ptr->disableStage )
|
||||
{
|
||||
case MTC_DISABLE__START:
|
||||
{
|
||||
@ -2657,6 +2689,7 @@ int nodeLinkClass::disable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
clear_subf_failed_bools ( node_ptr );
|
||||
clear_hostservices_ctls ( node_ptr );
|
||||
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__START ) ;
|
||||