Fix mtcAgent AIO simplex subfunction failure handling over unlock
The mtcAgent is seen to get stuck handling a subfunction failure
detected over self (controller-0) unlock of an AIO simplex controller.
It gets stuck reporting that is it already handling the failure,
but isn't.
log flooding: 'controller-0 already handling force full enable'
This issue only exists in AIO simplex when the subfunction enable
handler detects the failure. This issue was introduced by the
following update:
Remove Start Host Service Launch in mtcAgent & enhance fault detection
https://opendev.org/starlingx/metal/commit/
6106051f1c
Test Plan:
PASS: Verify an AIO simplex self unlock subfunction failure leads to
'degrade' state with 'enable failure' alarm.
PASS: Verify same issue for the standby controller leads to
'failure' state with 'enable failure' alarm.
Regression:
PASS: Verify spontaneous unhealthy active controller is degraded.
PASS: Verify spontaneous unhealthy standby controller is failed.
Closes-Bug: 2119449
Change-Id: I5ab5e6d85906f1923a0828211dbf94d2f82e73f8
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
@@ -2837,10 +2837,17 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
||||
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
|
||||
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
|
||||
|
||||
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
if ( is_controller(node_ptr) == true )
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
if ( node_ptr->hostname != this->my_hostname )
|
||||
{
|
||||
set_inactive_controller ( node_ptr->hostname ) ;
|
||||
}
|
||||
if ( AIO_SYSTEM )
|
||||
{
|
||||
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
|
||||
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
|
||||
}
|
||||
}
|
||||
|
||||
/* Send back a retry so that this add is converted to a modify */
|
||||
|
||||
@@ -694,37 +694,59 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
}
|
||||
|
||||
/* if we get here in controller simplex mode then go degraded
|
||||
* if we are not already degraded. Otherwise, fail. */
|
||||
if ( THIS_HOST && ( inactive_controller_insv() == false ))
|
||||
/* If we get here in controller non-simplex with no enabled
|
||||
* standby then raise critical enable alarm and go degraded. */
|
||||
if (( THIS_HOST) &&
|
||||
( NOT_SIMPLEX ) &&
|
||||
( degrade_only == false ) &&
|
||||
( is_controller(node_ptr) == true ) &&
|
||||
( inactive_controller_insv () == false ))
|
||||
{
|
||||
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
|
||||
( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
|
||||
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
}
|
||||
/* adminAction state is already changed to NONE. */
|
||||
degrade_only = true ;
|
||||
}
|
||||
|
||||
else if ( degrade_only == true )
|
||||
if ( degrade_only == true )
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
if ( ( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
|
||||
{
|
||||
wlog ("%s active controller is %s-%s-%s with worker:%s-%s and no enabled standby controller - degrade only",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_adminState_str(node_ptr->adminState).c_str(),
|
||||
get_operState_str(node_ptr->operState).c_str(),
|
||||
get_availStatus_str(node_ptr->availStatus).c_str(),
|
||||
get_operState_str(node_ptr->operState_subf).c_str(),
|
||||
get_availStatus_str(node_ptr->availStatus_subf).c_str());
|
||||
}
|
||||
else
|
||||
{
|
||||
wlog ("%s active controller is %s-%s-%s and no enabled standby controller - degrade only",
|
||||
node_ptr->hostname.c_str(),
|
||||
get_adminState_str(node_ptr->adminState).c_str(),
|
||||
get_operState_str(node_ptr->operState).c_str(),
|
||||
get_availStatus_str(node_ptr->availStatus).c_str());
|
||||
}
|
||||
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
|
||||
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
|
||||
|
||||
allStateChange ( node_ptr,
|
||||
node_ptr->adminState,
|
||||
MTC_OPER_STATE__ENABLED,
|
||||
MTC_AVAIL_STATUS__DEGRADED );
|
||||
alarm_enabled_failure ( node_ptr, true );
|
||||
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
|
||||
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__START );
|
||||
}
|
||||
else
|
||||
{
|
||||
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
|
||||
MTC_OPER_STATE__DISABLED,
|
||||
MTC_AVAIL_STATUS__FAILED );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER );
|
||||
}
|
||||
|
||||
/* Inform the VIM of the failure */
|
||||
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
|
||||
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER );
|
||||
break ;
|
||||
}
|
||||
case MTC_ENABLE__FAILURE_TIMER:
|
||||
@@ -8267,17 +8289,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Send out-of-service test command and wait for the
|
||||
* next audit interval to see the result.
|
||||
*
|
||||
* node_ptr->goEnabled_subf == true is pass
|
||||
* node_ptr->goEnabled_subf_failed == true is fail
|
||||
*
|
||||
**/
|
||||
if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
|
||||
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
|
||||
( node_ptr->forcing_full_enable == false ) &&
|
||||
( node_ptr->ar_disabled == false ))
|
||||
{
|
||||
/* Only force recovery if the node is not already in recovery */
|
||||
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
|
||||
( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user