Fix mtcAgent AIO simplex subfunction failure handling over unlock

The mtcAgent is seen to get stuck handling a subfunction failure
detected over self (controller-0) unlock of an AIO simplex controller.

It gets stuck reporting that is it already handling the failure,
but isn't.

log flooding: 'controller-0 already handling force full enable'

This issue only exists in AIO simplex when the subfunction enable
handler detects the failure. This issue was introduced by the
following update:

Remove Start Host Service Launch in mtcAgent & enhance fault detection
https://opendev.org/starlingx/metal/commit/
      6106051f1c

Test Plan:

PASS: Verify an AIO simplex self unlock subfunction failure leads to
      'degrade' state with 'enable failure' alarm.
PASS: Verify same issue for the standby controller leads to
      'failure' state with 'enable failure' alarm.

Regression:

PASS: Verify spontaneous unhealthy active controller is degraded.
PASS: Verify spontaneous unhealthy standby controller is failed.

Closes-Bug: 2119449
Change-Id: I5ab5e6d85906f1923a0828211dbf94d2f82e73f8
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald
2025-08-05 10:06:02 -04:00
parent f86bb1c10e
commit 2cb728678f
2 changed files with 52 additions and 28 deletions

View File

@@ -2837,10 +2837,17 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
node_ptr->operState = operState_str_to_enum (inv.oper.data ());
node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data());
if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
if ( is_controller(node_ptr) == true )
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
if ( node_ptr->hostname != this->my_hostname )
{
set_inactive_controller ( node_ptr->hostname ) ;
}
if ( AIO_SYSTEM )
{
node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data());
node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data());
}
}
/* Send back a retry so that this add is converted to a modify */

View File

@@ -694,37 +694,59 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr )
MTC_AVAIL_STATUS__FAILED );
}
/* if we get here in controller simplex mode then go degraded
* if we are not already degraded. Otherwise, fail. */
if ( THIS_HOST && ( inactive_controller_insv() == false ))
/* If we get here in controller non-simplex with no enabled
* standby then raise critical enable alarm and go degraded. */
if (( THIS_HOST) &&
( NOT_SIMPLEX ) &&
( degrade_only == false ) &&
( is_controller(node_ptr) == true ) &&
( inactive_controller_insv () == false ))
{
if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) ||
( node_ptr->operState != MTC_OPER_STATE__ENABLED ) ||
( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED))
{
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
}
/* adminAction state is already changed to NONE. */
degrade_only = true ;
}
else if ( degrade_only == true )
if ( degrade_only == true )
{
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
if ( ( AIO_SYSTEM ) && ( is_controller(node_ptr) == true ))
{
wlog ("%s active controller is %s-%s-%s with worker:%s-%s and no enabled standby controller - degrade only",
node_ptr->hostname.c_str(),
get_adminState_str(node_ptr->adminState).c_str(),
get_operState_str(node_ptr->operState).c_str(),
get_availStatus_str(node_ptr->availStatus).c_str(),
get_operState_str(node_ptr->operState_subf).c_str(),
get_availStatus_str(node_ptr->availStatus_subf).c_str());
}
else
{
wlog ("%s active controller is %s-%s-%s and no enabled standby controller - degrade only",
node_ptr->hostname.c_str(),
get_adminState_str(node_ptr->adminState).c_str(),
get_operState_str(node_ptr->operState).c_str(),
get_availStatus_str(node_ptr->availStatus).c_str());
}
wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str());
wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str());
allStateChange ( node_ptr,
node_ptr->adminState,
MTC_OPER_STATE__ENABLED,
MTC_AVAIL_STATUS__DEGRADED );
alarm_enabled_failure ( node_ptr, true );
mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP);
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE );
enableStageChange ( node_ptr, MTC_ENABLE__START );
}
else
{
allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED,
MTC_OPER_STATE__DISABLED,
MTC_AVAIL_STATUS__FAILED );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER );
}
/* Inform the VIM of the failure */
mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 );
enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER );
break ;
}
case MTC_ENABLE__FAILURE_TIMER:
@@ -8267,17 +8289,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr )
}
}
}
/*
* Send out-of-service test command and wait for the
* next audit interval to see the result.
*
* node_ptr->goEnabled_subf == true is pass
* node_ptr->goEnabled_subf_failed == true is fail
*
**/
if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) &&
( node_ptr->operState == MTC_OPER_STATE__ENABLED ) &&
( node_ptr->forcing_full_enable == false ) &&
( node_ptr->ar_disabled == false ))
{
/* Only force recovery if the node is not already in recovery */
if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) &&
( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE ))
{