From 2cb728678f7999cfc02bbc088f8fe1545a792754 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Tue, 5 Aug 2025 10:06:02 -0400 Subject: [PATCH] Fix mtcAgent AIO simplex subfunction failure handling over unlock The mtcAgent is seen to get stuck handling a subfunction failure detected over self (controller-0) unlock of an AIO simplex controller. It gets stuck reporting that is it already handling the failure, but isn't. log flooding: 'controller-0 already handling force full enable' This issue only exists in AIO simplex when the subfunction enable handler detects the failure. This issue was introduced by the following update: Remove Start Host Service Launch in mtcAgent & enhance fault detection https://opendev.org/starlingx/metal/commit/ 6106051f1c442f9617d5643f032c3765c2d0f4ba Test Plan: PASS: Verify an AIO simplex self unlock subfunction failure leads to 'degrade' state with 'enable failure' alarm. PASS: Verify same issue for the standby controller leads to 'failure' state with 'enable failure' alarm. Regression: PASS: Verify spontaneous unhealthy active controller is degraded. PASS: Verify spontaneous unhealthy standby controller is failed. Closes-Bug: 2119449 Change-Id: I5ab5e6d85906f1923a0828211dbf94d2f82e73f8 Signed-off-by: Eric MacDonald --- mtce/src/common/nodeClass.cpp | 13 ++++-- mtce/src/maintenance/mtcNodeHdlrs.cpp | 67 +++++++++++++++++---------- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index a236acdc..de55dbc1 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -2837,10 +2837,17 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) node_ptr->operState = operState_str_to_enum (inv.oper.data ()); node_ptr->availStatus = availStatus_str_to_enum (inv.avail.data()); - if (( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) + if ( is_controller(node_ptr) == true ) { - node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); - node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); + if ( node_ptr->hostname != this->my_hostname ) + { + set_inactive_controller ( node_ptr->hostname ) ; + } + if ( AIO_SYSTEM ) + { + node_ptr->operState_subf = operState_str_to_enum (inv.oper_subf.data()); + node_ptr->availStatus_subf = availStatus_str_to_enum (inv.avail_subf.data()); + } } /* Send back a retry so that this add is converted to a modify */ diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 1495c056..929e617c 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -694,37 +694,59 @@ int nodeLinkClass::enable_handler ( struct nodeLinkClass::node * node_ptr ) MTC_AVAIL_STATUS__FAILED ); } - /* if we get here in controller simplex mode then go degraded - * if we are not already degraded. Otherwise, fail. */ - if ( THIS_HOST && ( inactive_controller_insv() == false )) + /* If we get here in controller non-simplex with no enabled + * standby then raise critical enable alarm and go degraded. */ + if (( THIS_HOST) && + ( NOT_SIMPLEX ) && + ( degrade_only == false ) && + ( is_controller(node_ptr) == true ) && + ( inactive_controller_insv () == false )) { - if (( node_ptr->adminState != MTC_ADMIN_STATE__UNLOCKED ) || - ( node_ptr->operState != MTC_OPER_STATE__ENABLED ) || - ( node_ptr->availStatus != MTC_AVAIL_STATUS__DEGRADED)) - { - allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, - MTC_OPER_STATE__ENABLED, - MTC_AVAIL_STATUS__DEGRADED ); - } - /* adminAction state is already changed to NONE. */ + degrade_only = true ; } - else if ( degrade_only == true ) + if ( degrade_only == true ) { - allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, - MTC_OPER_STATE__ENABLED, - MTC_AVAIL_STATUS__DEGRADED ); + if ( ( AIO_SYSTEM ) && ( is_controller(node_ptr) == true )) + { + wlog ("%s active controller is %s-%s-%s with worker:%s-%s and no enabled standby controller - degrade only", + node_ptr->hostname.c_str(), + get_adminState_str(node_ptr->adminState).c_str(), + get_operState_str(node_ptr->operState).c_str(), + get_availStatus_str(node_ptr->availStatus).c_str(), + get_operState_str(node_ptr->operState_subf).c_str(), + get_availStatus_str(node_ptr->availStatus_subf).c_str()); + } + else + { + wlog ("%s active controller is %s-%s-%s and no enabled standby controller - degrade only", + node_ptr->hostname.c_str(), + get_adminState_str(node_ptr->adminState).c_str(), + get_operState_str(node_ptr->operState).c_str(), + get_availStatus_str(node_ptr->availStatus).c_str()); + } + wlog ("%s ... critical enable alarm raised", node_ptr->hostname.c_str()); + wlog ("%s ... recommend enabling a standby controller.", node_ptr->hostname.c_str()); + + allStateChange ( node_ptr, + node_ptr->adminState, + MTC_OPER_STATE__ENABLED, + MTC_AVAIL_STATUS__DEGRADED ); + alarm_enabled_failure ( node_ptr, true ); + mtcInvApi_update_task ( node_ptr, MTC_TASK_FAILED_NO_BACKUP); + adminActionChange ( node_ptr, MTC_ADMIN_ACTION__NONE ); + enableStageChange ( node_ptr, MTC_ENABLE__START ); } else { allStateChange ( node_ptr, MTC_ADMIN_STATE__UNLOCKED, MTC_OPER_STATE__DISABLED, MTC_AVAIL_STATUS__FAILED ); + enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER ); } /* Inform the VIM of the failure */ mtcVimApi_state_change ( node_ptr, VIM_HOST_FAILED, 3 ); - enableStageChange ( node_ptr, MTC_ENABLE__FAILURE_TIMER ); break ; } case MTC_ENABLE__FAILURE_TIMER: @@ -8267,17 +8289,12 @@ int nodeLinkClass::insv_test_handler ( struct nodeLinkClass::node * node_ptr ) } } } - /* - * Send out-of-service test command and wait for the - * next audit interval to see the result. - * - * node_ptr->goEnabled_subf == true is pass - * node_ptr->goEnabled_subf_failed == true is fail - * - **/ if (( node_ptr->operState_subf == MTC_OPER_STATE__DISABLED ) && + ( node_ptr->operState == MTC_OPER_STATE__ENABLED ) && + ( node_ptr->forcing_full_enable == false ) && ( node_ptr->ar_disabled == false )) { + /* Only force recovery if the node is not already in recovery */ if (( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE_SUBF ) && ( node_ptr->adminAction != MTC_ADMIN_ACTION__ENABLE )) {