From 0853bb3fcc48e8ed1e449c8c22e4ac819e836fd8 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 9 Jan 2025 19:51:01 -0500 Subject: [PATCH] Add configured add host delay to mtcAgent The active 'controller' domain name is used by the mtcAgent management interface to communicate with the mtcClient. The System Swact (Switch Activity) function dynamically migrates active controller services between controller-0 and controller-1. During this process, the mtcAgent, along with other services, are restarted on the newly active controller. When the mtcAgent starts, it reads the system inventory and adds the hosts to its internal control structure. During this "add" operation, the mtcAgent sends commands and expects responses from the local and remote mtcClients on individual nodes, using the controller domain name, which represents the management network's floating IP address. A new feature, the FQDN (Fully Qualified Domain Name) Resolution Manager, was introduced to handle domain name resolution in the StarlingX system. However, an issue was identified where the FQDN resolution manager does not have the 'controller' domain name resolution support fully available (qualified) when the mtcAgent starts messaging with its mtcClients. As a result, the communication between the mtcAgent and mtcClient can lead to silent message loss. This issue can cause the "add host" operation to fail, potentially being service affecting for that host. This update adds a small, manually configurable delay, to the mtcAgent host add operation start. This gives FQDN the time to complete setting up name resolution for the required 'controller' domain name. The default add_host_delay of 20 seconds was selected after seeing the occasional failure with a 10 second delay. This update can be removed in the future if the system makes changes to avoid starting the mtcAgent before all name resolution is ready. Test Plan: PASS: Verify issue in system, apply update, verify issue is resolved. PASS: Verify package/iso build along with AIO DX system install. PASS: Verify mtcAgent logging. Regression: PASS: Verify standby controller lock/unlock soak ; 10+ loops. PASS: Verify Swact soak of 20+ swacts succeeds without reproducing the issue this update is designed to fix. PASS: Verify heart beating is enabled on all remote hosts on both controllers following an install and multiple Swacts. PASS: Verify sensor monitoring is enabled on all hosts that have their BVMC provisioned over a Swact. PASS: Verify mtcClient, mtcAgent, hbsAgent and hbsClient logs for unexpected behavior. PASS: Verify default add hosts delay can be changed and a mtcAgent configuration reload or process restart uses the modified value. PASS: Verify no add host delay is imposed if the new configuration label is removed from the config file or set to 0. PASS: Verify host lock immediately following a swact and successful system host-list. Closes-Bug: 2093381 Change-Id: I694322eff0945c7c56bf21051b3d6cccacf829a2 Signed-off-by: Eric MacDonald --- mtce-common/src/common/logMacros.h | 1 + mtce/src/common/nodeClass.cpp | 14 ++++++++++++-- mtce/src/maintenance/mtcNodeCtrl.cpp | 5 +++++ mtce/src/maintenance/mtcNodeHdlrs.cpp | 7 ++++++- mtce/src/scripts/mtc.conf | 2 ++ 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/mtce-common/src/common/logMacros.h b/mtce-common/src/common/logMacros.h index 41de0aa7..a3227b5b 100644 --- a/mtce-common/src/common/logMacros.h +++ b/mtce-common/src/common/logMacros.h @@ -120,6 +120,7 @@ typedef struct int api_retries ; /**< api retries before failure */ int bmc_reset_delay ; /**< secs delay before bmc reset */ int http_retry_wait ; /**< secs to wait between http reg retries */ + int host_add_delay ; /**< secs to wait before adding hosts */ int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */ bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */ bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */ diff --git a/mtce/src/common/nodeClass.cpp b/mtce/src/common/nodeClass.cpp index b46bfafe..3b983f10 100755 --- a/mtce/src/common/nodeClass.cpp +++ b/mtce/src/common/nodeClass.cpp @@ -639,7 +639,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname ) ptr->swactStage = MTC_SWACT__START ; ptr->offlineStage = MTC_OFFLINE__IDLE ; ptr->onlineStage = MTC_ONLINE__START ; - ptr->addStage = MTC_ADD__START ; + ptr->addStage = MTC_ADD__START_DELAY; ptr->delStage = MTC_DEL__START ; ptr->recoveryStage = MTC_RECOVERY__START ; ptr->insvTestStage = MTC_INSV_TEST__RUN ; /* Start wo initial delay */ @@ -3137,7 +3137,17 @@ int nodeLinkClass::add_host ( node_inv_type & inv ) if (( rc == PASS ) && ( node_ptr )) { - node_ptr->addStage = MTC_ADD__START ; + int delay = daemon_get_cfg_ptr()->host_add_delay ; + if ( delay > 0 ) + { + mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay ); + node_ptr->addStage = MTC_ADD__START_DELAY ; + } + else + { + node_ptr->addStage = MTC_ADD__START ; + } + ilog ("Host add delay is %d seconds", delay ); adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD ); } return (rc); diff --git a/mtce/src/maintenance/mtcNodeCtrl.cpp b/mtce/src/maintenance/mtcNodeCtrl.cpp index ba5dd369..900805d0 100644 --- a/mtce/src/maintenance/mtcNodeCtrl.cpp +++ b/mtce/src/maintenance/mtcNodeCtrl.cpp @@ -463,6 +463,11 @@ static int mtc_config_handler ( void * user, config_ptr->http_retry_wait = atoi(value); mtcInv.http_retry_wait = config_ptr->http_retry_wait ; } + else if (MATCH("agent", "host_add_delay")) + { + config_ptr->host_add_delay = atoi(value); + ilog ("Start Delay : %d secs", config_ptr->host_add_delay ); + } else if (MATCH("timeouts", "failsafe_shutdown_delay")) { config_ptr->failsafe_shutdown_delay = atoi(value); diff --git a/mtce/src/maintenance/mtcNodeHdlrs.cpp b/mtce/src/maintenance/mtcNodeHdlrs.cpp index 7084dbab..a00152d1 100755 --- a/mtce/src/maintenance/mtcNodeHdlrs.cpp +++ b/mtce/src/maintenance/mtcNodeHdlrs.cpp @@ -6215,8 +6215,13 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr ) switch ( node_ptr->addStage ) { - case MTC_ADD__START: case MTC_ADD__START_DELAY: + { + if ( mtcTimer_expired (node_ptr->mtcTimer) ) + node_ptr->addStage = MTC_ADD__START ; + break ; + } + case MTC_ADD__START: { bool timer_set = false ; plog ("%s Host Add\n", node_ptr->hostname.c_str()); diff --git a/mtce/src/scripts/mtc.conf b/mtce/src/scripts/mtc.conf index 3bc57cba..84d6b3f6 100644 --- a/mtce/src/scripts/mtc.conf +++ b/mtce/src/scripts/mtc.conf @@ -80,6 +80,8 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc http_retry_wait = 10 ; secs to wait between http request retries +host_add_delay = 20 ; seconds to wait before adding hosts + [client] ; Client Configuration scheduling_priority = 45 ; realtime scheduling; range of 1 .. 99