Add configured add host delay to mtcAgent
The active 'controller' domain name is used by the mtcAgent management interface to communicate with the mtcClient. The System Swact (Switch Activity) function dynamically migrates active controller services between controller-0 and controller-1. During this process, the mtcAgent, along with other services, are restarted on the newly active controller. When the mtcAgent starts, it reads the system inventory and adds the hosts to its internal control structure. During this "add" operation, the mtcAgent sends commands and expects responses from the local and remote mtcClients on individual nodes, using the controller domain name, which represents the management network's floating IP address. A new feature, the FQDN (Fully Qualified Domain Name) Resolution Manager, was introduced to handle domain name resolution in the StarlingX system. However, an issue was identified where the FQDN resolution manager does not have the 'controller' domain name resolution support fully available (qualified) when the mtcAgent starts messaging with its mtcClients. As a result, the communication between the mtcAgent and mtcClient can lead to silent message loss. This issue can cause the "add host" operation to fail, potentially being service affecting for that host. This update adds a small, manually configurable delay, to the mtcAgent host add operation start. This gives FQDN the time to complete setting up name resolution for the required 'controller' domain name. The default add_host_delay of 20 seconds was selected after seeing the occasional failure with a 10 second delay. This update can be removed in the future if the system makes changes to avoid starting the mtcAgent before all name resolution is ready. Test Plan: PASS: Verify issue in system, apply update, verify issue is resolved. PASS: Verify package/iso build along with AIO DX system install. PASS: Verify mtcAgent logging. Regression: PASS: Verify standby controller lock/unlock soak ; 10+ loops. PASS: Verify Swact soak of 20+ swacts succeeds without reproducing the issue this update is designed to fix. PASS: Verify heart beating is enabled on all remote hosts on both controllers following an install and multiple Swacts. PASS: Verify sensor monitoring is enabled on all hosts that have their BVMC provisioned over a Swact. PASS: Verify mtcClient, mtcAgent, hbsAgent and hbsClient logs for unexpected behavior. PASS: Verify default add hosts delay can be changed and a mtcAgent configuration reload or process restart uses the modified value. PASS: Verify no add host delay is imposed if the new configuration label is removed from the config file or set to 0. PASS: Verify host lock immediately following a swact and successful system host-list. Closes-Bug: 2093381 Change-Id: I694322eff0945c7c56bf21051b3d6cccacf829a2 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
3423e2efff
commit
0853bb3fcc
mtce-common/src/common
mtce/src
@ -120,6 +120,7 @@ typedef struct
|
||||
int api_retries ; /**< api retries before failure */
|
||||
int bmc_reset_delay ; /**< secs delay before bmc reset */
|
||||
int http_retry_wait ; /**< secs to wait between http reg retries */
|
||||
int host_add_delay ; /**< secs to wait before adding hosts */
|
||||
int hostwd_failure_threshold ; /**< allowed # of missed pmon/hostwd messages */
|
||||
bool hostwd_reboot_on_err ; /**< should hostwd reboot on fault detected */
|
||||
bool hostwd_kdump_on_stall ; /**< sysrq crash dump on quorum msg'ing stall */
|
||||
|
@ -639,7 +639,7 @@ nodeLinkClass::node* nodeLinkClass::addNode( string hostname )
|
||||
ptr->swactStage = MTC_SWACT__START ;
|
||||
ptr->offlineStage = MTC_OFFLINE__IDLE ;
|
||||
ptr->onlineStage = MTC_ONLINE__START ;
|
||||
ptr->addStage = MTC_ADD__START ;
|
||||
ptr->addStage = MTC_ADD__START_DELAY;
|
||||
ptr->delStage = MTC_DEL__START ;
|
||||
ptr->recoveryStage = MTC_RECOVERY__START ;
|
||||
ptr->insvTestStage = MTC_INSV_TEST__RUN ; /* Start wo initial delay */
|
||||
@ -3137,7 +3137,17 @@ int nodeLinkClass::add_host ( node_inv_type & inv )
|
||||
|
||||
if (( rc == PASS ) && ( node_ptr ))
|
||||
{
|
||||
node_ptr->addStage = MTC_ADD__START ;
|
||||
int delay = daemon_get_cfg_ptr()->host_add_delay ;
|
||||
if ( delay > 0 )
|
||||
{
|
||||
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, delay );
|
||||
node_ptr->addStage = MTC_ADD__START_DELAY ;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_ptr->addStage = MTC_ADD__START ;
|
||||
}
|
||||
ilog ("Host add delay is %d seconds", delay );
|
||||
adminActionChange ( node_ptr , MTC_ADMIN_ACTION__ADD );
|
||||
}
|
||||
return (rc);
|
||||
|
@ -463,6 +463,11 @@ static int mtc_config_handler ( void * user,
|
||||
config_ptr->http_retry_wait = atoi(value);
|
||||
mtcInv.http_retry_wait = config_ptr->http_retry_wait ;
|
||||
}
|
||||
else if (MATCH("agent", "host_add_delay"))
|
||||
{
|
||||
config_ptr->host_add_delay = atoi(value);
|
||||
ilog ("Start Delay : %d secs", config_ptr->host_add_delay );
|
||||
}
|
||||
else if (MATCH("timeouts", "failsafe_shutdown_delay"))
|
||||
{
|
||||
config_ptr->failsafe_shutdown_delay = atoi(value);
|
||||
|
@ -6215,8 +6215,13 @@ int nodeLinkClass::add_handler ( struct nodeLinkClass::node * node_ptr )
|
||||
|
||||
switch ( node_ptr->addStage )
|
||||
{
|
||||
case MTC_ADD__START:
|
||||
case MTC_ADD__START_DELAY:
|
||||
{
|
||||
if ( mtcTimer_expired (node_ptr->mtcTimer) )
|
||||
node_ptr->addStage = MTC_ADD__START ;
|
||||
break ;
|
||||
}
|
||||
case MTC_ADD__START:
|
||||
{
|
||||
bool timer_set = false ;
|
||||
plog ("%s Host Add\n", node_ptr->hostname.c_str());
|
||||
|
@ -80,6 +80,8 @@ bmc_reset_delay = 300 ; seconds to wait before issuing a bmc
|
||||
|
||||
http_retry_wait = 10 ; secs to wait between http request retries
|
||||
|
||||
host_add_delay = 20 ; seconds to wait before adding hosts
|
||||
|
||||
[client] ; Client Configuration
|
||||
|
||||
scheduling_priority = 45 ; realtime scheduling; range of 1 .. 99
|
||||
|
Loading…
x
Reference in New Issue
Block a user