Improve mtcAgent interrupted thread cleanup

A BMC command send will be rejected if its thread
is not in the IDLE state going into the call.

This issue is seen to occur over a reprovisioning action
while the bmc access alarmable condition exists.

Maintenance will do retries. So the only visible side affect
of this issue is a failure to provision to 'redfish' over a
provisioning switch to 'dynamic' (learn mode). Instead
ipmi is selected.

The non-return to idle can occur when the bmc handler FSM
is interrupted by a reprovisioning request while a bmc
command is in flight.

This update enhances the thread management module by
introducing a thread consumption utility that is called
by the bmc command send utility. If the send finds that
its thread is not in the IDLE state it will either kill
the thread if it is running or free a completed but-not-
consumed thread result.

Note: Maintenance only supports the execution of
a single thread per host per process at one time.

Test Plan:

PASS: Verify BMC provisioning change from ipmi to dynamic
      while the ipmi provisioning was failing prior to
      re-provisioning. Verify the previous error is cleaned
      up and the reprovisioning request succeeds as expected.

PASS: Verify thread 'execution timeout kill' cleanup handling.
PASS: Verify thread 'complete but not consumed' cleanup handling.
PASS: Verify logging during regression soaks

Regression:

PASS: Verify bmc protocol reprovisioning script soak
PASS: Verify sensor monitoring following BMC reprovisioning
PASS: Verify product verification mtce regression test suite

Change-Id: Ie5e9e89ed2f8db6888c0fc7de03d494c75517178
Closes-Bug: 1864906
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-03-12 17:10:00 -05:00
parent 4f7d82308f
commit 4f5bf78f55
4 changed files with 76 additions and 1 deletions

View File

@ -309,6 +309,48 @@ bool thread_idle ( thread_ctrl_type & ctrl )
return (false);
}
/****************************************************************************
*
* Name : thread_done_consume
*
* Description: Return to IDLE stage.
*
****************************************************************************/
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info )
{
if ( ctrl.stage == THREAD_STAGE__IDLE )
{
return PASS ;
}
else if ( ctrl.done == false )
{
if ( info.runcount > ctrl.runcount )
{
ilog("%s thread cleanup ; cmd:%d ; cnt:%d:%d",
info.hostname.c_str(),
info.command,
ctrl.runcount,
info.runcount);
ctrl.done = true ;
ctrl.stage = THREAD_STAGE__DONE ;
thread_handler (ctrl, info);
return PASS ;
}
else
{
thread_kill(ctrl, info);
return RETRY ;
}
}
else
{
ctrl.stage = THREAD_STAGE__DONE ;
thread_handler( ctrl, info );
return PASS ;
}
}
/****************************************************************************
*
* Name : thread_launch
@ -381,7 +423,7 @@ void thread_kill ( thread_ctrl_type & ctrl, thread_info_type & info )
( ctrl.stage != THREAD_STAGE__WAIT ) &&
( ctrl.stage != THREAD_STAGE__IDLE ))
{
blog ("%s kill request\n", ctrl.hostname.c_str() );
wlog ("%s kill request\n", ctrl.hostname.c_str() );
_stage_change ( ctrl, THREAD_STAGE__KILL );
}
}

View File

@ -284,6 +284,7 @@ bool thread_done ( thread_ctrl_type & ctrl );
bool thread_idle ( thread_ctrl_type & ctrl );
void thread_kill ( thread_ctrl_type & ctrl , thread_info_type & info );
string thread_stage ( thread_ctrl_type & ctrl );
int thread_done_consume ( thread_ctrl_type & ctrl, thread_info_type & info );
/* Cooperative service of cancel and exit requests from parent */
void pthread_signal_handler ( thread_info_type * info_ptr );

View File

@ -39,6 +39,26 @@ int nodeLinkClass::bmc_command_send ( struct nodeLinkClass::node * node_ptr,
{
int rc = PASS ;
/* handle 'kill of in-progress' thread or 'done but not consumed' thread */
if ( ! thread_idle ( node_ptr->bmc_thread_ctrl ))
{
if ( ! thread_done ( node_ptr->bmc_thread_ctrl ))
{
thread_kill ( node_ptr->bmc_thread_ctrl,
node_ptr->bmc_thread_info );
return (RETRY);
}
else
{
mtcTimer_reset ( node_ptr->bmc_thread_ctrl.timer );
if ( thread_done_consume ( node_ptr->bmc_thread_ctrl,
node_ptr->bmc_thread_info ) != PASS )
{
return (RETRY);
}
}
}
node_ptr->bmc_thread_info.command = command ;
/* Update / Setup the BMC access credentials */
@ -437,6 +457,13 @@ bmc_command_recv_cleanup:
if ( rc != RETRY )
{
ilog ("%s %s recv '%s' command (%s) (rc:%d)",
node_ptr->hostname.c_str(),
node_ptr->bmc_thread_ctrl.name.c_str(),
bmcUtil_getCmd_str(node_ptr->bmc_thread_info.command).c_str(),
bmcUtil_getProtocol_str(node_ptr->bmc_protocol).c_str(),
rc);
node_ptr->bmc_thread_ctrl.done = true ;
node_ptr->bmc_thread_ctrl.retries = 0 ;
node_ptr->bmc_thread_ctrl.id = 0 ;

View File

@ -63,6 +63,11 @@ int nodeLinkClass::fsm ( struct nodeLinkClass::node * node_ptr )
/* Monitor and Manage active threads */
thread_handler ( node_ptr->bmc_thread_ctrl, node_ptr->bmc_thread_info );
if ( node_ptr->bmc_thread_ctrl.stage == THREAD_STAGE__KILL )
{
/* do nothing while thread is being killed */
return RETRY ;
}
/* manage the host connected state and board management alarms */
nodeLinkClass::bmc_handler ( node_ptr );