Make Mtce Power-Off FSM verify power-off

If a host's BMC server accepts a power-off command without
error but does not actually power-off the host, the power-off
FSM reports success yet the host power is still on.

This update adds a verification component to the power-off
FSM. Once the power-off command is issued and succeeds at the
command level, the power-off FSM will now query power status
and retry the power-off command until the server is verified
to be powered-off or the retry max (10) is reached and the
power-off command is failed.

Test Plan:

PASS: Verify 200+ Mtce Power Off/On cycles (ipmi & redfish)
PASS: Verify 100+ Mtce Reinstalls with FIT (ipmi & redfish)

Change-Id: Iddd120d89d1152fc0b26915df123f586c38b909b
Closes-Bug: 1865087
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-11-19 12:46:52 -05:00
parent 9b82bf6f65
commit 1350502720
4 changed files with 147 additions and 12 deletions

View File

@ -425,6 +425,10 @@ void mtc_stages_init ( void )
powerStages_str [MTC_POWEROFF__START ] = "Power-Off-Start";
powerStages_str [MTC_POWEROFF__REQ_SEND ] = "Power-Off-Req-Send";
powerStages_str [MTC_POWEROFF__RESP_WAIT ] = "Power-Off-Resp-Wait";
powerStages_str [MTC_POWEROFF__OFFLINE_WAIT ] = "Power-Off-Offline-Wait";
powerStages_str [MTC_POWEROFF__POWERQRY ] = "Power-Off-Power-Query";
powerStages_str [MTC_POWEROFF__POWERQRY_WAIT ] = "Power-Off-Power-Query-Wait";
powerStages_str [MTC_POWEROFF__QUEUE ] = "Power-Off-Queue";
powerStages_str [MTC_POWEROFF__DONE ] = "Power-Off-Done";
powerStages_str [MTC_POWEROFF__FAIL ] = "Power-Off-Fail";
powerStages_str [MTC_POWEROFF__FAIL_WAIT ] = "Power-Off-Fail-Wait";

View File

@ -1102,9 +1102,11 @@ typedef enum
MTC_POWEROFF__FAIL,
MTC_POWEROFF__FAIL_WAIT,
MTC_POWEROFF__QUEUE,
MTC_POWEROFF__OFFLINE_WAIT,
MTC_POWEROFF__POWERQRY,
MTC_POWEROFF__POWERQRY_WAIT,
MTC_POWER__DONE, /* clear power action */
MTC_POWER__STAGES
MTC_POWER__STAGES
} mtc_powerStages_enum ;
/** Return the string representing the specified 'power' stage */

View File

@ -66,7 +66,7 @@
#define MTC_SWACT_POLL_TIMER (10)
#define MTC_TASK_UPDATE_DELAY (30)
#define MTC_BM_PING_TIMEOUT (30)
#define MTC_BM_POWEROFF_TIMEOUT (30)
#define MTC_BM_POWEROFF_TIMEOUT (60)
#define MTC_BM_POWERON_TIMEOUT (30)
#define MTC_RESET_PROG_TIMEOUT (20)
#define MTC_WORKQUEUE_TIMEOUT (60)

View File

@ -4311,7 +4311,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
{
wlog ("%s Reinstall power query receive failed ; retry %d of %d in %d seconds (rc:%d)",
node_ptr->hostname.c_str(),
node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries,
MTC_POWER_ACTION_RETRY_COUNT,
MTC_RETRY_WAIT, rc );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
@ -4917,30 +4917,159 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else if ( rc )
{
node_ptr->power_action_retries--;
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
// Need to handle retries in this case since we don't
// go through the QUEUE stage.
if ( --node_ptr->power_action_retries > 0 )
{
char buffer[255] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;
snprintf ( buffer, 255, MTC_TASK_POWEROFF_QUEUE, attempts, MTC_POWER_ACTION_RETRY_COUNT);
mtcInvApi_update_task ( node_ptr, buffer);
// The power off command can fail due to connectivity
// issue or if the server is now already powered off.
// The latter could occur if the previous power off
// command failed 'in response' but actually did end up
// powering off. In that case, if we continue to just
// retry the power off when the power is already off
// then that will just fail again since most redfish
// implementations fail rather than wave-on a power off
// request while the power is already off. In this case
// its better to switch to power query power status
// again and allow that result to put this power off
// FSM into the correct state to continue/retry the
// quest for power off.
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
}
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__FAIL );
}
}
else
{
ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
ilog ("%s is Powering Off ; waiting for offline\n", node_ptr->hostname.c_str() );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_update_task ( node_ptr, "Powering Off" );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_BM_POWEROFF_TIMEOUT );
powerStageChange ( node_ptr , MTC_POWEROFF__OFFLINE_WAIT );
}
}
break ;
}
case MTC_POWEROFF__OFFLINE_WAIT:
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__OFFLINE )
{
mtcTimer_reset ( node_ptr->mtcTimer );
plog ("%s is now offline\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY );
}
else if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
elog ("%s Power-Off operation timeout - host did not go offline\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
break ;
}
case MTC_POWEROFF__POWERQRY:
{
if ( node_ptr->bmc_thread_ctrl.done )
{
/* Query Host Power Status */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
{
elog ("%s '%s' send failed\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
pingUtil_restart ( node_ptr->bm_ping_info );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else
{
powerStageChange ( node_ptr , MTC_POWEROFF__POWERQRY_WAIT );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
else
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
}
break ;
}
case MTC_POWEROFF__POWERQRY_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc != PASS )
{
wlog ("%s '%s' failed receive (rc:%d)",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str(),
rc );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else if ( node_ptr->bmc_thread_info.data.empty() )
{
wlog ("%s '%s' request yielded no response data",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else
{
int rc =
bmcUtil_is_power_on ( node_ptr->hostname,
node_ptr->bmc_protocol,
node_ptr->bmc_thread_info.data,
node_ptr->power_on);
if ( rc == PASS )
{
if ( node_ptr->power_on == true )
{
ilog ("%s Power not Off ; retry power-off ",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
else
{
ilog ("%s Power-Off Verified",
node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
mtcTimer_reset ( node_ptr->mtcTimer );
break ;
}
}
else
{
elog ("%s Power query failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
powerStageChange ( node_ptr , MTC_POWEROFF__QUEUE );
}
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
break ;
}
case MTC_POWEROFF__QUEUE:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
node_ptr->mtcTimer.ring = false ;
if ( node_ptr->power_action_retries > 0 )
if ( --node_ptr->power_action_retries > 0 )
{
char buffer[255] ;
int attempts = MTC_POWER_ACTION_RETRY_COUNT - node_ptr->power_action_retries ;