Modify Mtce Reinstall FSM to first power-off BMC provisioned hosts

This update only applies to servers that support and are provisioned
for Board Management Control (BMC).

The BMC of some servers silently reject the 'set next boot device',
a command while it is executing BIOS.

The current reinstall algorithm when the BMC is provisioned starts by
detecting the power state of the target server. If the power is off
it will 'first power it on' and then proceed to 'set next boot device'
to pxe followed by a reset. For the initial power off state case, the
timing of these operations is such that the server is in BIOS when the
'set next boot device' command is issued.

This update modifies the host reinstall algorithm to first power-off
a server followed by setting the next boot device while the server is
confirmed to be powered off, then powered on. This ensures the server
gets and handles the set next boot device command operation properly.

This update also fixes a race condition between the bmc_handler and
power_handler by moving the final power state update in the power
handler to the power done phase.

Test Plan:

Verify all new reinstall failure path handling via fault insertion testing
Verify reinstall of powered off host
Verify reinstall of powered on host
Verify reinstall of Wildcat server with ipmi
Verify reinstall of Supermicro server with ipmi and redfish
Verify reinstall of Ironpass server with ipmi
Verify reinstall of WolfPass server with redfish and ipmi
Verify reinstall of Dell server with ipmi

Over 30 reinstalls were performed across all server types, with initial
power on and off using both ipmi and redfish (where supported).

Change-Id: Iefb17e9aa76c45f2ceadf83f23b1231ae82f000f
Closes-Bug: 1862065
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-02-10 10:15:56 -05:00
parent 932d7df90a
commit da7b2e94f1
5 changed files with 192 additions and 61 deletions

View File

@ -326,19 +326,19 @@ void print_mtc_message ( string hostname,
}
/* Graceful recovery stages strings and string get'er */
static std::string recoveryStages_str [MTC_RECOVERY__STAGES +1] ;
static std::string disableStages_str [MTC_DISABLE__STAGES +1] ;
static std::string enableStages_str [MTC_ENABLE__STAGES +1] ;
static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string recoveryStages_str [MTC_RECOVERY__STAGES +1] ;
static std::string disableStages_str [MTC_DISABLE__STAGES +1] ;
static std::string enableStages_str [MTC_ENABLE__STAGES +1] ;
static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string resetStages_str [MTC_RESET__STAGES +1] ;
static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
static std::string addStages_str [MTC_ADD__STAGES +1] ;
static std::string delStages_str [MTC_DEL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
static std::string addStages_str [MTC_ADD__STAGES +1] ;
static std::string delStages_str [MTC_DEL__STAGES +1] ;
static std::string subStages_str [MTC_SUBSTAGE__STAGES +1] ;
void mtc_stages_init ( void )
@ -461,12 +461,16 @@ void mtc_stages_init ( void )
reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start";
reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait";
reinstallStages_str [MTC_REINSTALL__POWERQRY ] = "Reinstall-Power-State-Query";
reinstallStages_str [MTC_REINSTALL__POWERQRY_WAIT ] = "Reinstall-Power-State-Query-Wait";
reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart";
reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__POWEROFF ] = "Reinstall-PowerOff";
reinstallStages_str [MTC_REINSTALL__POWEROFF_WAIT ] = "Reinstall-PowerOff-Wait";
reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot";
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";

View File

@ -245,9 +245,11 @@ typedef enum
#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline"
#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online"
#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power off host"
#define MTC_TASK_REINSTALL_FAIL_PU "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request"
#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request"
#define MTC_TASK_REINSTALL_FAIL_PQ "Reinstall Failed ; could not query power state"
#define MTC_TASK_REINSTALL_FAIL "Reinstall Failed"
#define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded"
@ -1046,7 +1048,7 @@ typedef enum
MTC_RESETPROG__REBOOT,
MTC_RESETPROG__WAIT,
MTC_RESETPROG__FAIL,
MTC_RESETPROG__STAGES
MTC_RESETPROG__STAGES
} mtc_resetProgStages_enum ;
/** Return the string representing the specified 'reset' stage */
@ -1059,10 +1061,14 @@ typedef enum
MTC_REINSTALL__START_WAIT,
MTC_REINSTALL__RESTART,
MTC_REINSTALL__RESTART_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__POWERQRY,
MTC_REINSTALL__POWERQRY_WAIT,
MTC_REINSTALL__POWEROFF,
MTC_REINSTALL__POWEROFF_WAIT,
MTC_REINSTALL__NETBOOT,
MTC_REINSTALL__NETBOOT_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__RESET,
MTC_REINSTALL__RESET_WAIT,
MTC_REINSTALL__WIPEDISK,

View File

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=157
TIS_PATCH_VER=158
BUILD_IS_SLOW=5

View File

@ -193,7 +193,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
{
/* handle the redfishtool root query as a special case because
* it is likely to fail and we don't want un-necessary error logs */
if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) &&
if ((( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) ||
( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO )) &&
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
{
blog ("%s bmc redfish %s failed",
@ -201,14 +202,6 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO ) &&
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
{
wlog ("%s bmc redfish %s failed",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else
{
elog ("%s bmc redfish %s command failed (%s) (data:%s) (rc:%d:%d:%s)\n",
@ -220,6 +213,7 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
node_ptr->bmc_thread_info.status,
node_ptr->bmc_thread_info.status_string.c_str());
}
goto bmc_command_recv_cleanup;
}
else
{
@ -382,6 +376,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
}
}
bmc_command_recv_cleanup:
if ( rc != RETRY )
{
node_ptr->bmc_thread_ctrl.done = true ;

View File

@ -4036,14 +4036,15 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
* Description: This FSM handles node (re)install with and without
* a provisioned Board Management Controller (BMC).
*
* BMC provisioned case: using IPMI commands to BMC ...
* BMC provisioned case: board management commands to BMC ...
*
* - ensure host power is on
* - power off host
* - force network boot on next reset
* - issue node reset
* - power on host
*
* BMC not provisioned case: using mtce messaging to node ...
* BMC not provisioned case: mtce messaging to node ...
*
* - host must be online
* - send mtcClient wipedisk command
* fail reinstall if no ACK
* - send mtcClient reboot command
@ -4120,17 +4121,9 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC );
reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT );
}
else if ( node_ptr->power_on == false )
{
/* need to power on node */
wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON );
}
else
{
/* power is on so issue net boot command */
ilog ("%s Reinstall power is on", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY );
}
}
else
@ -4211,18 +4204,107 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_REINSTALL__POWERON:
case MTC_REINSTALL__POWERQRY:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
if ( node_ptr->bmc_thread_ctrl.done )
{
/* Query Host Power Status */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
{
elog ("%s '%s' send failed\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
pingUtil_restart ( node_ptr->bm_ping_info );
}
else
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY_WAIT );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
}
else
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
}
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
case MTC_REINSTALL__POWERQRY_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc != PASS )
{
wlog ("%s '%s' failed receive (rc:%d)",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str(),
rc );
}
else if ( node_ptr->bmc_thread_info.data.empty() )
{
wlog ("%s '%s' request yielded no response data",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else
{
int rc =
bmcUtil_is_power_on ( node_ptr->hostname,
node_ptr->bmc_protocol,
node_ptr->bmc_thread_info.data,
node_ptr->power_on);
if ( rc == PASS )
{
if ( node_ptr->power_on == true )
{
ilog ("%s Reinstall power-off required",
node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF );
}
else
{
ilog ("%s Reinstall power-off already",
node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
break ;
}
else
{
elog ("%s Reinstall power query failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
}
}
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
; /* wait longer */
}
break ;
}
case MTC_REINSTALL__POWEROFF:
{
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
break ;
}
case MTC_REINSTALL__POWEROFF_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
if ( node_ptr->power_on == false )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
@ -4276,7 +4358,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
if ( rc == PASS )
{
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET);
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON);
}
else if ( rc == RETRY )
{
@ -4293,6 +4375,41 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_REINSTALL__POWERON:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
}
else
{
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PU);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PU );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
}
else
{
/* run the power handler till the host's power is on or
* the power-on handler times out */
power_handler ( node_ptr );
}
break ;
}
case MTC_REINSTALL__RESET:
{
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
@ -4736,7 +4853,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
if ( rc )
{
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
@ -4744,7 +4861,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
blog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
@ -4756,14 +4873,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
if ( rc )
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc )
{
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
@ -4772,10 +4888,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
mtcInvApi_update_task ( node_ptr, "Powering Off" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_update_task ( node_ptr, "Powering Off" );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
node_ptr->power_on = false ;
}
}
break ;
@ -4822,6 +4940,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
powerStageChange ( node_ptr , MTC_POWER__DONE );
node_ptr->power_on = false ;
}
break ;
}
@ -5021,10 +5140,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
ilog ("%s is Powering On\n", node_ptr->hostname.c_str() );
mtcInvApi_update_task ( node_ptr, "Powering On" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_update_task ( node_ptr, "Powering On" );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__DONE );
node_ptr->power_on = true ;
}
}
break ;
@ -5067,6 +5188,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
powerStageChange ( node_ptr , MTC_POWER__DONE );
node_ptr->power_on = true ;
}
break ;
}
@ -5083,7 +5205,10 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
ar_enable ( node_ptr );
mtcInvApi_force_task ( node_ptr, "" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_force_task ( node_ptr, "" );
}
break ;
}
}