Merge "Modify Mtce Reinstall FSM to first power-off BMC provisioned hosts"

This commit is contained in:
Zuul 2020-02-14 17:11:53 +00:00 committed by Gerrit Code Review
commit b235d3c111
5 changed files with 192 additions and 61 deletions
mtce-common/src/common
mtce

@ -326,19 +326,19 @@ void print_mtc_message ( string hostname,
}
/* Graceful recovery stages strings and string get'er */
static std::string recoveryStages_str [MTC_RECOVERY__STAGES +1] ;
static std::string disableStages_str [MTC_DISABLE__STAGES +1] ;
static std::string enableStages_str [MTC_ENABLE__STAGES +1] ;
static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string recoveryStages_str [MTC_RECOVERY__STAGES +1] ;
static std::string disableStages_str [MTC_DISABLE__STAGES +1] ;
static std::string enableStages_str [MTC_ENABLE__STAGES +1] ;
static std::string sensorStages_str [MTC_SENSOR__STAGES +1] ;
static std::string powerStages_str [MTC_POWER__STAGES +1] ;
static std::string powercycleStages_str [MTC_POWERCYCLE__STAGES +1] ;
static std::string resetStages_str [MTC_RESET__STAGES +1] ;
static std::string reinstallStages_str [MTC_REINSTALL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
static std::string addStages_str [MTC_ADD__STAGES +1] ;
static std::string delStages_str [MTC_DEL__STAGES +1] ;
static std::string oosTestStages_str [MTC_OOS_TEST__STAGES +1] ;
static std::string insvTestStages_str [MTC_INSV_TEST__STAGES +1] ;
static std::string configStages_str [MTC_CONFIG__STAGES +1] ;
static std::string addStages_str [MTC_ADD__STAGES +1] ;
static std::string delStages_str [MTC_DEL__STAGES +1] ;
static std::string subStages_str [MTC_SUBSTAGE__STAGES +1] ;
void mtc_stages_init ( void )
@ -461,12 +461,16 @@ void mtc_stages_init ( void )
reinstallStages_str [MTC_REINSTALL__START ] = "Reinstall-Start";
reinstallStages_str [MTC_REINSTALL__START_WAIT ] = "Reinstall-Start-Wait";
reinstallStages_str [MTC_REINSTALL__POWERQRY ] = "Reinstall-Power-State-Query";
reinstallStages_str [MTC_REINSTALL__POWERQRY_WAIT ] = "Reinstall-Power-State-Query-Wait";
reinstallStages_str [MTC_REINSTALL__RESTART ] = "Reinstall-ReStart";
reinstallStages_str [MTC_REINSTALL__RESTART_WAIT ] = "Reinstall-ReStart-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__POWEROFF ] = "Reinstall-PowerOff";
reinstallStages_str [MTC_REINSTALL__POWEROFF_WAIT ] = "Reinstall-PowerOff-Wait";
reinstallStages_str [MTC_REINSTALL__NETBOOT ] = "Reinstall-Netboot";
reinstallStages_str [MTC_REINSTALL__NETBOOT_WAIT ] = "Reinstall-Netboot-Wait";
reinstallStages_str [MTC_REINSTALL__POWERON ] = "Reinstall-PowerOn";
reinstallStages_str [MTC_REINSTALL__POWERON_WAIT ] = "Reinstall-PowerOn-Wait";
reinstallStages_str [MTC_REINSTALL__RESET ] = "Reinstall-Reset";
reinstallStages_str [MTC_REINSTALL__RESET_WAIT ] = "Reinstall-Reset-Wait";
reinstallStages_str [MTC_REINSTALL__WIPEDISK ] = "Reinstall-Wipedisk";

@ -245,9 +245,11 @@ typedef enum
#define MTC_TASK_REINSTALL_FAIL_OL "Reinstall Failed ; timeout waiting for offline"
#define MTC_TASK_REINSTALL_FAIL_TO "Reinstall Failed ; timeout waiting for online"
#define MTC_TASK_REINSTALL_FAIL_BA "Reinstall Failed ; timeout waiting BMC access"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_PO "Reinstall Failed ; could not power off host"
#define MTC_TASK_REINSTALL_FAIL_PU "Reinstall Failed ; could not power on host"
#define MTC_TASK_REINSTALL_FAIL_NB "Reinstall Failed ; netboot request"
#define MTC_TASK_REINSTALL_FAIL_PR "Reinstall Failed ; power reset request"
#define MTC_TASK_REINSTALL_FAIL_PQ "Reinstall Failed ; could not query power state"
#define MTC_TASK_REINSTALL_FAIL "Reinstall Failed"
#define MTC_TASK_REINSTALL_SUCCESS "Reinstall Succeeded"
@ -1046,7 +1048,7 @@ typedef enum
MTC_RESETPROG__REBOOT,
MTC_RESETPROG__WAIT,
MTC_RESETPROG__FAIL,
MTC_RESETPROG__STAGES
MTC_RESETPROG__STAGES
} mtc_resetProgStages_enum ;
/** Return the string representing the specified 'reset' stage */
@ -1059,10 +1061,14 @@ typedef enum
MTC_REINSTALL__START_WAIT,
MTC_REINSTALL__RESTART,
MTC_REINSTALL__RESTART_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__POWERQRY,
MTC_REINSTALL__POWERQRY_WAIT,
MTC_REINSTALL__POWEROFF,
MTC_REINSTALL__POWEROFF_WAIT,
MTC_REINSTALL__NETBOOT,
MTC_REINSTALL__NETBOOT_WAIT,
MTC_REINSTALL__POWERON,
MTC_REINSTALL__POWERON_WAIT,
MTC_REINSTALL__RESET,
MTC_REINSTALL__RESET_WAIT,
MTC_REINSTALL__WIPEDISK,

@ -1,3 +1,3 @@
SRC_DIR="src"
TIS_PATCH_VER=157
TIS_PATCH_VER=158
BUILD_IS_SLOW=5

@ -193,7 +193,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
{
/* handle the redfishtool root query as a special case because
* it is likely to fail and we don't want un-necessary error logs */
if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) &&
if ((( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_QUERY ) ||
( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO )) &&
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
{
blog ("%s bmc redfish %s failed",
@ -201,14 +202,6 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else if (( node_ptr->bmc_thread_info.command == BMC_THREAD_CMD__BMC_INFO ) &&
(( rc == FAIL_SYSTEM_CALL ) || ( rc == FAIL_NOT_ACTIVE )))
{
wlog ("%s bmc redfish %s failed",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else
{
elog ("%s bmc redfish %s command failed (%s) (data:%s) (rc:%d:%d:%s)\n",
@ -220,6 +213,7 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
node_ptr->bmc_thread_info.status,
node_ptr->bmc_thread_info.status_string.c_str());
}
goto bmc_command_recv_cleanup;
}
else
{
@ -382,6 +376,8 @@ int nodeLinkClass::bmc_command_recv ( struct nodeLinkClass::node * node_ptr )
}
}
bmc_command_recv_cleanup:
if ( rc != RETRY )
{
node_ptr->bmc_thread_ctrl.done = true ;

@ -4036,14 +4036,15 @@ int nodeLinkClass::reset_handler ( struct nodeLinkClass::node * node_ptr )
* Description: This FSM handles node (re)install with and without
* a provisioned Board Management Controller (BMC).
*
* BMC provisioned case: using IPMI commands to BMC ...
* BMC provisioned case: board management commands to BMC ...
*
* - ensure host power is on
* - power off host
* - force network boot on next reset
* - issue node reset
* - power on host
*
* BMC not provisioned case: using mtce messaging to node ...
* BMC not provisioned case: mtce messaging to node ...
*
* - host must be online
* - send mtcClient wipedisk command
* fail reinstall if no ACK
* - send mtcClient reboot command
@ -4120,17 +4121,9 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_REINSTALL_TIMEOUT_BMC_ACC );
reinstallStageChange ( node_ptr, MTC_REINSTALL__START_WAIT );
}
else if ( node_ptr->power_on == false )
{
/* need to power on node */
wlog ("%s Reinstall power-on required", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON );
}
else
{
/* power is on so issue net boot command */
ilog ("%s Reinstall power is on", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY );
}
}
else
@ -4211,18 +4204,107 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_REINSTALL__POWERON:
case MTC_REINSTALL__POWERQRY:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
if ( node_ptr->bmc_thread_ctrl.done )
{
/* Query Host Power Status */
if ( bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_STATUS ) != PASS )
{
elog ("%s '%s' send failed\n",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
pingUtil_restart ( node_ptr->bm_ping_info );
}
else
{
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERQRY_WAIT );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
}
}
else
{
thread_kill ( node_ptr->bmc_thread_ctrl , node_ptr->bmc_thread_info ) ;
}
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
case MTC_REINSTALL__POWERQRY_WAIT:
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
int rc = bmc_command_recv ( node_ptr ) ;
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc != PASS )
{
wlog ("%s '%s' failed receive (rc:%d)",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str(),
rc );
}
else if ( node_ptr->bmc_thread_info.data.empty() )
{
wlog ("%s '%s' request yielded no response data",
node_ptr->hostname.c_str(),
bmcUtil_getCmd_str(
node_ptr->bmc_thread_info.command).c_str());
}
else
{
int rc =
bmcUtil_is_power_on ( node_ptr->hostname,
node_ptr->bmc_protocol,
node_ptr->bmc_thread_info.data,
node_ptr->power_on);
if ( rc == PASS )
{
if ( node_ptr->power_on == true )
{
ilog ("%s Reinstall power-off required",
node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF );
}
else
{
ilog ("%s Reinstall power-off already",
node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr , MTC_REINSTALL__NETBOOT );
}
break ;
}
else
{
elog ("%s Reinstall power query failed (rc:%d)",
node_ptr->hostname.c_str(), rc );
}
}
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PQ );
reinstallStageChange ( node_ptr, MTC_REINSTALL__FAIL );
}
else
{
; /* wait longer */
}
break ;
}
case MTC_REINSTALL__POWEROFF:
{
powerStageChange ( node_ptr , MTC_POWEROFF__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWEROFF_WAIT );
break ;
}
case MTC_REINSTALL__POWEROFF_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
if ( node_ptr->power_on == false )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
@ -4276,7 +4358,7 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
if ( rc == PASS )
{
ilog ("%s Reinstall netboot request completed", node_ptr->hostname.c_str());
reinstallStageChange ( node_ptr, MTC_REINSTALL__RESET);
reinstallStageChange ( node_ptr, MTC_REINSTALL__POWERON);
}
else if ( rc == RETRY )
{
@ -4293,6 +4375,41 @@ int nodeLinkClass::reinstall_handler ( struct nodeLinkClass::node * node_ptr )
}
break ;
}
case MTC_REINSTALL__POWERON:
{
powerStageChange ( node_ptr , MTC_POWERON__REQ_SEND );
reinstallStageChange ( node_ptr , MTC_REINSTALL__POWERON_WAIT );
break ;
}
case MTC_REINSTALL__POWERON_WAIT:
{
/* The power handler manages timeout */
if ( node_ptr->powerStage == MTC_POWER__DONE )
{
if ( node_ptr->power_on == true )
{
if ( node_ptr->task != MTC_TASK_REINSTALL )
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL );
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_SECS_5 );
reinstallStageChange ( node_ptr , MTC_REINSTALL__OFFLINE_WAIT );
}
else
{
elog ("%s %s", node_ptr->hostname.c_str(), MTC_TASK_REINSTALL_FAIL_PU);
mtcInvApi_update_task ( node_ptr, MTC_TASK_REINSTALL_FAIL_PU );
reinstallStageChange ( node_ptr , MTC_REINSTALL__FAIL );
}
}
else
{
/* run the power handler till the host's power is on or
* the power-on handler times out */
power_handler ( node_ptr );
}
break ;
}
case MTC_REINSTALL__RESET:
{
int rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_RESET );
@ -4736,7 +4853,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
rc = bmc_command_send ( node_ptr, BMC_THREAD_CMD__POWER_OFF );
if ( rc )
{
wlog ("%s Power-Off request failed (%d)\n", node_ptr->hostname.c_str(), rc );
@ -4744,7 +4861,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
}
else
{
blog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
ilog ("%s Power-Off requested\n", node_ptr->hostname.c_str());
powerStageChange ( node_ptr , MTC_POWEROFF__RESP_WAIT );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
@ -4756,14 +4873,13 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
{
if ( mtcTimer_expired ( node_ptr->mtcTimer ) )
{
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
if ( rc )
rc = bmc_command_recv ( node_ptr );
if ( rc == RETRY )
{
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_RETRY_WAIT );
break ;
}
else if ( rc )
{
elog ("%s Power-Off command failed\n", node_ptr->hostname.c_str());
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_POWER_ACTION_RETRY_DELAY );
@ -4772,10 +4888,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
ilog ("%s is Powering Off\n", node_ptr->hostname.c_str() );
mtcInvApi_update_task ( node_ptr, "Powering Off" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_update_task ( node_ptr, "Powering Off" );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
powerStageChange ( node_ptr , MTC_POWEROFF__DONE );
node_ptr->power_on = false ;
}
}
break ;
@ -4822,6 +4940,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__POWERED_OFF );
powerStageChange ( node_ptr , MTC_POWER__DONE );
node_ptr->power_on = false ;
}
break ;
}
@ -5021,10 +5140,12 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
else
{
ilog ("%s is Powering On\n", node_ptr->hostname.c_str() );
mtcInvApi_update_task ( node_ptr, "Powering On" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_update_task ( node_ptr, "Powering On" );
}
mtcTimer_start ( node_ptr->mtcTimer, mtcTimer_handler, MTC_TASK_UPDATE_DELAY );
powerStageChange ( node_ptr , MTC_POWERON__DONE );
node_ptr->power_on = true ;
}
}
break ;
@ -5067,6 +5188,7 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__OFFLINE );
powerStageChange ( node_ptr , MTC_POWER__DONE );
node_ptr->power_on = true ;
}
break ;
}
@ -5083,7 +5205,10 @@ int nodeLinkClass::power_handler ( struct nodeLinkClass::node * node_ptr )
ar_enable ( node_ptr );
mtcInvApi_force_task ( node_ptr, "" );
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__REINSTALL )
{
mtcInvApi_force_task ( node_ptr, "" );
}
break ;
}
}