Fix idrac-wsman set_power_state to wait on HW

set_power_state has returned to the caller immediately without
confirming the system has reached the requested state. This fixes that
by synchronously waiting until the target state has been read before
returning.

That bug can cause instance workload deployments to fail on Dell EMC
PowerEdge server models on which IPA ramdisk soft power off fails and
ironic employs its OOB fallback strategy. After an otherwise successful
deployment, the node is active, but is powered off. No error is reported
in last_error. If the subsequent instance workflow expects the system to
be powered on into the operating system, it fails.

Story: 2009204
Task: 43261
Change-Id: I3112a22149c07e5508f26c79f33d09aeb905c308
(cherry picked from commit 2a0fd1d13f)
This commit is contained in:
Aija Jauntēva 2021-09-08 11:20:20 -04:00 committed by Julia Kreger
parent 38f655f627
commit b25b6e6d5d
3 changed files with 44 additions and 25 deletions

View File

@ -24,6 +24,7 @@ from oslo_utils import importutils
from ironic.common import exception
from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as cond_utils
from ironic.drivers import base
from ironic.drivers.modules.drac import common as drac_common
from ironic.drivers.modules.drac import management as drac_management
@ -87,15 +88,17 @@ def _commit_boot_list_change(node):
node.save()
def _set_power_state(node, power_state):
def _set_power_state(task, power_state, timeout=None):
"""Turns the server power on/off or do a reboot.
:param node: an ironic node object.
:param task: a TaskManager instance containing the node to act on.
:param power_state: a power state from :mod:`ironic.common.states`.
:param timeout: Time to wait for the node to reach the requested state.
When requested state is reboot, not used as not waiting then.
:raises: InvalidParameterValue if required DRAC credentials are missing.
:raises: DracOperationError on an error from python-dracclient
"""
node = task.node
# NOTE(ifarkas): DRAC interface doesn't allow changing the boot device
# multiple times in a row without a reboot. This is
# because a change need to be committed via a
@ -131,6 +134,20 @@ def _set_power_state(node, power_state):
try:
client.set_power_state(target_power_state)
if calc_power_state == states.REBOOT:
# TODO(rloo): Support timeouts!
if timeout is not None:
LOG.warning("The 'idrac-wsman' Power Interface does not "
"support 'timeout' parameter when setting "
"power state to reboot. Ignoring "
"timeout=%(timeout)s",
{'timeout': timeout})
else:
# Skipped for reboot as can't match reboot with on/off.
# Reboot so far has been part of workflow that is not followed
# by another power state change that could break the flow.
cond_utils.node_wait_for_power_state(
task, calc_power_state, timeout)
break
except drac_exceptions.BaseClientException as exc:
if (power_state == states.REBOOT
@ -214,20 +231,13 @@ class DracWSManPower(base.PowerInterface):
:param task: a TaskManager instance containing the node to act on.
:param power_state: a power state from :mod:`ironic.common.states`.
:param timeout: timeout (in seconds). Unsupported by this interface.
:param timeout: Time to wait for the node to reach the requested state.
When requested state is reboot, not used as not waiting then.
:raises: InvalidParameterValue if required DRAC credentials are
missing.
:raises: DracOperationError on an error from python-dracclient.
"""
# TODO(rloo): Support timeouts!
if timeout is not None:
LOG.warning(
"The 'idrac' Power Interface's 'set_power_state' method "
"doesn't support the 'timeout' parameter. Ignoring "
"timeout=%(timeout)s",
{'timeout': timeout})
_set_power_state(task.node, power_state)
_set_power_state(task, power_state, timeout)
@METRICS.timer('DracPower.reboot')
@task_manager.require_exclusive_lock
@ -240,14 +250,7 @@ class DracWSManPower(base.PowerInterface):
missing.
:raises: DracOperationError on an error from python-dracclient.
"""
# TODO(rloo): Support timeouts!
if timeout is not None:
LOG.warning("The 'idrac' Power Interface's 'reboot' method "
"doesn't support the 'timeout' parameter. Ignoring "
"timeout=%(timeout)s",
{'timeout': timeout})
_set_power_state(task.node, states.REBOOT)
_set_power_state(task, states.REBOOT, timeout)
class DracPower(DracWSManPower):

View File

@ -71,7 +71,8 @@ class DracPowerTestCase(test_utils.BaseDracTest):
@mock.patch.object(drac_power.LOG, 'warning')
def test_set_power_state(self, mock_log, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.side_effect = [drac_constants.POWER_ON,
drac_constants.POWER_OFF]
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.power.set_power_state(task, states.POWER_OFF)
@ -97,6 +98,8 @@ class DracPowerTestCase(test_utils.BaseDracTest):
@mock.patch.object(drac_power.LOG, 'warning')
def test_set_power_state_timeout(self, mock_log, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.side_effect = [drac_constants.POWER_ON,
drac_constants.POWER_OFF]
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
@ -105,7 +108,7 @@ class DracPowerTestCase(test_utils.BaseDracTest):
drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_OFF]
mock_client.set_power_state.assert_called_once_with(drac_power_state)
self.assertTrue(mock_log.called)
self.assertFalse(mock_log.called)
@mock.patch.object(drac_power.LOG, 'warning')
def test_reboot_while_powered_on(self, mock_log, mock_get_drac_client):
@ -136,7 +139,8 @@ class DracPowerTestCase(test_utils.BaseDracTest):
def test_reboot_while_powered_off(self, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.return_value = drac_constants.POWER_OFF
mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF,
drac_constants.POWER_ON]
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
@ -148,7 +152,9 @@ class DracPowerTestCase(test_utils.BaseDracTest):
@mock.patch('time.sleep')
def test_reboot_retries_success(self, mock_sleep, mock_get_drac_client):
mock_client = mock_get_drac_client.return_value
mock_client.get_power_state.return_value = drac_constants.POWER_OFF
mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF,
drac_constants.POWER_OFF,
drac_constants.POWER_ON]
exc = drac_exceptions.DRACOperationFailed(
drac_messages=['The command failed to set RequestedState'])
mock_client.set_power_state.side_effect = [exc, None]

View File

@ -0,0 +1,10 @@
---
fixes:
- |
Fixes ``idrac-wsman`` power interface to wait for the hardware to reach the
target state before returning. For systems where soft power off at the end
of deployment to boot to instance failed and forced hard power off was
used, this left node successfully deployed in off state without any errors.
This broke other workflows expecting node to be on booted into
OS at the end of deployment. Additional information can be found in
`story 2009204 <https://storyboard.openstack.org/#!/story/2009204>`_.