From f8a6682575e926684e4ae68bb9f0ad7a1ae1138c Mon Sep 17 00:00:00 2001 From: Christopher Dearborn Date: Mon, 23 Mar 2020 12:37:38 -0400 Subject: [PATCH] Fix issue where server fails to reboot The iDRAC will return a "The command failed to set RequestedState" error if the server is powered off and a reboot is requested. In this situation, convert the requested reboot into a power on to avoid this error. To minimize the chance of a race condition, it is critical to do this check immediately before sending the power state change command. This keeps the window during which the server could change power states without us knowing about it as small as possible. Note that this error is returned in other scenarios as well. Conflicts: ironic/drivers/modules/drac/power.py Change-Id: I4c95959090423ff1929700f0fce596510447ac1c Story: 2007487 Task: 39206 (cherry picked from commit 95a3e1bdfd12b8b579c0dd6ff8e12d80d4214117) --- ironic/drivers/modules/drac/power.py | 78 ++++++++++++++----- .../unit/drivers/modules/drac/test_power.py | 55 +++++++++++++ .../unit/drivers/third_party_driver_mocks.py | 11 +++ ...c-fix-reboot-failure-c740e765ff41bcf0.yaml | 7 ++ 4 files changed, 133 insertions(+), 18 deletions(-) create mode 100644 releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml diff --git a/ironic/drivers/modules/drac/power.py b/ironic/drivers/modules/drac/power.py index fe02fe5159..2be801e76d 100644 --- a/ironic/drivers/modules/drac/power.py +++ b/ironic/drivers/modules/drac/power.py @@ -15,6 +15,8 @@ DRAC power interface """ +import time + from ironic_lib import metrics_utils from oslo_log import log as logging from oslo_utils import importutils @@ -42,6 +44,10 @@ if drac_constants: REVERSE_POWER_STATES = dict((v, k) for (k, v) in POWER_STATES.items()) +POWER_STATE_TRIES = 15 +POWER_STATE_SLEEP = 2 +POWER_STATE_CHANGE_FAIL = 'The command failed to set RequestedState' + def _get_power_state(node): """Returns the current power state of the node. @@ -100,18 +106,60 @@ def _set_power_state(node, power_state): _commit_boot_list_change(node) client = drac_common.get_drac_client(node) - target_power_state = REVERSE_POWER_STATES[power_state] + tries = POWER_STATE_TRIES - try: - client.set_power_state(target_power_state) - except drac_exceptions.BaseClientException as exc: - LOG.error('DRAC driver failed to set power state for node ' - '%(node_uuid)s to %(power_state)s. ' - 'Reason: %(error)s.', - {'node_uuid': node.uuid, - 'power_state': power_state, - 'error': exc}) - raise exception.DracOperationError(error=exc) + # Cases have been seen where the iDRAC returns a SYS021 error even when + # the server is in the right power state and a valid power state change + # is attempted. Retry in this case. + while tries > 0: + # The iDRAC will return a SYS021 error if the server is powered off + # and a reboot is requested. In this situation, convert the requested + # reboot into a power on to avoid this error. To minimize the chance + # of a race condition, it is critical to do this check immediately + # before sending the power state change command. This keeps the + # window during which the server could change power states without us + # knowing about it as small as possible. + calc_power_state = power_state + if power_state == states.REBOOT: + current_power_state = _get_power_state(node) + # If the server is not on, then power it on instead of rebooting + if current_power_state != states.POWER_ON: + calc_power_state = states.POWER_ON + + target_power_state = REVERSE_POWER_STATES[calc_power_state] + + try: + client.set_power_state(target_power_state) + break + except drac_exceptions.BaseClientException as exc: + if (power_state == states.REBOOT + and POWER_STATE_CHANGE_FAIL in str(exc) + and tries > 0): + LOG.warning('DRAC driver failed to set power state for node ' + '%(node_uuid)s to %(calc_power_state)s. ' + 'Reason: %(error)s. Retrying...', + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state, + 'error': exc}) + tries -= 1 + time.sleep(POWER_STATE_SLEEP) + else: + LOG.error('DRAC driver failed to set power state for node ' + '%(node_uuid)s to %(calc_power_state)s. ' + 'Reason: %(error)s.', + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state, + 'error': exc}) + raise exception.DracOperationError(error=exc) + + if tries <= 0: + error_msg = (_('DRAC driver timed out while trying to set the power ' + 'state for node %(node_uuid)s to ' + '%(calc_power_state)s.') % + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state}) + LOG.error(error_msg) + raise exception.DracOperationError(error_msg) class DracPower(base.PowerInterface): @@ -187,10 +235,4 @@ class DracPower(base.PowerInterface): "timeout=%(timeout)s", {'timeout': timeout}) - current_power_state = _get_power_state(task.node) - if current_power_state == states.POWER_ON: - target_power_state = states.REBOOT - else: - target_power_state = states.POWER_ON - - _set_power_state(task.node, target_power_state) + _set_power_state(task.node, states.REBOOT) diff --git a/ironic/tests/unit/drivers/modules/drac/test_power.py b/ironic/tests/unit/drivers/modules/drac/test_power.py index d1edf36d81..14f6b2b268 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_power.py +++ b/ironic/tests/unit/drivers/modules/drac/test_power.py @@ -147,3 +147,58 @@ class DracPowerTestCase(db_base.DbTestCase): drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON] mock_client.set_power_state.assert_called_once_with(drac_power_state) + + @mock.patch('time.sleep') + def test_reboot_retries_success(self, mock_sleep, mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.return_value = drac_constants.POWER_OFF + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = [exc, None] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.power.reboot(task) + + drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON] + self.assertEqual(2, mock_client.set_power_state.call_count) + mock_client.set_power_state.assert_has_calls( + [mock.call(drac_power_state), + mock.call(drac_power_state)]) + + @mock.patch('time.sleep') + def test_reboot_retries_fail(self, mock_sleep, mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.return_value = drac_constants.POWER_OFF + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = exc + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaises(exception.DracOperationError, + task.driver.power.reboot, task) + + self.assertEqual(drac_power.POWER_STATE_TRIES, + mock_client.set_power_state.call_count) + + @mock.patch('time.sleep') + def test_reboot_retries_power_change_success(self, mock_sleep, + mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF, + drac_constants.POWER_ON] + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = [exc, None] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.power.reboot(task) + + self.assertEqual(2, mock_client.set_power_state.call_count) + drac_power_state1 = drac_power.REVERSE_POWER_STATES[states.POWER_ON] + drac_power_state2 = drac_power.REVERSE_POWER_STATES[states.REBOOT] + mock_client.set_power_state.assert_has_calls( + [mock.call(drac_power_state1), + mock.call(drac_power_state2)]) diff --git a/ironic/tests/unit/drivers/third_party_driver_mocks.py b/ironic/tests/unit/drivers/third_party_driver_mocks.py index 055707fe3c..3733361523 100644 --- a/ironic/tests/unit/drivers/third_party_driver_mocks.py +++ b/ironic/tests/unit/drivers/third_party_driver_mocks.py @@ -112,6 +112,17 @@ if not dracclient: sys.modules['dracclient.exceptions'] = dracclient.exceptions dracclient.exceptions.BaseClientException = type('BaseClientException', (Exception,), {}) + + dracclient.exceptions.DRACRequestFailed = type( + 'DRACRequestFailed', (dracclient.exceptions.BaseClientException,), {}) + + class DRACOperationFailed(dracclient.exceptions.DRACRequestFailed): + def __init__(self, **kwargs): + super(DRACOperationFailed, self).__init__( + 'DRAC operation failed. Messages: %(drac_messages)s' % kwargs) + + dracclient.exceptions.DRACOperationFailed = DRACOperationFailed + # Now that the external library has been mocked, if anything had already # loaded any of the drivers, reload them. if 'ironic.drivers.modules.drac' in sys.modules: diff --git a/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml b/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml new file mode 100644 index 0000000000..d5b922c70e --- /dev/null +++ b/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixed a bug where rebooting a node managed by the ``idrac`` hardware + type when using the WS-MAN power interface sometimes fails with a + ``The command failed to set RequestedState`` error. See bug `2007487 + `_ for details.