diff --git a/ironic/drivers/modules/drac/power.py b/ironic/drivers/modules/drac/power.py index fe02fe5159..2be801e76d 100644 --- a/ironic/drivers/modules/drac/power.py +++ b/ironic/drivers/modules/drac/power.py @@ -15,6 +15,8 @@ DRAC power interface """ +import time + from ironic_lib import metrics_utils from oslo_log import log as logging from oslo_utils import importutils @@ -42,6 +44,10 @@ if drac_constants: REVERSE_POWER_STATES = dict((v, k) for (k, v) in POWER_STATES.items()) +POWER_STATE_TRIES = 15 +POWER_STATE_SLEEP = 2 +POWER_STATE_CHANGE_FAIL = 'The command failed to set RequestedState' + def _get_power_state(node): """Returns the current power state of the node. @@ -100,18 +106,60 @@ def _set_power_state(node, power_state): _commit_boot_list_change(node) client = drac_common.get_drac_client(node) - target_power_state = REVERSE_POWER_STATES[power_state] + tries = POWER_STATE_TRIES - try: - client.set_power_state(target_power_state) - except drac_exceptions.BaseClientException as exc: - LOG.error('DRAC driver failed to set power state for node ' - '%(node_uuid)s to %(power_state)s. ' - 'Reason: %(error)s.', - {'node_uuid': node.uuid, - 'power_state': power_state, - 'error': exc}) - raise exception.DracOperationError(error=exc) + # Cases have been seen where the iDRAC returns a SYS021 error even when + # the server is in the right power state and a valid power state change + # is attempted. Retry in this case. + while tries > 0: + # The iDRAC will return a SYS021 error if the server is powered off + # and a reboot is requested. In this situation, convert the requested + # reboot into a power on to avoid this error. To minimize the chance + # of a race condition, it is critical to do this check immediately + # before sending the power state change command. This keeps the + # window during which the server could change power states without us + # knowing about it as small as possible. + calc_power_state = power_state + if power_state == states.REBOOT: + current_power_state = _get_power_state(node) + # If the server is not on, then power it on instead of rebooting + if current_power_state != states.POWER_ON: + calc_power_state = states.POWER_ON + + target_power_state = REVERSE_POWER_STATES[calc_power_state] + + try: + client.set_power_state(target_power_state) + break + except drac_exceptions.BaseClientException as exc: + if (power_state == states.REBOOT + and POWER_STATE_CHANGE_FAIL in str(exc) + and tries > 0): + LOG.warning('DRAC driver failed to set power state for node ' + '%(node_uuid)s to %(calc_power_state)s. ' + 'Reason: %(error)s. Retrying...', + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state, + 'error': exc}) + tries -= 1 + time.sleep(POWER_STATE_SLEEP) + else: + LOG.error('DRAC driver failed to set power state for node ' + '%(node_uuid)s to %(calc_power_state)s. ' + 'Reason: %(error)s.', + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state, + 'error': exc}) + raise exception.DracOperationError(error=exc) + + if tries <= 0: + error_msg = (_('DRAC driver timed out while trying to set the power ' + 'state for node %(node_uuid)s to ' + '%(calc_power_state)s.') % + {'node_uuid': node.uuid, + 'calc_power_state': calc_power_state}) + LOG.error(error_msg) + raise exception.DracOperationError(error_msg) class DracPower(base.PowerInterface): @@ -187,10 +235,4 @@ class DracPower(base.PowerInterface): "timeout=%(timeout)s", {'timeout': timeout}) - current_power_state = _get_power_state(task.node) - if current_power_state == states.POWER_ON: - target_power_state = states.REBOOT - else: - target_power_state = states.POWER_ON - - _set_power_state(task.node, target_power_state) + _set_power_state(task.node, states.REBOOT) diff --git a/ironic/tests/unit/drivers/modules/drac/test_power.py b/ironic/tests/unit/drivers/modules/drac/test_power.py index 27cc759818..4c442ba6f8 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_power.py +++ b/ironic/tests/unit/drivers/modules/drac/test_power.py @@ -144,3 +144,58 @@ class DracPowerTestCase(test_utils.BaseDracTest): drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON] mock_client.set_power_state.assert_called_once_with(drac_power_state) + + @mock.patch('time.sleep') + def test_reboot_retries_success(self, mock_sleep, mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.return_value = drac_constants.POWER_OFF + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = [exc, None] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.power.reboot(task) + + drac_power_state = drac_power.REVERSE_POWER_STATES[states.POWER_ON] + self.assertEqual(2, mock_client.set_power_state.call_count) + mock_client.set_power_state.assert_has_calls( + [mock.call(drac_power_state), + mock.call(drac_power_state)]) + + @mock.patch('time.sleep') + def test_reboot_retries_fail(self, mock_sleep, mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.return_value = drac_constants.POWER_OFF + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = exc + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + self.assertRaises(exception.DracOperationError, + task.driver.power.reboot, task) + + self.assertEqual(drac_power.POWER_STATE_TRIES, + mock_client.set_power_state.call_count) + + @mock.patch('time.sleep') + def test_reboot_retries_power_change_success(self, mock_sleep, + mock_get_drac_client): + mock_client = mock_get_drac_client.return_value + mock_client.get_power_state.side_effect = [drac_constants.POWER_OFF, + drac_constants.POWER_ON] + exc = drac_exceptions.DRACOperationFailed( + drac_messages=['The command failed to set RequestedState']) + mock_client.set_power_state.side_effect = [exc, None] + + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.power.reboot(task) + + self.assertEqual(2, mock_client.set_power_state.call_count) + drac_power_state1 = drac_power.REVERSE_POWER_STATES[states.POWER_ON] + drac_power_state2 = drac_power.REVERSE_POWER_STATES[states.REBOOT] + mock_client.set_power_state.assert_has_calls( + [mock.call(drac_power_state1), + mock.call(drac_power_state2)]) diff --git a/ironic/tests/unit/drivers/third_party_driver_mocks.py b/ironic/tests/unit/drivers/third_party_driver_mocks.py index 52fe300594..f031f46254 100644 --- a/ironic/tests/unit/drivers/third_party_driver_mocks.py +++ b/ironic/tests/unit/drivers/third_party_driver_mocks.py @@ -112,6 +112,17 @@ if not dracclient: sys.modules['dracclient.exceptions'] = dracclient.exceptions dracclient.exceptions.BaseClientException = type('BaseClientException', (Exception,), {}) + + dracclient.exceptions.DRACRequestFailed = type( + 'DRACRequestFailed', (dracclient.exceptions.BaseClientException,), {}) + + class DRACOperationFailed(dracclient.exceptions.DRACRequestFailed): + def __init__(self, **kwargs): + super(DRACOperationFailed, self).__init__( + 'DRAC operation failed. Messages: %(drac_messages)s' % kwargs) + + dracclient.exceptions.DRACOperationFailed = DRACOperationFailed + # Now that the external library has been mocked, if anything had already # loaded any of the drivers, reload them. if 'ironic.drivers.modules.drac' in sys.modules: diff --git a/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml b/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml new file mode 100644 index 0000000000..d5b922c70e --- /dev/null +++ b/releasenotes/notes/idrac-fix-reboot-failure-c740e765ff41bcf0.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixed a bug where rebooting a node managed by the ``idrac`` hardware + type when using the WS-MAN power interface sometimes fails with a + ``The command failed to set RequestedState`` error. See bug `2007487 + `_ for details.