From 18a8e2f6e9d80a827acba39a6ace366c13d40e31 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 4 May 2020 13:34:05 +0200 Subject: [PATCH] redfish: handle hardware that is unable to set persistent boot Some hardware has dropped support for continuous boot, see this thread: http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html Work around by falling back to one-time boot device, restoring it on every reboot or power on. Story: #2007527 Task: #39320 Change-Id: I762056dea4f7b8ccdb8f95b2f21e26b45763905d --- ironic/drivers/modules/redfish/management.py | 74 ++++++++++++++++- ironic/drivers/modules/redfish/power.py | 11 +++ .../modules/redfish/test_management.py | 80 +++++++++++++++++++ .../drivers/modules/redfish/test_power.py | 36 ++++++--- .../redfish-sadness-6e2a37b3f45ef1aa.yaml | 18 +++++ 5 files changed, 206 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml diff --git a/ironic/drivers/modules/redfish/management.py b/ironic/drivers/modules/redfish/management.py index 4026da48cb..dff790ce56 100644 --- a/ironic/drivers/modules/redfish/management.py +++ b/ironic/drivers/modules/redfish/management.py @@ -24,6 +24,7 @@ from ironic.common import components from ironic.common import exception from ironic.common.i18n import _ from ironic.common import indicator_states +from ironic.common import utils from ironic.conductor import task_manager from ironic.drivers import base from ironic.drivers.modules.redfish import utils as redfish_utils @@ -68,6 +69,42 @@ if sushy: v: k for k, v in INDICATOR_MAP.items()} +def _set_boot_device(task, system, device, enabled=None): + """An internal routine to set the boot device. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + :param device: the Redfish boot device. + :param enabled: Redfish boot device persistence value or None. + :raises: SushyError on an error from the Sushy library + """ + try: + system.set_system_boot_options(device, enabled=enabled) + except sushy.exceptions.SushyError as e: + if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS: + # NOTE(dtantsur): continuous boot device settings have been + # removed from Redfish, and some vendors stopped supporting + # it before an alternative was provided. As a work around, + # use one-time boot and restore the boot device on every + # reboot via RedfishPower. + LOG.debug('Error %(error)s when trying to set a ' + 'persistent boot device on node %(node)s, ' + 'falling back to one-time boot settings', + {'error': e, 'node': task.node.uuid}) + system.set_system_boot_options( + device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) + LOG.warning('Could not set persistent boot device to ' + '%(dev)s for node %(node)s, using one-time ' + 'boot device instead', + {'dev': device, 'node': task.node.uuid}) + utils.set_node_nested_field( + task.node, 'driver_internal_info', + 'redfish_boot_device', device) + task.node.save() + else: + raise + + class RedfishManagement(base.ManagementInterface): def __init__(self): @@ -107,6 +144,33 @@ class RedfishManagement(base.ManagementInterface): """ return list(BOOT_DEVICE_MAP_REV) + @task_manager.require_exclusive_lock + def restore_boot_device(self, task, system): + """Restore boot device if needed. + + Checks the redfish_boot_device internal flag and sets the one-time + boot device accordingly. A warning is issued if it fails. + + This method is supposed to be called from the Redfish power interface + and should be considered private to the Redfish hardware type. + + :param task: a task from TaskManager. + :param system: a Redfish System object. + """ + device = task.node.driver_internal_info.get('redfish_boot_device') + if not device: + return + + LOG.debug('Restoring boot device %(dev)s on node %(node)s', + {'dev': device, 'node': task.node.uuid}) + try: + _set_boot_device(task, system, device) + except sushy.exceptions.SushyError as e: + LOG.warning('Unable to recover boot device %(dev)s for node ' + '%(node)s, relying on the pre-configured boot order. ' + 'Error: %(error)s', + {'dev': device, 'node': task.node.uuid, 'error': e}) + @task_manager.require_exclusive_lock def set_boot_device(self, task, device, persistent=False): """Set the boot device for a node. @@ -124,6 +188,10 @@ class RedfishManagement(base.ManagementInterface): :raises: RedfishConnectionError when it fails to connect to Redfish :raises: RedfishError on an error from the Sushy library """ + utils.pop_node_nested_field( + task.node, 'driver_internal_info', 'redfish_boot_device') + task.node.save() + system = redfish_utils.get_system(task.node) desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent] @@ -132,11 +200,9 @@ class RedfishManagement(base.ManagementInterface): # NOTE(etingof): this can be racy, esp if BMC is not RESTful enabled = (desired_persistence if desired_persistence != current_persistence else None) - try: - system.set_system_boot_options( - BOOT_DEVICE_MAP_REV[device], enabled=enabled) - + _set_boot_device(task, system, BOOT_DEVICE_MAP_REV[device], + enabled=enabled) except sushy.exceptions.SushyError as e: error_msg = (_('Redfish set boot device failed for node ' '%(node)s. Error: %(error)s') % diff --git a/ironic/drivers/modules/redfish/power.py b/ironic/drivers/modules/redfish/power.py index cb944a4273..0a6f3f338a 100644 --- a/ironic/drivers/modules/redfish/power.py +++ b/ironic/drivers/modules/redfish/power.py @@ -22,6 +22,7 @@ from ironic.common import states from ironic.conductor import task_manager from ironic.conductor import utils as cond_utils from ironic.drivers import base +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import utils as redfish_utils LOG = log.getLogger(__name__) @@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface): :raises: RedfishError on an error from the Sushy library """ system = redfish_utils.get_system(task.node) + + if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT) + and isinstance(task.driver.management, + redfish_mgmt.RedfishManagement)): + task.driver.management.restore_boot_device(task, system) + try: _set_power_state(task, system, power_state, timeout=timeout) except sushy.exceptions.SushyError as e: @@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface): next_state = states.POWER_OFF _set_power_state(task, system, next_state, timeout=timeout) + if isinstance(task.driver.management, + redfish_mgmt.RedfishManagement): + task.driver.management.restore_boot_device(task, system) + next_state = states.POWER_ON _set_power_state(task, system, next_state, timeout=timeout) except sushy.exceptions.SushyError as e: diff --git a/ironic/tests/unit/drivers/modules/redfish/test_management.py b/ironic/tests/unit/drivers/modules/redfish/test_management.py index 2336a64e35..7ee818cef6 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_management.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_management.py @@ -100,6 +100,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_options.assert_called_once_with( expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -123,6 +125,8 @@ class RedfishManagementTestCase(db_base.DbTestCase): fake_system.set_system_boot_options.assert_called_once_with( sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) # Reset mocks fake_system.set_system_boot_options.reset_mock() @@ -170,6 +174,82 @@ class RedfishManagementTestCase(db_base.DbTestCase): sushy.BOOT_SOURCE_TARGET_PXE, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE) mock_get_system.assert_called_once_with(task.node) + self.assertNotIn('redfish_boot_device', + task.node.driver_internal_info) + + @mock.patch.object(sushy, 'Sushy', autospec=True) + @mock.patch.object(redfish_utils, 'get_system', autospec=True) + def test_set_boot_device_persistence_fallback(self, mock_get_system, + mock_sushy): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = [ + sushy.exceptions.SushyError(), + None, + ] + mock_get_system.return_value = fake_system + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.set_boot_device( + task, boot_devices.PXE, persistent=True) + fake_system.set_system_boot_options.assert_has_calls([ + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS), + mock.call(sushy.BOOT_SOURCE_TARGET_PXE, + enabled=sushy.BOOT_SOURCE_ENABLED_ONCE), + ]) + mock_get_system.assert_called_once_with(task.node) + + task.node.refresh() + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_PXE, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, enabled=None) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) + + def test_restore_boot_device_noop(self): + fake_system = mock.Mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.driver.management.restore_boot_device(task, fake_system) + + self.assertFalse(fake_system.set_system_boot_options.called) + + @mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True) + def test_restore_boot_device_failure(self, mock_log): + fake_system = mock.Mock() + fake_system.set_system_boot_options.side_effect = ( + sushy.exceptions.SushyError() + ) + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + task.node.driver_internal_info['redfish_boot_device'] = ( + sushy.BOOT_SOURCE_TARGET_HDD + ) + + task.driver.management.restore_boot_device(task, fake_system) + + fake_system.set_system_boot_options.assert_called_once_with( + sushy.BOOT_SOURCE_TARGET_HDD, enabled=None) + self.assertTrue(mock_log.called) + # The stored boot device is kept intact + self.assertEqual( + sushy.BOOT_SOURCE_TARGET_HDD, + task.node.driver_internal_info['redfish_boot_device']) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_get_boot_device(self, mock_get_system): diff --git a/ironic/tests/unit/drivers/modules/redfish/test_power.py b/ironic/tests/unit/drivers/modules/redfish/test_power.py index 22470a7270..d976e15a63 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_power.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_power.py @@ -20,6 +20,7 @@ from oslo_utils import importutils from ironic.common import exception from ironic.common import states from ironic.conductor import task_manager +from ironic.drivers.modules.redfish import management as redfish_mgmt from ironic.drivers.modules.redfish import power as redfish_power from ironic.drivers.modules.redfish import utils as redfish_utils from ironic.tests.unit.db import base as db_base @@ -84,19 +85,21 @@ class RedfishPowerTestCase(db_base.DbTestCase): mock_get_system.assert_called_once_with(task.node) mock_get_system.reset_mock() + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_set_power_state(self, mock_get_system): + def test_set_power_state(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: expected_values = [ - (states.POWER_ON, sushy.RESET_ON), - (states.POWER_OFF, sushy.RESET_FORCE_OFF), - (states.REBOOT, sushy.RESET_FORCE_RESTART), - (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART), - (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN) + (states.POWER_ON, sushy.RESET_ON, True), + (states.POWER_OFF, sushy.RESET_FORCE_OFF, False), + (states.REBOOT, sushy.RESET_FORCE_RESTART, True), + (states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True), + (states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False) ] - for target, expected in expected_values: + for target, expected, restore_bootdev in expected_values: if target in (states.POWER_OFF, states.SOFT_POWER_OFF): final = sushy.SYSTEM_POWER_STATE_OFF transient = sushy.SYSTEM_POWER_STATE_ON @@ -115,9 +118,15 @@ class RedfishPowerTestCase(db_base.DbTestCase): system_result[0].reset_system.assert_called_once_with(expected) mock_get_system.assert_called_with(task.node) self.assertEqual(4, mock_get_system.call_count) + if restore_bootdev: + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + else: + self.assertFalse(mock_restore_bootdev.called) # Reset mocks mock_get_system.reset_mock() + mock_restore_bootdev.reset_mock() @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_set_power_state_not_reached(self, mock_get_system): @@ -166,8 +175,11 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_once_with(task.node) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_off(self, mock_get_system): + def test_reboot_from_power_off(self, mock_get_system, + mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -187,9 +199,13 @@ class RedfishPowerTestCase(db_base.DbTestCase): sushy.RESET_ON) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) + @mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device', + autospec=True) @mock.patch.object(redfish_utils, 'get_system', autospec=True) - def test_reboot_from_power_on(self, mock_get_system): + def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev): with task_manager.acquire(self.context, self.node.uuid, shared=False) as task: system_result = [ @@ -211,6 +227,8 @@ class RedfishPowerTestCase(db_base.DbTestCase): ]) mock_get_system.assert_called_with(task.node) self.assertEqual(3, mock_get_system.call_count) + mock_restore_bootdev.assert_called_once_with( + task.driver.management, task, system_result[0]) @mock.patch.object(redfish_utils, 'get_system', autospec=True) def test_reboot_not_reached(self, mock_get_system): diff --git a/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml new file mode 100644 index 0000000000..a8cb499beb --- /dev/null +++ b/releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml @@ -0,0 +1,18 @@ +--- +fixes: + - | + Provides a workaround for hardware that does not support persistent boot + device setting with the ``redfish`` hardware type. When such situation is + detected, ironic will fall back to one-time boot device setting, restoring + it on every reboot. +issues: + - | + Some redfish-enabled hardware is known not to support persistent boot + device setting that is used by the Bare Metal service for deployed + instances. The ``redfish`` hardware type tries to work around this problem, + but rebooting such an instance in-band may cause it to boot incorrectly. + A predictable boot order should be configured in the node's boot firmware + to avoid issues and at least metadata cleaning must be enabled. + See `this mailing list thread + `_ + for technical details.