Redfish: wait for secure boot state change if it's not immediate

We have discovered hardware that only applies boot mode / secure boot
changes during a reboot. Furthermore, the same hardware cannot update
both at the same time. To err on the safe side, reboot and wait for
the value to change if it's not changed immediately.

Co-Authored-By: Jacob Anders <janders@redhat.com>
Change-Id: I318940a76be531f453f0f5cf31a59cba16febf57
This commit is contained in:
Dmitry Tantsur 2022-11-08 13:45:26 +01:00
parent 985c7fdf21
commit 6487b95813
6 changed files with 325 additions and 32 deletions

View File

@ -306,6 +306,10 @@ Change Node Boot Mode
Request a change to the Node's boot mode.
.. note::
Depending on the driver and the underlying hardware, changing boot mode may
result in an automatic reboot.
.. versionadded:: 1.76
A change in node's boot mode can be requested.
@ -341,6 +345,10 @@ Change Node Secure Boot
Request a change to the Node's secure boot state.
.. note::
Depending on the driver and the underlying hardware, changing the secure
boot state may result in an automatic reboot.
.. versionadded:: 1.76
A change in node's secure boot state can be requested.

View File

@ -140,6 +140,22 @@ Two clean and deploy steps are provided for key management:
``management.clear_secure_boot_keys``
removes all secure boot keys from the node.
Rebooting on boot mode changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
While some hardware is able to change the boot mode or the `UEFI secure boot`_
state immediately, other models may require a reboot for such a change to be
applied. Furthermore, some hardware models cannot change the boot mode and the
secure boot state simultaneously, requiring several reboots.
The Bare Metal service refreshes the System resource after issuing a PATCH
request against it. If the expected change is not observed, the node is
rebooted, and the Bare Metal service waits until the change is applied. In the
end, the previous power state is restored.
This logic makes changing boot configuration more robust at the expense of
several reboots in the worst case.
Out-Of-Band inspection
======================

View File

@ -115,6 +115,12 @@ opts = [
default=60,
help=_('Number of seconds to wait between checking for '
'failed raid config tasks')),
cfg.IntOpt('boot_mode_config_timeout',
min=0,
default=900,
help=_('Number of seconds to wait for boot mode or secure '
'boot status change to take effect after a reboot. '
'Set to 0 to disable waiting.')),
]

View File

@ -14,6 +14,7 @@
# under the License.
import collections
import time
from urllib.parse import urlparse
from ironic_lib import metrics_utils
@ -44,6 +45,8 @@ METRICS = metrics_utils.get_metrics_logger(__name__)
sushy = importutils.try_import('sushy')
BOOT_MODE_CONFIG_INTERVAL = 15
if sushy:
BOOT_DEVICE_MAP = {
sushy.BOOT_SOURCE_TARGET_PXE: boot_devices.PXE,
@ -327,9 +330,13 @@ class RedfishManagement(base.ManagementInterface):
"""
system = redfish_utils.get_system(task.node)
# NOTE(dtantsur): check the readability of the current mode before
# modifying anything. I suspect it can become None transiently after
# the update, while we need to know if it is supported *at all*.
get_mode_unsupported = (system.boot.get('mode') is None)
try:
system.set_system_boot_options(mode=BOOT_MODE_MAP_REV[mode])
except sushy.exceptions.SushyError as e:
error_msg = (_('Setting boot mode to %(mode)s '
'failed for node %(node)s. '
@ -342,7 +349,7 @@ class RedfishManagement(base.ManagementInterface):
# getting or setting the boot mode. When setting failed and the
# mode attribute is missing from the boot field, raising
# UnsupportedDriverExtension will allow the deploy to continue.
if system.boot.get('mode') is None:
if get_mode_unsupported:
LOG.info(_('Attempt to set boot mode on node %(node)s '
'failed to set boot mode as the node does not '
'appear to support overriding the boot mode. '
@ -352,6 +359,66 @@ class RedfishManagement(base.ManagementInterface):
driver=task.node.driver, extension='set_boot_mode')
raise exception.RedfishError(error=error_msg)
# NOTE(dtantsur): this case is rather hypothetical, but in our own
# emulator, it's possible that mode is constantly set to None, while
# the request to change the mode succeeds.
if get_mode_unsupported:
LOG.warning('The request to set boot mode for node %(node)s to '
'%(value)s has succeeded, but the current mode is '
'not known. Skipping reboot and assuming '
'the operation has succeeded.',
{'node': task.node.uuid, 'value': mode})
return
self._wait_for_boot_mode(task, system, mode)
LOG.info('Boot mode for node %(node)s has been set to '
'%(value)s', {'node': task.node.uuid, 'value': mode})
def _wait_for_boot_mode(self, task, system, mode):
system.refresh(force=True)
# NOTE(dtantsur/janders): at least Dell machines change boot mode via
# a BIOS configuration job. A reboot is needed to apply it.
if system.boot.get('mode') == BOOT_MODE_MAP_REV[mode]:
LOG.debug('Node %(node)s is already configured with requested '
'boot mode %(new_value)s.',
{'node': task.node.uuid,
'new_value': BOOT_MODE_MAP_REV[mode]})
return
LOG.info('Rebooting node %(node)s to change boot mode from '
'%(old_value)s to %(new_value)s',
{'node': task.node.uuid,
'old_value': system.boot.get('mode'),
'new_value': BOOT_MODE_MAP_REV[mode]})
old_power_state = task.driver.power.get_power_state(task)
manager_utils.node_power_action(task, states.REBOOT)
if CONF.redfish.boot_mode_config_timeout:
threshold = time.time() + CONF.redfish.boot_mode_config_timeout
while (time.time() <= threshold
and system.boot.get('mode') != BOOT_MODE_MAP_REV[mode]):
LOG.debug('Still waiting for boot mode of node %(node)s '
'to become %(value)s, current is %(current)s',
{'node': task.node.uuid,
'value': BOOT_MODE_MAP_REV[mode],
'current': system.boot.get('mode')})
time.sleep(BOOT_MODE_CONFIG_INTERVAL)
system.refresh(force=True)
if system.boot.get('mode') != BOOT_MODE_MAP_REV[mode]:
msg = (_('Timeout reached while waiting for boot mode of '
'node %(node)s to become %(value)s, '
'current is %(current)s')
% {'node': task.node.uuid,
'value': BOOT_MODE_MAP_REV[mode],
'current': system.boot.get('mode')})
LOG.error(msg)
raise exception.RedfishError(error=msg)
manager_utils.node_power_action(task, old_power_state)
def get_boot_mode(self, task):
"""Get the current boot mode for a node.
@ -1142,9 +1209,45 @@ class RedfishManagement(base.ManagementInterface):
% {'node': task.node.uuid, 'value': state, 'exc': exc})
LOG.error(msg)
raise exception.RedfishError(error=msg)
else:
LOG.info('Secure boot state for node %(node)s has been set to '
'%(value)s', {'node': task.node.uuid, 'value': state})
self._wait_for_secure_boot(task, sb, state)
LOG.info('Secure boot state for node %(node)s has been set to '
'%(value)s', {'node': task.node.uuid, 'value': state})
def _wait_for_secure_boot(self, task, sb, state):
# NOTE(dtantsur): at least Dell machines change secure boot status via
# a BIOS configuration job. A reboot is needed to apply it.
sb.refresh(force=True)
if sb.enabled == state:
return
LOG.info('Rebooting node %(node)s to change secure boot state to '
'%(value)s', {'node': task.node.uuid, 'value': state})
old_power_state = task.driver.power.get_power_state(task)
manager_utils.node_power_action(task, states.REBOOT)
if CONF.redfish.boot_mode_config_timeout:
threshold = time.time() + CONF.redfish.boot_mode_config_timeout
while time.time() <= threshold and sb.enabled != state:
LOG.debug(
'Still waiting for secure boot state of node %(node)s '
'to become %(value)s, current is %(current)s',
{'node': task.node.uuid, 'value': state,
'current': sb.enabled})
time.sleep(BOOT_MODE_CONFIG_INTERVAL)
sb.refresh(force=True)
if sb.enabled != state:
msg = (_('Timeout reached while waiting for secure boot state '
'of node %(node)s to become %(state)s, '
'current is %(current)s')
% {'node': task.node.uuid, 'state': state,
'current': sb.enabled})
LOG.error(msg)
raise exception.RedfishError(error=msg)
manager_utils.node_power_action(task, old_power_state)
def _reset_keys(self, task, reset_type):
system = redfish_utils.get_system(task.node)

View File

@ -28,10 +28,12 @@ from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.conf import CONF
from ironic.drivers.modules import boot_mode_utils
from ironic.drivers.modules import deploy_utils
from ironic.drivers.modules.redfish import boot as redfish_boot
from ironic.drivers.modules.redfish import firmware_utils
from ironic.drivers.modules.redfish import management as redfish_mgmt
from ironic.drivers.modules.redfish import power as redfish_power
from ironic.drivers.modules.redfish import utils as redfish_utils
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.db import utils as db_utils
@ -268,8 +270,10 @@ class RedfishManagementTestCase(db_base.DbTestCase):
boot_devices.PXE,
task.node.driver_internal_info['redfish_boot_device'])
@mock.patch.object(boot_mode_utils, 'sync_boot_mode', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_boot_device_persistency_vendor(self, mock_get_system):
def test_set_boot_device_persistency_vendor(self, mock_get_system,
mock_sync_boot_mode):
fake_system = mock_get_system.return_value
fake_system.boot.get.return_value = \
sushy.BOOT_SOURCE_ENABLED_CONTINUOUS
@ -288,18 +292,16 @@ class RedfishManagementTestCase(db_base.DbTestCase):
shared=False) as task:
task.driver.management.set_boot_device(
task, boot_devices.PXE, persistent=True)
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected)
if vendor == 'SuperMicro':
fake_system.set_system_boot_options.assert_has_calls(
[mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=expected),
mock.call(mode=sushy.BOOT_SOURCE_MODE_UEFI)])
mock_sync_boot_mode.assert_called_once_with(task)
else:
fake_system.set_system_boot_options.assert_has_calls(
[mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=expected)])
mock_sync_boot_mode.assert_not_called()
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
mock_sync_boot_mode.reset_mock()
mock_get_system.reset_mock()
def test_restore_boot_device(self):
@ -391,34 +393,46 @@ class RedfishManagementTestCase(db_base.DbTestCase):
self.assertEqual(list(redfish_mgmt.BOOT_MODE_MAP_REV),
supported_boot_modes)
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_boot_mode',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_boot_mode(self, mock_get_system):
def test_set_boot_mode(self, mock_get_system, mock_wait):
boot_attribute = {
'target': sushy.BOOT_SOURCE_TARGET_PXE,
'enabled': sushy.BOOT_SOURCE_ENABLED_CONTINUOUS,
'mode': sushy.BOOT_SOURCE_MODE_BIOS,
'mode': None,
}
fake_system = mock.Mock(boot=boot_attribute)
fake_system = mock.Mock()
mock_get_system.return_value = fake_system
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
expected_values = [
(boot_modes.LEGACY_BIOS, sushy.BOOT_SOURCE_MODE_BIOS),
(boot_modes.UEFI, sushy.BOOT_SOURCE_MODE_UEFI)
(boot_modes.LEGACY_BIOS, sushy.BOOT_SOURCE_MODE_BIOS,
sushy.BOOT_SOURCE_MODE_UEFI),
(boot_modes.UEFI, sushy.BOOT_SOURCE_MODE_UEFI,
sushy.BOOT_SOURCE_MODE_BIOS),
(boot_modes.LEGACY_BIOS, sushy.BOOT_SOURCE_MODE_BIOS, None),
(boot_modes.UEFI, sushy.BOOT_SOURCE_MODE_UEFI, None),
]
for mode, expected in expected_values:
for mode, expected, current in expected_values:
boot_attribute['mode'] = current
task.driver.management.set_boot_mode(task, mode=mode)
# Asserts
fake_system.set_system_boot_options.assert_called_once_with(
mode=expected)
mock_get_system.assert_called_once_with(task.node)
if current is not None:
mock_wait.assert_called_once_with(task.driver.management,
task, fake_system, mode)
else:
mock_wait.assert_not_called()
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
mock_get_system.reset_mock()
mock_wait.reset_mock()
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
@ -462,6 +476,44 @@ class RedfishManagementTestCase(db_base.DbTestCase):
mode=sushy.BOOT_SOURCE_MODE_UEFI)
mock_get_system.assert_called_once_with(task.node)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_boot_mode_immediate(self, mock_power):
fake_system = mock.Mock(spec=['boot', 'refresh'],
boot={'mode': sushy.BOOT_SOURCE_MODE_UEFI})
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management._wait_for_boot_mode(
task, fake_system, boot_modes.UEFI)
fake_system.refresh.assert_called_once_with(force=True)
mock_power.assert_not_called()
@mock.patch('time.sleep', lambda _: None)
@mock.patch.object(redfish_power.RedfishPower, 'get_power_state',
autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_boot_mode(self, mock_power, mock_get_power):
attempts = 3
def side_effect(force):
nonlocal attempts
attempts -= 1
if attempts <= 0:
fake_system.boot['mode'] = sushy.BOOT_SOURCE_MODE_UEFI
fake_system = mock.Mock(spec=['boot', 'refresh'],
boot={'mode': sushy.BOOT_SOURCE_MODE_BIOS})
fake_system.refresh.side_effect = side_effect
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management._wait_for_boot_mode(
task, fake_system, boot_modes.UEFI)
fake_system.refresh.assert_called_with(force=True)
self.assertEqual(3, fake_system.refresh.call_count)
mock_power.assert_has_calls([
mock.call(task, states.REBOOT),
mock.call(task, mock_get_power.return_value),
])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_get_boot_mode(self, mock_get_system):
boot_attribute = {
@ -1462,61 +1514,84 @@ class RedfishManagementTestCase(db_base.DbTestCase):
task.driver.management.get_secure_boot_state,
task)
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_secure_boot',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state(self, mock_get_system):
def test_set_secure_boot_state(self, mock_get_system, mock_wait):
fake_system = mock_get_system.return_value
fake_system.secure_boot.enabled = False
fake_system.boot = {'mode': sushy.BOOT_SOURCE_MODE_UEFI}
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
task.driver.management.set_secure_boot_state(task, True)
fake_system.secure_boot.set_enabled.assert_called_once_with(True)
mock_wait.assert_called_once_with(task.driver.management,
task, fake_system.secure_boot,
True)
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_secure_boot',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state_boot_mode_unknown(self, mock_get_system):
def test_set_secure_boot_state_boot_mode_unknown(self, mock_get_system,
mock_wait):
fake_system = mock_get_system.return_value
fake_system.secure_boot.enabled = False
fake_system.boot = {}
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
task.driver.management.set_secure_boot_state(task, True)
fake_system.secure_boot.set_enabled.assert_called_once_with(True)
mock_wait.assert_called_once_with(task.driver.management,
task, fake_system.secure_boot,
True)
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_secure_boot',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state_boot_mode_no_change(self, mock_get_system):
def test_set_secure_boot_state_boot_mode_no_change(self, mock_get_system,
mock_wait):
fake_system = mock_get_system.return_value
fake_system.secure_boot.enabled = False
fake_system.boot = {'mode': sushy.BOOT_SOURCE_MODE_BIOS}
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
task.driver.management.set_secure_boot_state(task, False)
self.assertFalse(fake_system.secure_boot.set_enabled.called)
fake_system.secure_boot.set_enabled.assert_not_called()
mock_wait.assert_not_called()
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_secure_boot',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state_boot_mode_incorrect(self, mock_get_system):
def test_set_secure_boot_state_boot_mode_incorrect(self, mock_get_system,
mock_wait):
fake_system = mock_get_system.return_value
fake_system.secure_boot.enabled = False
fake_system.boot = {'mode': sushy.BOOT_SOURCE_MODE_BIOS}
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
self.assertRaisesRegex(
exception.RedfishError, 'requires UEFI',
task.driver.management.set_secure_boot_state, task, True)
self.assertFalse(fake_system.secure_boot.set_enabled.called)
fake_system.secure_boot.set_enabled.assert_not_called()
mock_wait.assert_not_called()
@mock.patch.object(redfish_mgmt.RedfishManagement, '_wait_for_secure_boot',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state_boot_mode_fails(self, mock_get_system):
def test_set_secure_boot_state_boot_mode_fails(self, mock_get_system,
mock_wait):
fake_system = mock_get_system.return_value
fake_system.secure_boot.enabled = False
fake_system.secure_boot.set_enabled.side_effect = \
sushy.exceptions.SushyError
fake_system.boot = {'mode': sushy.BOOT_SOURCE_MODE_UEFI}
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
self.assertRaisesRegex(
exception.RedfishError, 'Failed to set secure boot',
task.driver.management.set_secure_boot_state, task, True)
fake_system.secure_boot.set_enabled.assert_called_once_with(True)
mock_wait.assert_not_called()
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_secure_boot_state_not_implemented(self, mock_get_system):
@ -1528,11 +1603,76 @@ class RedfishManagementTestCase(db_base.DbTestCase):
mock_get_system.return_value = NoSecureBoot()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
shared=False) as task:
self.assertRaises(exception.UnsupportedDriverExtension,
task.driver.management.set_secure_boot_state,
task, True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_secure_boot_immediate(self, mock_power):
fake_sb = mock.Mock(spec=['enabled', 'refresh'], enabled=True)
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management._wait_for_secure_boot(task, fake_sb, True)
fake_sb.refresh.assert_called_once_with(force=True)
mock_power.assert_not_called()
@mock.patch('time.sleep', lambda _: None)
@mock.patch.object(redfish_power.RedfishPower, 'get_power_state',
autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_secure_boot(self, mock_power, mock_get_power):
attempts = 3
def side_effect(force):
nonlocal attempts
attempts -= 1
if attempts <= 0:
fake_sb.enabled = True
fake_sb = mock.Mock(spec=['enabled', 'refresh'], enabled=False)
fake_sb.refresh.side_effect = side_effect
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management._wait_for_secure_boot(task, fake_sb, True)
fake_sb.refresh.assert_called_with(force=True)
self.assertEqual(3, fake_sb.refresh.call_count)
mock_power.assert_has_calls([
mock.call(task, states.REBOOT),
mock.call(task, mock_get_power.return_value),
])
@mock.patch.object(redfish_mgmt, 'BOOT_MODE_CONFIG_INTERVAL', 0.1)
@mock.patch.object(redfish_power.RedfishPower, 'get_power_state',
autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_secure_boot_timeout(self, mock_power, mock_get_power):
CONF.set_override('boot_mode_config_timeout', 1, group='redfish')
fake_sb = mock.Mock(spec=['enabled', 'refresh'], enabled=False)
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.assertRaisesRegex(
exception.RedfishError, 'Timeout reached',
task.driver.management._wait_for_secure_boot,
task, fake_sb, True)
fake_sb.refresh.assert_called_with(force=True)
mock_power.assert_called_once_with(task, states.REBOOT)
@mock.patch.object(redfish_power.RedfishPower, 'get_power_state',
autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test_wait_for_secure_boot_no_wait(self, mock_power, mock_get_power):
CONF.set_override('boot_mode_config_timeout', 0, group='redfish')
fake_sb = mock.Mock(spec=['enabled', 'refresh'], enabled=False)
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management._wait_for_secure_boot(task, fake_sb, True)
fake_sb.refresh.assert_called_once_with(force=True)
mock_power.assert_has_calls([
mock.call(task, states.REBOOT),
mock.call(task, mock_get_power.return_value),
])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reset_secure_boot_to_default(self, mock_get_system):
with task_manager.acquire(self.context, self.node.uuid,

View File

@ -0,0 +1,20 @@
---
fixes:
- |
While updating boot mode or secure boot state in the Redfish driver,
the node is now rebooted if the change is not detected on the System
resource refresh. Ironic then waits up to
``[redfish]boot_mode_config_timeout`` seconds until the change is applied.
upgrade:
- |
Changing the boot mode or the secure boot state via the direct API
(``/v1/nodes/{node_ident}/states/boot_mode`` and
``/v1/nodes/{node_ident}/states/secure_boot`` accordingly) may now
result in a reboot. This happens when the change cannot be applied
immediately. Previously, the change would be applied whenever the next
reboot happens for any unrelated reason, causing inconsistent behavior.
issues:
- |
When boot mode needs to be changed during provisioning, an additional
reboot may happen on certain hardware. This is to ensure consistent
behavior when any boot setting change results in a separate internal job.