redfish: handle hardware that is unable to set persistent boot
Some hardware has dropped support for continuous boot, see this thread: http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html Work around by falling back to one-time boot device, restoring it on every reboot or power on. Story: #2007527 Task: #39320 Change-Id: I762056dea4f7b8ccdb8f95b2f21e26b45763905d
This commit is contained in:
parent
8562df092f
commit
18a8e2f6e9
@ -24,6 +24,7 @@ from ironic.common import components
|
||||
from ironic.common import exception
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import indicator_states
|
||||
from ironic.common import utils
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.drivers import base
|
||||
from ironic.drivers.modules.redfish import utils as redfish_utils
|
||||
@ -68,6 +69,42 @@ if sushy:
|
||||
v: k for k, v in INDICATOR_MAP.items()}
|
||||
|
||||
|
||||
def _set_boot_device(task, system, device, enabled=None):
|
||||
"""An internal routine to set the boot device.
|
||||
|
||||
:param task: a task from TaskManager.
|
||||
:param system: a Redfish System object.
|
||||
:param device: the Redfish boot device.
|
||||
:param enabled: Redfish boot device persistence value or None.
|
||||
:raises: SushyError on an error from the Sushy library
|
||||
"""
|
||||
try:
|
||||
system.set_system_boot_options(device, enabled=enabled)
|
||||
except sushy.exceptions.SushyError as e:
|
||||
if enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS:
|
||||
# NOTE(dtantsur): continuous boot device settings have been
|
||||
# removed from Redfish, and some vendors stopped supporting
|
||||
# it before an alternative was provided. As a work around,
|
||||
# use one-time boot and restore the boot device on every
|
||||
# reboot via RedfishPower.
|
||||
LOG.debug('Error %(error)s when trying to set a '
|
||||
'persistent boot device on node %(node)s, '
|
||||
'falling back to one-time boot settings',
|
||||
{'error': e, 'node': task.node.uuid})
|
||||
system.set_system_boot_options(
|
||||
device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
|
||||
LOG.warning('Could not set persistent boot device to '
|
||||
'%(dev)s for node %(node)s, using one-time '
|
||||
'boot device instead',
|
||||
{'dev': device, 'node': task.node.uuid})
|
||||
utils.set_node_nested_field(
|
||||
task.node, 'driver_internal_info',
|
||||
'redfish_boot_device', device)
|
||||
task.node.save()
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
class RedfishManagement(base.ManagementInterface):
|
||||
|
||||
def __init__(self):
|
||||
@ -107,6 +144,33 @@ class RedfishManagement(base.ManagementInterface):
|
||||
"""
|
||||
return list(BOOT_DEVICE_MAP_REV)
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def restore_boot_device(self, task, system):
|
||||
"""Restore boot device if needed.
|
||||
|
||||
Checks the redfish_boot_device internal flag and sets the one-time
|
||||
boot device accordingly. A warning is issued if it fails.
|
||||
|
||||
This method is supposed to be called from the Redfish power interface
|
||||
and should be considered private to the Redfish hardware type.
|
||||
|
||||
:param task: a task from TaskManager.
|
||||
:param system: a Redfish System object.
|
||||
"""
|
||||
device = task.node.driver_internal_info.get('redfish_boot_device')
|
||||
if not device:
|
||||
return
|
||||
|
||||
LOG.debug('Restoring boot device %(dev)s on node %(node)s',
|
||||
{'dev': device, 'node': task.node.uuid})
|
||||
try:
|
||||
_set_boot_device(task, system, device)
|
||||
except sushy.exceptions.SushyError as e:
|
||||
LOG.warning('Unable to recover boot device %(dev)s for node '
|
||||
'%(node)s, relying on the pre-configured boot order. '
|
||||
'Error: %(error)s',
|
||||
{'dev': device, 'node': task.node.uuid, 'error': e})
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def set_boot_device(self, task, device, persistent=False):
|
||||
"""Set the boot device for a node.
|
||||
@ -124,6 +188,10 @@ class RedfishManagement(base.ManagementInterface):
|
||||
:raises: RedfishConnectionError when it fails to connect to Redfish
|
||||
:raises: RedfishError on an error from the Sushy library
|
||||
"""
|
||||
utils.pop_node_nested_field(
|
||||
task.node, 'driver_internal_info', 'redfish_boot_device')
|
||||
task.node.save()
|
||||
|
||||
system = redfish_utils.get_system(task.node)
|
||||
|
||||
desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent]
|
||||
@ -132,11 +200,9 @@ class RedfishManagement(base.ManagementInterface):
|
||||
# NOTE(etingof): this can be racy, esp if BMC is not RESTful
|
||||
enabled = (desired_persistence
|
||||
if desired_persistence != current_persistence else None)
|
||||
|
||||
try:
|
||||
system.set_system_boot_options(
|
||||
BOOT_DEVICE_MAP_REV[device], enabled=enabled)
|
||||
|
||||
_set_boot_device(task, system, BOOT_DEVICE_MAP_REV[device],
|
||||
enabled=enabled)
|
||||
except sushy.exceptions.SushyError as e:
|
||||
error_msg = (_('Redfish set boot device failed for node '
|
||||
'%(node)s. Error: %(error)s') %
|
||||
|
@ -22,6 +22,7 @@ from ironic.common import states
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.conductor import utils as cond_utils
|
||||
from ironic.drivers import base
|
||||
from ironic.drivers.modules.redfish import management as redfish_mgmt
|
||||
from ironic.drivers.modules.redfish import utils as redfish_utils
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface):
|
||||
:raises: RedfishError on an error from the Sushy library
|
||||
"""
|
||||
system = redfish_utils.get_system(task.node)
|
||||
|
||||
if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT)
|
||||
and isinstance(task.driver.management,
|
||||
redfish_mgmt.RedfishManagement)):
|
||||
task.driver.management.restore_boot_device(task, system)
|
||||
|
||||
try:
|
||||
_set_power_state(task, system, power_state, timeout=timeout)
|
||||
except sushy.exceptions.SushyError as e:
|
||||
@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface):
|
||||
next_state = states.POWER_OFF
|
||||
_set_power_state(task, system, next_state, timeout=timeout)
|
||||
|
||||
if isinstance(task.driver.management,
|
||||
redfish_mgmt.RedfishManagement):
|
||||
task.driver.management.restore_boot_device(task, system)
|
||||
|
||||
next_state = states.POWER_ON
|
||||
_set_power_state(task, system, next_state, timeout=timeout)
|
||||
except sushy.exceptions.SushyError as e:
|
||||
|
@ -100,6 +100,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
|
||||
fake_system.set_system_boot_options.assert_called_once_with(
|
||||
expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
self.assertNotIn('redfish_boot_device',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
# Reset mocks
|
||||
fake_system.set_system_boot_options.reset_mock()
|
||||
@ -123,6 +125,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
|
||||
fake_system.set_system_boot_options.assert_called_once_with(
|
||||
sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected)
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
self.assertNotIn('redfish_boot_device',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
# Reset mocks
|
||||
fake_system.set_system_boot_options.reset_mock()
|
||||
@ -170,6 +174,82 @@ class RedfishManagementTestCase(db_base.DbTestCase):
|
||||
sushy.BOOT_SOURCE_TARGET_PXE,
|
||||
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
self.assertNotIn('redfish_boot_device',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(sushy, 'Sushy', autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_set_boot_device_persistence_fallback(self, mock_get_system,
|
||||
mock_sushy):
|
||||
fake_system = mock.Mock()
|
||||
fake_system.set_system_boot_options.side_effect = [
|
||||
sushy.exceptions.SushyError(),
|
||||
None,
|
||||
]
|
||||
mock_get_system.return_value = fake_system
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
task.driver.management.set_boot_device(
|
||||
task, boot_devices.PXE, persistent=True)
|
||||
fake_system.set_system_boot_options.assert_has_calls([
|
||||
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
|
||||
enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS),
|
||||
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
|
||||
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE),
|
||||
])
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
|
||||
task.node.refresh()
|
||||
self.assertEqual(
|
||||
sushy.BOOT_SOURCE_TARGET_PXE,
|
||||
task.node.driver_internal_info['redfish_boot_device'])
|
||||
|
||||
def test_restore_boot_device(self):
|
||||
fake_system = mock.Mock()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
task.node.driver_internal_info['redfish_boot_device'] = (
|
||||
sushy.BOOT_SOURCE_TARGET_HDD
|
||||
)
|
||||
|
||||
task.driver.management.restore_boot_device(task, fake_system)
|
||||
|
||||
fake_system.set_system_boot_options.assert_called_once_with(
|
||||
sushy.BOOT_SOURCE_TARGET_HDD, enabled=None)
|
||||
# The stored boot device is kept intact
|
||||
self.assertEqual(
|
||||
sushy.BOOT_SOURCE_TARGET_HDD,
|
||||
task.node.driver_internal_info['redfish_boot_device'])
|
||||
|
||||
def test_restore_boot_device_noop(self):
|
||||
fake_system = mock.Mock()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
task.driver.management.restore_boot_device(task, fake_system)
|
||||
|
||||
self.assertFalse(fake_system.set_system_boot_options.called)
|
||||
|
||||
@mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True)
|
||||
def test_restore_boot_device_failure(self, mock_log):
|
||||
fake_system = mock.Mock()
|
||||
fake_system.set_system_boot_options.side_effect = (
|
||||
sushy.exceptions.SushyError()
|
||||
)
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
task.node.driver_internal_info['redfish_boot_device'] = (
|
||||
sushy.BOOT_SOURCE_TARGET_HDD
|
||||
)
|
||||
|
||||
task.driver.management.restore_boot_device(task, fake_system)
|
||||
|
||||
fake_system.set_system_boot_options.assert_called_once_with(
|
||||
sushy.BOOT_SOURCE_TARGET_HDD, enabled=None)
|
||||
self.assertTrue(mock_log.called)
|
||||
# The stored boot device is kept intact
|
||||
self.assertEqual(
|
||||
sushy.BOOT_SOURCE_TARGET_HDD,
|
||||
task.node.driver_internal_info['redfish_boot_device'])
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_get_boot_device(self, mock_get_system):
|
||||
|
@ -20,6 +20,7 @@ from oslo_utils import importutils
|
||||
from ironic.common import exception
|
||||
from ironic.common import states
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.drivers.modules.redfish import management as redfish_mgmt
|
||||
from ironic.drivers.modules.redfish import power as redfish_power
|
||||
from ironic.drivers.modules.redfish import utils as redfish_utils
|
||||
from ironic.tests.unit.db import base as db_base
|
||||
@ -84,19 +85,21 @@ class RedfishPowerTestCase(db_base.DbTestCase):
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
mock_get_system.reset_mock()
|
||||
|
||||
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
|
||||
autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_set_power_state(self, mock_get_system):
|
||||
def test_set_power_state(self, mock_get_system, mock_restore_bootdev):
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
expected_values = [
|
||||
(states.POWER_ON, sushy.RESET_ON),
|
||||
(states.POWER_OFF, sushy.RESET_FORCE_OFF),
|
||||
(states.REBOOT, sushy.RESET_FORCE_RESTART),
|
||||
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART),
|
||||
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN)
|
||||
(states.POWER_ON, sushy.RESET_ON, True),
|
||||
(states.POWER_OFF, sushy.RESET_FORCE_OFF, False),
|
||||
(states.REBOOT, sushy.RESET_FORCE_RESTART, True),
|
||||
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True),
|
||||
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False)
|
||||
]
|
||||
|
||||
for target, expected in expected_values:
|
||||
for target, expected, restore_bootdev in expected_values:
|
||||
if target in (states.POWER_OFF, states.SOFT_POWER_OFF):
|
||||
final = sushy.SYSTEM_POWER_STATE_OFF
|
||||
transient = sushy.SYSTEM_POWER_STATE_ON
|
||||
@ -115,9 +118,15 @@ class RedfishPowerTestCase(db_base.DbTestCase):
|
||||
system_result[0].reset_system.assert_called_once_with(expected)
|
||||
mock_get_system.assert_called_with(task.node)
|
||||
self.assertEqual(4, mock_get_system.call_count)
|
||||
if restore_bootdev:
|
||||
mock_restore_bootdev.assert_called_once_with(
|
||||
task.driver.management, task, system_result[0])
|
||||
else:
|
||||
self.assertFalse(mock_restore_bootdev.called)
|
||||
|
||||
# Reset mocks
|
||||
mock_get_system.reset_mock()
|
||||
mock_restore_bootdev.reset_mock()
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_set_power_state_not_reached(self, mock_get_system):
|
||||
@ -166,8 +175,11 @@ class RedfishPowerTestCase(db_base.DbTestCase):
|
||||
sushy.RESET_ON)
|
||||
mock_get_system.assert_called_once_with(task.node)
|
||||
|
||||
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
|
||||
autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_reboot_from_power_off(self, mock_get_system):
|
||||
def test_reboot_from_power_off(self, mock_get_system,
|
||||
mock_restore_bootdev):
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
system_result = [
|
||||
@ -187,9 +199,13 @@ class RedfishPowerTestCase(db_base.DbTestCase):
|
||||
sushy.RESET_ON)
|
||||
mock_get_system.assert_called_with(task.node)
|
||||
self.assertEqual(3, mock_get_system.call_count)
|
||||
mock_restore_bootdev.assert_called_once_with(
|
||||
task.driver.management, task, system_result[0])
|
||||
|
||||
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
|
||||
autospec=True)
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_reboot_from_power_on(self, mock_get_system):
|
||||
def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev):
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
system_result = [
|
||||
@ -211,6 +227,8 @@ class RedfishPowerTestCase(db_base.DbTestCase):
|
||||
])
|
||||
mock_get_system.assert_called_with(task.node)
|
||||
self.assertEqual(3, mock_get_system.call_count)
|
||||
mock_restore_bootdev.assert_called_once_with(
|
||||
task.driver.management, task, system_result[0])
|
||||
|
||||
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
|
||||
def test_reboot_not_reached(self, mock_get_system):
|
||||
|
18
releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml
Normal file
18
releasenotes/notes/redfish-sadness-6e2a37b3f45ef1aa.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Provides a workaround for hardware that does not support persistent boot
|
||||
device setting with the ``redfish`` hardware type. When such situation is
|
||||
detected, ironic will fall back to one-time boot device setting, restoring
|
||||
it on every reboot.
|
||||
issues:
|
||||
- |
|
||||
Some redfish-enabled hardware is known not to support persistent boot
|
||||
device setting that is used by the Bare Metal service for deployed
|
||||
instances. The ``redfish`` hardware type tries to work around this problem,
|
||||
but rebooting such an instance in-band may cause it to boot incorrectly.
|
||||
A predictable boot order should be configured in the node's boot firmware
|
||||
to avoid issues and at least metadata cleaning must be enabled.
|
||||
See `this mailing list thread
|
||||
<http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html>`_
|
||||
for technical details.
|
Loading…
Reference in New Issue
Block a user