Merge "redfish: handle hardware that is unable to set persistent boot" into stable/train

This commit is contained in:
Zuul 2020-05-29 10:53:15 +00:00 committed by Gerrit Code Review
commit 911bc51a10
6 changed files with 258 additions and 24 deletions

View File

@ -537,3 +537,31 @@ def validate_conductor_group(conductor_group):
raise exception.InvalidConductorGroup(group=conductor_group)
if not re.match(r'^[a-zA-Z0-9_\-\.]*$', conductor_group):
raise exception.InvalidConductorGroup(group=conductor_group)
def set_node_nested_field(node, collection, field, value):
"""Set a value in a dictionary field of a node.
:param node: Node object.
:param collection: Name of the field with the dictionary.
:param field: Nested field name.
:param value: New value.
"""
col = getattr(node, collection)
col[field] = value
setattr(node, collection, col)
def pop_node_nested_field(node, collection, field, default=None):
"""Pop a value from a dictionary field of a node.
:param node: Node object.
:param collection: Name of the field with the dictionary.
:param field: Nested field name.
:param default: The default value to return.
:return: The removed value or the default.
"""
col = getattr(node, collection)
result = col.pop(field, default)
setattr(node, collection, col)
return result

View File

@ -22,6 +22,7 @@ from ironic.common import boot_devices
from ironic.common import boot_modes
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import utils
from ironic.conductor import task_manager
from ironic.drivers import base
from ironic.drivers.modules.redfish import utils as redfish_utils
@ -56,6 +57,63 @@ if sushy:
BOOT_DEVICE_PERSISTENT_MAP.items()}
def _set_boot_device(task, system, device,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE):
"""An internal routine to set the boot device.
:param task: a task from TaskManager.
:param system: a Redfish System object.
:param device: the Redfish boot device.
:param enabled: Redfish boot device persistence value.
:raises: SushyError on an error from the Sushy library
"""
desired_enabled = enabled
current_enabled = system.boot.get('enabled')
# NOTE(etingof): this can be racy, esp if BMC is not RESTful
new_enabled = (desired_enabled
if desired_enabled != current_enabled else None)
use_new_set_system_boot = True
try:
try:
system.set_system_boot_options(device, enabled=new_enabled)
except AttributeError:
new_enabled = enabled
use_new_set_system_boot = False
system.set_system_boot_source(device, enabled=new_enabled)
except sushy.exceptions.SushyError as e:
if new_enabled == sushy.BOOT_SOURCE_ENABLED_CONTINUOUS:
# NOTE(dtantsur): continuous boot device settings have been
# removed from Redfish, and some vendors stopped supporting
# it before an alternative was provided. As a work around,
# use one-time boot and restore the boot device on every
# reboot via RedfishPower.
LOG.debug('Error %(error)s when trying to set a '
'persistent boot device on node %(node)s, '
'falling back to one-time boot settings',
{'error': e, 'node': task.node.uuid})
if use_new_set_system_boot:
system.set_system_boot_options(
device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
else:
system.set_system_boot_source(
device, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
LOG.warning('Could not set persistent boot device to '
'%(dev)s for node %(node)s, using one-time '
'boot device instead',
{'dev': device, 'node': task.node.uuid})
utils.set_node_nested_field(
task.node, 'driver_internal_info',
'redfish_boot_device', device)
task.node.save()
else:
raise
class RedfishManagement(base.ManagementInterface):
def __init__(self):
@ -95,6 +153,33 @@ class RedfishManagement(base.ManagementInterface):
"""
return list(BOOT_DEVICE_MAP_REV)
@task_manager.require_exclusive_lock
def restore_boot_device(self, task, system):
"""Restore boot device if needed.
Checks the redfish_boot_device internal flag and sets the one-time
boot device accordingly. A warning is issued if it fails.
This method is supposed to be called from the Redfish power interface
and should be considered private to the Redfish hardware type.
:param task: a task from TaskManager.
:param system: a Redfish System object.
"""
device = task.node.driver_internal_info.get('redfish_boot_device')
if not device:
return
LOG.debug('Restoring boot device %(dev)s on node %(node)s',
{'dev': device, 'node': task.node.uuid})
try:
_set_boot_device(task, system, device)
except sushy.exceptions.SushyError as e:
LOG.warning('Unable to recover boot device %(dev)s for node '
'%(node)s, relying on the pre-configured boot order. '
'Error: %(error)s',
{'dev': device, 'node': task.node.uuid, 'error': e})
@task_manager.require_exclusive_lock
def set_boot_device(self, task, device, persistent=False):
"""Set the boot device for a node.
@ -112,24 +197,16 @@ class RedfishManagement(base.ManagementInterface):
:raises: RedfishConnectionError when it fails to connect to Redfish
:raises: RedfishError on an error from the Sushy library
"""
utils.pop_node_nested_field(
task.node, 'driver_internal_info', 'redfish_boot_device')
task.node.save()
system = redfish_utils.get_system(task.node)
desired_persistence = BOOT_DEVICE_PERSISTENT_MAP_REV[persistent]
current_persistence = system.boot.get('enabled')
# NOTE(etingof): this can be racy, esp if BMC is not RESTful
enabled = (desired_persistence
if desired_persistence != current_persistence else None)
try:
try:
system.set_system_boot_options(
BOOT_DEVICE_MAP_REV[device], enabled=enabled)
except AttributeError:
system.set_system_boot_source(
BOOT_DEVICE_MAP_REV[device],
enabled=BOOT_DEVICE_PERSISTENT_MAP_REV[persistent])
_set_boot_device(
task, system, BOOT_DEVICE_MAP_REV[device],
enabled=BOOT_DEVICE_PERSISTENT_MAP_REV[persistent])
except sushy.exceptions.SushyError as e:
error_msg = (_('Redfish set boot device failed for node '
'%(node)s. Error: %(error)s') %

View File

@ -22,6 +22,7 @@ from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as cond_utils
from ironic.drivers import base
from ironic.drivers.modules.redfish import management as redfish_mgmt
from ironic.drivers.modules.redfish import utils as redfish_utils
LOG = log.getLogger(__name__)
@ -123,6 +124,12 @@ class RedfishPower(base.PowerInterface):
:raises: RedfishError on an error from the Sushy library
"""
system = redfish_utils.get_system(task.node)
if (power_state in (states.POWER_ON, states.SOFT_REBOOT, states.REBOOT)
and isinstance(task.driver.management,
redfish_mgmt.RedfishManagement)):
task.driver.management.restore_boot_device(task, system)
try:
_set_power_state(task, system, power_state, timeout=timeout)
except sushy.exceptions.SushyError as e:
@ -151,6 +158,10 @@ class RedfishPower(base.PowerInterface):
next_state = states.POWER_OFF
_set_power_state(task, system, next_state, timeout=timeout)
if isinstance(task.driver.management,
redfish_mgmt.RedfishManagement):
task.driver.management.restore_boot_device(task, system)
next_state = states.POWER_ON
_set_power_state(task, system, next_state, timeout=timeout)
except sushy.exceptions.SushyError as e:

View File

@ -118,6 +118,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
fake_system.set_system_boot_source.assert_called_once_with(
expected, enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
@ -141,6 +143,8 @@ class RedfishManagementTestCase(db_base.DbTestCase):
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_PXE, enabled=expected)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
# Reset mocks
fake_system.set_system_boot_options.reset_mock()
@ -188,6 +192,84 @@ class RedfishManagementTestCase(db_base.DbTestCase):
sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
mock_get_system.assert_called_once_with(task.node)
self.assertNotIn('redfish_boot_device',
task.node.driver_internal_info)
@mock.patch.object(sushy, 'Sushy', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_boot_device_persistence_fallback(self, mock_get_system,
mock_sushy):
fake_system = mock.Mock()
fake_system.set_system_boot_options.side_effect = [
sushy.exceptions.SushyError(),
None,
]
mock_get_system.return_value = fake_system
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management.set_boot_device(
task, boot_devices.PXE, persistent=True)
fake_system.set_system_boot_options.assert_has_calls([
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_CONTINUOUS),
mock.call(sushy.BOOT_SOURCE_TARGET_PXE,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE),
])
mock_get_system.assert_called_once_with(task.node)
task.node.refresh()
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_PXE,
task.node.driver_internal_info['redfish_boot_device'])
def test_restore_boot_device(self):
fake_system = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.node.driver_internal_info['redfish_boot_device'] = (
sushy.BOOT_SOURCE_TARGET_HDD
)
task.driver.management.restore_boot_device(task, fake_system)
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_HDD,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
# The stored boot device is kept intact
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_HDD,
task.node.driver_internal_info['redfish_boot_device'])
def test_restore_boot_device_noop(self):
fake_system = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.management.restore_boot_device(task, fake_system)
self.assertFalse(fake_system.set_system_boot_options.called)
@mock.patch.object(redfish_mgmt.LOG, 'warning', autospec=True)
def test_restore_boot_device_failure(self, mock_log):
fake_system = mock.Mock()
fake_system.set_system_boot_options.side_effect = (
sushy.exceptions.SushyError()
)
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.node.driver_internal_info['redfish_boot_device'] = (
sushy.BOOT_SOURCE_TARGET_HDD
)
task.driver.management.restore_boot_device(task, fake_system)
fake_system.set_system_boot_options.assert_called_once_with(
sushy.BOOT_SOURCE_TARGET_HDD,
enabled=sushy.BOOT_SOURCE_ENABLED_ONCE)
self.assertTrue(mock_log.called)
# The stored boot device is kept intact
self.assertEqual(
sushy.BOOT_SOURCE_TARGET_HDD,
task.node.driver_internal_info['redfish_boot_device'])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_get_boot_device(self, mock_get_system):

View File

@ -19,6 +19,7 @@ from oslo_utils import importutils
from ironic.common import exception
from ironic.common import states
from ironic.conductor import task_manager
from ironic.drivers.modules.redfish import management as redfish_mgmt
from ironic.drivers.modules.redfish import power as redfish_power
from ironic.drivers.modules.redfish import utils as redfish_utils
from ironic.tests.unit.db import base as db_base
@ -82,19 +83,21 @@ class RedfishPowerTestCase(db_base.DbTestCase):
mock_get_system.assert_called_once_with(task.node)
mock_get_system.reset_mock()
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state(self, mock_get_system):
def test_set_power_state(self, mock_get_system, mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
expected_values = [
(states.POWER_ON, sushy.RESET_ON),
(states.POWER_OFF, sushy.RESET_FORCE_OFF),
(states.REBOOT, sushy.RESET_FORCE_RESTART),
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART),
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN)
(states.POWER_ON, sushy.RESET_ON, True),
(states.POWER_OFF, sushy.RESET_FORCE_OFF, False),
(states.REBOOT, sushy.RESET_FORCE_RESTART, True),
(states.SOFT_REBOOT, sushy.RESET_GRACEFUL_RESTART, True),
(states.SOFT_POWER_OFF, sushy.RESET_GRACEFUL_SHUTDOWN, False)
]
for target, expected in expected_values:
for target, expected, restore_bootdev in expected_values:
if target in (states.POWER_OFF, states.SOFT_POWER_OFF):
final = sushy.SYSTEM_POWER_STATE_OFF
transient = sushy.SYSTEM_POWER_STATE_ON
@ -113,9 +116,15 @@ class RedfishPowerTestCase(db_base.DbTestCase):
system_result[0].reset_system.assert_called_once_with(expected)
mock_get_system.assert_called_with(task.node)
self.assertEqual(4, mock_get_system.call_count)
if restore_bootdev:
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
else:
self.assertFalse(mock_restore_bootdev.called)
# Reset mocks
mock_get_system.reset_mock()
mock_restore_bootdev.reset_mock()
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state_not_reached(self, mock_get_system):
@ -164,8 +173,11 @@ class RedfishPowerTestCase(db_base.DbTestCase):
sushy.RESET_ON)
mock_get_system.assert_called_once_with(task.node)
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_from_power_off(self, mock_get_system):
def test_reboot_from_power_off(self, mock_get_system,
mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
system_result = [
@ -185,9 +197,13 @@ class RedfishPowerTestCase(db_base.DbTestCase):
sushy.RESET_ON)
mock_get_system.assert_called_with(task.node)
self.assertEqual(3, mock_get_system.call_count)
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_from_power_on(self, mock_get_system):
def test_reboot_from_power_on(self, mock_get_system, mock_restore_bootdev):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
system_result = [
@ -209,6 +225,8 @@ class RedfishPowerTestCase(db_base.DbTestCase):
])
mock_get_system.assert_called_with(task.node)
self.assertEqual(3, mock_get_system.call_count)
mock_restore_bootdev.assert_called_once_with(
task.driver.management, task, system_result[0])
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_reboot_not_reached(self, mock_get_system):

View File

@ -0,0 +1,18 @@
---
fixes:
- |
Provides a workaround for hardware that does not support persistent boot
device setting with the ``redfish`` hardware type. When such situation is
detected, ironic will fall back to one-time boot device setting, restoring
it on every reboot.
issues:
- |
Some redfish-enabled hardware is known not to support persistent boot
device setting that is used by the Bare Metal service for deployed
instances. The ``redfish`` hardware type tries to work around this problem,
but rebooting such an instance in-band may cause it to boot incorrectly.
A predictable boot order should be configured in the node's boot firmware
to avoid issues and at least metadata cleaning must be enabled.
See `this mailing list thread
<http://lists.openstack.org/pipermail/openstack-discuss/2020-April/014543.html>`_
for technical details.