Skip initial reboot to IPA when updating firmware out-of-band

This change enables Ironic to skip initial reboot to IPA when
performing out-of-band firmware.

Change-Id: Id055a4ddbde3dbe336717e5f06ca6eb024b90c9f
Signed-off-by: Jacob Anders <janders@redhat.com>
This commit is contained in:
Jacob Anders
2025-07-08 16:25:40 +10:00
parent 276937a571
commit 1b2c01185c
6 changed files with 132 additions and 47 deletions

View File

@@ -38,6 +38,7 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
:param disable_ramdisk: Whether to skip booting ramdisk for servicing. :param disable_ramdisk: Whether to skip booting ramdisk for servicing.
""" """
node = task.node node = task.node
ramdisk_needed = None
try: try:
# NOTE(ghe): Valid power and network values are needed to perform # NOTE(ghe): Valid power and network values are needed to perform
# a service operation. # a service operation.
@@ -54,42 +55,53 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
node.set_driver_internal_info('service_disable_ramdisk', node.set_driver_internal_info('service_disable_ramdisk',
disable_ramdisk) disable_ramdisk)
task.node.save() task.node.save()
utils.node_update_cache(task) utils.node_update_cache(task)
# Allow the deploy driver to set up the ramdisk again (necessary for IPA)
try:
if not disable_ramdisk:
prepare_result = task.driver.deploy.prepare_service(task)
else:
LOG.info('Skipping preparing for service in-band service since '
'out-of-band only service has been requested for node '
'%s', node.uuid)
prepare_result = None
except Exception as e:
msg = (_('Failed to prepare node %(node)s for service: %(e)s')
% {'node': node.uuid, 'e': e})
return utils.servicing_error_handler(task, msg, traceback=True)
if prepare_result == states.SERVICEWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['service_steps'] and
# node.service_step and then make an RPC call to
# continue_node_service to start service operations.
task.process_event('wait')
return
try: try:
conductor_steps.set_node_service_steps( conductor_steps.set_node_service_steps(
task, disable_ramdisk=disable_ramdisk) task, disable_ramdisk=disable_ramdisk)
except exception.InvalidParameterValue:
if disable_ramdisk:
# NOTE(janders) raising log severity since this will result
# in servicing failure
LOG.error('Failed to compose list of service steps for node '
'%s', node.uuid)
raise
LOG.debug('Unable to compose list of service steps for node %(node)s '
'will retry once ramdisk is booted.', {'node': node.uuid})
ramdisk_needed = True
except Exception as e: except Exception as e:
# Catch all exceptions and follow the error handling
# path so things are cleaned up properly.
msg = (_('Cannot service node %(node)s: %(msg)s') msg = (_('Cannot service node %(node)s: %(msg)s')
% {'node': node.uuid, 'msg': e}) % {'node': node.uuid, 'msg': e})
return utils.servicing_error_handler(task, msg) return utils.servicing_error_handler(task, msg)
steps = node.driver_internal_info.get('service_steps', []) steps = node.driver_internal_info.get('service_steps', [])
step_index = 0 if steps else None step_index = 0 if steps else None
for step in steps:
step_requires_ramdisk = step.get('requires_ramdisk')
if step_requires_ramdisk:
LOG.debug('Found service step %s requiring ramdisk '
'for node %s', step, task.node.uuid)
ramdisk_needed = True
if ramdisk_needed and not disable_ramdisk:
try:
prepare_result = task.driver.deploy.prepare_service(task)
except Exception as e:
msg = (_('Failed to prepare node %(node)s for service: %(e)s')
% {'node': node.uuid, 'e': e})
return utils.servicing_error_handler(task, msg, traceback=True)
if prepare_result == states.SERVICEWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['service_steps'] and
# node.service_step and then make an RPC call to
# continue_node_service to start service operations.
task.process_event('wait')
return
else:
LOG.debug('Will proceed with servicing node %(node)s '
'without booting the ramdisk.', {'node': node.uuid})
do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk) do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)

View File

@@ -496,9 +496,11 @@ def set_node_service_steps(task, disable_ramdisk=False):
are accepted. are accepted.
:raises: InvalidParameterValue if there is a problem with the user's :raises: InvalidParameterValue if there is a problem with the user's
clean steps. clean steps.
:raises: NodeCleaningFailure if there was a problem getting the :raises: NodeServicingFailure if there was a problem getting the
service steps. service steps.
""" """
# FIXME(janders) it seems NodeServicingFailure is never actually raised,
# perhaps we should remove it?
node = task.node node = task.node
steps = _validate_user_service_steps( steps = _validate_user_service_steps(
task, node.driver_internal_info.get('service_steps', []), task, node.driver_internal_info.get('service_steps', []),

View File

@@ -1750,7 +1750,7 @@ def prepare_agent_boot(task):
task.driver.boot.prepare_ramdisk(task, deploy_opts) task.driver.boot.prepare_ramdisk(task, deploy_opts)
def reboot_to_finish_step(task, timeout=None): def reboot_to_finish_step(task, timeout=None, disable_ramdisk=None):
"""Reboot the node into IPA to finish a deploy/clean step. """Reboot the node into IPA to finish a deploy/clean step.
:param task: a TaskManager instance. :param task: a TaskManager instance.
@@ -1759,8 +1759,14 @@ def reboot_to_finish_step(task, timeout=None):
:returns: states.CLEANWAIT if cleaning operation in progress :returns: states.CLEANWAIT if cleaning operation in progress
or states.DEPLOYWAIT if deploy operation in progress. or states.DEPLOYWAIT if deploy operation in progress.
""" """
disable_ramdisk = task.node.driver_internal_info.get( if disable_ramdisk is None:
'cleaning_disable_ramdisk') if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]:
disable_ramdisk = task.node.driver_internal_info.get(
'cleaning_disable_ramdisk')
elif task.node.provision_state in [states.SERVICING,
states.SERVICEWAIT]:
disable_ramdisk = task.node.driver_internal_info.get(
'service_disable_ramdisk')
if not disable_ramdisk: if not disable_ramdisk:
if (manager_utils.is_fast_track(task) if (manager_utils.is_fast_track(task)
and not task.node.disable_power_off): and not task.node.disable_power_off):

View File

@@ -146,7 +146,7 @@ class RedfishFirmware(base.FirmwareInterface):
requires_ramdisk=True) requires_ramdisk=True)
@base.service_step(priority=0, abortable=False, @base.service_step(priority=0, abortable=False,
argsinfo=_FW_SETTINGS_ARGSINFO, argsinfo=_FW_SETTINGS_ARGSINFO,
requires_ramdisk=True) requires_ramdisk=False)
@base.cache_firmware_components @base.cache_firmware_components
def update(self, task, settings): def update(self, task, settings):
"""Update the Firmware on the node using the settings for components. """Update the Firmware on the node using the settings for components.
@@ -181,7 +181,8 @@ class RedfishFirmware(base.FirmwareInterface):
polling=True polling=True
) )
return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval) return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval,
disable_ramdisk=True)
def _execute_firmware_update(self, node, update_service, settings): def _execute_firmware_update(self, node, update_service, settings):
"""Executes the next firmware update to the node """Executes the next firmware update to the node

View File

@@ -109,14 +109,21 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
mock_validate.side_effect = exception.NetworkError() mock_validate.side_effect = exception.NetworkError()
self.__do_node_service_validate_fail(mock_validate) self.__do_node_service_validate_fail(mock_validate)
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True) autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True) autospec=True)
def test__do_node_service_prepare_service_fail(self, mock_prep, def test__do_node_service_prepare_service_fail(self, mock_prep,
mock_validate, mock_validate,
service_steps=None): mock_steps,
# Exception from task.driver.deploy.prepare_cleaning should cause node service_steps=[]):
# NOTE(janders) after removing unconditional initial reboot into
# ramdisk, set_node_service_steps needs to InvalidParameterValue
# to force boot into ramdisk
mock_steps.side_effect = exception.InvalidParameterValue('error')
# Exception from task.driver.deploy.prepare_service should cause node
# to go to SERVICEFAIL # to go to SERVICEFAIL
mock_prep.side_effect = exception.InvalidParameterValue('error') mock_prep.side_effect = exception.InvalidParameterValue('error')
tgt_prov_state = states.ACTIVE tgt_prov_state = states.ACTIVE
@@ -135,17 +142,76 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
self.assertFalse(node.maintenance) self.assertFalse(node.maintenance)
self.assertIsNone(node.fault) self.assertIsNone(node.fault)
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True) autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True) autospec=True)
def test__do_node_service_prepare_service_wait(self, mock_prep, def test__do_node_service_prepare_service_wait(self, mock_prep,
mock_validate): mock_validate,
mock_steps):
service_steps = [ service_steps = [
{'step': 'trigger_servicewait', 'priority': 10, {'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor'} 'interface': 'vendor'}
] ]
mock_steps.side_effect = exception.InvalidParameterValue('error')
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICING,
target_provision_state=tgt_prov_state,
vendor_interface='fake')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
servicing.do_node_service(task, service_steps=service_steps)
node.refresh()
self.assertEqual(states.SERVICEWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_called_once_with(mock.ANY, mock.ANY)
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_out_of_band(self, mock_prep,
mock_validate):
# NOTE(janders) this test ensures ramdisk isn't prepared if
# steps do not require it
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor'}
]
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICING,
target_provision_state=tgt_prov_state,
vendor_interface='fake')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
servicing.do_node_service(task, service_steps=service_steps)
node.refresh()
self.assertEqual(states.SERVICEWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_not_called()
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_requires_ramdisk_fallback(self, mock_prep,
mock_validate):
# NOTE(janders): here we set 'requires_ramdisk': 'True'
# to force ramdisk_needed to be set and trigger prepare ramdisk
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor', 'requires_ramdisk': 'True'}
]
mock_prep.return_value = states.SERVICEWAIT mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
@@ -194,11 +260,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
@mock.patch.object(conductor_steps, 'set_node_service_steps', @mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True) autospec=True)
def __do_node_service_steps_fail(self, mock_steps, mock_validate, def __do_node_service_steps_fail(self, mock_steps, mock_validate,
service_steps=None, invalid_exc=True): service_steps=None):
if invalid_exc: mock_steps.side_effect = exception.NodeCleaningFailure('failure')
mock_steps.side_effect = exception.InvalidParameterValue('invalid')
else:
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake-hardware', self.context, driver='fake-hardware',
@@ -224,12 +287,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
def test_do_node_service_steps_fail_poweroff(self, mock_steps, def test_do_node_service_steps_fail_poweroff(self, mock_steps,
mock_validate, mock_validate,
mock_power, mock_power,
service_steps=None, service_steps=None):
invalid_exc=True): mock_steps.side_effect = exception.NodeCleaningFailure('failure')
if invalid_exc:
mock_steps.side_effect = exception.InvalidParameterValue('invalid')
else:
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE tgt_prov_state = states.ACTIVE
self.config(poweroff_in_cleanfail=True, group='conductor') self.config(poweroff_in_cleanfail=True, group='conductor')
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
@@ -249,9 +308,7 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
self.assertFalse(mock_power.called) self.assertFalse(mock_power.called)
def test__do_node_service_steps_fail(self): def test__do_node_service_steps_fail(self):
for invalid in (True, False): self.__do_node_service_steps_fail(service_steps=[self.deploy_raid])
self.__do_node_service_steps_fail(service_steps=[self.deploy_raid],
invalid_exc=invalid)
@mock.patch.object(conductor_steps, 'set_node_service_steps', @mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True) autospec=True)

View File

@@ -0,0 +1,7 @@
---
fixes:
- |
Removes initial, unconditional reboot into ramdisk during servicing when
not required by specified service steps. This reduces the total number of
reboots needed when performing servicing, speeding up the process.