Skip initial reboot to IPA when updating firmware out-of-band

This change enables Ironic to skip initial reboot to IPA when
performing out-of-band firmware.

Change-Id: Id055a4ddbde3dbe336717e5f06ca6eb024b90c9f
Signed-off-by: Jacob Anders <janders@redhat.com>
This commit is contained in:
Jacob Anders
2025-07-08 16:25:40 +10:00
parent 276937a571
commit 1b2c01185c
6 changed files with 132 additions and 47 deletions

View File

@@ -38,6 +38,7 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
:param disable_ramdisk: Whether to skip booting ramdisk for servicing.
"""
node = task.node
ramdisk_needed = None
try:
# NOTE(ghe): Valid power and network values are needed to perform
# a service operation.
@@ -54,42 +55,53 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
node.set_driver_internal_info('service_disable_ramdisk',
disable_ramdisk)
task.node.save()
utils.node_update_cache(task)
# Allow the deploy driver to set up the ramdisk again (necessary for IPA)
try:
if not disable_ramdisk:
prepare_result = task.driver.deploy.prepare_service(task)
else:
LOG.info('Skipping preparing for service in-band service since '
'out-of-band only service has been requested for node '
'%s', node.uuid)
prepare_result = None
except Exception as e:
msg = (_('Failed to prepare node %(node)s for service: %(e)s')
% {'node': node.uuid, 'e': e})
return utils.servicing_error_handler(task, msg, traceback=True)
if prepare_result == states.SERVICEWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['service_steps'] and
# node.service_step and then make an RPC call to
# continue_node_service to start service operations.
task.process_event('wait')
return
try:
conductor_steps.set_node_service_steps(
task, disable_ramdisk=disable_ramdisk)
except exception.InvalidParameterValue:
if disable_ramdisk:
# NOTE(janders) raising log severity since this will result
# in servicing failure
LOG.error('Failed to compose list of service steps for node '
'%s', node.uuid)
raise
LOG.debug('Unable to compose list of service steps for node %(node)s '
'will retry once ramdisk is booted.', {'node': node.uuid})
ramdisk_needed = True
except Exception as e:
# Catch all exceptions and follow the error handling
# path so things are cleaned up properly.
msg = (_('Cannot service node %(node)s: %(msg)s')
% {'node': node.uuid, 'msg': e})
return utils.servicing_error_handler(task, msg)
steps = node.driver_internal_info.get('service_steps', [])
step_index = 0 if steps else None
for step in steps:
step_requires_ramdisk = step.get('requires_ramdisk')
if step_requires_ramdisk:
LOG.debug('Found service step %s requiring ramdisk '
'for node %s', step, task.node.uuid)
ramdisk_needed = True
if ramdisk_needed and not disable_ramdisk:
try:
prepare_result = task.driver.deploy.prepare_service(task)
except Exception as e:
msg = (_('Failed to prepare node %(node)s for service: %(e)s')
% {'node': node.uuid, 'e': e})
return utils.servicing_error_handler(task, msg, traceback=True)
if prepare_result == states.SERVICEWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['service_steps'] and
# node.service_step and then make an RPC call to
# continue_node_service to start service operations.
task.process_event('wait')
return
else:
LOG.debug('Will proceed with servicing node %(node)s '
'without booting the ramdisk.', {'node': node.uuid})
do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)

View File

@@ -496,9 +496,11 @@ def set_node_service_steps(task, disable_ramdisk=False):
are accepted.
:raises: InvalidParameterValue if there is a problem with the user's
clean steps.
:raises: NodeCleaningFailure if there was a problem getting the
:raises: NodeServicingFailure if there was a problem getting the
service steps.
"""
# FIXME(janders) it seems NodeServicingFailure is never actually raised,
# perhaps we should remove it?
node = task.node
steps = _validate_user_service_steps(
task, node.driver_internal_info.get('service_steps', []),

View File

@@ -1750,7 +1750,7 @@ def prepare_agent_boot(task):
task.driver.boot.prepare_ramdisk(task, deploy_opts)
def reboot_to_finish_step(task, timeout=None):
def reboot_to_finish_step(task, timeout=None, disable_ramdisk=None):
"""Reboot the node into IPA to finish a deploy/clean step.
:param task: a TaskManager instance.
@@ -1759,8 +1759,14 @@ def reboot_to_finish_step(task, timeout=None):
:returns: states.CLEANWAIT if cleaning operation in progress
or states.DEPLOYWAIT if deploy operation in progress.
"""
disable_ramdisk = task.node.driver_internal_info.get(
'cleaning_disable_ramdisk')
if disable_ramdisk is None:
if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]:
disable_ramdisk = task.node.driver_internal_info.get(
'cleaning_disable_ramdisk')
elif task.node.provision_state in [states.SERVICING,
states.SERVICEWAIT]:
disable_ramdisk = task.node.driver_internal_info.get(
'service_disable_ramdisk')
if not disable_ramdisk:
if (manager_utils.is_fast_track(task)
and not task.node.disable_power_off):

View File

@@ -146,7 +146,7 @@ class RedfishFirmware(base.FirmwareInterface):
requires_ramdisk=True)
@base.service_step(priority=0, abortable=False,
argsinfo=_FW_SETTINGS_ARGSINFO,
requires_ramdisk=True)
requires_ramdisk=False)
@base.cache_firmware_components
def update(self, task, settings):
"""Update the Firmware on the node using the settings for components.
@@ -181,7 +181,8 @@ class RedfishFirmware(base.FirmwareInterface):
polling=True
)
return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval)
return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval,
disable_ramdisk=True)
def _execute_firmware_update(self, node, update_service, settings):
"""Executes the next firmware update to the node

View File

@@ -109,14 +109,21 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
mock_validate.side_effect = exception.NetworkError()
self.__do_node_service_validate_fail(mock_validate)
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_prepare_service_fail(self, mock_prep,
mock_validate,
service_steps=None):
# Exception from task.driver.deploy.prepare_cleaning should cause node
mock_steps,
service_steps=[]):
# NOTE(janders) after removing unconditional initial reboot into
# ramdisk, set_node_service_steps needs to InvalidParameterValue
# to force boot into ramdisk
mock_steps.side_effect = exception.InvalidParameterValue('error')
# Exception from task.driver.deploy.prepare_service should cause node
# to go to SERVICEFAIL
mock_prep.side_effect = exception.InvalidParameterValue('error')
tgt_prov_state = states.ACTIVE
@@ -135,17 +142,76 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_prepare_service_wait(self, mock_prep,
mock_validate):
mock_validate,
mock_steps):
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor'}
]
mock_steps.side_effect = exception.InvalidParameterValue('error')
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICING,
target_provision_state=tgt_prov_state,
vendor_interface='fake')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
servicing.do_node_service(task, service_steps=service_steps)
node.refresh()
self.assertEqual(states.SERVICEWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_called_once_with(mock.ANY, mock.ANY)
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_out_of_band(self, mock_prep,
mock_validate):
# NOTE(janders) this test ensures ramdisk isn't prepared if
# steps do not require it
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor'}
]
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICING,
target_provision_state=tgt_prov_state,
vendor_interface='fake')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
servicing.do_node_service(task, service_steps=service_steps)
node.refresh()
self.assertEqual(states.SERVICEWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_not_called()
mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_requires_ramdisk_fallback(self, mock_prep,
mock_validate):
# NOTE(janders): here we set 'requires_ramdisk': 'True'
# to force ramdisk_needed to be set and trigger prepare ramdisk
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor', 'requires_ramdisk': 'True'}
]
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
@@ -194,11 +260,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
def __do_node_service_steps_fail(self, mock_steps, mock_validate,
service_steps=None, invalid_exc=True):
if invalid_exc:
mock_steps.side_effect = exception.InvalidParameterValue('invalid')
else:
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
service_steps=None):
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
@@ -224,12 +287,8 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
def test_do_node_service_steps_fail_poweroff(self, mock_steps,
mock_validate,
mock_power,
service_steps=None,
invalid_exc=True):
if invalid_exc:
mock_steps.side_effect = exception.InvalidParameterValue('invalid')
else:
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
service_steps=None):
mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE
self.config(poweroff_in_cleanfail=True, group='conductor')
node = obj_utils.create_test_node(
@@ -249,9 +308,7 @@ class DoNodeServiceTestCase(db_base.DbTestCase):
self.assertFalse(mock_power.called)
def test__do_node_service_steps_fail(self):
for invalid in (True, False):
self.__do_node_service_steps_fail(service_steps=[self.deploy_raid],
invalid_exc=invalid)
self.__do_node_service_steps_fail(service_steps=[self.deploy_raid])
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)

View File

@@ -0,0 +1,7 @@
---
fixes:
- |
Removes initial, unconditional reboot into ramdisk during servicing when
not required by specified service steps. This reduces the total number of
reboots needed when performing servicing, speeding up the process.