From 1eda80766c321c607e69af02bfbeee8943ab5df2 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Fri, 8 Aug 2025 14:37:06 -0700 Subject: [PATCH] Fix the ability to escape service fail Back when we developed service, we expected operators to iterate to fix their issues, but we also put in abort code. We just never wired in the abort code to the abort verb. It really seems like we really should have done that, and this change changes API and Conductor code path to make this happen. Closes-Bug: 2119989 Assisted-By: Claude Clode - Claude Sonnet 4 Change-Id: Ic02ba87485a676e77563057427ab94953bea2cc2 Signed-off-by: Julia Kreger --- ironic/common/states.py | 3 ++ ironic/conductor/manager.py | 19 ++++++++- ironic/conductor/servicing.py | 12 ++++++ ironic/conductor/utils.py | 8 ++++ ironic/drivers/modules/deploy_utils.py | 6 +++ .../unit/api/controllers/v1/test_node.py | 21 ++++++++++ ironic/tests/unit/conductor/test_manager.py | 31 ++++++++++++++ ironic/tests/unit/conductor/test_servicing.py | 41 +++++++++++++++++++ ironic/tests/unit/conductor/test_utils.py | 19 ++++++--- .../drivers/modules/drac/test_management.py | 4 +- .../unit/drivers/modules/redfish/test_raid.py | 10 +++++ .../unit/drivers/modules/test_deploy_utils.py | 16 ++++++++ .../servicefail-abort-8ca7a7498321b67c.yaml | 7 ++++ 13 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml diff --git a/ironic/common/states.py b/ironic/common/states.py index a7cb822adf..7f5aff509f 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -676,5 +676,8 @@ machine.add_transition(SERVICEFAIL, SERVICEHOLD, 'hold') # A node in service fail may be deleted. machine.add_transition(SERVICEFAIL, DELETING, 'delete') +# A node in service fail may be aborted (returned to active) +machine.add_transition(SERVICEFAIL, ACTIVE, 'abort') + # A node in service wait may be deleted. machine.add_transition(SERVICEWAIT, DELETING, 'delete') diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index b4ca59f1da..8def4675a9 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1440,7 +1440,8 @@ class ConductorManager(base_manager.BaseConductorManager): states.RESCUEWAIT, states.INSPECTWAIT, states.CLEANHOLD, - states.DEPLOYHOLD)): + states.DEPLOYHOLD, + states.SERVICEFAIL)): self._do_abort(task) return @@ -1532,6 +1533,22 @@ class ConductorManager(base_manager.BaseConductorManager): task.node.provision_state), err_handler=utils.provisioning_error_handler) + if node.provision_state == states.SERVICEFAIL: + if task.node.maintenance: + msg = (_('Can not abort service on node "%(node)s" while it ' + 'is in maintenance mode. Please remove the node ' + 'from maintenance prior to issuing the request to ' + 'abort the service operation.') % + {'node': node.uuid}) + raise exception.InvalidState(msg) + # When someone is in service fail, its okay to abort. + task.process_event( + 'abort', + callback=self._spawn_worker, + call_args=(servicing.do_node_service_abort, + task), + err_handler=utils.provisioning_error_handler) + @METRICS.timer('ConductorManager._sync_power_states') @periodics.periodic(spacing=CONF.conductor.sync_power_state_interval, enabled=CONF.conductor.sync_power_state_interval > 0) diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py index ad2d2b326c..78350eed08 100644 --- a/ironic/conductor/servicing.py +++ b/ironic/conductor/servicing.py @@ -336,6 +336,11 @@ def get_last_error(node): return last_error +def _was_agent_attempted(task): + """Check if agent start was attempted (vs. successfully heartbeated).""" + return task.node.driver_internal_info.get('agent_start_attempted', False) + + @task_manager.require_exclusive_lock def do_node_service_abort(task): """Internal method to abort an ongoing operation. @@ -344,7 +349,14 @@ def do_node_service_abort(task): """ node = task.node try: + # Use attempt-based logic instead of heartbeat-based + agent_attempted = _was_agent_attempted(task) + if agent_attempted: + task.driver.power.set_power_state(task, states.POWER_OFF) task.driver.deploy.tear_down_service(task) + if agent_attempted: + task.driver.boot.prepare_instance(task) + task.driver.power.set_power_state(task, states.POWER_ON) except Exception as e: log_msg = (_('Failed to tear down service for node %(node)s ' 'after aborting the operation. Error: %(err)s') % diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index f8177f3a60..ccdb778ffc 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -645,6 +645,8 @@ def wipe_internal_info_on_power_off(node): node.del_driver_internal_info('agent_cached_clean_steps') # Remove TLS certificate since it's regenerated on each run. node.del_driver_internal_info('agent_verify_ca') + # Remove agent start attempt tracking since it doesn't survive power off. + node.del_driver_internal_info('agent_start_attempted') def wipe_token_and_url(task): @@ -671,6 +673,8 @@ def wipe_deploy_internal_info(task): node.del_driver_internal_info('deploy_step_index') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('image_source') + # Remove agent start attempt tracking since deployment is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) @@ -685,6 +689,8 @@ def wipe_cleaning_internal_info(task): node.del_driver_internal_info('cleaning_disable_ramdisk') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('declarative_cleaning') + # Remove agent start attempt tracking since cleaning is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) @@ -698,6 +704,8 @@ def wipe_service_internal_info(task): node.del_driver_internal_info('service_disable_ramdisk') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('image_source') + # Remove agent start attempt tracking since servicing is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index f501b332a4..f3fcb01024 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -1746,6 +1746,12 @@ def prepare_agent_boot(task): :param task: a TaskManager instance. """ + # Record that we're attempting to start the agent, because this + # is the *central* location where we trigger ramdisk launches + # for actions outside of inspection. + task.node.set_driver_internal_info('agent_start_attempted', True) + task.node.save() + deploy_opts = build_agent_options(task.node) task.driver.boot.prepare_ramdisk(task, deploy_opts) diff --git a/ironic/tests/unit/api/controllers/v1/test_node.py b/ironic/tests/unit/api/controllers/v1/test_node.py index 2854d69e70..5d7ed6f243 100644 --- a/ironic/tests/unit/api/controllers/v1/test_node.py +++ b/ironic/tests/unit/api/controllers/v1/test_node.py @@ -6229,6 +6229,27 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """ self.assertEqual(urlparse.urlparse(ret.location).path, expected_location) + @mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action', + autospec=True) + def test_provision_with_abort_after_service_failed(self, + mock_dpa): + node = self.node + node.provision_state = states.SERVICEFAIL + node.target_provision_state = states.ACTIVE + node.save() + ret = self.put_json('/nodes/%s/states/provision' % node.uuid, + {'target': states.VERBS['abort']}, + headers={api_base.Version.string: "1.13"}) + self.assertEqual(http_client.ACCEPTED, ret.status_code) + self.assertEqual(b'', ret.body) + # Check location header + self.assertIsNotNone(ret.location) + expected_location = '/v1/nodes/%s/states' % node.uuid + self.assertEqual(urlparse.urlparse(ret.location).path, + expected_location) + mock_dpa.assert_called_once_with( + mock.ANY, mock.ANY, node.uuid, 'abort', 'test-topic') + def test_provision_with_unprovision_in_service_wait(self): node = self.node node.provision_state = states.SERVICEWAIT diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 283d3023a9..ca9f6654cc 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2835,6 +2835,37 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, states.DEPLOYHOLD) self.assertNotIn('agent_url', node.driver_internal_info) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_abort_from_servicefail(self, mock_spawn): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEFAIL, + target_provision_state=states.ACTIVE) + + self._start_service() + self.service.do_provisioning_action(self.context, node.uuid, 'abort') + mock_spawn.assert_called_with( + self.service, + servicing.do_node_service_abort, + mock.ANY) + + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_abort_from_servicefail_maintenance( + self, mock_spawn): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEFAIL, + target_provision_state=states.ACTIVE, + maintenance=True) + + self._start_service() + self.assertRaises( + exception.InvalidState, + self.service.do_provisioning_action, + self.context, node.uuid, 'abort') + @mgr_utils.mock_record_keepalive class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py index 9057f0e173..1ababbb1fd 100644 --- a/ironic/tests/unit/conductor/test_servicing.py +++ b/ironic/tests/unit/conductor/test_servicing.py @@ -1017,6 +1017,7 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase): driver_internal_info={ 'agent_url': 'some url', 'agent_secret_token': 'token', + 'agent_start_attempted': True, 'service_step_index': 2, 'servicing_reboot': True, 'servicing_polling': True, @@ -1087,6 +1088,46 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase): self.assertTrue(task.node.maintenance) self.assertEqual('service failure', task.node.fault) + @mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True) + @mock.patch.object(fake.FakePower, 'set_power_state', autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test_do_node_service_abort_agent_attempted(self, tear_mock, power_mock, + prepare_mock): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEWAIT, + target_provision_state=states.AVAILABLE, + driver_internal_info={ + 'agent_start_attempted': True, + 'service_step_index': 2}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(mock.ANY, task) + prepare_mock.assert_called_once_with(mock.ANY, task) + power_mock.assert_has_calls([ + mock.call(mock.ANY, task, 'power off'), + mock.call(mock.ANY, task, 'power on')]) + + @mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True) + @mock.patch.object(fake.FakePower, 'set_power_state', autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test_do_node_service_abort_agent_not_attempted(self, tear_mock, + power_mock, + prepare_mock): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEWAIT, + target_provision_state=states.AVAILABLE, + driver_internal_info={ + 'service_step_index': 2}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(mock.ANY, task) + prepare_mock.assert_not_called() + power_mock.assert_not_called() + class DoNodeCleanTestChildNodes(db_base.DbTestCase): def setUp(self): diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 344a2acff7..c168f6769c 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -2655,9 +2655,16 @@ class AgentTokenUtilsTestCase(tests_base.TestCase): def test_wipe_deploy_internal_info(self): conductor_utils.add_secret_token(self.node) - self.assertIn('agent_secret_token', self.node.driver_internal_info) + self.node.set_driver_internal_info('agent_start_attempted', True) + dii = self.node.driver_internal_info + self.assertIn('agent_secret_token', dii) + self.assertIn('agent_start_attempted', dii) + conductor_utils.wipe_deploy_internal_info(mock.Mock(node=self.node)) - self.assertNotIn('agent_secret_token', self.node.driver_internal_info) + + dii = self.node.driver_internal_info + self.assertNotIn('agent_secret_token', dii) + self.assertNotIn('agent_start_attempted', dii) def test_is_agent_token_present(self): # This should always be False as the token has not been added yet. @@ -3222,8 +3229,9 @@ class ServiceUtilsTestCase(db_base.DbTestCase): 'servicing_polling': 1, 'service_disable_ramdisk': False, 'skip_current_service_step': False, - 'steps_validated': 'meow' - 'agent_secret_token', + 'steps_validated': 'meow', + 'agent_secret_token': 'token', + 'agent_start_attempted': True, 'image_source': 'image_ref'} self.node.save() not_in_list = ['agent_cached_service_steps', @@ -3232,7 +3240,8 @@ class ServiceUtilsTestCase(db_base.DbTestCase): 'service_disable_ramdisk', 'skip_current_service_step', 'steps_validated', - 'agent_secret_token' + 'agent_secret_token', + 'agent_start_attempted', 'image_source'] with task_manager.acquire(self.context, self.node.id, shared=True) as task: diff --git a/ironic/tests/unit/drivers/modules/drac/test_management.py b/ironic/tests/unit/drivers/modules/drac/test_management.py index 9b4d22369d..087faa5f2f 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_management.py +++ b/ironic/tests/unit/drivers/modules/drac/test_management.py @@ -162,7 +162,8 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest): mock_get_configuration.return_value = json.loads( '{"oem": {"interface": "idrac-redfish", ' '"data": {"prop1": "value1", "prop2": 2}}}') - + mock_sdii = mock.Mock() + task.node.set_driver_internal_info = mock_sdii result = self.management.import_configuration(task, 'edge') self.assertEqual(states.DEPLOYWAIT, result) @@ -175,6 +176,7 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest): mock_build_agent_options.assert_called_once_with(task.node) task.driver.boot.prepare_ramdisk.assert_called_once_with( task, deploy_opts) + self.assertEqual(2, mock_sdii.call_count) @mock.patch.object(drac_mgmt.DracRedfishManagement, 'import_configuration', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_raid.py b/ironic/tests/unit/drivers/modules/redfish/test_raid.py index f43605a462..f116cf877a 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_raid.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_raid.py @@ -231,8 +231,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase): self.mock_storage] self.node.target_raid_config = target_raid_config self.node.save() + mock_sdii = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: + task.node.set_driver_internal_info = mock_sdii task.driver.raid.create_configuration(task) pre = '/redfish/v1/Systems/1/Storage/1/Drives/' expected_payload = { @@ -251,6 +253,7 @@ class RedfishRAIDTestCase(db_base.DbTestCase): ) # Async operation, raid_config shouldn't be updated yet self.assertEqual({}, task.node.raid_config) + self.assertEqual(2, mock_sdii.call_count) @mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk', spec_set=True, autospec=True) @@ -585,8 +588,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase): self.mock_storage] self.node.target_raid_config = target_raid_config self.node.save() + sdii_mock = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: + task.node.set_driver_internal_info = sdii_mock task.driver.raid.create_configuration(task) pre = '/redfish/v1/Systems/1/Storage/1/Drives/' expected_payload = { @@ -605,6 +610,11 @@ class RedfishRAIDTestCase(db_base.DbTestCase): ) # Async operation, raid_config shouldn't be updated yet self.assertEqual({}, task.node.raid_config) + sdii_mock.assert_has_calls([ + mock.call('raid_configs', {'operation': 'create', + 'pending': {}, + 'task_monitor_uri': mock.ANY}), + mock.call('agent_start_attempted', True)]) @mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk', spec_set=True, autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 47df855106..1100e50786 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1198,6 +1198,22 @@ class AgentMethodsTestCase(db_base.DbTestCase): options = utils.build_agent_options(self.node) self.assertEqual('https://api-url', options['ipa-api-url']) + @mock.patch.object(utils, 'build_agent_options', autospec=True) + def test_prepare_agent_boot(self, mock_build_opts): + mock_build_opts.return_value = {'ipa-api-url': 'https://api-url'} + + with task_manager.acquire(self.context, self.node.uuid) as task: + mock_pr = mock.Mock() + task.driver.boot.prepare_ramdisk = mock_pr + utils.prepare_agent_boot(task) + # Verify attempt tracking is set + self.assertTrue( + task.node.driver_internal_info.get('agent_start_attempted')) + + # Verify boot preparation was called + mock_pr.assert_called_once_with( + task, {'ipa-api-url': 'https://api-url'}) + def test_direct_deploy_should_convert_raw_image_true(self): cfg.CONF.set_override('force_raw_images', True) cfg.CONF.set_override('stream_raw_images', True, group='agent') diff --git a/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml b/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml new file mode 100644 index 0000000000..675b801754 --- /dev/null +++ b/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes the ability to invoke the ``abort`` API verb when a node is in + ``service fail`` state, allowing the user to back out of a failure state. + For more information see + `bug 2119989 `_.