diff --git a/ironic/common/states.py b/ironic/common/states.py index a7cb822adf..7f5aff509f 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -676,5 +676,8 @@ machine.add_transition(SERVICEFAIL, SERVICEHOLD, 'hold') # A node in service fail may be deleted. machine.add_transition(SERVICEFAIL, DELETING, 'delete') +# A node in service fail may be aborted (returned to active) +machine.add_transition(SERVICEFAIL, ACTIVE, 'abort') + # A node in service wait may be deleted. machine.add_transition(SERVICEWAIT, DELETING, 'delete') diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index b4ca59f1da..8def4675a9 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1440,7 +1440,8 @@ class ConductorManager(base_manager.BaseConductorManager): states.RESCUEWAIT, states.INSPECTWAIT, states.CLEANHOLD, - states.DEPLOYHOLD)): + states.DEPLOYHOLD, + states.SERVICEFAIL)): self._do_abort(task) return @@ -1532,6 +1533,22 @@ class ConductorManager(base_manager.BaseConductorManager): task.node.provision_state), err_handler=utils.provisioning_error_handler) + if node.provision_state == states.SERVICEFAIL: + if task.node.maintenance: + msg = (_('Can not abort service on node "%(node)s" while it ' + 'is in maintenance mode. Please remove the node ' + 'from maintenance prior to issuing the request to ' + 'abort the service operation.') % + {'node': node.uuid}) + raise exception.InvalidState(msg) + # When someone is in service fail, its okay to abort. + task.process_event( + 'abort', + callback=self._spawn_worker, + call_args=(servicing.do_node_service_abort, + task), + err_handler=utils.provisioning_error_handler) + @METRICS.timer('ConductorManager._sync_power_states') @periodics.periodic(spacing=CONF.conductor.sync_power_state_interval, enabled=CONF.conductor.sync_power_state_interval > 0) diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py index ad2d2b326c..78350eed08 100644 --- a/ironic/conductor/servicing.py +++ b/ironic/conductor/servicing.py @@ -336,6 +336,11 @@ def get_last_error(node): return last_error +def _was_agent_attempted(task): + """Check if agent start was attempted (vs. successfully heartbeated).""" + return task.node.driver_internal_info.get('agent_start_attempted', False) + + @task_manager.require_exclusive_lock def do_node_service_abort(task): """Internal method to abort an ongoing operation. @@ -344,7 +349,14 @@ def do_node_service_abort(task): """ node = task.node try: + # Use attempt-based logic instead of heartbeat-based + agent_attempted = _was_agent_attempted(task) + if agent_attempted: + task.driver.power.set_power_state(task, states.POWER_OFF) task.driver.deploy.tear_down_service(task) + if agent_attempted: + task.driver.boot.prepare_instance(task) + task.driver.power.set_power_state(task, states.POWER_ON) except Exception as e: log_msg = (_('Failed to tear down service for node %(node)s ' 'after aborting the operation. Error: %(err)s') % diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index f8177f3a60..ccdb778ffc 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -645,6 +645,8 @@ def wipe_internal_info_on_power_off(node): node.del_driver_internal_info('agent_cached_clean_steps') # Remove TLS certificate since it's regenerated on each run. node.del_driver_internal_info('agent_verify_ca') + # Remove agent start attempt tracking since it doesn't survive power off. + node.del_driver_internal_info('agent_start_attempted') def wipe_token_and_url(task): @@ -671,6 +673,8 @@ def wipe_deploy_internal_info(task): node.del_driver_internal_info('deploy_step_index') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('image_source') + # Remove agent start attempt tracking since deployment is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) @@ -685,6 +689,8 @@ def wipe_cleaning_internal_info(task): node.del_driver_internal_info('cleaning_disable_ramdisk') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('declarative_cleaning') + # Remove agent start attempt tracking since cleaning is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) @@ -698,6 +704,8 @@ def wipe_service_internal_info(task): node.del_driver_internal_info('service_disable_ramdisk') node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('image_source') + # Remove agent start attempt tracking since servicing is complete/aborted + node.del_driver_internal_info('agent_start_attempted') async_steps.remove_node_flags(node) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index f501b332a4..f3fcb01024 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -1746,6 +1746,12 @@ def prepare_agent_boot(task): :param task: a TaskManager instance. """ + # Record that we're attempting to start the agent, because this + # is the *central* location where we trigger ramdisk launches + # for actions outside of inspection. + task.node.set_driver_internal_info('agent_start_attempted', True) + task.node.save() + deploy_opts = build_agent_options(task.node) task.driver.boot.prepare_ramdisk(task, deploy_opts) diff --git a/ironic/tests/unit/api/controllers/v1/test_node.py b/ironic/tests/unit/api/controllers/v1/test_node.py index 2854d69e70..5d7ed6f243 100644 --- a/ironic/tests/unit/api/controllers/v1/test_node.py +++ b/ironic/tests/unit/api/controllers/v1/test_node.py @@ -6229,6 +6229,27 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """ self.assertEqual(urlparse.urlparse(ret.location).path, expected_location) + @mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action', + autospec=True) + def test_provision_with_abort_after_service_failed(self, + mock_dpa): + node = self.node + node.provision_state = states.SERVICEFAIL + node.target_provision_state = states.ACTIVE + node.save() + ret = self.put_json('/nodes/%s/states/provision' % node.uuid, + {'target': states.VERBS['abort']}, + headers={api_base.Version.string: "1.13"}) + self.assertEqual(http_client.ACCEPTED, ret.status_code) + self.assertEqual(b'', ret.body) + # Check location header + self.assertIsNotNone(ret.location) + expected_location = '/v1/nodes/%s/states' % node.uuid + self.assertEqual(urlparse.urlparse(ret.location).path, + expected_location) + mock_dpa.assert_called_once_with( + mock.ANY, mock.ANY, node.uuid, 'abort', 'test-topic') + def test_provision_with_unprovision_in_service_wait(self): node = self.node node.provision_state = states.SERVICEWAIT diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 283d3023a9..ca9f6654cc 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2835,6 +2835,37 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, states.DEPLOYHOLD) self.assertNotIn('agent_url', node.driver_internal_info) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_abort_from_servicefail(self, mock_spawn): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEFAIL, + target_provision_state=states.ACTIVE) + + self._start_service() + self.service.do_provisioning_action(self.context, node.uuid, 'abort') + mock_spawn.assert_called_with( + self.service, + servicing.do_node_service_abort, + mock.ANY) + + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_abort_from_servicefail_maintenance( + self, mock_spawn): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEFAIL, + target_provision_state=states.ACTIVE, + maintenance=True) + + self._start_service() + self.assertRaises( + exception.InvalidState, + self.service.do_provisioning_action, + self.context, node.uuid, 'abort') + @mgr_utils.mock_record_keepalive class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py index 9057f0e173..1ababbb1fd 100644 --- a/ironic/tests/unit/conductor/test_servicing.py +++ b/ironic/tests/unit/conductor/test_servicing.py @@ -1017,6 +1017,7 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase): driver_internal_info={ 'agent_url': 'some url', 'agent_secret_token': 'token', + 'agent_start_attempted': True, 'service_step_index': 2, 'servicing_reboot': True, 'servicing_polling': True, @@ -1087,6 +1088,46 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase): self.assertTrue(task.node.maintenance) self.assertEqual('service failure', task.node.fault) + @mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True) + @mock.patch.object(fake.FakePower, 'set_power_state', autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test_do_node_service_abort_agent_attempted(self, tear_mock, power_mock, + prepare_mock): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEWAIT, + target_provision_state=states.AVAILABLE, + driver_internal_info={ + 'agent_start_attempted': True, + 'service_step_index': 2}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(mock.ANY, task) + prepare_mock.assert_called_once_with(mock.ANY, task) + power_mock.assert_has_calls([ + mock.call(mock.ANY, task, 'power off'), + mock.call(mock.ANY, task, 'power on')]) + + @mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True) + @mock.patch.object(fake.FakePower, 'set_power_state', autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test_do_node_service_abort_agent_not_attempted(self, tear_mock, + power_mock, + prepare_mock): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEWAIT, + target_provision_state=states.AVAILABLE, + driver_internal_info={ + 'service_step_index': 2}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(mock.ANY, task) + prepare_mock.assert_not_called() + power_mock.assert_not_called() + class DoNodeCleanTestChildNodes(db_base.DbTestCase): def setUp(self): diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 344a2acff7..c168f6769c 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -2655,9 +2655,16 @@ class AgentTokenUtilsTestCase(tests_base.TestCase): def test_wipe_deploy_internal_info(self): conductor_utils.add_secret_token(self.node) - self.assertIn('agent_secret_token', self.node.driver_internal_info) + self.node.set_driver_internal_info('agent_start_attempted', True) + dii = self.node.driver_internal_info + self.assertIn('agent_secret_token', dii) + self.assertIn('agent_start_attempted', dii) + conductor_utils.wipe_deploy_internal_info(mock.Mock(node=self.node)) - self.assertNotIn('agent_secret_token', self.node.driver_internal_info) + + dii = self.node.driver_internal_info + self.assertNotIn('agent_secret_token', dii) + self.assertNotIn('agent_start_attempted', dii) def test_is_agent_token_present(self): # This should always be False as the token has not been added yet. @@ -3222,8 +3229,9 @@ class ServiceUtilsTestCase(db_base.DbTestCase): 'servicing_polling': 1, 'service_disable_ramdisk': False, 'skip_current_service_step': False, - 'steps_validated': 'meow' - 'agent_secret_token', + 'steps_validated': 'meow', + 'agent_secret_token': 'token', + 'agent_start_attempted': True, 'image_source': 'image_ref'} self.node.save() not_in_list = ['agent_cached_service_steps', @@ -3232,7 +3240,8 @@ class ServiceUtilsTestCase(db_base.DbTestCase): 'service_disable_ramdisk', 'skip_current_service_step', 'steps_validated', - 'agent_secret_token' + 'agent_secret_token', + 'agent_start_attempted', 'image_source'] with task_manager.acquire(self.context, self.node.id, shared=True) as task: diff --git a/ironic/tests/unit/drivers/modules/drac/test_management.py b/ironic/tests/unit/drivers/modules/drac/test_management.py index 9b4d22369d..087faa5f2f 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_management.py +++ b/ironic/tests/unit/drivers/modules/drac/test_management.py @@ -162,7 +162,8 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest): mock_get_configuration.return_value = json.loads( '{"oem": {"interface": "idrac-redfish", ' '"data": {"prop1": "value1", "prop2": 2}}}') - + mock_sdii = mock.Mock() + task.node.set_driver_internal_info = mock_sdii result = self.management.import_configuration(task, 'edge') self.assertEqual(states.DEPLOYWAIT, result) @@ -175,6 +176,7 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest): mock_build_agent_options.assert_called_once_with(task.node) task.driver.boot.prepare_ramdisk.assert_called_once_with( task, deploy_opts) + self.assertEqual(2, mock_sdii.call_count) @mock.patch.object(drac_mgmt.DracRedfishManagement, 'import_configuration', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/redfish/test_raid.py b/ironic/tests/unit/drivers/modules/redfish/test_raid.py index f43605a462..f116cf877a 100644 --- a/ironic/tests/unit/drivers/modules/redfish/test_raid.py +++ b/ironic/tests/unit/drivers/modules/redfish/test_raid.py @@ -231,8 +231,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase): self.mock_storage] self.node.target_raid_config = target_raid_config self.node.save() + mock_sdii = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: + task.node.set_driver_internal_info = mock_sdii task.driver.raid.create_configuration(task) pre = '/redfish/v1/Systems/1/Storage/1/Drives/' expected_payload = { @@ -251,6 +253,7 @@ class RedfishRAIDTestCase(db_base.DbTestCase): ) # Async operation, raid_config shouldn't be updated yet self.assertEqual({}, task.node.raid_config) + self.assertEqual(2, mock_sdii.call_count) @mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk', spec_set=True, autospec=True) @@ -585,8 +588,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase): self.mock_storage] self.node.target_raid_config = target_raid_config self.node.save() + sdii_mock = mock.Mock() with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: + task.node.set_driver_internal_info = sdii_mock task.driver.raid.create_configuration(task) pre = '/redfish/v1/Systems/1/Storage/1/Drives/' expected_payload = { @@ -605,6 +610,11 @@ class RedfishRAIDTestCase(db_base.DbTestCase): ) # Async operation, raid_config shouldn't be updated yet self.assertEqual({}, task.node.raid_config) + sdii_mock.assert_has_calls([ + mock.call('raid_configs', {'operation': 'create', + 'pending': {}, + 'task_monitor_uri': mock.ANY}), + mock.call('agent_start_attempted', True)]) @mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk', spec_set=True, autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 47df855106..1100e50786 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1198,6 +1198,22 @@ class AgentMethodsTestCase(db_base.DbTestCase): options = utils.build_agent_options(self.node) self.assertEqual('https://api-url', options['ipa-api-url']) + @mock.patch.object(utils, 'build_agent_options', autospec=True) + def test_prepare_agent_boot(self, mock_build_opts): + mock_build_opts.return_value = {'ipa-api-url': 'https://api-url'} + + with task_manager.acquire(self.context, self.node.uuid) as task: + mock_pr = mock.Mock() + task.driver.boot.prepare_ramdisk = mock_pr + utils.prepare_agent_boot(task) + # Verify attempt tracking is set + self.assertTrue( + task.node.driver_internal_info.get('agent_start_attempted')) + + # Verify boot preparation was called + mock_pr.assert_called_once_with( + task, {'ipa-api-url': 'https://api-url'}) + def test_direct_deploy_should_convert_raw_image_true(self): cfg.CONF.set_override('force_raw_images', True) cfg.CONF.set_override('stream_raw_images', True, group='agent') diff --git a/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml b/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml new file mode 100644 index 0000000000..675b801754 --- /dev/null +++ b/releasenotes/notes/servicefail-abort-8ca7a7498321b67c.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes the ability to invoke the ``abort`` API verb when a node is in + ``service fail`` state, allowing the user to back out of a failure state. + For more information see + `bug 2119989 `_.