Fix the ability to escape service fail

Back when we developed service, we expected operators to
iterate to fix their issues, but we also put in abort code.

We just never wired in the abort code to the abort verb.

It really seems like we really should have done that, and
this change changes API and Conductor code path to make this
happen.

Closes-Bug: 2119989

Assisted-By: Claude Clode - Claude Sonnet 4
Change-Id: Ic02ba87485a676e77563057427ab94953bea2cc2
Signed-off-by: Julia Kreger <juliaashleykreger@gmail.com>
This commit is contained in:
Julia Kreger
2025-08-08 14:37:06 -07:00
parent ad29d2b00c
commit 1eda80766c
13 changed files with 190 additions and 7 deletions

View File

@@ -676,5 +676,8 @@ machine.add_transition(SERVICEFAIL, SERVICEHOLD, 'hold')
# A node in service fail may be deleted.
machine.add_transition(SERVICEFAIL, DELETING, 'delete')
# A node in service fail may be aborted (returned to active)
machine.add_transition(SERVICEFAIL, ACTIVE, 'abort')
# A node in service wait may be deleted.
machine.add_transition(SERVICEWAIT, DELETING, 'delete')

View File

@@ -1440,7 +1440,8 @@ class ConductorManager(base_manager.BaseConductorManager):
states.RESCUEWAIT,
states.INSPECTWAIT,
states.CLEANHOLD,
states.DEPLOYHOLD)):
states.DEPLOYHOLD,
states.SERVICEFAIL)):
self._do_abort(task)
return
@@ -1532,6 +1533,22 @@ class ConductorManager(base_manager.BaseConductorManager):
task.node.provision_state),
err_handler=utils.provisioning_error_handler)
if node.provision_state == states.SERVICEFAIL:
if task.node.maintenance:
msg = (_('Can not abort service on node "%(node)s" while it '
'is in maintenance mode. Please remove the node '
'from maintenance prior to issuing the request to '
'abort the service operation.') %
{'node': node.uuid})
raise exception.InvalidState(msg)
# When someone is in service fail, its okay to abort.
task.process_event(
'abort',
callback=self._spawn_worker,
call_args=(servicing.do_node_service_abort,
task),
err_handler=utils.provisioning_error_handler)
@METRICS.timer('ConductorManager._sync_power_states')
@periodics.periodic(spacing=CONF.conductor.sync_power_state_interval,
enabled=CONF.conductor.sync_power_state_interval > 0)

View File

@@ -336,6 +336,11 @@ def get_last_error(node):
return last_error
def _was_agent_attempted(task):
"""Check if agent start was attempted (vs. successfully heartbeated)."""
return task.node.driver_internal_info.get('agent_start_attempted', False)
@task_manager.require_exclusive_lock
def do_node_service_abort(task):
"""Internal method to abort an ongoing operation.
@@ -344,7 +349,14 @@ def do_node_service_abort(task):
"""
node = task.node
try:
# Use attempt-based logic instead of heartbeat-based
agent_attempted = _was_agent_attempted(task)
if agent_attempted:
task.driver.power.set_power_state(task, states.POWER_OFF)
task.driver.deploy.tear_down_service(task)
if agent_attempted:
task.driver.boot.prepare_instance(task)
task.driver.power.set_power_state(task, states.POWER_ON)
except Exception as e:
log_msg = (_('Failed to tear down service for node %(node)s '
'after aborting the operation. Error: %(err)s') %

View File

@@ -645,6 +645,8 @@ def wipe_internal_info_on_power_off(node):
node.del_driver_internal_info('agent_cached_clean_steps')
# Remove TLS certificate since it's regenerated on each run.
node.del_driver_internal_info('agent_verify_ca')
# Remove agent start attempt tracking since it doesn't survive power off.
node.del_driver_internal_info('agent_start_attempted')
def wipe_token_and_url(task):
@@ -671,6 +673,8 @@ def wipe_deploy_internal_info(task):
node.del_driver_internal_info('deploy_step_index')
node.del_driver_internal_info('steps_validated')
node.del_driver_internal_info('image_source')
# Remove agent start attempt tracking since deployment is complete/aborted
node.del_driver_internal_info('agent_start_attempted')
async_steps.remove_node_flags(node)
@@ -685,6 +689,8 @@ def wipe_cleaning_internal_info(task):
node.del_driver_internal_info('cleaning_disable_ramdisk')
node.del_driver_internal_info('steps_validated')
node.del_driver_internal_info('declarative_cleaning')
# Remove agent start attempt tracking since cleaning is complete/aborted
node.del_driver_internal_info('agent_start_attempted')
async_steps.remove_node_flags(node)
@@ -698,6 +704,8 @@ def wipe_service_internal_info(task):
node.del_driver_internal_info('service_disable_ramdisk')
node.del_driver_internal_info('steps_validated')
node.del_driver_internal_info('image_source')
# Remove agent start attempt tracking since servicing is complete/aborted
node.del_driver_internal_info('agent_start_attempted')
async_steps.remove_node_flags(node)

View File

@@ -1746,6 +1746,12 @@ def prepare_agent_boot(task):
:param task: a TaskManager instance.
"""
# Record that we're attempting to start the agent, because this
# is the *central* location where we trigger ramdisk launches
# for actions outside of inspection.
task.node.set_driver_internal_info('agent_start_attempted', True)
task.node.save()
deploy_opts = build_agent_options(task.node)
task.driver.boot.prepare_ramdisk(task, deploy_opts)

View File

@@ -6229,6 +6229,27 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """
self.assertEqual(urlparse.urlparse(ret.location).path,
expected_location)
@mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action',
autospec=True)
def test_provision_with_abort_after_service_failed(self,
mock_dpa):
node = self.node
node.provision_state = states.SERVICEFAIL
node.target_provision_state = states.ACTIVE
node.save()
ret = self.put_json('/nodes/%s/states/provision' % node.uuid,
{'target': states.VERBS['abort']},
headers={api_base.Version.string: "1.13"})
self.assertEqual(http_client.ACCEPTED, ret.status_code)
self.assertEqual(b'', ret.body)
# Check location header
self.assertIsNotNone(ret.location)
expected_location = '/v1/nodes/%s/states' % node.uuid
self.assertEqual(urlparse.urlparse(ret.location).path,
expected_location)
mock_dpa.assert_called_once_with(
mock.ANY, mock.ANY, node.uuid, 'abort', 'test-topic')
def test_provision_with_unprovision_in_service_wait(self):
node = self.node
node.provision_state = states.SERVICEWAIT

View File

@@ -2835,6 +2835,37 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
states.DEPLOYHOLD)
self.assertNotIn('agent_url', node.driver_internal_info)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_do_provision_action_abort_from_servicefail(self, mock_spawn):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICEFAIL,
target_provision_state=states.ACTIVE)
self._start_service()
self.service.do_provisioning_action(self.context, node.uuid, 'abort')
mock_spawn.assert_called_with(
self.service,
servicing.do_node_service_abort,
mock.ANY)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_do_provision_action_abort_from_servicefail_maintenance(
self, mock_spawn):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICEFAIL,
target_provision_state=states.ACTIVE,
maintenance=True)
self._start_service()
self.assertRaises(
exception.InvalidState,
self.service.do_provisioning_action,
self.context, node.uuid, 'abort')
@mgr_utils.mock_record_keepalive
class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):

View File

@@ -1017,6 +1017,7 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase):
driver_internal_info={
'agent_url': 'some url',
'agent_secret_token': 'token',
'agent_start_attempted': True,
'service_step_index': 2,
'servicing_reboot': True,
'servicing_polling': True,
@@ -1087,6 +1088,46 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase):
self.assertTrue(task.node.maintenance)
self.assertEqual('service failure', task.node.fault)
@mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True)
@mock.patch.object(fake.FakePower, 'set_power_state', autospec=True)
@mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True)
def test_do_node_service_abort_agent_attempted(self, tear_mock, power_mock,
prepare_mock):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICEWAIT,
target_provision_state=states.AVAILABLE,
driver_internal_info={
'agent_start_attempted': True,
'service_step_index': 2})
with task_manager.acquire(self.context, node.uuid) as task:
servicing.do_node_service_abort(task)
tear_mock.assert_called_once_with(mock.ANY, task)
prepare_mock.assert_called_once_with(mock.ANY, task)
power_mock.assert_has_calls([
mock.call(mock.ANY, task, 'power off'),
mock.call(mock.ANY, task, 'power on')])
@mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True)
@mock.patch.object(fake.FakePower, 'set_power_state', autospec=True)
@mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True)
def test_do_node_service_abort_agent_not_attempted(self, tear_mock,
power_mock,
prepare_mock):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.SERVICEWAIT,
target_provision_state=states.AVAILABLE,
driver_internal_info={
'service_step_index': 2})
with task_manager.acquire(self.context, node.uuid) as task:
servicing.do_node_service_abort(task)
tear_mock.assert_called_once_with(mock.ANY, task)
prepare_mock.assert_not_called()
power_mock.assert_not_called()
class DoNodeCleanTestChildNodes(db_base.DbTestCase):
def setUp(self):

View File

@@ -2655,9 +2655,16 @@ class AgentTokenUtilsTestCase(tests_base.TestCase):
def test_wipe_deploy_internal_info(self):
conductor_utils.add_secret_token(self.node)
self.assertIn('agent_secret_token', self.node.driver_internal_info)
self.node.set_driver_internal_info('agent_start_attempted', True)
dii = self.node.driver_internal_info
self.assertIn('agent_secret_token', dii)
self.assertIn('agent_start_attempted', dii)
conductor_utils.wipe_deploy_internal_info(mock.Mock(node=self.node))
self.assertNotIn('agent_secret_token', self.node.driver_internal_info)
dii = self.node.driver_internal_info
self.assertNotIn('agent_secret_token', dii)
self.assertNotIn('agent_start_attempted', dii)
def test_is_agent_token_present(self):
# This should always be False as the token has not been added yet.
@@ -3222,8 +3229,9 @@ class ServiceUtilsTestCase(db_base.DbTestCase):
'servicing_polling': 1,
'service_disable_ramdisk': False,
'skip_current_service_step': False,
'steps_validated': 'meow'
'agent_secret_token',
'steps_validated': 'meow',
'agent_secret_token': 'token',
'agent_start_attempted': True,
'image_source': 'image_ref'}
self.node.save()
not_in_list = ['agent_cached_service_steps',
@@ -3232,7 +3240,8 @@ class ServiceUtilsTestCase(db_base.DbTestCase):
'service_disable_ramdisk',
'skip_current_service_step',
'steps_validated',
'agent_secret_token'
'agent_secret_token',
'agent_start_attempted',
'image_source']
with task_manager.acquire(self.context, self.node.id,
shared=True) as task:

View File

@@ -162,7 +162,8 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest):
mock_get_configuration.return_value = json.loads(
'{"oem": {"interface": "idrac-redfish", '
'"data": {"prop1": "value1", "prop2": 2}}}')
mock_sdii = mock.Mock()
task.node.set_driver_internal_info = mock_sdii
result = self.management.import_configuration(task, 'edge')
self.assertEqual(states.DEPLOYWAIT, result)
@@ -175,6 +176,7 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest):
mock_build_agent_options.assert_called_once_with(task.node)
task.driver.boot.prepare_ramdisk.assert_called_once_with(
task, deploy_opts)
self.assertEqual(2, mock_sdii.call_count)
@mock.patch.object(drac_mgmt.DracRedfishManagement,
'import_configuration', autospec=True)

View File

@@ -231,8 +231,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
self.mock_storage]
self.node.target_raid_config = target_raid_config
self.node.save()
mock_sdii = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.node.set_driver_internal_info = mock_sdii
task.driver.raid.create_configuration(task)
pre = '/redfish/v1/Systems/1/Storage/1/Drives/'
expected_payload = {
@@ -251,6 +253,7 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
)
# Async operation, raid_config shouldn't be updated yet
self.assertEqual({}, task.node.raid_config)
self.assertEqual(2, mock_sdii.call_count)
@mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk',
spec_set=True, autospec=True)
@@ -585,8 +588,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
self.mock_storage]
self.node.target_raid_config = target_raid_config
self.node.save()
sdii_mock = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
task.node.set_driver_internal_info = sdii_mock
task.driver.raid.create_configuration(task)
pre = '/redfish/v1/Systems/1/Storage/1/Drives/'
expected_payload = {
@@ -605,6 +610,11 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
)
# Async operation, raid_config shouldn't be updated yet
self.assertEqual({}, task.node.raid_config)
sdii_mock.assert_has_calls([
mock.call('raid_configs', {'operation': 'create',
'pending': {},
'task_monitor_uri': mock.ANY}),
mock.call('agent_start_attempted', True)])
@mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk',
spec_set=True, autospec=True)

View File

@@ -1198,6 +1198,22 @@ class AgentMethodsTestCase(db_base.DbTestCase):
options = utils.build_agent_options(self.node)
self.assertEqual('https://api-url', options['ipa-api-url'])
@mock.patch.object(utils, 'build_agent_options', autospec=True)
def test_prepare_agent_boot(self, mock_build_opts):
mock_build_opts.return_value = {'ipa-api-url': 'https://api-url'}
with task_manager.acquire(self.context, self.node.uuid) as task:
mock_pr = mock.Mock()
task.driver.boot.prepare_ramdisk = mock_pr
utils.prepare_agent_boot(task)
# Verify attempt tracking is set
self.assertTrue(
task.node.driver_internal_info.get('agent_start_attempted'))
# Verify boot preparation was called
mock_pr.assert_called_once_with(
task, {'ipa-api-url': 'https://api-url'})
def test_direct_deploy_should_convert_raw_image_true(self):
cfg.CONF.set_override('force_raw_images', True)
cfg.CONF.set_override('stream_raw_images', True, group='agent')

View File

@@ -0,0 +1,7 @@
---
fixes:
- |
Fixes the ability to invoke the ``abort`` API verb when a node is in
``service fail`` state, allowing the user to back out of a failure state.
For more information see
`bug 2119989 <https://bugs.launchpad.net/ironic/+bug/2119989>`_.