Fix the ability to escape service fail
Back when we developed service, we expected operators to iterate to fix their issues, but we also put in abort code. We just never wired in the abort code to the abort verb. It really seems like we really should have done that, and this change changes API and Conductor code path to make this happen. Closes-Bug: 2119989 Assisted-By: Claude Clode - Claude Sonnet 4 Change-Id: Ic02ba87485a676e77563057427ab94953bea2cc2 Signed-off-by: Julia Kreger <juliaashleykreger@gmail.com>
This commit is contained in:
@@ -676,5 +676,8 @@ machine.add_transition(SERVICEFAIL, SERVICEHOLD, 'hold')
|
||||
# A node in service fail may be deleted.
|
||||
machine.add_transition(SERVICEFAIL, DELETING, 'delete')
|
||||
|
||||
# A node in service fail may be aborted (returned to active)
|
||||
machine.add_transition(SERVICEFAIL, ACTIVE, 'abort')
|
||||
|
||||
# A node in service wait may be deleted.
|
||||
machine.add_transition(SERVICEWAIT, DELETING, 'delete')
|
||||
|
||||
@@ -1440,7 +1440,8 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
states.RESCUEWAIT,
|
||||
states.INSPECTWAIT,
|
||||
states.CLEANHOLD,
|
||||
states.DEPLOYHOLD)):
|
||||
states.DEPLOYHOLD,
|
||||
states.SERVICEFAIL)):
|
||||
self._do_abort(task)
|
||||
return
|
||||
|
||||
@@ -1532,6 +1533,22 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
task.node.provision_state),
|
||||
err_handler=utils.provisioning_error_handler)
|
||||
|
||||
if node.provision_state == states.SERVICEFAIL:
|
||||
if task.node.maintenance:
|
||||
msg = (_('Can not abort service on node "%(node)s" while it '
|
||||
'is in maintenance mode. Please remove the node '
|
||||
'from maintenance prior to issuing the request to '
|
||||
'abort the service operation.') %
|
||||
{'node': node.uuid})
|
||||
raise exception.InvalidState(msg)
|
||||
# When someone is in service fail, its okay to abort.
|
||||
task.process_event(
|
||||
'abort',
|
||||
callback=self._spawn_worker,
|
||||
call_args=(servicing.do_node_service_abort,
|
||||
task),
|
||||
err_handler=utils.provisioning_error_handler)
|
||||
|
||||
@METRICS.timer('ConductorManager._sync_power_states')
|
||||
@periodics.periodic(spacing=CONF.conductor.sync_power_state_interval,
|
||||
enabled=CONF.conductor.sync_power_state_interval > 0)
|
||||
|
||||
@@ -336,6 +336,11 @@ def get_last_error(node):
|
||||
return last_error
|
||||
|
||||
|
||||
def _was_agent_attempted(task):
|
||||
"""Check if agent start was attempted (vs. successfully heartbeated)."""
|
||||
return task.node.driver_internal_info.get('agent_start_attempted', False)
|
||||
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def do_node_service_abort(task):
|
||||
"""Internal method to abort an ongoing operation.
|
||||
@@ -344,7 +349,14 @@ def do_node_service_abort(task):
|
||||
"""
|
||||
node = task.node
|
||||
try:
|
||||
# Use attempt-based logic instead of heartbeat-based
|
||||
agent_attempted = _was_agent_attempted(task)
|
||||
if agent_attempted:
|
||||
task.driver.power.set_power_state(task, states.POWER_OFF)
|
||||
task.driver.deploy.tear_down_service(task)
|
||||
if agent_attempted:
|
||||
task.driver.boot.prepare_instance(task)
|
||||
task.driver.power.set_power_state(task, states.POWER_ON)
|
||||
except Exception as e:
|
||||
log_msg = (_('Failed to tear down service for node %(node)s '
|
||||
'after aborting the operation. Error: %(err)s') %
|
||||
|
||||
@@ -645,6 +645,8 @@ def wipe_internal_info_on_power_off(node):
|
||||
node.del_driver_internal_info('agent_cached_clean_steps')
|
||||
# Remove TLS certificate since it's regenerated on each run.
|
||||
node.del_driver_internal_info('agent_verify_ca')
|
||||
# Remove agent start attempt tracking since it doesn't survive power off.
|
||||
node.del_driver_internal_info('agent_start_attempted')
|
||||
|
||||
|
||||
def wipe_token_and_url(task):
|
||||
@@ -671,6 +673,8 @@ def wipe_deploy_internal_info(task):
|
||||
node.del_driver_internal_info('deploy_step_index')
|
||||
node.del_driver_internal_info('steps_validated')
|
||||
node.del_driver_internal_info('image_source')
|
||||
# Remove agent start attempt tracking since deployment is complete/aborted
|
||||
node.del_driver_internal_info('agent_start_attempted')
|
||||
async_steps.remove_node_flags(node)
|
||||
|
||||
|
||||
@@ -685,6 +689,8 @@ def wipe_cleaning_internal_info(task):
|
||||
node.del_driver_internal_info('cleaning_disable_ramdisk')
|
||||
node.del_driver_internal_info('steps_validated')
|
||||
node.del_driver_internal_info('declarative_cleaning')
|
||||
# Remove agent start attempt tracking since cleaning is complete/aborted
|
||||
node.del_driver_internal_info('agent_start_attempted')
|
||||
async_steps.remove_node_flags(node)
|
||||
|
||||
|
||||
@@ -698,6 +704,8 @@ def wipe_service_internal_info(task):
|
||||
node.del_driver_internal_info('service_disable_ramdisk')
|
||||
node.del_driver_internal_info('steps_validated')
|
||||
node.del_driver_internal_info('image_source')
|
||||
# Remove agent start attempt tracking since servicing is complete/aborted
|
||||
node.del_driver_internal_info('agent_start_attempted')
|
||||
async_steps.remove_node_flags(node)
|
||||
|
||||
|
||||
|
||||
@@ -1746,6 +1746,12 @@ def prepare_agent_boot(task):
|
||||
|
||||
:param task: a TaskManager instance.
|
||||
"""
|
||||
# Record that we're attempting to start the agent, because this
|
||||
# is the *central* location where we trigger ramdisk launches
|
||||
# for actions outside of inspection.
|
||||
task.node.set_driver_internal_info('agent_start_attempted', True)
|
||||
task.node.save()
|
||||
|
||||
deploy_opts = build_agent_options(task.node)
|
||||
task.driver.boot.prepare_ramdisk(task, deploy_opts)
|
||||
|
||||
|
||||
@@ -6229,6 +6229,27 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """
|
||||
self.assertEqual(urlparse.urlparse(ret.location).path,
|
||||
expected_location)
|
||||
|
||||
@mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action',
|
||||
autospec=True)
|
||||
def test_provision_with_abort_after_service_failed(self,
|
||||
mock_dpa):
|
||||
node = self.node
|
||||
node.provision_state = states.SERVICEFAIL
|
||||
node.target_provision_state = states.ACTIVE
|
||||
node.save()
|
||||
ret = self.put_json('/nodes/%s/states/provision' % node.uuid,
|
||||
{'target': states.VERBS['abort']},
|
||||
headers={api_base.Version.string: "1.13"})
|
||||
self.assertEqual(http_client.ACCEPTED, ret.status_code)
|
||||
self.assertEqual(b'', ret.body)
|
||||
# Check location header
|
||||
self.assertIsNotNone(ret.location)
|
||||
expected_location = '/v1/nodes/%s/states' % node.uuid
|
||||
self.assertEqual(urlparse.urlparse(ret.location).path,
|
||||
expected_location)
|
||||
mock_dpa.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, node.uuid, 'abort', 'test-topic')
|
||||
|
||||
def test_provision_with_unprovision_in_service_wait(self):
|
||||
node = self.node
|
||||
node.provision_state = states.SERVICEWAIT
|
||||
|
||||
@@ -2835,6 +2835,37 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
states.DEPLOYHOLD)
|
||||
self.assertNotIn('agent_url', node.driver_internal_info)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_do_provision_action_abort_from_servicefail(self, mock_spawn):
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.SERVICEFAIL,
|
||||
target_provision_state=states.ACTIVE)
|
||||
|
||||
self._start_service()
|
||||
self.service.do_provisioning_action(self.context, node.uuid, 'abort')
|
||||
mock_spawn.assert_called_with(
|
||||
self.service,
|
||||
servicing.do_node_service_abort,
|
||||
mock.ANY)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_do_provision_action_abort_from_servicefail_maintenance(
|
||||
self, mock_spawn):
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.SERVICEFAIL,
|
||||
target_provision_state=states.ACTIVE,
|
||||
maintenance=True)
|
||||
|
||||
self._start_service()
|
||||
self.assertRaises(
|
||||
exception.InvalidState,
|
||||
self.service.do_provisioning_action,
|
||||
self.context, node.uuid, 'abort')
|
||||
|
||||
|
||||
@mgr_utils.mock_record_keepalive
|
||||
class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
|
||||
@@ -1017,6 +1017,7 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase):
|
||||
driver_internal_info={
|
||||
'agent_url': 'some url',
|
||||
'agent_secret_token': 'token',
|
||||
'agent_start_attempted': True,
|
||||
'service_step_index': 2,
|
||||
'servicing_reboot': True,
|
||||
'servicing_polling': True,
|
||||
@@ -1087,6 +1088,46 @@ class DoNodeServiceAbortTestCase(db_base.DbTestCase):
|
||||
self.assertTrue(task.node.maintenance)
|
||||
self.assertEqual('service failure', task.node.fault)
|
||||
|
||||
@mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True)
|
||||
@mock.patch.object(fake.FakePower, 'set_power_state', autospec=True)
|
||||
@mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True)
|
||||
def test_do_node_service_abort_agent_attempted(self, tear_mock, power_mock,
|
||||
prepare_mock):
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.SERVICEWAIT,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
driver_internal_info={
|
||||
'agent_start_attempted': True,
|
||||
'service_step_index': 2})
|
||||
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
servicing.do_node_service_abort(task)
|
||||
tear_mock.assert_called_once_with(mock.ANY, task)
|
||||
prepare_mock.assert_called_once_with(mock.ANY, task)
|
||||
power_mock.assert_has_calls([
|
||||
mock.call(mock.ANY, task, 'power off'),
|
||||
mock.call(mock.ANY, task, 'power on')])
|
||||
|
||||
@mock.patch.object(fake.FakeBoot, 'prepare_instance', autospec=True)
|
||||
@mock.patch.object(fake.FakePower, 'set_power_state', autospec=True)
|
||||
@mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True)
|
||||
def test_do_node_service_abort_agent_not_attempted(self, tear_mock,
|
||||
power_mock,
|
||||
prepare_mock):
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.SERVICEWAIT,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
driver_internal_info={
|
||||
'service_step_index': 2})
|
||||
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
servicing.do_node_service_abort(task)
|
||||
tear_mock.assert_called_once_with(mock.ANY, task)
|
||||
prepare_mock.assert_not_called()
|
||||
power_mock.assert_not_called()
|
||||
|
||||
|
||||
class DoNodeCleanTestChildNodes(db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
|
||||
@@ -2655,9 +2655,16 @@ class AgentTokenUtilsTestCase(tests_base.TestCase):
|
||||
|
||||
def test_wipe_deploy_internal_info(self):
|
||||
conductor_utils.add_secret_token(self.node)
|
||||
self.assertIn('agent_secret_token', self.node.driver_internal_info)
|
||||
self.node.set_driver_internal_info('agent_start_attempted', True)
|
||||
dii = self.node.driver_internal_info
|
||||
self.assertIn('agent_secret_token', dii)
|
||||
self.assertIn('agent_start_attempted', dii)
|
||||
|
||||
conductor_utils.wipe_deploy_internal_info(mock.Mock(node=self.node))
|
||||
self.assertNotIn('agent_secret_token', self.node.driver_internal_info)
|
||||
|
||||
dii = self.node.driver_internal_info
|
||||
self.assertNotIn('agent_secret_token', dii)
|
||||
self.assertNotIn('agent_start_attempted', dii)
|
||||
|
||||
def test_is_agent_token_present(self):
|
||||
# This should always be False as the token has not been added yet.
|
||||
@@ -3222,8 +3229,9 @@ class ServiceUtilsTestCase(db_base.DbTestCase):
|
||||
'servicing_polling': 1,
|
||||
'service_disable_ramdisk': False,
|
||||
'skip_current_service_step': False,
|
||||
'steps_validated': 'meow'
|
||||
'agent_secret_token',
|
||||
'steps_validated': 'meow',
|
||||
'agent_secret_token': 'token',
|
||||
'agent_start_attempted': True,
|
||||
'image_source': 'image_ref'}
|
||||
self.node.save()
|
||||
not_in_list = ['agent_cached_service_steps',
|
||||
@@ -3232,7 +3240,8 @@ class ServiceUtilsTestCase(db_base.DbTestCase):
|
||||
'service_disable_ramdisk',
|
||||
'skip_current_service_step',
|
||||
'steps_validated',
|
||||
'agent_secret_token'
|
||||
'agent_secret_token',
|
||||
'agent_start_attempted',
|
||||
'image_source']
|
||||
with task_manager.acquire(self.context, self.node.id,
|
||||
shared=True) as task:
|
||||
|
||||
@@ -162,7 +162,8 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest):
|
||||
mock_get_configuration.return_value = json.loads(
|
||||
'{"oem": {"interface": "idrac-redfish", '
|
||||
'"data": {"prop1": "value1", "prop2": 2}}}')
|
||||
|
||||
mock_sdii = mock.Mock()
|
||||
task.node.set_driver_internal_info = mock_sdii
|
||||
result = self.management.import_configuration(task, 'edge')
|
||||
self.assertEqual(states.DEPLOYWAIT, result)
|
||||
|
||||
@@ -175,6 +176,7 @@ class DracRedfishManagementTestCase(test_utils.BaseDracTest):
|
||||
mock_build_agent_options.assert_called_once_with(task.node)
|
||||
task.driver.boot.prepare_ramdisk.assert_called_once_with(
|
||||
task, deploy_opts)
|
||||
self.assertEqual(2, mock_sdii.call_count)
|
||||
|
||||
@mock.patch.object(drac_mgmt.DracRedfishManagement,
|
||||
'import_configuration', autospec=True)
|
||||
|
||||
@@ -231,8 +231,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
|
||||
self.mock_storage]
|
||||
self.node.target_raid_config = target_raid_config
|
||||
self.node.save()
|
||||
mock_sdii = mock.Mock()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
task.node.set_driver_internal_info = mock_sdii
|
||||
task.driver.raid.create_configuration(task)
|
||||
pre = '/redfish/v1/Systems/1/Storage/1/Drives/'
|
||||
expected_payload = {
|
||||
@@ -251,6 +253,7 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
|
||||
)
|
||||
# Async operation, raid_config shouldn't be updated yet
|
||||
self.assertEqual({}, task.node.raid_config)
|
||||
self.assertEqual(2, mock_sdii.call_count)
|
||||
|
||||
@mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk',
|
||||
spec_set=True, autospec=True)
|
||||
@@ -585,8 +588,10 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
|
||||
self.mock_storage]
|
||||
self.node.target_raid_config = target_raid_config
|
||||
self.node.save()
|
||||
sdii_mock = mock.Mock()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
task.node.set_driver_internal_info = sdii_mock
|
||||
task.driver.raid.create_configuration(task)
|
||||
pre = '/redfish/v1/Systems/1/Storage/1/Drives/'
|
||||
expected_payload = {
|
||||
@@ -605,6 +610,11 @@ class RedfishRAIDTestCase(db_base.DbTestCase):
|
||||
)
|
||||
# Async operation, raid_config shouldn't be updated yet
|
||||
self.assertEqual({}, task.node.raid_config)
|
||||
sdii_mock.assert_has_calls([
|
||||
mock.call('raid_configs', {'operation': 'create',
|
||||
'pending': {},
|
||||
'task_monitor_uri': mock.ANY}),
|
||||
mock.call('agent_start_attempted', True)])
|
||||
|
||||
@mock.patch.object(redfish_boot.RedfishVirtualMediaBoot, 'prepare_ramdisk',
|
||||
spec_set=True, autospec=True)
|
||||
|
||||
@@ -1198,6 +1198,22 @@ class AgentMethodsTestCase(db_base.DbTestCase):
|
||||
options = utils.build_agent_options(self.node)
|
||||
self.assertEqual('https://api-url', options['ipa-api-url'])
|
||||
|
||||
@mock.patch.object(utils, 'build_agent_options', autospec=True)
|
||||
def test_prepare_agent_boot(self, mock_build_opts):
|
||||
mock_build_opts.return_value = {'ipa-api-url': 'https://api-url'}
|
||||
|
||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||
mock_pr = mock.Mock()
|
||||
task.driver.boot.prepare_ramdisk = mock_pr
|
||||
utils.prepare_agent_boot(task)
|
||||
# Verify attempt tracking is set
|
||||
self.assertTrue(
|
||||
task.node.driver_internal_info.get('agent_start_attempted'))
|
||||
|
||||
# Verify boot preparation was called
|
||||
mock_pr.assert_called_once_with(
|
||||
task, {'ipa-api-url': 'https://api-url'})
|
||||
|
||||
def test_direct_deploy_should_convert_raw_image_true(self):
|
||||
cfg.CONF.set_override('force_raw_images', True)
|
||||
cfg.CONF.set_override('stream_raw_images', True, group='agent')
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes the ability to invoke the ``abort`` API verb when a node is in
|
||||
``service fail`` state, allowing the user to back out of a failure state.
|
||||
For more information see
|
||||
`bug 2119989 <https://bugs.launchpad.net/ironic/+bug/2119989>`_.
|
||||
Reference in New Issue
Block a user