diff --git a/ironic/common/states.py b/ironic/common/states.py index 89b710189e..f2238b41b5 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -269,6 +269,9 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE, FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) """States where API lookups are permitted with fast track enabled.""" +FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL, + RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL)) + ############## # Power states diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index cb801fc8c2..8a5b8be36a 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -245,12 +245,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None): task.process_event(event) +def get_last_error(node): + last_error = _('By request, the clean operation was aborted') + if node.clean_step: + last_error += ( + _(' during or after the completion of step "%s"') + % conductor_steps.step_id(node.clean_step) + ) + return last_error + + @task_manager.require_exclusive_lock -def do_node_clean_abort(task, step_name=None): +def do_node_clean_abort(task): """Internal method to abort an ongoing operation. :param task: a TaskManager instance with an exclusive lock - :param step_name: The name of the clean step. """ node = task.node try: @@ -268,12 +277,13 @@ def do_node_clean_abort(task, step_name=None): set_fail_state=False) return + last_error = get_last_error(node) info_message = _('Clean operation aborted for node %s') % node.uuid - last_error = _('By request, the clean operation was aborted') - if step_name: - msg = _(' after the completion of step "%s"') % step_name - last_error += msg - info_message += msg + if node.clean_step: + info_message += ( + _(' during or after the completion of step "%s"') + % node.clean_step + ) node.last_error = last_error node.clean_step = None @@ -315,7 +325,7 @@ def continue_node_clean(task): target_state = None task.process_event('fail', target_state=target_state) - do_node_clean_abort(task, step_name) + do_node_clean_abort(task) return LOG.debug('The cleaning operation for node %(node)s was ' diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 7f6470e387..a4fed6aff3 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1331,7 +1331,8 @@ class ConductorManager(base_manager.BaseConductorManager): callback=self._spawn_worker, call_args=(cleaning.do_node_clean_abort, task), err_handler=utils.provisioning_error_handler, - target_state=target_state) + target_state=target_state, + last_error=cleaning.get_last_error(node)) return if node.provision_state == states.RESCUEWAIT: diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py index 509c9ce92e..922e74cf63 100644 --- a/ironic/conductor/task_manager.py +++ b/ironic/conductor/task_manager.py @@ -527,7 +527,8 @@ class TaskManager(object): self.release_resources() def process_event(self, event, callback=None, call_args=None, - call_kwargs=None, err_handler=None, target_state=None): + call_kwargs=None, err_handler=None, target_state=None, + last_error=None): """Process the given event for the task's current state. :param event: the name of the event to process @@ -540,6 +541,8 @@ class TaskManager(object): prev_target_state) :param target_state: if specified, the target provision state for the node. Otherwise, use the target state from the fsm + :param last_error: last error to set on the node together with + the state transition. :raises: InvalidState if the event is not allowed by the associated state machine """ @@ -572,13 +575,15 @@ class TaskManager(object): # set up the async worker if callback: - # clear the error if we're going to start work in a callback - self.node.last_error = None + # update the error if we're going to start work in a callback + self.node.last_error = last_error if call_args is None: call_args = () if call_kwargs is None: call_kwargs = {} self.spawn_after(callback, *call_args, **call_kwargs) + elif last_error is not None: + self.node.last_error = last_error # publish the state transition by saving the Node self.node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 643bc4d10a..b43de8e553 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -302,9 +302,11 @@ def node_power_action(task, new_state, timeout=None): # Set the target_power_state and clear any last_error, if we're # starting a new operation. This will expose to other processes - # and clients that work is in progress. - node['target_power_state'] = target_state - node['last_error'] = None + # and clients that work is in progress. Keep the last_error intact + # if the power action happens as a result of a failure. + node.target_power_state = target_state + if node.provision_state not in states.FAILURE_STATES: + node.last_error = None node.timestamp_driver_internal_info('last_power_state_change') # NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in # fast-track (or an accidentally booted agent) will cause subsequent diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 65261450af..8e961eb925 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -1124,12 +1124,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase): class DoNodeCleanAbortTestCase(db_base.DbTestCase): @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) - def _test__do_node_clean_abort(self, step_name, tear_mock): + def _test_do_node_clean_abort(self, clean_step, tear_mock): node = obj_utils.create_test_node( self.context, driver='fake-hardware', - provision_state=states.CLEANFAIL, + provision_state=states.CLEANWAIT, target_provision_state=states.AVAILABLE, - clean_step={'step': 'foo', 'abortable': True}, + clean_step=clean_step, driver_internal_info={ 'agent_url': 'some url', 'agent_secret_token': 'token', @@ -1139,11 +1139,11 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase): 'skip_current_clean_step': True}) with task_manager.acquire(self.context, node.uuid) as task: - cleaning.do_node_clean_abort(task, step_name=step_name) + cleaning.do_node_clean_abort(task) self.assertIsNotNone(task.node.last_error) tear_mock.assert_called_once_with(task.driver.deploy, task) - if step_name: - self.assertIn(step_name, task.node.last_error) + if clean_step: + self.assertIn(clean_step['step'], task.node.last_error) # assert node's clean_step and metadata was cleaned up self.assertEqual({}, task.node.clean_step) self.assertNotIn('clean_step_index', @@ -1159,11 +1159,12 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase): self.assertNotIn('agent_secret_token', task.node.driver_internal_info) - def test__do_node_clean_abort(self): - self._test__do_node_clean_abort(None) + def test_do_node_clean_abort_early(self): + self._test_do_node_clean_abort(None) - def test__do_node_clean_abort_with_step_name(self): - self._test__do_node_clean_abort('foo') + def test_do_node_clean_abort_with_step(self): + self._test_do_node_clean_abort({'step': 'foo', 'interface': 'deploy', + 'abortable': True}) @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) def test__do_node_clean_abort_tear_down_fail(self, tear_mock): diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index f496333649..60c5a6201e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2716,7 +2716,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, # Node will be moved to tgt_prov_state after cleaning, not tested here self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) - self.assertIsNone(node.last_error) + self.assertEqual('By request, the clean operation was aborted', + node.last_error) mock_spawn.assert_called_with( self.service, cleaning.do_node_clean_abort, mock.ANY) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 3efc3052be..3b19f456a7 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -196,7 +196,8 @@ class NodePowerActionTestCase(db_base.DbTestCase): node = obj_utils.create_test_node(self.context, uuid=uuidutils.generate_uuid(), driver='fake-hardware', - power_state=states.POWER_OFF) + power_state=states.POWER_OFF, + last_error='failed before') task = task_manager.TaskManager(self.context, node.uuid) get_power_mock.return_value = states.POWER_OFF @@ -209,6 +210,27 @@ class NodePowerActionTestCase(db_base.DbTestCase): self.assertIsNone(node['target_power_state']) self.assertIsNone(node['last_error']) + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_node_power_action_keep_last_error(self, get_power_mock): + """Test node_power_action to keep last_error for failed states.""" + node = obj_utils.create_test_node(self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + power_state=states.POWER_OFF, + provision_state=states.CLEANFAIL, + last_error='failed before') + task = task_manager.TaskManager(self.context, node.uuid) + + get_power_mock.return_value = states.POWER_OFF + + conductor_utils.node_power_action(task, states.POWER_ON) + + node.refresh() + get_power_mock.assert_called_once_with(mock.ANY, mock.ANY) + self.assertEqual(states.POWER_ON, node['power_state']) + self.assertIsNone(node['target_power_state']) + self.assertEqual('failed before', node['last_error']) + @mock.patch('ironic.objects.node.NodeSetPowerStateNotification', autospec=True) @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) diff --git a/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml new file mode 100644 index 0000000000..270278f1ba --- /dev/null +++ b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + When aborting cleaning, the ``last_error`` field is no longer initially + empty. It is now populated on the state transition to ``clean failed``. + - | + When cleaning or deployment fails, the ``last_error`` field is no longer + temporary set to ``None`` while the power off action is running.