Merge "Do not move nodes to CLEAN FAILED with empty last_error"

2023-03-02 14:23:40 +00:00 · 2023-03-02 14:23:40 +00:00 · a3f854392a
commit a3f854392a
parent 63de82c3d3 9a0fa631ca
9 changed files with 80 additions and 27 deletions
--- a/ironic/common/states.py
+++ b/ironic/common/states.py
@ -269,6 +269,9 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE,
 FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES)
 """States where API lookups are permitted with fast track enabled."""
 FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL,
                            RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL))
 ##############
 # Power states
--- a/ironic/conductor/cleaning.py
+++ b/ironic/conductor/cleaning.py
@ -248,12 +248,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
    task.process_event(event)
 def get_last_error(node):
    last_error = _('By request, the clean operation was aborted')
    if node.clean_step:
        last_error += (
            _(' during or after the completion of step "%s"')
            % conductor_steps.step_id(node.clean_step)
        )
    return last_error
@task_manager.require_exclusive_lock
-def do_node_clean_abort(task, step_name=None):
+def do_node_clean_abort(task):
    """Internal method to abort an ongoing operation.
    :param task: a TaskManager instance with an exclusive lock
    :param step_name: The name of the clean step.
    """
    node = task.node
    try:
@ -271,12 +280,13 @@ def do_node_clean_abort(task, step_name=None):
                                     set_fail_state=False)
        return
    last_error = get_last_error(node)
    info_message = _('Clean operation aborted for node %s') % node.uuid
-    last_error = _('By request, the clean operation was aborted')
+    if node.clean_step:
-    if step_name:
+        info_message += (
-        msg = _(' after the completion of step "%s"') % step_name
+            _(' during or after the completion of step "%s"')
-        last_error += msg
+            % node.clean_step
-        info_message += msg
+        )
    node.last_error = last_error
    node.clean_step = None
@ -318,7 +328,7 @@ def continue_node_clean(task):
                target_state = None
            task.process_event('fail', target_state=target_state)
-            do_node_clean_abort(task, step_name)
+            do_node_clean_abort(task)
            return
        LOG.debug('The cleaning operation for node %(node)s was '
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -1351,7 +1351,8 @@ class ConductorManager(base_manager.BaseConductorManager):
                callback=self._spawn_worker,
                call_args=(cleaning.do_node_clean_abort, task),
                err_handler=utils.provisioning_error_handler,
-                target_state=target_state)
+                target_state=target_state,
                last_error=cleaning.get_last_error(node))
            return
        if node.provision_state == states.RESCUEWAIT:
--- a/ironic/conductor/task_manager.py
+++ b/ironic/conductor/task_manager.py
@ -527,7 +527,8 @@ class TaskManager(object):
            self.release_resources()
    def process_event(self, event, callback=None, call_args=None,
-                      call_kwargs=None, err_handler=None, target_state=None):
+                      call_kwargs=None, err_handler=None, target_state=None,
                      last_error=None):
        """Process the given event for the task's current state.
        :param event: the name of the event to process
@ -540,6 +541,8 @@ class TaskManager(object):
                prev_target_state)
        :param target_state: if specified, the target provision state for the
               node. Otherwise, use the target state from the fsm
        :param last_error: last error to set on the node together with
               the state transition.
        :raises: InvalidState if the event is not allowed by the associated
                 state machine
        """
@ -572,13 +575,15 @@ class TaskManager(object):
        # set up the async worker
        if callback:
-            # clear the error if we're going to start work in a callback
+            # update the error if we're going to start work in a callback
-            self.node.last_error = None
+            self.node.last_error = last_error
            if call_args is None:
                call_args = ()
            if call_kwargs is None:
                call_kwargs = {}
            self.spawn_after(callback, *call_args, **call_kwargs)
        elif last_error is not None:
            self.node.last_error = last_error
        # publish the state transition by saving the Node
        self.node.save()
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@ -302,9 +302,11 @@ def node_power_action(task, new_state, timeout=None):
    # Set the target_power_state and clear any last_error, if we're
    # starting a new operation. This will expose to other processes
-    # and clients that work is in progress.
+    # and clients that work is in progress. Keep the last_error intact
-    node['target_power_state'] = target_state
+    # if the power action happens as a result of a failure.
-    node['last_error'] = None
+    node.target_power_state = target_state
    if node.provision_state not in states.FAILURE_STATES:
        node.last_error = None
    node.timestamp_driver_internal_info('last_power_state_change')
    # NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in
    # fast-track (or an accidentally booted agent) will cause subsequent
--- a/ironic/tests/unit/conductor/test_cleaning.py
+++ b/ironic/tests/unit/conductor/test_cleaning.py
@ -1138,12 +1138,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
 class DoNodeCleanAbortTestCase(db_base.DbTestCase):
    @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
-    def _test__do_node_clean_abort(self, step_name, tear_mock):
+    def _test_do_node_clean_abort(self, clean_step, tear_mock):
        node = obj_utils.create_test_node(
            self.context, driver='fake-hardware',
-            provision_state=states.CLEANFAIL,
+            provision_state=states.CLEANWAIT,
            target_provision_state=states.AVAILABLE,
-            clean_step={'step': 'foo', 'abortable': True},
+            clean_step=clean_step,
            driver_internal_info={
                'agent_url': 'some url',
                'agent_secret_token': 'token',
@ -1153,11 +1153,11 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
                'skip_current_clean_step': True})
        with task_manager.acquire(self.context, node.uuid) as task:
-            cleaning.do_node_clean_abort(task, step_name=step_name)
+            cleaning.do_node_clean_abort(task)
            self.assertIsNotNone(task.node.last_error)
            tear_mock.assert_called_once_with(task.driver.deploy, task)
-            if step_name:
+            if clean_step:
-                self.assertIn(step_name, task.node.last_error)
+                self.assertIn(clean_step['step'], task.node.last_error)
            # assert node's clean_step and metadata was cleaned up
            self.assertEqual({}, task.node.clean_step)
            self.assertNotIn('clean_step_index',
@ -1173,11 +1173,12 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
            self.assertNotIn('agent_secret_token',
                             task.node.driver_internal_info)
-    def test__do_node_clean_abort(self):
+    def test_do_node_clean_abort_early(self):
-        self._test__do_node_clean_abort(None)
+        self._test_do_node_clean_abort(None)
-    def test__do_node_clean_abort_with_step_name(self):
+    def test_do_node_clean_abort_with_step(self):
-        self._test__do_node_clean_abort('foo')
+        self._test_do_node_clean_abort({'step': 'foo', 'interface': 'deploy',
                                        'abortable': True})
    @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
    def test__do_node_clean_abort_tear_down_fail(self, tear_mock):
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@ -2735,7 +2735,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
        # Node will be moved to tgt_prov_state after cleaning, not tested here
        self.assertEqual(states.CLEANFAIL, node.provision_state)
        self.assertEqual(tgt_prov_state, node.target_provision_state)
-        self.assertIsNone(node.last_error)
+        self.assertEqual('By request, the clean operation was aborted',
                         node.last_error)
        mock_spawn.assert_called_with(
            self.service, cleaning.do_node_clean_abort, mock.ANY)
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@ -196,7 +196,8 @@ class NodePowerActionTestCase(db_base.DbTestCase):
        node = obj_utils.create_test_node(self.context,
                                          uuid=uuidutils.generate_uuid(),
                                          driver='fake-hardware',
-                                          power_state=states.POWER_OFF)
+                                          power_state=states.POWER_OFF,
                                          last_error='failed before')
        task = task_manager.TaskManager(self.context, node.uuid)
        get_power_mock.return_value = states.POWER_OFF
@ -209,6 +210,27 @@ class NodePowerActionTestCase(db_base.DbTestCase):
        self.assertIsNone(node['target_power_state'])
        self.assertIsNone(node['last_error'])
    @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
    def test_node_power_action_keep_last_error(self, get_power_mock):
        """Test node_power_action to keep last_error for failed states."""
        node = obj_utils.create_test_node(self.context,
                                          uuid=uuidutils.generate_uuid(),
                                          driver='fake-hardware',
                                          power_state=states.POWER_OFF,
                                          provision_state=states.CLEANFAIL,
                                          last_error='failed before')
        task = task_manager.TaskManager(self.context, node.uuid)
        get_power_mock.return_value = states.POWER_OFF
        conductor_utils.node_power_action(task, states.POWER_ON)
        node.refresh()
        get_power_mock.assert_called_once_with(mock.ANY, mock.ANY)
        self.assertEqual(states.POWER_ON, node['power_state'])
        self.assertIsNone(node['target_power_state'])
        self.assertEqual('failed before', node['last_error'])
    @mock.patch('ironic.objects.node.NodeSetPowerStateNotification',
                autospec=True)
    @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
--- a/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml
+++ b/releasenotes/notes/cleaning-error-5c13c33c58404b97.yaml
@ -0,0 +1,8 @@
 ---
 fixes:
  - |
    When aborting cleaning, the ``last_error`` field is no longer initially
    empty. It is now populated on the state transition to ``clean failed``.
  - |
    When cleaning or deployment fails, the ``last_error`` field is no longer
    temporary set to ``None`` while the power off action is running.