Agent vendor handles manual cleaning

Agent base vendor now supports manual cleaning. In the case of a clean version mismatch (agent rebooted to a new version after manual cleaning started), we continue from the current clean step instead of (in automated cleaning) restarting from the first clean step. Change-Id: I89726f3b32f96befba9a2e1bee67b2a7cc7eeb57 Partial-Bug: #1526290
2015-11-19 17:14:55 +00:00 · 2015-11-19 17:14:55 +00:00 · 1f82783a9e
parent cc3891573e
commit 1f82783a9e
4 changed files with 208 additions and 38 deletions
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -646,10 +646,18 @@ class ConductorManager(base_manager.BaseConductorManager):
                state=node.provision_state)
        self._do_node_clean(task)

-    def _get_node_next_clean_steps(self, task):
+    def _get_node_next_clean_steps(self, task, skip_current_step=True):
        """Get the task's node's next clean steps.

+        This returns the list of remaining (ordered) clean steps to be done
+        on the node. If no clean steps have been started yet, all the clean
+        steps are returned. Otherwise, the clean steps after the current
+        clean step are returned. The current clean step is also returned if
+        'skip_current_step' is False.
+
        :param task: A TaskManager object
+        :param skip_current_step: True to skip the current clean step; False to
+                                  include it.
        :raises: NodeCleaningFailure if an internal error occurred when
                 getting the next clean steps
        :returns: ordered list of clean step dictionaries
@ -662,9 +670,12 @@ class ConductorManager(base_manager.BaseConductorManager):
            return next_steps

        try:
-            # Trim off the last clean step (now finished) and
-            # all previous steps
-            next_steps = next_steps[next_steps.index(node.clean_step) + 1:]
+            # Trim off all previous steps up to (and maybe including) the
+            # current clean step.
+            ind = next_steps.index(node.clean_step)
+            if skip_current_step:
+                ind += 1
+            next_steps = next_steps[ind:]
        except ValueError:
            msg = (_('Node %(node)s got an invalid last step for '
                     '%(state)s: %(step)s.') %
@ -782,7 +793,17 @@ class ConductorManager(base_manager.BaseConductorManager):
                     'state': node.provision_state,
                     'clean_state': states.CLEANWAIT})

-            next_steps = self._get_node_next_clean_steps(task)
+            info = node.driver_internal_info
+            try:
+                skip_current_step = info.pop('skip_current_clean_step')
+            except KeyError:
+                skip_current_step = True
+            else:
+                node.driver_internal_info = info
+                node.save()
+
+            next_steps = self._get_node_next_clean_steps(
+                task, skip_current_step=skip_current_step)

            # If this isn't the final clean step in the cleaning operation
            # and it is flagged to abort after the clean step that just
--- a/ironic/drivers/modules/agent_base_vendor.py
+++ b/ironic/drivers/modules/agent_base_vendor.py
@ -218,14 +218,18 @@ class BaseAgentVendor(base.VendorInterface):
    def continue_cleaning(self, task, **kwargs):
        """Start the next cleaning step if the previous one is complete.

-        In order to avoid errors and make agent upgrades painless, cleaning
-        will check the version of all hardware managers during get_clean_steps
-        at the beginning of cleaning and before executing each step in the
-        agent. If the version has changed between steps, the agent is unable
-        to tell if an ordering change will cause a cleaning issue. Therefore,
-        we restart cleaning.
+        In order to avoid errors and make agent upgrades painless, the agent
+        compares the version of all hardware managers at the start of the
+        cleaning (the agent's get_clean_steps() call) and before executing
+        each clean step. If the version has changed between steps, the agent is
+        unable to tell if an ordering change will cause a cleaning issue so
+        it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
+        the entire cleaning cycle. For manual cleaning, we don't.
        """
        node = task.node
+        # For manual clean, the target provision state is MANAGEABLE, whereas
+        # for automated cleaning, it is (the default) AVAILABLE.
+        manual_clean = node.target_provision_state == states.MANAGEABLE
        command = self._get_completed_cleaning_command(task)
        LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
                  ' %(command)s', {'node': node.uuid,
@ -245,20 +249,57 @@ class BaseAgentVendor(base.VendorInterface):
            LOG.error(msg)
            return manager_utils.cleaning_error_handler(task, msg)
        elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
-            # Restart cleaning, agent must have rebooted to new version
-            LOG.info(_LI('Node %s detected a clean version mismatch, '
-                         'resetting clean steps and rebooting the node.'),
-                     node.uuid)
-            try:
-                manager_utils.set_node_cleaning_steps(task)
-            except exception.NodeCleaningFailure:
-                msg = (_('Could not restart cleaning on node %(node)s: '
-                         '%(err)s.') %
-                       {'node': node.uuid,
-                        'err': command.get('command_error'),
-                        'step': node.clean_step})
-                LOG.exception(msg)
-                return manager_utils.cleaning_error_handler(task, msg)
+            if manual_clean:
+                # Don't restart manual cleaning if agent reboots to a new
+                # version. Both are operator actions, unlike automated
+                # cleaning. Manual clean steps are not necessarily idempotent
+                # like automated clean steps and can be even longer running.
+                LOG.info(_LI('During manual cleaning, node %(node)s detected '
+                             'a clean version mismatch. Re-executing and '
+                             'continuing from current step %(step)s.'),
+                         {'node': node.uuid, 'step': node.clean_step})
+
+                result = self._client.get_clean_steps(
+                    task.node, task.ports).get('command_result')
+
+                required_keys = ('clean_steps', 'hardware_manager_version')
+                missing_keys = [key for key in required_keys
+                                if key not in result]
+                if missing_keys:
+                    msg = (_('Could not continue manual cleaning from step '
+                             '%(step)s on node %(node)s. get_clean_steps '
+                             'returned invalid result. The keys %(keys)s are '
+                             'missing from result %(result)s.') %
+                           {'node': node.uuid,
+                            'step': node.clean_step,
+                            'keys': missing_keys,
+                            'result': result})
+                    LOG.error(msg)
+                    return manager_utils.cleaning_error_handler(task, msg)
+
+                driver_internal_info = node.driver_internal_info
+                driver_internal_info['hardware_manager_version'] = result[
+                    'hardware_manager_version']
+                driver_internal_info['skip_current_clean_step'] = False
+                node.driver_internal_info = driver_internal_info
+                node.save()
+            else:
+                # Restart cleaning, agent must have rebooted to new version
+                LOG.info(_LI('During automated cleaning, node %s detected a '
+                             'clean version mismatch. Resetting clean steps '
+                             'and rebooting the node.'),
+                         node.uuid)
+                try:
+                    manager_utils.set_node_cleaning_steps(task)
+                except exception.NodeCleaningFailure:
+                    msg = (_('Could not restart automated cleaning on node '
+                             '%(node)s: %(err)s.') %
+                           {'node': node.uuid,
+                            'err': command.get('command_error'),
+                            'step': node.clean_step})
+                    LOG.exception(msg)
+                    return manager_utils.cleaning_error_handler(task, msg)
+
            self.notify_conductor_resume_clean(task)

        elif command.get('command_status') == 'SUCCEEDED':
@ -354,7 +395,8 @@ class BaseAgentVendor(base.VendorInterface):
                    if not node.clean_step:
                        LOG.debug('Node %s just booted to start cleaning.',
                                  node.uuid)
-                        msg = _('Node failed to start the next cleaning step.')
+                        msg = _('Node failed to start the first cleaning '
+                                'step.')
                        manager_utils.set_node_cleaning_steps(task)
                        self.notify_conductor_resume_clean(task)
                    else:
@ -368,7 +410,7 @@ class BaseAgentVendor(base.VendorInterface):
        except Exception as e:
            err_info = {'node': node.uuid, 'msg': msg, 'e': e}
            last_error = _('Asynchronous exception for node %(node)s: '
-                           '%(msg)s exception: %(e)s') % err_info
+                           '%(msg)s Exception: %(e)s') % err_info
            LOG.exception(last_error)
            if node.provision_state in (states.CLEANING, states.CLEANWAIT):
                manager_utils.cleaning_error_handler(task, last_error)
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@ -1543,6 +1543,35 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
    def test_continue_node_clean_backward_compat(self):
        self._continue_node_clean(states.CLEANING)

+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
+    def _continue_node_clean_skip_step(self, mock_spawn, skip=True):
+        # test that skipping current step mechanism works
+        driver_info = {'clean_steps': self.clean_steps}
+        if not skip:
+            driver_info['skip_current_clean_step'] = skip
+        node = obj_utils.create_test_node(
+            self.context, driver='fake', provision_state=states.CLEANWAIT,
+            target_provision_state=states.MANAGEABLE,
+            driver_internal_info=driver_info, clean_step=self.clean_steps[0])
+        self._start_service()
+        self.service.continue_node_clean(self.context, node.uuid)
+        self.service._worker_pool.waitall()
+        node.refresh()
+        if skip:
+            expected_steps = self.next_clean_steps
+        else:
+            self.assertFalse(
+                'skip_current_clean_step' in node.driver_internal_info)
+            expected_steps = self.clean_steps
+        mock_spawn.assert_called_with(self.service._do_next_clean_step,
+                                      mock.ANY, expected_steps)
+
+    def test_continue_node_clean_skip_step(self):
+        self._continue_node_clean_skip_step()
+
+    def test_continue_node_clean_no_skip_step(self):
+        self._continue_node_clean_skip_step(skip=False)
+
    def _continue_node_clean_abort(self, manual=False):
        last_clean_step = self.clean_steps[0]
        last_clean_step['abortable'] = False
@ -2061,7 +2090,7 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
    def test__do_next_clean_step_manual_bad_step_return_value(self):
        self._do_next_clean_step_bad_step_return_value(manual=True)

-    def test__get_node_next_clean_steps(self):
+    def __get_node_next_clean_steps(self, skip=True):
        driver_internal_info = {'clean_steps': self.clean_steps}
        node = obj_utils.create_test_node(
            self.context, driver='fake',
@ -2072,8 +2101,16 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
            clean_step=self.clean_steps[0])

        with task_manager.acquire(self.context, node.uuid) as task:
-            steps = self.service._get_node_next_clean_steps(task)
-            self.assertEqual(self.next_clean_steps, steps)
+            steps = self.service._get_node_next_clean_steps(
+                task, skip_current_step=skip)
+            expected = self.next_clean_steps if skip else self.clean_steps
+            self.assertEqual(expected, steps)
+
+    def test__get_node_next_clean_steps(self):
+        self.__get_node_next_clean_steps()
+
+    def test__get_node_next_clean_steps_no_skip(self):
+        self.__get_node_next_clean_steps(skip=False)

    def test__get_node_next_clean_steps_unset_clean_step(self):
        driver_internal_info = {'clean_steps': self.clean_steps}
--- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py
@ -319,7 +319,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
        log_mock.assert_called_once_with(
            'Asynchronous exception for node '
            '1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
-            'is done. exception: LlamaException')
+            'is done. Exception: LlamaException')

    @mock.patch.object(agent_base_vendor.BaseAgentVendor, 'deploy_has_started',
                       autospec=True)
@ -354,7 +354,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
        log_mock.assert_called_once_with(
            'Asynchronous exception for node '
            '1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
-            'is done. exception: LlamaException')
+            'is done. Exception: LlamaException')

    @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
    @mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
@ -897,21 +897,91 @@ class TestBaseAgentVendor(db_base.DbTestCase):
    @mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
    @mock.patch.object(agent_base_vendor.BaseAgentVendor,
                       'notify_conductor_resume_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
+                       autospec=True)
    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
                       autospec=True)
-    def test_continue_cleaning_clean_version_mismatch(
-            self, status_mock, notify_mock, steps_mock):
-        # Test that cleaning is restarted if there is a version mismatch
+    def _test_continue_cleaning_clean_version_mismatch(
+            self, status_mock, get_steps_mock, notify_mock, steps_mock,
+            manual=False):
+        get_steps_mock.return_value = {
+            'command_status': 'CLEAN_VERSION_MISMATCH',
+            'command_name': 'get_clean_step',
+            'command_result': {
+                'hardware_manager_version': {'Generic': '1'},
+                'clean_steps': {
+                    'GenericHardwareManager': [
+                        {'interface': 'deploy',
+                         'step': 'erase_devices',
+                         'priority': 20}]}
+            }
+        }
        status_mock.return_value = [{
            'command_status': 'CLEAN_VERSION_MISMATCH',
            'command_name': 'execute_clean_step',
-            'command_result': {}
        }]
-        with task_manager.acquire(self.context, self.node['uuid'],
+        tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
+        self.node.provision_state = states.CLEANWAIT
+        self.node.target_provision_state = tgt_prov_state
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
                                  shared=False) as task:
            self.passthru.continue_cleaning(task)
-            steps_mock.assert_called_once_with(task)
            notify_mock.assert_called_once_with(mock.ANY, task)
+            if manual:
+                get_steps_mock.assert_called_once_with(mock.ANY, task.node,
+                                                       task.ports)
+                self.assertFalse(
+                    task.node.driver_internal_info['skip_current_clean_step'])
+                self.assertEqual(
+                    {'Generic': '1'},
+                    task.node.driver_internal_info['hardware_manager_version'])
+                self.assertFalse(steps_mock.called)
+            else:
+                self.assertFalse(get_steps_mock.called)
+                steps_mock.assert_called_once_with(task)
+                self.assertFalse('skip_current_clean_step' in
+                                 task.node.driver_internal_info)
+
+    def test_continue_cleaning_automated_clean_version_mismatch(self):
+        self._test_continue_cleaning_clean_version_mismatch()
+
+    def test_continue_cleaning_manual_clean_version_mismatch(self):
+        self._test_continue_cleaning_clean_version_mismatch(manual=True)
+
+    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
+    @mock.patch.object(agent_base_vendor.BaseAgentVendor,
+                       'notify_conductor_resume_clean', autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
+                       autospec=True)
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_continue_cleaning_manual_version_mismatch_bad(
+            self, status_mock, get_steps_mock, notify_mock, error_mock):
+        get_steps_mock.return_value = {
+            'command_status': 'CLEAN_VERSION_MISMATCH',
+            'command_name': 'get_clean_step',
+            'command_result': {
+                'hardware_manager_version': {'Generic': '1'}}
+        }
+        status_mock.return_value = [{
+            'command_status': 'CLEAN_VERSION_MISMATCH',
+            'command_name': 'execute_clean_step',
+        }]
+        tgt_prov_state = states.MANAGEABLE
+        self.node.provision_state = states.CLEANWAIT
+        self.node.target_provision_state = tgt_prov_state
+        self.node.save()
+        with task_manager.acquire(self.context, self.node.uuid,
+                                  shared=False) as task:
+            self.passthru.continue_cleaning(task)
+
+            get_steps_mock.assert_called_once_with(mock.ANY, task.node,
+                                                   task.ports)
+            error_mock.assert_called_once_with(task, mock.ANY)
+            self.assertFalse(notify_mock.called)
+            self.assertFalse('skip_current_clean_step' in
+                             task.node.driver_internal_info)

    @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',