Add cleanwait timeout cleanup process

Previously, if a node in a cleaning state timed out, the timeout process would not purge certain items from the node's configuration which resulted in a short circuiting of the logic cleaning being retried. This was a result of the node clean_step configuration not being purged upon a timeout occuring. This change adds a wrapper method around the cleaning failure error handler to allow the _fail_if_in_state method to call the error handler, since error handler syntax is not uniform and the _fail_if_in_state cannot pass arguments. It also changes the cleaning error handler to permit the error handler to delete the node clean_step, and cleaning related driver_internal_info configuration from a node in the event the node in in CLEANFAIL state. Change-Id: I9ee5c0b385648c9b7e1d330d5d1af9b2c486a436 Closes-Bug: #1590146
2016-06-08 18:39:09 -04:00 · 2016-06-08 18:39:09 -04:00 · 42e2685395
commit 42e2685395
parent a155f30c88
5 changed files with 69 additions and 7 deletions
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@ -1356,13 +1356,10 @@ class ConductorManager(base_manager.BaseConductorManager):
                   'provision_state': states.CLEANWAIT,
                   'maintenance': False,
                   'provisioned_before': callback_timeout}
-        last_error = _("Timeout reached while cleaning the node. Please "
-                       "check if the ramdisk responsible for the cleaning is "
-                       "running on the node.")
        self._fail_if_in_state(context, filters, states.CLEANWAIT,
                               'provision_updated_at',
-                               last_error=last_error,
-                               keep_target_state=True)
+                               keep_target_state=True,
+                               callback_method=utils.cleanup_cleanwait_timeout)

    @periodics.periodic(spacing=CONF.conductor.sync_local_state_interval)
    def _sync_local_state(self, context):
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@ -202,11 +202,27 @@ def provisioning_error_handler(e, node, provision_state,
                     'tgt_prov_state': target_provision_state})


+def cleanup_cleanwait_timeout(task):
+    """Cleanup a cleaning task after timeout.
+
+    :param task: a TaskManager instance.
+    """
+    last_error = (_("Timeout reached while cleaning the node. Please "
+                    "check if the ramdisk responsible for the cleaning is "
+                    "running on the node. Failed on step %(step)s.") %
+                  {'step': task.node.clean_step})
+    cleaning_error_handler(task, msg=last_error,
+                           set_fail_state=True)
+
+
 def cleaning_error_handler(task, msg, tear_down_cleaning=True,
                           set_fail_state=True):
    """Put a failed node in CLEANFAIL and maintenance."""
    node = task.node
-    if node.provision_state in (states.CLEANING, states.CLEANWAIT):
+    if node.provision_state in (
+            states.CLEANING,
+            states.CLEANWAIT,
+            states.CLEANFAIL):
        # Clear clean step, msg should already include current step
        node.clean_step = {}
        info = node.driver_internal_info
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@ -1100,7 +1100,13 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
            self.context, driver='fake',
            provision_state=states.CLEANWAIT,
            target_provision_state=tgt_prov_state,
-            provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
+            provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
+            clean_step={
+                'interface': 'deploy',
+                'step': 'erase_devices'},
+            driver_internal_info={
+                'cleaning_reboot': manual,
+                'clean_step_index': 0})

        self.service._check_cleanwait_timeouts(self.context)
        self._stop_service()
@ -1108,6 +1114,11 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
        self.assertEqual(states.CLEANFAIL, node.provision_state)
        self.assertEqual(tgt_prov_state, node.target_provision_state)
        self.assertIsNotNone(node.last_error)
+        # Test that cleaning parameters have been purged in order
+        # to prevent looping of the cleaning sequence
+        self.assertEqual({}, node.clean_step)
+        self.assertNotIn('clean_step_index', node.driver_internal_info)
+        self.assertNotIn('cleaning_reboot', node.driver_internal_info)

    def test__check_cleanwait_timeouts_automated_clean(self):
        self._check_cleanwait_timeouts()
--- a/ironic/tests/unit/conductor/test_utils.py
+++ b/ironic/tests/unit/conductor/test_utils.py
@ -572,6 +572,37 @@ class ErrorHandlersTestCase(tests_base.TestCase):
        self.assertFalse(self.node.save.called)
        self.assertFalse(log_mock.warning.called)

+    @mock.patch.object(conductor_utils, 'cleaning_error_handler')
+    def test_cleanup_cleanwait_timeout_handler_call(self, mock_error_handler):
+        self.node.clean_step = {}
+        conductor_utils.cleanup_cleanwait_timeout(self.task)
+
+        mock_error_handler.assert_called_once_with(
+            self.task,
+            msg="Timeout reached while cleaning the node. Please "
+                "check if the ramdisk responsible for the cleaning is "
+                "running on the node. Failed on step {}.",
+            set_fail_state=True)
+
+    def test_cleanup_cleanwait_timeout(self):
+        self.node.provision_state = states.CLEANFAIL
+        target = 'baz'
+        self.node.target_provision_state = target
+        self.node.driver_internal_info = {}
+        self.node.clean_step = {'key': 'val'}
+        clean_error = ("Timeout reached while cleaning the node. Please "
+                       "check if the ramdisk responsible for the cleaning is "
+                       "running on the node. Failed on step {'key': 'val'}.")
+        self.node.driver_internal_info = {
+            'cleaning_reboot': True,
+            'clean_step_index': 0}
+        conductor_utils.cleanup_cleanwait_timeout(self.task)
+        self.assertEqual({}, self.node.clean_step)
+        self.assertNotIn('clean_step_index', self.node.driver_internal_info)
+        self.task.process_event.assert_called_once()
+        self.assertTrue(self.node.maintenance)
+        self.assertEqual(clean_error, self.node.maintenance_reason)
+
    def test_cleaning_error_handler(self):
        self.node.provision_state = states.CLEANING
        target = 'baz'
--- a/releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml
+++ b/releasenotes/notes/cleaning-retry-fix-89a5d0e65920a064.yaml
@ -0,0 +1,7 @@
+---
+fixes:
+  - A bug has been corrected where a node's current clean_step
+    was not purged upon that node timing out from a CLEANWAIT state.
+    Previously, this bug would prevent a user from retrying cleaning
+    operations. For more information,
+    see https://bugs.launchpad.net/ironic/+bug/1590146.