From f5a676a60812d77a98b1509e745b85b52e4eaabb Mon Sep 17 00:00:00 2001
From: Arne Wiebalck <Arne.Wiebalck@cern.ch>
Date: Fri, 19 Jul 2019 15:07:29 +0200
Subject: [PATCH] Do not tear down node upon cleaning failure

In case of a failure during cleaning, ironic currently shuts the
node off. This is dangerous, e.g. when the cleaning step is a
firmware upgrade. This patch proposes to corect this behaviour
and leave the node on in case cleaning raises an exception.

Task: #30357
Story: #2005375
Change-Id: I5fe8b380c890eb9b9dcee33868ceda2a9bab9929
---
 ironic/conductor/utils.py                      |  4 ++--
 ironic/drivers/modules/deploy_utils.py         | 18 ++++++++++++------
 .../unit/drivers/modules/test_deploy_utils.py  | 10 ++++++++--
 ...upon-cleaning-failure-a9cda6ae71ed2540.yaml | 11 +++++++++++
 4 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100644 releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml

diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index 88d9b3814c..9abbed5d3c 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
                            set_fail_state=True):
     """Put a failed node in CLEANFAIL and maintenance."""
     node = task.node
+    node.fault = faults.CLEAN_FAILURE
+    node.maintenance = True
 
     if tear_down_cleaning:
         try:
@@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
     # for automated cleaning, it is AVAILABLE.
     manual_clean = node.target_provision_state == states.MANAGEABLE
     node.last_error = msg
-    node.maintenance = True
     node.maintenance_reason = msg
-    node.fault = faults.CLEAN_FAILURE
     node.save()
 
     if set_fail_state and node.provision_state != states.CLEANFAIL:
diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py
index 9da823ba66..e987021c19 100644
--- a/ironic/drivers/modules/deploy_utils.py
+++ b/ironic/drivers/modules/deploy_utils.py
@@ -32,6 +32,7 @@ from oslo_utils import strutils
 import six
 
 from ironic.common import exception
+from ironic.common import faults
 from ironic.common.glance_service import service_utils
 from ironic.common.i18n import _
 from ironic.common import image_service
@@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
     """Tears down the environment setup for in-band cleaning.
 
     This method does the following:
-    1. Powers off the bare metal node.
-    2. If 'manage_boot' parameter is set to true, it also
-    calls the 'clean_up_ramdisk' method of boot interface to clean up
-    the environment that was set for booting agent ramdisk.
+    1. Powers off the bare metal node (unless the node is fast
+    tracked or there was a cleaning failure).
+    2. If 'manage_boot' parameter is set to true, it also calls
+    the 'clean_up_ramdisk' method of boot interface to clean
+    up the environment that was set for booting agent ramdisk.
     3. Deletes the cleaning ports which were setup as part
     of cleaning.
 
@@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
         removed.
     """
     fast_track = manager_utils.is_fast_track(task)
-    if not fast_track:
+
+    node = task.node
+    cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
+
+    if not (fast_track or cleaning_failure):
         manager_utils.node_power_action(task, states.POWER_OFF)
 
     if manage_boot:
@@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True):
 
     power_state_to_restore = manager_utils.power_on_node_if_needed(task)
     task.driver.network.remove_cleaning_network(task)
-    if not fast_track:
+    if not (fast_track or cleaning_failure):
         manager_utils.restore_power_state_if_needed(
             task, power_state_to_restore)
 
diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py
index 86cfd3f7f9..ef412a3364 100644
--- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py
+++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py
@@ -31,6 +31,7 @@ from testtools import matchers
 
 from ironic.common import boot_devices
 from ironic.common import exception
+from ironic.common import faults
 from ironic.common import image_service
 from ironic.common import states
 from ironic.common import utils as common_utils
@@ -1750,12 +1751,14 @@ class AgentMethodsTestCase(db_base.DbTestCase):
     def _test_tear_down_inband_cleaning(
             self, power_mock, remove_cleaning_network_mock,
             clean_up_ramdisk_mock, is_fast_track_mock,
-            manage_boot=True, fast_track=False):
+            manage_boot=True, fast_track=False, cleaning_error=False):
         is_fast_track_mock.return_value = fast_track
         with task_manager.acquire(
                 self.context, self.node.uuid, shared=False) as task:
+            if cleaning_error:
+                task.node.fault = faults.CLEAN_FAILURE
             utils.tear_down_inband_cleaning(task, manage_boot=manage_boot)
-            if not fast_track:
+            if not (fast_track or cleaning_error):
                 power_mock.assert_called_once_with(task, states.POWER_OFF)
             else:
                 self.assertFalse(power_mock.called)
@@ -1775,6 +1778,9 @@ class AgentMethodsTestCase(db_base.DbTestCase):
     def test_tear_down_inband_cleaning_fast_track(self):
         self._test_tear_down_inband_cleaning(fast_track=True)
 
+    def test_tear_down_inband_cleaning_cleaning_error(self):
+        self._test_tear_down_inband_cleaning(cleaning_error=True)
+
     def test_build_agent_options_conf(self):
         self.config(api_url='https://api-url', group='conductor')
         options = utils.build_agent_options(self.node)
diff --git a/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml b/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml
new file mode 100644
index 0000000000..b13057cbb8
--- /dev/null
+++ b/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    Fixes a bug where ironic would shut a node down upon cleaning failure.
+    Now, the node stays powered on (as documented and intended).
+upgrade:
+  - |
+    When a failure occurs during cleaning, nodes will no longer be shut down. The
+    behaviour was changed to prevent harm and allow for an admin intervention
+    when sensitive operations, such as firmware upgrades, are performed and fail
+    during cleaning.