From f5a676a60812d77a98b1509e745b85b52e4eaabb Mon Sep 17 00:00:00 2001 From: Arne Wiebalck <Arne.Wiebalck@cern.ch> Date: Fri, 19 Jul 2019 15:07:29 +0200 Subject: [PATCH] Do not tear down node upon cleaning failure In case of a failure during cleaning, ironic currently shuts the node off. This is dangerous, e.g. when the cleaning step is a firmware upgrade. This patch proposes to corect this behaviour and leave the node on in case cleaning raises an exception. Task: #30357 Story: #2005375 Change-Id: I5fe8b380c890eb9b9dcee33868ceda2a9bab9929 --- ironic/conductor/utils.py | 4 ++-- ironic/drivers/modules/deploy_utils.py | 18 ++++++++++++------ .../unit/drivers/modules/test_deploy_utils.py | 10 ++++++++-- ...upon-cleaning-failure-a9cda6ae71ed2540.yaml | 11 +++++++++++ 4 files changed, 33 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 88d9b3814c..9abbed5d3c 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, set_fail_state=True): """Put a failed node in CLEANFAIL and maintenance.""" node = task.node + node.fault = faults.CLEAN_FAILURE + node.maintenance = True if tear_down_cleaning: try: @@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE node.last_error = msg - node.maintenance = True node.maintenance_reason = msg - node.fault = faults.CLEAN_FAILURE node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 9da823ba66..e987021c19 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -32,6 +32,7 @@ from oslo_utils import strutils import six from ironic.common import exception +from ironic.common import faults from ironic.common.glance_service import service_utils from ironic.common.i18n import _ from ironic.common import image_service @@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True): """Tears down the environment setup for in-band cleaning. This method does the following: - 1. Powers off the bare metal node. - 2. If 'manage_boot' parameter is set to true, it also - calls the 'clean_up_ramdisk' method of boot interface to clean up - the environment that was set for booting agent ramdisk. + 1. Powers off the bare metal node (unless the node is fast + tracked or there was a cleaning failure). + 2. If 'manage_boot' parameter is set to true, it also calls + the 'clean_up_ramdisk' method of boot interface to clean + up the environment that was set for booting agent ramdisk. 3. Deletes the cleaning ports which were setup as part of cleaning. @@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True): removed. """ fast_track = manager_utils.is_fast_track(task) - if not fast_track: + + node = task.node + cleaning_failure = (node.fault == faults.CLEAN_FAILURE) + + if not (fast_track or cleaning_failure): manager_utils.node_power_action(task, states.POWER_OFF) if manage_boot: @@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True): power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) - if not fast_track: + if not (fast_track or cleaning_failure): manager_utils.restore_power_state_if_needed( task, power_state_to_restore) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 86cfd3f7f9..ef412a3364 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -31,6 +31,7 @@ from testtools import matchers from ironic.common import boot_devices from ironic.common import exception +from ironic.common import faults from ironic.common import image_service from ironic.common import states from ironic.common import utils as common_utils @@ -1750,12 +1751,14 @@ class AgentMethodsTestCase(db_base.DbTestCase): def _test_tear_down_inband_cleaning( self, power_mock, remove_cleaning_network_mock, clean_up_ramdisk_mock, is_fast_track_mock, - manage_boot=True, fast_track=False): + manage_boot=True, fast_track=False, cleaning_error=False): is_fast_track_mock.return_value = fast_track with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: + if cleaning_error: + task.node.fault = faults.CLEAN_FAILURE utils.tear_down_inband_cleaning(task, manage_boot=manage_boot) - if not fast_track: + if not (fast_track or cleaning_error): power_mock.assert_called_once_with(task, states.POWER_OFF) else: self.assertFalse(power_mock.called) @@ -1775,6 +1778,9 @@ class AgentMethodsTestCase(db_base.DbTestCase): def test_tear_down_inband_cleaning_fast_track(self): self._test_tear_down_inband_cleaning(fast_track=True) + def test_tear_down_inband_cleaning_cleaning_error(self): + self._test_tear_down_inband_cleaning(cleaning_error=True) + def test_build_agent_options_conf(self): self.config(api_url='https://api-url', group='conductor') options = utils.build_agent_options(self.node) diff --git a/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml b/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml new file mode 100644 index 0000000000..b13057cbb8 --- /dev/null +++ b/releasenotes/notes/fix-do-not-tear-down-nodes-upon-cleaning-failure-a9cda6ae71ed2540.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes a bug where ironic would shut a node down upon cleaning failure. + Now, the node stays powered on (as documented and intended). +upgrade: + - | + When a failure occurs during cleaning, nodes will no longer be shut down. The + behaviour was changed to prevent harm and allow for an admin intervention + when sensitive operations, such as firmware upgrades, are performed and fail + during cleaning.