From ad696c9bac379a690b1f674de4983bf1a06a243e Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Fri, 8 Jan 2021 14:48:54 +0100 Subject: [PATCH] Do not enter maintenance if cleaning fails before running the 1st step We use maintenance mode to signal that hardware needs additional intervention, because of potential damage or stuck long-running processes. This is not the case for PXE booting or invalid requested manual clean steps, so don't set maintenance if no clean step is running when the failure occurs. Change-Id: I8a7ce072359660fc6640e5f20ec2d3c452033557 --- doc/source/admin/cleaning.rst | 12 +++++++++--- ironic/conductor/utils.py | 19 +++++++++++++------ ironic/tests/unit/conductor/test_cleaning.py | 14 ++++++++++++-- ironic/tests/unit/conductor/test_manager.py | 11 +++++++++-- ...edundant-maintenance-09849674334f656a.yaml | 5 +++++ 5 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/redundant-maintenance-09849674334f656a.yaml diff --git a/doc/source/admin/cleaning.rst b/doc/source/admin/cleaning.rst index 8338a8b175..9cfd628c79 100644 --- a/doc/source/admin/cleaning.rst +++ b/doc/source/admin/cleaning.rst @@ -314,9 +314,15 @@ cleaning. Troubleshooting =============== -If cleaning fails on a node, the node will be put into ``clean failed`` state -and placed in maintenance mode, to prevent ironic from taking actions on the -node. +If cleaning fails on a node, the node will be put into ``clean failed`` state. +If the failure happens while running a clean step, the node is also placed in +maintenance mode to prevent ironic from taking actions on the node. The +operator should validate that no permanent damage has been done to the +node and no processes are still running on it before removing the maintenance +mode. + +.. note:: Older versions of ironic may put the node to maintenance even when + no clean step has been running. Nodes in ``clean failed`` will not be powered off, as the node might be in a state such that powering it off could damage the node or remove useful diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 081c6ce42c..fccb9261c5 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -407,9 +407,9 @@ def cleanup_cleanwait_timeout(task): def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, - tear_down_cleaning=True, - set_fail_state=True): - """Put a failed node in CLEANFAIL and maintenance. + tear_down_cleaning=True, set_fail_state=True, + set_maintenance=None): + """Put a failed node in CLEANFAIL and maintenance (if needed). :param task: a TaskManager instance. :param logmsg: Message to be logged. @@ -420,12 +420,19 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, cleaning. Default to True. :param set_fail_state: Whether to set node to failed state. Default to True. + :param set_maintenance: Whether to set maintenance mode. If None, + maintenance mode will be set if and only if a clean step is being + executed on a node. """ + if set_maintenance is None: + set_maintenance = bool(task.node.clean_step) + errmsg = errmsg or logmsg LOG.error(logmsg, exc_info=traceback) node = task.node - node.fault = faults.CLEAN_FAILURE - node.maintenance = True + if set_maintenance: + node.fault = faults.CLEAN_FAILURE + node.maintenance = True if tear_down_cleaning: try: @@ -457,7 +464,7 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, manual_clean = node.target_provision_state == states.MANAGEABLE node.last_error = errmsg # NOTE(dtantsur): avoid overwriting existing maintenance_reason - if not node.maintenance_reason: + if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg node.save() diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 2fba816879..e2e00cf0b3 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -18,6 +18,7 @@ from oslo_config import cfg from oslo_utils import uuidutils from ironic.common import exception +from ironic.common import faults from ironic.common import states from ironic.conductor import cleaning from ironic.conductor import steps as conductor_steps @@ -63,6 +64,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): node.refresh() self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) mock_validate.assert_called_once_with(mock.ANY, mock.ANY) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', @@ -331,6 +334,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertIn('is not allowed', node.last_error) self.assertTrue(node.maintenance) self.assertEqual('Original reason', node.maintenance_reason) + self.assertIsNone(node.fault) # no clean step running self.assertFalse(mock_prep.called) self.assertFalse(mock_tear_down.called) @@ -356,6 +360,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual(tgt_prov_state, node.target_provision_state) mock_prep.assert_called_once_with(mock.ANY, task) mock_validate.assert_called_once_with(mock.ANY, task) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) def test__do_node_clean_automated_prepare_clean_fail(self): self.__do_node_clean_prepare_clean_fail() @@ -413,6 +419,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) mock_steps.assert_called_once_with(mock.ANY) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) def test__do_node_clean_automated_steps_fail(self): for invalid in (True, False): @@ -460,6 +468,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): if clean_steps: self.assertEqual(clean_steps, node.driver_internal_info['clean_steps']) + self.assertFalse(node.maintenance) # Check that state didn't change self.assertEqual(states.CLEANING, node.provision_state) @@ -751,6 +760,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) self.assertTrue(node.maintenance) + self.assertEqual(faults.CLEAN_FAILURE, node.fault) mock_execute.assert_called_once_with( mock.ANY, mock.ANY, self.clean_steps[0]) mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning') @@ -931,7 +941,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) self.assertEqual(1, tear_mock.call_count) - self.assertTrue(node.maintenance) + self.assertFalse(node.maintenance) # no step is running deploy_exec_calls = [ mock.call(mock.ANY, mock.ANY, self.clean_steps[0]), mock.call(mock.ANY, mock.ANY, self.clean_steps[2]), @@ -1038,7 +1048,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual({}, node.clean_step) self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) - self.assertTrue(node.maintenance) + self.assertTrue(node.maintenance) # the 1st clean step was running deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY, self.clean_steps[0]) # Make sure we don't execute any other step and return diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index e954ec178e..b3d3f42518 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -36,6 +36,7 @@ from ironic.common import boot_devices from ironic.common import components from ironic.common import driver_factory from ironic.common import exception +from ironic.common import faults from ironic.common import images from ironic.common import indicator_states from ironic.common import nova @@ -1858,7 +1859,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertIsNotNone(node.last_error) mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY) - def _check_cleanwait_timeouts(self, manual=False): + def _check_cleanwait_timeouts(self, manual=False, with_step=True): self._start_service() CONF.set_override('clean_callback_timeout', 1, group='conductor') tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE @@ -1869,7 +1870,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0), clean_step={ 'interface': 'deploy', - 'step': 'erase_devices'}, + 'step': 'erase_devices'} if with_step else {}, driver_internal_info={ 'cleaning_reboot': manual, 'clean_step_index': 0}) @@ -1880,6 +1881,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertIsNotNone(node.last_error) + self.assertEqual(with_step, node.maintenance) + self.assertEqual(faults.CLEAN_FAILURE if with_step else None, + node.fault) # Test that cleaning parameters have been purged in order # to prevent looping of the cleaning sequence self.assertEqual({}, node.clean_step) @@ -1892,6 +1896,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__check_cleanwait_timeouts_manual_clean(self): self._check_cleanwait_timeouts(manual=True) + def test__check_cleanwait_timeouts_boot_timeout(self): + self._check_cleanwait_timeouts(with_step=False) + @mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up', autospec=True) @mock.patch.object(conductor_utils, 'node_power_action', autospec=True) diff --git a/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml new file mode 100644 index 0000000000..2d82469743 --- /dev/null +++ b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Failed cleaning no longer results in maintenance mode if no clean step is + running, e.g. on PXE timeout or failed clean steps validation.