diff --git a/doc/source/admin/cleaning.rst b/doc/source/admin/cleaning.rst index 8338a8b175..9cfd628c79 100644 --- a/doc/source/admin/cleaning.rst +++ b/doc/source/admin/cleaning.rst @@ -314,9 +314,15 @@ cleaning. Troubleshooting =============== -If cleaning fails on a node, the node will be put into ``clean failed`` state -and placed in maintenance mode, to prevent ironic from taking actions on the -node. +If cleaning fails on a node, the node will be put into ``clean failed`` state. +If the failure happens while running a clean step, the node is also placed in +maintenance mode to prevent ironic from taking actions on the node. The +operator should validate that no permanent damage has been done to the +node and no processes are still running on it before removing the maintenance +mode. + +.. note:: Older versions of ironic may put the node to maintenance even when + no clean step has been running. Nodes in ``clean failed`` will not be powered off, as the node might be in a state such that powering it off could damage the node or remove useful diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 081c6ce42c..fccb9261c5 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -407,9 +407,9 @@ def cleanup_cleanwait_timeout(task): def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, - tear_down_cleaning=True, - set_fail_state=True): - """Put a failed node in CLEANFAIL and maintenance. + tear_down_cleaning=True, set_fail_state=True, + set_maintenance=None): + """Put a failed node in CLEANFAIL and maintenance (if needed). :param task: a TaskManager instance. :param logmsg: Message to be logged. @@ -420,12 +420,19 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, cleaning. Default to True. :param set_fail_state: Whether to set node to failed state. Default to True. + :param set_maintenance: Whether to set maintenance mode. If None, + maintenance mode will be set if and only if a clean step is being + executed on a node. """ + if set_maintenance is None: + set_maintenance = bool(task.node.clean_step) + errmsg = errmsg or logmsg LOG.error(logmsg, exc_info=traceback) node = task.node - node.fault = faults.CLEAN_FAILURE - node.maintenance = True + if set_maintenance: + node.fault = faults.CLEAN_FAILURE + node.maintenance = True if tear_down_cleaning: try: @@ -457,7 +464,7 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, manual_clean = node.target_provision_state == states.MANAGEABLE node.last_error = errmsg # NOTE(dtantsur): avoid overwriting existing maintenance_reason - if not node.maintenance_reason: + if not node.maintenance_reason and set_maintenance: node.maintenance_reason = errmsg node.save() diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 2fba816879..e2e00cf0b3 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -18,6 +18,7 @@ from oslo_config import cfg from oslo_utils import uuidutils from ironic.common import exception +from ironic.common import faults from ironic.common import states from ironic.conductor import cleaning from ironic.conductor import steps as conductor_steps @@ -63,6 +64,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): node.refresh() self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) mock_validate.assert_called_once_with(mock.ANY, mock.ANY) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', @@ -331,6 +334,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertIn('is not allowed', node.last_error) self.assertTrue(node.maintenance) self.assertEqual('Original reason', node.maintenance_reason) + self.assertIsNone(node.fault) # no clean step running self.assertFalse(mock_prep.called) self.assertFalse(mock_tear_down.called) @@ -356,6 +360,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual(tgt_prov_state, node.target_provision_state) mock_prep.assert_called_once_with(mock.ANY, task) mock_validate.assert_called_once_with(mock.ANY, task) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) def test__do_node_clean_automated_prepare_clean_fail(self): self.__do_node_clean_prepare_clean_fail() @@ -413,6 +419,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) mock_steps.assert_called_once_with(mock.ANY) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) def test__do_node_clean_automated_steps_fail(self): for invalid in (True, False): @@ -460,6 +468,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): if clean_steps: self.assertEqual(clean_steps, node.driver_internal_info['clean_steps']) + self.assertFalse(node.maintenance) # Check that state didn't change self.assertEqual(states.CLEANING, node.provision_state) @@ -751,6 +760,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) self.assertTrue(node.maintenance) + self.assertEqual(faults.CLEAN_FAILURE, node.fault) mock_execute.assert_called_once_with( mock.ANY, mock.ANY, self.clean_steps[0]) mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning') @@ -931,7 +941,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) self.assertEqual(1, tear_mock.call_count) - self.assertTrue(node.maintenance) + self.assertFalse(node.maintenance) # no step is running deploy_exec_calls = [ mock.call(mock.ANY, mock.ANY, self.clean_steps[0]), mock.call(mock.ANY, mock.ANY, self.clean_steps[2]), @@ -1038,7 +1048,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertEqual({}, node.clean_step) self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertIsNotNone(node.last_error) - self.assertTrue(node.maintenance) + self.assertTrue(node.maintenance) # the 1st clean step was running deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY, self.clean_steps[0]) # Make sure we don't execute any other step and return diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index e954ec178e..b3d3f42518 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -36,6 +36,7 @@ from ironic.common import boot_devices from ironic.common import components from ironic.common import driver_factory from ironic.common import exception +from ironic.common import faults from ironic.common import images from ironic.common import indicator_states from ironic.common import nova @@ -1858,7 +1859,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertIsNotNone(node.last_error) mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY) - def _check_cleanwait_timeouts(self, manual=False): + def _check_cleanwait_timeouts(self, manual=False, with_step=True): self._start_service() CONF.set_override('clean_callback_timeout', 1, group='conductor') tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE @@ -1869,7 +1870,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0), clean_step={ 'interface': 'deploy', - 'step': 'erase_devices'}, + 'step': 'erase_devices'} if with_step else {}, driver_internal_info={ 'cleaning_reboot': manual, 'clean_step_index': 0}) @@ -1880,6 +1881,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertIsNotNone(node.last_error) + self.assertEqual(with_step, node.maintenance) + self.assertEqual(faults.CLEAN_FAILURE if with_step else None, + node.fault) # Test that cleaning parameters have been purged in order # to prevent looping of the cleaning sequence self.assertEqual({}, node.clean_step) @@ -1892,6 +1896,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__check_cleanwait_timeouts_manual_clean(self): self._check_cleanwait_timeouts(manual=True) + def test__check_cleanwait_timeouts_boot_timeout(self): + self._check_cleanwait_timeouts(with_step=False) + @mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up', autospec=True) @mock.patch.object(conductor_utils, 'node_power_action', autospec=True) diff --git a/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml new file mode 100644 index 0000000000..2d82469743 --- /dev/null +++ b/releasenotes/notes/redundant-maintenance-09849674334f656a.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Failed cleaning no longer results in maintenance mode if no clean step is + running, e.g. on PXE timeout or failed clean steps validation.