Merge "Do not enter maintenance if cleaning fails before running the 1st step"

This commit is contained in:
Zuul 2021-01-12 07:10:42 +00:00 committed by Gerrit Code Review
commit 07bdccea58
5 changed files with 48 additions and 13 deletions

View File

@ -314,9 +314,15 @@ cleaning.
Troubleshooting Troubleshooting
=============== ===============
If cleaning fails on a node, the node will be put into ``clean failed`` state If cleaning fails on a node, the node will be put into ``clean failed`` state.
and placed in maintenance mode, to prevent ironic from taking actions on the If the failure happens while running a clean step, the node is also placed in
node. maintenance mode to prevent ironic from taking actions on the node. The
operator should validate that no permanent damage has been done to the
node and no processes are still running on it before removing the maintenance
mode.
.. note:: Older versions of ironic may put the node to maintenance even when
no clean step has been running.
Nodes in ``clean failed`` will not be powered off, as the node might be in a Nodes in ``clean failed`` will not be powered off, as the node might be in a
state such that powering it off could damage the node or remove useful state such that powering it off could damage the node or remove useful

View File

@ -407,9 +407,9 @@ def cleanup_cleanwait_timeout(task):
def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False, def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
tear_down_cleaning=True, tear_down_cleaning=True, set_fail_state=True,
set_fail_state=True): set_maintenance=None):
"""Put a failed node in CLEANFAIL and maintenance. """Put a failed node in CLEANFAIL and maintenance (if needed).
:param task: a TaskManager instance. :param task: a TaskManager instance.
:param logmsg: Message to be logged. :param logmsg: Message to be logged.
@ -420,12 +420,19 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
cleaning. Default to True. cleaning. Default to True.
:param set_fail_state: Whether to set node to failed state. Default to :param set_fail_state: Whether to set node to failed state. Default to
True. True.
:param set_maintenance: Whether to set maintenance mode. If None,
maintenance mode will be set if and only if a clean step is being
executed on a node.
""" """
if set_maintenance is None:
set_maintenance = bool(task.node.clean_step)
errmsg = errmsg or logmsg errmsg = errmsg or logmsg
LOG.error(logmsg, exc_info=traceback) LOG.error(logmsg, exc_info=traceback)
node = task.node node = task.node
node.fault = faults.CLEAN_FAILURE if set_maintenance:
node.maintenance = True node.fault = faults.CLEAN_FAILURE
node.maintenance = True
if tear_down_cleaning: if tear_down_cleaning:
try: try:
@ -457,7 +464,7 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
manual_clean = node.target_provision_state == states.MANAGEABLE manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = errmsg node.last_error = errmsg
# NOTE(dtantsur): avoid overwriting existing maintenance_reason # NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason: if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg node.maintenance_reason = errmsg
node.save() node.save()

View File

@ -18,6 +18,7 @@ from oslo_config import cfg
from oslo_utils import uuidutils from oslo_utils import uuidutils
from ironic.common import exception from ironic.common import exception
from ironic.common import faults
from ironic.common import states from ironic.common import states
from ironic.conductor import cleaning from ironic.conductor import cleaning
from ironic.conductor import steps as conductor_steps from ironic.conductor import steps as conductor_steps
@ -63,6 +64,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
node.refresh() node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)
mock_validate.assert_called_once_with(mock.ANY, mock.ANY) mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate', @mock.patch('ironic.drivers.modules.fake.FakePower.validate',
@ -331,6 +334,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertIn('is not allowed', node.last_error) self.assertIn('is not allowed', node.last_error)
self.assertTrue(node.maintenance) self.assertTrue(node.maintenance)
self.assertEqual('Original reason', node.maintenance_reason) self.assertEqual('Original reason', node.maintenance_reason)
self.assertIsNone(node.fault) # no clean step running
self.assertFalse(mock_prep.called) self.assertFalse(mock_prep.called)
self.assertFalse(mock_tear_down.called) self.assertFalse(mock_tear_down.called)
@ -356,6 +360,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_prep.assert_called_once_with(mock.ANY, task) mock_prep.assert_called_once_with(mock.ANY, task)
mock_validate.assert_called_once_with(mock.ANY, task) mock_validate.assert_called_once_with(mock.ANY, task)
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)
def test__do_node_clean_automated_prepare_clean_fail(self): def test__do_node_clean_automated_prepare_clean_fail(self):
self.__do_node_clean_prepare_clean_fail() self.__do_node_clean_prepare_clean_fail()
@ -413,6 +419,8 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
mock_steps.assert_called_once_with(mock.ANY) mock_steps.assert_called_once_with(mock.ANY)
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)
def test__do_node_clean_automated_steps_fail(self): def test__do_node_clean_automated_steps_fail(self):
for invalid in (True, False): for invalid in (True, False):
@ -460,6 +468,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
if clean_steps: if clean_steps:
self.assertEqual(clean_steps, self.assertEqual(clean_steps,
node.driver_internal_info['clean_steps']) node.driver_internal_info['clean_steps'])
self.assertFalse(node.maintenance)
# Check that state didn't change # Check that state didn't change
self.assertEqual(states.CLEANING, node.provision_state) self.assertEqual(states.CLEANING, node.provision_state)
@ -751,6 +760,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
self.assertTrue(node.maintenance) self.assertTrue(node.maintenance)
self.assertEqual(faults.CLEAN_FAILURE, node.fault)
mock_execute.assert_called_once_with( mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0]) mock.ANY, mock.ANY, self.clean_steps[0])
mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning') mock_collect_logs.assert_called_once_with(mock.ANY, label='cleaning')
@ -931,7 +941,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
self.assertEqual(1, tear_mock.call_count) self.assertEqual(1, tear_mock.call_count)
self.assertTrue(node.maintenance) self.assertFalse(node.maintenance) # no step is running
deploy_exec_calls = [ deploy_exec_calls = [
mock.call(mock.ANY, mock.ANY, self.clean_steps[0]), mock.call(mock.ANY, mock.ANY, self.clean_steps[0]),
mock.call(mock.ANY, mock.ANY, self.clean_steps[2]), mock.call(mock.ANY, mock.ANY, self.clean_steps[2]),
@ -1038,7 +1048,7 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertEqual({}, node.clean_step) self.assertEqual({}, node.clean_step)
self.assertNotIn('clean_step_index', node.driver_internal_info) self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
self.assertTrue(node.maintenance) self.assertTrue(node.maintenance) # the 1st clean step was running
deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY, deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY,
self.clean_steps[0]) self.clean_steps[0])
# Make sure we don't execute any other step and return # Make sure we don't execute any other step and return

View File

@ -36,6 +36,7 @@ from ironic.common import boot_devices
from ironic.common import components from ironic.common import components
from ironic.common import driver_factory from ironic.common import driver_factory
from ironic.common import exception from ironic.common import exception
from ironic.common import faults
from ironic.common import images from ironic.common import images
from ironic.common import indicator_states from ironic.common import indicator_states
from ironic.common import nova from ironic.common import nova
@ -1858,7 +1859,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY) mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY)
def _check_cleanwait_timeouts(self, manual=False): def _check_cleanwait_timeouts(self, manual=False, with_step=True):
self._start_service() self._start_service()
CONF.set_override('clean_callback_timeout', 1, group='conductor') CONF.set_override('clean_callback_timeout', 1, group='conductor')
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
@ -1869,7 +1870,7 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0), provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0),
clean_step={ clean_step={
'interface': 'deploy', 'interface': 'deploy',
'step': 'erase_devices'}, 'step': 'erase_devices'} if with_step else {},
driver_internal_info={ driver_internal_info={
'cleaning_reboot': manual, 'cleaning_reboot': manual,
'clean_step_index': 0}) 'clean_step_index': 0})
@ -1880,6 +1881,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNotNone(node.last_error) self.assertIsNotNone(node.last_error)
self.assertEqual(with_step, node.maintenance)
self.assertEqual(faults.CLEAN_FAILURE if with_step else None,
node.fault)
# Test that cleaning parameters have been purged in order # Test that cleaning parameters have been purged in order
# to prevent looping of the cleaning sequence # to prevent looping of the cleaning sequence
self.assertEqual({}, node.clean_step) self.assertEqual({}, node.clean_step)
@ -1892,6 +1896,9 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__check_cleanwait_timeouts_manual_clean(self): def test__check_cleanwait_timeouts_manual_clean(self):
self._check_cleanwait_timeouts(manual=True) self._check_cleanwait_timeouts(manual=True)
def test__check_cleanwait_timeouts_boot_timeout(self):
self._check_cleanwait_timeouts(with_step=False)
@mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up', @mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up',
autospec=True) autospec=True)
@mock.patch.object(conductor_utils, 'node_power_action', autospec=True) @mock.patch.object(conductor_utils, 'node_power_action', autospec=True)

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Failed cleaning no longer results in maintenance mode if no clean step is
running, e.g. on PXE timeout or failed clean steps validation.