Do not move nodes to CLEAN FAILED with empty last_error

When cleaning fails, we power off the node, unless it has been running
a clean step already. This happens when aborting cleaning or on a boot
failure. This change makes sure that the power action does not wipe
the last_error field, resulting in a node with provision_state=CLEANFAIL
and last_error=None for several seconds. I've hit this in Metal3.

Also when aborting cleaning, make sure last_error is set during
the transition to CLEANFAIL, not when the clean up thread starts
running.

While here, make sure to log the current step in all cases, not only
when aborting a non-abortable step.

Change-Id: Id21dd7eb44dad149661ebe2d75a9b030aa70526f
Story: #2010603
Task: #47476
This commit is contained in:
Dmitry Tantsur 2023-02-22 16:08:09 +01:00
parent 8f25cf03a3
commit 9a0fa631ca
9 changed files with 80 additions and 27 deletions

View File

@ -269,6 +269,9 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE,
FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES)
"""States where API lookups are permitted with fast track enabled.""" """States where API lookups are permitted with fast track enabled."""
FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL,
RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL))
############## ##############
# Power states # Power states

View File

@ -248,12 +248,21 @@ def do_next_clean_step(task, step_index, disable_ramdisk=None):
task.process_event(event) task.process_event(event)
def get_last_error(node):
last_error = _('By request, the clean operation was aborted')
if node.clean_step:
last_error += (
_(' during or after the completion of step "%s"')
% conductor_steps.step_id(node.clean_step)
)
return last_error
@task_manager.require_exclusive_lock @task_manager.require_exclusive_lock
def do_node_clean_abort(task, step_name=None): def do_node_clean_abort(task):
"""Internal method to abort an ongoing operation. """Internal method to abort an ongoing operation.
:param task: a TaskManager instance with an exclusive lock :param task: a TaskManager instance with an exclusive lock
:param step_name: The name of the clean step.
""" """
node = task.node node = task.node
try: try:
@ -271,12 +280,13 @@ def do_node_clean_abort(task, step_name=None):
set_fail_state=False) set_fail_state=False)
return return
last_error = get_last_error(node)
info_message = _('Clean operation aborted for node %s') % node.uuid info_message = _('Clean operation aborted for node %s') % node.uuid
last_error = _('By request, the clean operation was aborted') if node.clean_step:
if step_name: info_message += (
msg = _(' after the completion of step "%s"') % step_name _(' during or after the completion of step "%s"')
last_error += msg % node.clean_step
info_message += msg )
node.last_error = last_error node.last_error = last_error
node.clean_step = None node.clean_step = None
@ -318,7 +328,7 @@ def continue_node_clean(task):
target_state = None target_state = None
task.process_event('fail', target_state=target_state) task.process_event('fail', target_state=target_state)
do_node_clean_abort(task, step_name) do_node_clean_abort(task)
return return
LOG.debug('The cleaning operation for node %(node)s was ' LOG.debug('The cleaning operation for node %(node)s was '

View File

@ -1349,7 +1349,8 @@ class ConductorManager(base_manager.BaseConductorManager):
callback=self._spawn_worker, callback=self._spawn_worker,
call_args=(cleaning.do_node_clean_abort, task), call_args=(cleaning.do_node_clean_abort, task),
err_handler=utils.provisioning_error_handler, err_handler=utils.provisioning_error_handler,
target_state=target_state) target_state=target_state,
last_error=cleaning.get_last_error(node))
return return
if node.provision_state == states.RESCUEWAIT: if node.provision_state == states.RESCUEWAIT:

View File

@ -527,7 +527,8 @@ class TaskManager(object):
self.release_resources() self.release_resources()
def process_event(self, event, callback=None, call_args=None, def process_event(self, event, callback=None, call_args=None,
call_kwargs=None, err_handler=None, target_state=None): call_kwargs=None, err_handler=None, target_state=None,
last_error=None):
"""Process the given event for the task's current state. """Process the given event for the task's current state.
:param event: the name of the event to process :param event: the name of the event to process
@ -540,6 +541,8 @@ class TaskManager(object):
prev_target_state) prev_target_state)
:param target_state: if specified, the target provision state for the :param target_state: if specified, the target provision state for the
node. Otherwise, use the target state from the fsm node. Otherwise, use the target state from the fsm
:param last_error: last error to set on the node together with
the state transition.
:raises: InvalidState if the event is not allowed by the associated :raises: InvalidState if the event is not allowed by the associated
state machine state machine
""" """
@ -572,13 +575,15 @@ class TaskManager(object):
# set up the async worker # set up the async worker
if callback: if callback:
# clear the error if we're going to start work in a callback # update the error if we're going to start work in a callback
self.node.last_error = None self.node.last_error = last_error
if call_args is None: if call_args is None:
call_args = () call_args = ()
if call_kwargs is None: if call_kwargs is None:
call_kwargs = {} call_kwargs = {}
self.spawn_after(callback, *call_args, **call_kwargs) self.spawn_after(callback, *call_args, **call_kwargs)
elif last_error is not None:
self.node.last_error = last_error
# publish the state transition by saving the Node # publish the state transition by saving the Node
self.node.save() self.node.save()

View File

@ -302,9 +302,11 @@ def node_power_action(task, new_state, timeout=None):
# Set the target_power_state and clear any last_error, if we're # Set the target_power_state and clear any last_error, if we're
# starting a new operation. This will expose to other processes # starting a new operation. This will expose to other processes
# and clients that work is in progress. # and clients that work is in progress. Keep the last_error intact
node['target_power_state'] = target_state # if the power action happens as a result of a failure.
node['last_error'] = None node.target_power_state = target_state
if node.provision_state not in states.FAILURE_STATES:
node.last_error = None
node.timestamp_driver_internal_info('last_power_state_change') node.timestamp_driver_internal_info('last_power_state_change')
# NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in # NOTE(dtantsur): wipe token on shutting down, otherwise a reboot in
# fast-track (or an accidentally booted agent) will cause subsequent # fast-track (or an accidentally booted agent) will cause subsequent

View File

@ -1138,12 +1138,12 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
class DoNodeCleanAbortTestCase(db_base.DbTestCase): class DoNodeCleanAbortTestCase(db_base.DbTestCase):
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
def _test__do_node_clean_abort(self, step_name, tear_mock): def _test_do_node_clean_abort(self, clean_step, tear_mock):
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake-hardware', self.context, driver='fake-hardware',
provision_state=states.CLEANFAIL, provision_state=states.CLEANWAIT,
target_provision_state=states.AVAILABLE, target_provision_state=states.AVAILABLE,
clean_step={'step': 'foo', 'abortable': True}, clean_step=clean_step,
driver_internal_info={ driver_internal_info={
'agent_url': 'some url', 'agent_url': 'some url',
'agent_secret_token': 'token', 'agent_secret_token': 'token',
@ -1153,11 +1153,11 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
'skip_current_clean_step': True}) 'skip_current_clean_step': True})
with task_manager.acquire(self.context, node.uuid) as task: with task_manager.acquire(self.context, node.uuid) as task:
cleaning.do_node_clean_abort(task, step_name=step_name) cleaning.do_node_clean_abort(task)
self.assertIsNotNone(task.node.last_error) self.assertIsNotNone(task.node.last_error)
tear_mock.assert_called_once_with(task.driver.deploy, task) tear_mock.assert_called_once_with(task.driver.deploy, task)
if step_name: if clean_step:
self.assertIn(step_name, task.node.last_error) self.assertIn(clean_step['step'], task.node.last_error)
# assert node's clean_step and metadata was cleaned up # assert node's clean_step and metadata was cleaned up
self.assertEqual({}, task.node.clean_step) self.assertEqual({}, task.node.clean_step)
self.assertNotIn('clean_step_index', self.assertNotIn('clean_step_index',
@ -1173,11 +1173,12 @@ class DoNodeCleanAbortTestCase(db_base.DbTestCase):
self.assertNotIn('agent_secret_token', self.assertNotIn('agent_secret_token',
task.node.driver_internal_info) task.node.driver_internal_info)
def test__do_node_clean_abort(self): def test_do_node_clean_abort_early(self):
self._test__do_node_clean_abort(None) self._test_do_node_clean_abort(None)
def test__do_node_clean_abort_with_step_name(self): def test_do_node_clean_abort_with_step(self):
self._test__do_node_clean_abort('foo') self._test_do_node_clean_abort({'step': 'foo', 'interface': 'deploy',
'abortable': True})
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True) @mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
def test__do_node_clean_abort_tear_down_fail(self, tear_mock): def test__do_node_clean_abort_tear_down_fail(self, tear_mock):

View File

@ -2734,7 +2734,8 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
# Node will be moved to tgt_prov_state after cleaning, not tested here # Node will be moved to tgt_prov_state after cleaning, not tested here
self.assertEqual(states.CLEANFAIL, node.provision_state) self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state) self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNone(node.last_error) self.assertEqual('By request, the clean operation was aborted',
node.last_error)
mock_spawn.assert_called_with( mock_spawn.assert_called_with(
self.service, cleaning.do_node_clean_abort, mock.ANY) self.service, cleaning.do_node_clean_abort, mock.ANY)

View File

@ -196,7 +196,8 @@ class NodePowerActionTestCase(db_base.DbTestCase):
node = obj_utils.create_test_node(self.context, node = obj_utils.create_test_node(self.context,
uuid=uuidutils.generate_uuid(), uuid=uuidutils.generate_uuid(),
driver='fake-hardware', driver='fake-hardware',
power_state=states.POWER_OFF) power_state=states.POWER_OFF,
last_error='failed before')
task = task_manager.TaskManager(self.context, node.uuid) task = task_manager.TaskManager(self.context, node.uuid)
get_power_mock.return_value = states.POWER_OFF get_power_mock.return_value = states.POWER_OFF
@ -209,6 +210,27 @@ class NodePowerActionTestCase(db_base.DbTestCase):
self.assertIsNone(node['target_power_state']) self.assertIsNone(node['target_power_state'])
self.assertIsNone(node['last_error']) self.assertIsNone(node['last_error'])
@mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
def test_node_power_action_keep_last_error(self, get_power_mock):
"""Test node_power_action to keep last_error for failed states."""
node = obj_utils.create_test_node(self.context,
uuid=uuidutils.generate_uuid(),
driver='fake-hardware',
power_state=states.POWER_OFF,
provision_state=states.CLEANFAIL,
last_error='failed before')
task = task_manager.TaskManager(self.context, node.uuid)
get_power_mock.return_value = states.POWER_OFF
conductor_utils.node_power_action(task, states.POWER_ON)
node.refresh()
get_power_mock.assert_called_once_with(mock.ANY, mock.ANY)
self.assertEqual(states.POWER_ON, node['power_state'])
self.assertIsNone(node['target_power_state'])
self.assertEqual('failed before', node['last_error'])
@mock.patch('ironic.objects.node.NodeSetPowerStateNotification', @mock.patch('ironic.objects.node.NodeSetPowerStateNotification',
autospec=True) autospec=True)
@mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
When aborting cleaning, the ``last_error`` field is no longer initially
empty. It is now populated on the state transition to ``clean failed``.
- |
When cleaning or deployment fails, the ``last_error`` field is no longer
temporary set to ``None`` while the power off action is running.