Agent vendor handles manual cleaning

Agent base vendor now supports manual cleaning. In the case of a
clean version mismatch (agent rebooted to a new version after manual
cleaning started), we continue from the current clean step instead
of (in automated cleaning) restarting from the first clean step.

Change-Id: I89726f3b32f96befba9a2e1bee67b2a7cc7eeb57
Partial-Bug: #1526290
This commit is contained in:
Ruby Loo 2015-11-19 17:14:55 +00:00
parent cc3891573e
commit 1f82783a9e
4 changed files with 208 additions and 38 deletions

View File

@ -646,10 +646,18 @@ class ConductorManager(base_manager.BaseConductorManager):
state=node.provision_state)
self._do_node_clean(task)
def _get_node_next_clean_steps(self, task):
def _get_node_next_clean_steps(self, task, skip_current_step=True):
"""Get the task's node's next clean steps.
This returns the list of remaining (ordered) clean steps to be done
on the node. If no clean steps have been started yet, all the clean
steps are returned. Otherwise, the clean steps after the current
clean step are returned. The current clean step is also returned if
'skip_current_step' is False.
:param task: A TaskManager object
:param skip_current_step: True to skip the current clean step; False to
include it.
:raises: NodeCleaningFailure if an internal error occurred when
getting the next clean steps
:returns: ordered list of clean step dictionaries
@ -662,9 +670,12 @@ class ConductorManager(base_manager.BaseConductorManager):
return next_steps
try:
# Trim off the last clean step (now finished) and
# all previous steps
next_steps = next_steps[next_steps.index(node.clean_step) + 1:]
# Trim off all previous steps up to (and maybe including) the
# current clean step.
ind = next_steps.index(node.clean_step)
if skip_current_step:
ind += 1
next_steps = next_steps[ind:]
except ValueError:
msg = (_('Node %(node)s got an invalid last step for '
'%(state)s: %(step)s.') %
@ -782,7 +793,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state,
'clean_state': states.CLEANWAIT})
next_steps = self._get_node_next_clean_steps(task)
info = node.driver_internal_info
try:
skip_current_step = info.pop('skip_current_clean_step')
except KeyError:
skip_current_step = True
else:
node.driver_internal_info = info
node.save()
next_steps = self._get_node_next_clean_steps(
task, skip_current_step=skip_current_step)
# If this isn't the final clean step in the cleaning operation
# and it is flagged to abort after the clean step that just

View File

@ -218,14 +218,18 @@ class BaseAgentVendor(base.VendorInterface):
def continue_cleaning(self, task, **kwargs):
"""Start the next cleaning step if the previous one is complete.
In order to avoid errors and make agent upgrades painless, cleaning
will check the version of all hardware managers during get_clean_steps
at the beginning of cleaning and before executing each step in the
agent. If the version has changed between steps, the agent is unable
to tell if an ordering change will cause a cleaning issue. Therefore,
we restart cleaning.
In order to avoid errors and make agent upgrades painless, the agent
compares the version of all hardware managers at the start of the
cleaning (the agent's get_clean_steps() call) and before executing
each clean step. If the version has changed between steps, the agent is
unable to tell if an ordering change will cause a cleaning issue so
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
the entire cleaning cycle. For manual cleaning, we don't.
"""
node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is (the default) AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
command = self._get_completed_cleaning_command(task)
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
' %(command)s', {'node': node.uuid,
@ -245,20 +249,57 @@ class BaseAgentVendor(base.VendorInterface):
LOG.error(msg)
return manager_utils.cleaning_error_handler(task, msg)
elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
# Restart cleaning, agent must have rebooted to new version
LOG.info(_LI('Node %s detected a clean version mismatch, '
'resetting clean steps and rebooting the node.'),
node.uuid)
try:
manager_utils.set_node_cleaning_steps(task)
except exception.NodeCleaningFailure:
msg = (_('Could not restart cleaning on node %(node)s: '
'%(err)s.') %
{'node': node.uuid,
'err': command.get('command_error'),
'step': node.clean_step})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
if manual_clean:
# Don't restart manual cleaning if agent reboots to a new
# version. Both are operator actions, unlike automated
# cleaning. Manual clean steps are not necessarily idempotent
# like automated clean steps and can be even longer running.
LOG.info(_LI('During manual cleaning, node %(node)s detected '
'a clean version mismatch. Re-executing and '
'continuing from current step %(step)s.'),
{'node': node.uuid, 'step': node.clean_step})
result = self._client.get_clean_steps(
task.node, task.ports).get('command_result')
required_keys = ('clean_steps', 'hardware_manager_version')
missing_keys = [key for key in required_keys
if key not in result]
if missing_keys:
msg = (_('Could not continue manual cleaning from step '
'%(step)s on node %(node)s. get_clean_steps '
'returned invalid result. The keys %(keys)s are '
'missing from result %(result)s.') %
{'node': node.uuid,
'step': node.clean_step,
'keys': missing_keys,
'result': result})
LOG.error(msg)
return manager_utils.cleaning_error_handler(task, msg)
driver_internal_info = node.driver_internal_info
driver_internal_info['hardware_manager_version'] = result[
'hardware_manager_version']
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
node.save()
else:
# Restart cleaning, agent must have rebooted to new version
LOG.info(_LI('During automated cleaning, node %s detected a '
'clean version mismatch. Resetting clean steps '
'and rebooting the node.'),
node.uuid)
try:
manager_utils.set_node_cleaning_steps(task)
except exception.NodeCleaningFailure:
msg = (_('Could not restart automated cleaning on node '
'%(node)s: %(err)s.') %
{'node': node.uuid,
'err': command.get('command_error'),
'step': node.clean_step})
LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg)
self.notify_conductor_resume_clean(task)
elif command.get('command_status') == 'SUCCEEDED':
@ -354,7 +395,8 @@ class BaseAgentVendor(base.VendorInterface):
if not node.clean_step:
LOG.debug('Node %s just booted to start cleaning.',
node.uuid)
msg = _('Node failed to start the next cleaning step.')
msg = _('Node failed to start the first cleaning '
'step.')
manager_utils.set_node_cleaning_steps(task)
self.notify_conductor_resume_clean(task)
else:
@ -368,7 +410,7 @@ class BaseAgentVendor(base.VendorInterface):
except Exception as e:
err_info = {'node': node.uuid, 'msg': msg, 'e': e}
last_error = _('Asynchronous exception for node %(node)s: '
'%(msg)s exception: %(e)s') % err_info
'%(msg)s Exception: %(e)s') % err_info
LOG.exception(last_error)
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
manager_utils.cleaning_error_handler(task, last_error)

View File

@ -1543,6 +1543,35 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
def test_continue_node_clean_backward_compat(self):
self._continue_node_clean(states.CLEANING)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
def _continue_node_clean_skip_step(self, mock_spawn, skip=True):
# test that skipping current step mechanism works
driver_info = {'clean_steps': self.clean_steps}
if not skip:
driver_info['skip_current_clean_step'] = skip
node = obj_utils.create_test_node(
self.context, driver='fake', provision_state=states.CLEANWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, clean_step=self.clean_steps[0])
self._start_service()
self.service.continue_node_clean(self.context, node.uuid)
self.service._worker_pool.waitall()
node.refresh()
if skip:
expected_steps = self.next_clean_steps
else:
self.assertFalse(
'skip_current_clean_step' in node.driver_internal_info)
expected_steps = self.clean_steps
mock_spawn.assert_called_with(self.service._do_next_clean_step,
mock.ANY, expected_steps)
def test_continue_node_clean_skip_step(self):
self._continue_node_clean_skip_step()
def test_continue_node_clean_no_skip_step(self):
self._continue_node_clean_skip_step(skip=False)
def _continue_node_clean_abort(self, manual=False):
last_clean_step = self.clean_steps[0]
last_clean_step['abortable'] = False
@ -2061,7 +2090,7 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
def test__do_next_clean_step_manual_bad_step_return_value(self):
self._do_next_clean_step_bad_step_return_value(manual=True)
def test__get_node_next_clean_steps(self):
def __get_node_next_clean_steps(self, skip=True):
driver_internal_info = {'clean_steps': self.clean_steps}
node = obj_utils.create_test_node(
self.context, driver='fake',
@ -2072,8 +2101,16 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
clean_step=self.clean_steps[0])
with task_manager.acquire(self.context, node.uuid) as task:
steps = self.service._get_node_next_clean_steps(task)
self.assertEqual(self.next_clean_steps, steps)
steps = self.service._get_node_next_clean_steps(
task, skip_current_step=skip)
expected = self.next_clean_steps if skip else self.clean_steps
self.assertEqual(expected, steps)
def test__get_node_next_clean_steps(self):
self.__get_node_next_clean_steps()
def test__get_node_next_clean_steps_no_skip(self):
self.__get_node_next_clean_steps(skip=False)
def test__get_node_next_clean_steps_unset_clean_step(self):
driver_internal_info = {'clean_steps': self.clean_steps}

View File

@ -319,7 +319,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
log_mock.assert_called_once_with(
'Asynchronous exception for node '
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
'is done. exception: LlamaException')
'is done. Exception: LlamaException')
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'deploy_has_started',
autospec=True)
@ -354,7 +354,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
log_mock.assert_called_once_with(
'Asynchronous exception for node '
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
'is done. exception: LlamaException')
'is done. Exception: LlamaException')
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
@ -897,21 +897,91 @@ class TestBaseAgentVendor(db_base.DbTestCase):
@mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'notify_conductor_resume_clean', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_cleaning_clean_version_mismatch(
self, status_mock, notify_mock, steps_mock):
# Test that cleaning is restarted if there is a version mismatch
def _test_continue_cleaning_clean_version_mismatch(
self, status_mock, get_steps_mock, notify_mock, steps_mock,
manual=False):
get_steps_mock.return_value = {
'command_status': 'CLEAN_VERSION_MISMATCH',
'command_name': 'get_clean_step',
'command_result': {
'hardware_manager_version': {'Generic': '1'},
'clean_steps': {
'GenericHardwareManager': [
{'interface': 'deploy',
'step': 'erase_devices',
'priority': 20}]}
}
}
status_mock.return_value = [{
'command_status': 'CLEAN_VERSION_MISMATCH',
'command_name': 'execute_clean_step',
'command_result': {}
}]
with task_manager.acquire(self.context, self.node['uuid'],
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self.node.provision_state = states.CLEANWAIT
self.node.target_provision_state = tgt_prov_state
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.passthru.continue_cleaning(task)
steps_mock.assert_called_once_with(task)
notify_mock.assert_called_once_with(mock.ANY, task)
if manual:
get_steps_mock.assert_called_once_with(mock.ANY, task.node,
task.ports)
self.assertFalse(
task.node.driver_internal_info['skip_current_clean_step'])
self.assertEqual(
{'Generic': '1'},
task.node.driver_internal_info['hardware_manager_version'])
self.assertFalse(steps_mock.called)
else:
self.assertFalse(get_steps_mock.called)
steps_mock.assert_called_once_with(task)
self.assertFalse('skip_current_clean_step' in
task.node.driver_internal_info)
def test_continue_cleaning_automated_clean_version_mismatch(self):
self._test_continue_cleaning_clean_version_mismatch()
def test_continue_cleaning_manual_clean_version_mismatch(self):
self._test_continue_cleaning_clean_version_mismatch(manual=True)
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'notify_conductor_resume_clean', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_cleaning_manual_version_mismatch_bad(
self, status_mock, get_steps_mock, notify_mock, error_mock):
get_steps_mock.return_value = {
'command_status': 'CLEAN_VERSION_MISMATCH',
'command_name': 'get_clean_step',
'command_result': {
'hardware_manager_version': {'Generic': '1'}}
}
status_mock.return_value = [{
'command_status': 'CLEAN_VERSION_MISMATCH',
'command_name': 'execute_clean_step',
}]
tgt_prov_state = states.MANAGEABLE
self.node.provision_state = states.CLEANWAIT
self.node.target_provision_state = tgt_prov_state
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.passthru.continue_cleaning(task)
get_steps_mock.assert_called_once_with(mock.ANY, task.node,
task.ports)
error_mock.assert_called_once_with(task, mock.ANY)
self.assertFalse(notify_mock.called)
self.assertFalse('skip_current_clean_step' in
task.node.driver_internal_info)
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',