Support executing in-band deploy steps

This patch introduces changes necessary to execute in-band
deploy steps among the already supported out-of-band ones:
* Load in-band deploy steps on the first heartbeat.
* Duplicates some clean step functions and changes
  others to make them usable for deploy steps.
* Modifies the conductor to delay validation of deploy steps
  and templates that may be run in-band.
* Modifies the agent's execute_deploy_step to gracefully
  support both in-band and out-of-band steps.

Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com>
Change-Id: I6744029cac9b13ae1b973b19983c5605d35b7397
Depends-On: https://review.opendev.org/698770
Story: 2006963
Task: 37789
This commit is contained in:
Mark Goddard 2019-12-12 17:04:13 +00:00 committed by Dmitry Tantsur
parent ff2030ce98
commit 4795c4a8b8
6 changed files with 260 additions and 24 deletions

View File

@ -88,6 +88,7 @@ def start_deploy(task, manager, configdrive=None, event='deploy'):
node.instance_info = instance_info
driver_internal_info = node.driver_internal_info
driver_internal_info.pop('steps_validated', None)
# Infer the image type to make sure the deploy driver
# validates only the necessary variables for different
# image types.
@ -185,8 +186,9 @@ def do_node_deploy(task, conductor_id=None, configdrive=None):
traceback=True, clean_up=False)
try:
# This gets the deploy steps and puts them in the node's
# driver_internal_info['deploy_steps'].
# This gets the deploy steps (if any) and puts them in the node's
# driver_internal_info['deploy_steps']. In-band steps are skipped since
# we know that an agent is not running yet.
conductor_steps.set_node_deployment_steps(task)
except exception.InstanceDeployFailure as e:
with excutils.save_and_reraise_exception():
@ -313,6 +315,7 @@ def do_next_deploy_step(task, step_index, conductor_id):
driver_internal_info.pop('deploy_step_index', None)
driver_internal_info.pop('deployment_reboot', None)
driver_internal_info.pop('deployment_polling', None)
driver_internal_info.pop('steps_validated', None)
# Remove the agent_url cached from the deployment.
driver_internal_info.pop('agent_url', None)
node.driver_internal_info = driver_internal_info

View File

@ -872,6 +872,15 @@ class ConductorManager(base_manager.BaseConductorManager):
save_required = False
info = node.driver_internal_info
# Agent is running, we're ready to validate the remaining steps
if not info.get('steps_validated'):
conductor_steps.validate_deploy_templates(task)
conductor_steps.set_node_deployment_steps(
task, reset_current=False)
info['steps_validated'] = True
save_required = True
try:
skip_current_step = info.pop('skip_current_deploy_step')
except KeyError:

View File

@ -87,6 +87,11 @@ def is_equivalent(step1, step2):
and step1.get('step') == step2.get('step'))
def find_step(steps, step):
"""Find an identical step in the list of steps."""
return next((x for x in steps if is_equivalent(x, step)), None)
def _get_steps(task, interfaces, get_method, enabled=False,
sort_step_key=None):
"""Get steps for task.node.
@ -299,19 +304,21 @@ def _get_all_deployment_steps(task):
return _sorted_steps(steps, _deploy_step_key)
def set_node_deployment_steps(task):
def set_node_deployment_steps(task, reset_current=True):
"""Set up the node with deployment step information for deploying.
Get the deploy steps from the driver.
:param reset_current: Whether to reset the current step to the first one.
:raises: InstanceDeployFailure if there was a problem getting the
deployment steps.
"""
node = task.node
driver_internal_info = node.driver_internal_info
driver_internal_info['deploy_steps'] = _get_all_deployment_steps(task)
node.deploy_step = {}
driver_internal_info['deploy_step_index'] = None
if reset_current:
node.deploy_step = {}
driver_internal_info['deploy_step_index'] = None
node.driver_internal_info = driver_internal_info
node.save()

View File

@ -477,14 +477,22 @@ class HeartbeatMixin(object):
{'node': node.uuid})
def _heartbeat_deploy_wait(self, task):
msg = _('Failed checking if deploy is done')
msg = _('Unexpected exception')
node = task.node
try:
# NOTE(dtantsur): on first heartbeat, load in-band steps.
if not node.driver_internal_info.get('agent_cached_deploy_steps'):
msg = _('Failed to load in-band deploy steps')
# Refresh steps since this is the first time IPA has
# booted and we need to collect in-band steps.
self.refresh_steps(task, 'deploy')
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
# are currently in the core deploy.deploy step. Other deploy steps
# may cause the agent to boot, but we should not trigger deployment
# at that point if the driver is polling for completion of a step.
if self.in_core_deploy_step(task):
msg = _('Failed checking if deploy is done')
if not self.deploy_has_started(task):
msg = _('Node failed to deploy')
self.continue_deploy(task)
@ -494,15 +502,14 @@ class HeartbeatMixin(object):
else:
node.touch_provisioning()
else:
# The exceptions from RPC are not possible as we using cast
# here
# Check if the driver is polling for completion of a step,
# via the 'deployment_polling' flag.
node.touch_provisioning()
# Check if the driver is polling for completion of
# a step, via the 'deployment_polling' flag.
polling = node.driver_internal_info.get(
'deployment_polling', False)
if not polling:
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
msg = _('Failed to process the next deploy step')
self.process_next_step(task, 'deploy')
except Exception as e:
last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
@ -663,6 +670,25 @@ class AgentDeployMixin(HeartbeatMixin):
task, 'clean', interface='deploy',
override_priorities=new_priorities)
@METRICS.timer('AgentDeployMixin.get_deploy_steps')
def get_deploy_steps(self, task):
"""Get the list of deploy steps from the agent.
:param task: a TaskManager object containing the node
:raises InstanceDeployFailure: if the deploy steps are not yet
available (cached), for example, when a node has just been
enrolled and has not been deployed yet.
:returns: A list of deploy step dictionaries
"""
steps = super(AgentDeployMixin, self).get_deploy_steps(task)[:]
ib_steps = get_steps(task, 'deploy', interface='deploy')
# NOTE(dtantsur): we allow in-band steps to be shadowed by out-of-band
# ones, see the docstring of execute_deploy_step for details.
steps += [step for step in ib_steps
# FIXME(dtantsur): nested loops are not too efficient
if not conductor_steps.find_step(steps, step)]
return steps
@METRICS.timer('AgentDeployMixin.refresh_steps')
def refresh_steps(self, task, step_type):
"""Refresh the node's cached clean/deploy steps from the booted agent.
@ -687,7 +713,18 @@ class AgentDeployMixin(HeartbeatMixin):
'steps': previous_steps})
call = getattr(self._client, 'get_%s_steps' % step_type)
agent_result = call(node, task.ports).get('command_result', {})
try:
agent_result = call(node, task.ports).get('command_result', {})
except exception.AgentAPIError as exc:
if step_type == 'clean':
raise
else:
LOG.warning('Agent running on node %(node)s does not support '
'in-band deploy steps: %(err)s. Support for old '
'agents will be removed in the V release.',
{'node': node.uuid, 'err': exc})
return
missing = set(['%s_steps' % step_type,
'hardware_manager_version']).difference(agent_result)
if missing:
@ -741,6 +778,37 @@ class AgentDeployMixin(HeartbeatMixin):
"""
return execute_step(task, step, 'clean')
@METRICS.timer('AgentDeployMixin.execute_deploy_step')
def execute_deploy_step(self, task, step):
"""Execute a deploy step.
We're trying to find a step among both out-of-band and in-band steps.
In case of duplicates, out-of-band steps take priority. This property
allows having an out-of-band deploy step that calls into
a corresponding in-band step after some preparation (e.g. with
additional input).
:param task: a TaskManager object containing the node
:param step: a deploy step dictionary to execute
:raises: NodeCleaningFailure if the agent does not return a command
status
:returns: states.DEPLOYWAIT to signify the step will be completed async
"""
agent_running = task.node.driver_internal_info.get(
'agent_cached_deploy_steps')
oob_steps = self.deploy_steps
if conductor_steps.find_step(oob_steps, step):
return super(AgentDeployMixin, self).execute_deploy_step(
task, step)
elif not agent_running:
raise exception.InstanceDeployFailure(
_('Deploy step %(step)s has not been found. Available '
'out-of-band steps: %(oob)s. Agent is not running.') %
{'step': step, 'oob': oob_steps})
else:
return execute_step(task, step, 'deploy')
def _process_version_mismatch(self, task, step_type):
node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas
@ -811,7 +879,7 @@ class AgentDeployMixin(HeartbeatMixin):
process (the agent's get_clean|deploy_steps() call) and before
executing each step. If the version has changed between steps,
the agent is unable to tell if an ordering change will cause an issue
so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we
so it returns VERSION_MISMATCH. For automated cleaning, we
restart the entire cleaning cycle. For manual cleaning or deploy,
we don't.
@ -854,8 +922,10 @@ class AgentDeployMixin(HeartbeatMixin):
'type': step_type})
LOG.error(msg)
return manager_utils.cleaning_error_handler(task, msg)
# NOTE(dtantsur): VERSION_MISMATCH is a new alias for
# CLEAN_VERSION_MISMATCH, remove the old one after IPA removes it.
elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH',
'DEPLOY_VERSION_MISMATCH'):
'VERSION_MISMATCH'):
self._process_version_mismatch(task, step_type)
elif command.get('command_status') == 'SUCCEEDED':
step_hook = _get_post_step_hook(node, step_type)

View File

@ -1663,6 +1663,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
'step': 'deploy_start', 'priority': 50, 'interface': 'deploy'}
self.deploy_end = {
'step': 'deploy_end', 'priority': 20, 'interface': 'deploy'}
self.in_band_step = {
'step': 'deploy_middle', 'priority': 30, 'interface': 'deploy'}
self.deploy_steps = [self.deploy_start, self.deploy_end]
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
@ -1715,7 +1717,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
prv_state = states.DEPLOYWAIT
tgt_prv_state = states.ACTIVE
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
'deploy_step_index': 0,
'steps_validated': True}
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=prv_state,
target_provision_state=tgt_prv_state,
@ -1732,6 +1735,36 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
deployments.do_next_deploy_step,
mock.ANY, 1, mock.ANY)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_deploy_steps',
autospec=True)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_continue_node_deploy_first_agent_boot(self, mock_spawn,
mock_get_steps):
new_steps = [self.deploy_start, self.in_band_step, self.deploy_end]
mock_get_steps.return_value = new_steps
prv_state = states.DEPLOYWAIT
tgt_prv_state = states.ACTIVE
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=prv_state,
target_provision_state=tgt_prv_state,
last_error=None,
driver_internal_info=driver_info,
deploy_step=self.deploy_steps[0])
self._start_service()
self.service.continue_node_deploy(self.context, node.uuid)
self._stop_service()
node.refresh()
self.assertEqual(states.DEPLOYING, node.provision_state)
self.assertEqual(tgt_prv_state, node.target_provision_state)
self.assertTrue(node.driver_internal_info['steps_validated'])
self.assertEqual(new_steps, node.driver_internal_info['deploy_steps'])
mock_spawn.assert_called_with(mock.ANY,
deployments.do_next_deploy_step,
mock.ANY, 1, mock.ANY)
@mock.patch.object(task_manager.TaskManager, 'process_event',
autospec=True)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
@ -1744,7 +1777,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
prv_state = states.DEPLOYING
tgt_prv_state = states.ACTIVE
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
'deploy_step_index': 0,
'steps_validated': True}
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake-hardware',
provision_state=prv_state,
@ -1767,7 +1801,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
def _continue_node_deploy_skip_step(self, mock_spawn, skip=True):
# test that skipping current step mechanism works
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0}
'deploy_step_index': 0,
'steps_validated': True}
if not skip:
driver_info['skip_current_deploy_step'] = skip
node = obj_utils.create_test_node(
@ -1801,7 +1836,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
# test that deployment_polling flag is cleared
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0,
'deployment_polling': True}
'deployment_polling': True,
'steps_validated': True}
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYWAIT,

View File

@ -83,6 +83,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
super(HeartbeatMixinTest, self).setUp()
self.deploy = agent_base.HeartbeatMixin()
@mock.patch.object(agent_base.HeartbeatMixin,
'refresh_steps', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
@ -93,7 +95,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
'reboot_to_instance', autospec=True)
def test_heartbeat_continue_deploy(self, rti_mock, cd_mock,
deploy_started_mock,
in_deploy_mock):
in_deploy_mock,
refresh_steps_mock):
in_deploy_mock.return_value = True
deploy_started_mock.return_value = False
self.node.provision_state = states.DEPLOYWAIT
@ -109,6 +112,42 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
task.node.driver_internal_info['agent_version'])
cd_mock.assert_called_once_with(self.deploy, task)
self.assertFalse(rti_mock.called)
refresh_steps_mock.assert_called_once_with(self.deploy,
task, 'deploy')
@mock.patch.object(agent_base.HeartbeatMixin,
'refresh_steps', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'deploy_has_started', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'reboot_to_instance', autospec=True)
def test_heartbeat_continue_deploy_second_run(self, rti_mock, cd_mock,
deploy_started_mock,
in_deploy_mock,
refresh_steps_mock):
in_deploy_mock.return_value = True
deploy_started_mock.return_value = False
dii = self.node.driver_internal_info
dii['agent_cached_deploy_steps'] = ['step']
self.node.driver_internal_info = dii
self.node.provision_state = states.DEPLOYWAIT
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.deploy.heartbeat(task, 'url', '3.2.0')
self.assertFalse(task.shared)
self.assertEqual(
'url', task.node.driver_internal_info['agent_url'])
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
cd_mock.assert_called_once_with(self.deploy, task)
self.assertFalse(rti_mock.called)
self.assertFalse(refresh_steps_mock.called)
@mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@ -141,8 +180,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(cd_mock.called)
rti_mock.assert_called_once_with(self.deploy, task)
@mock.patch.object(manager_utils,
'notify_conductor_resume_deploy', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'process_next_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
@ -157,7 +196,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
deploy_is_done_mock,
deploy_started_mock,
in_deploy_mock,
in_resume_deploy_mock):
process_next_mock):
# Check that heartbeats do not trigger deployment actions when not in
# the deploy.deploy step.
in_deploy_mock.return_value = False
@ -176,7 +215,53 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
self.assertTrue(in_resume_deploy_mock.called)
process_next_mock.assert_called_once_with(self.deploy,
task, 'deploy')
@mock.patch.object(agent_base.HeartbeatMixin,
'refresh_steps', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'process_next_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'deploy_has_started', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'deploy_is_done', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'reboot_to_instance', autospec=True)
def test_heartbeat_not_in_core_deploy_step_refresh(self, rti_mock, cd_mock,
deploy_is_done_mock,
deploy_started_mock,
in_deploy_mock,
process_next_mock,
refresh_steps_mock):
# Check loading in-band deploy steps.
in_deploy_mock.return_value = False
self.node.provision_state = states.DEPLOYWAIT
info = self.node.driver_internal_info
info.pop('agent_cached_deploy_steps', None)
self.node.driver_internal_info = info
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.deploy.heartbeat(task, 'url', '3.2.0')
self.assertFalse(task.shared)
self.assertEqual(
'url', task.node.driver_internal_info['agent_url'])
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
self.assertFalse(deploy_started_mock.called)
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
refresh_steps_mock.assert_called_once_with(self.deploy,
task, 'deploy')
process_next_mock.assert_called_once_with(self.deploy,
task, 'deploy')
@mock.patch.object(manager_utils,
'notify_conductor_resume_deploy', autospec=True)
@ -200,6 +285,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
in_deploy_mock.return_value = False
self.node.provision_state = states.DEPLOYWAIT
info = self.node.driver_internal_info
info['agent_cached_deploy_steps'] = ['step1']
info['deployment_polling'] = True
self.node.driver_internal_info = info
self.node.save()
@ -2132,6 +2218,10 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
task.ports)
class FakeAgentDeploy(agent_base.AgentDeployMixin, fake.FakeDeploy):
pass
class StepMethodsTestCase(db_base.DbTestCase):
def setUp(self):
@ -2159,6 +2249,7 @@ class StepMethodsTestCase(db_base.DbTestCase):
self.node = object_utils.create_test_node(self.context, **n)
self.ports = [object_utils.create_test_port(self.context,
node_id=self.node.id)]
self.deploy = FakeAgentDeploy()
def test_agent_get_steps(self):
with task_manager.acquire(
@ -2222,6 +2313,26 @@ class StepMethodsTestCase(db_base.DbTestCase):
self.context, self.node.uuid, shared=False) as task:
self.assertEqual([], agent_base.get_steps(task, 'clean'))
def test_get_deploy_steps(self):
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
task.node.driver_internal_info = {
'agent_cached_deploy_steps': self.clean_steps
}
steps = self.deploy.get_deploy_steps(task)
# 2 in-band steps + one out-of-band
self.assertEqual(3, len(steps))
self.assertIn(self.clean_steps['deploy'][0], steps)
self.assertIn(self.clean_steps['deploy'][1], steps)
self.assertNotIn(self.clean_steps['raid'][0], steps)
def test_get_deploy_steps_only_oob(self):
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
steps = self.deploy.get_deploy_steps(task)
# one out-of-band step
self.assertEqual(1, len(steps))
@mock.patch('ironic.objects.Port.list_by_node_id',
spec_set=types.FunctionType)
@mock.patch.object(agent_client.AgentClient, 'execute_clean_step',