diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 152bbce43e..3c7eadb2bf 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -88,6 +88,7 @@ def start_deploy(task, manager, configdrive=None, event='deploy'): node.instance_info = instance_info driver_internal_info = node.driver_internal_info + driver_internal_info.pop('steps_validated', None) # Infer the image type to make sure the deploy driver # validates only the necessary variables for different # image types. @@ -185,8 +186,9 @@ def do_node_deploy(task, conductor_id=None, configdrive=None): traceback=True, clean_up=False) try: - # This gets the deploy steps and puts them in the node's - # driver_internal_info['deploy_steps']. + # This gets the deploy steps (if any) and puts them in the node's + # driver_internal_info['deploy_steps']. In-band steps are skipped since + # we know that an agent is not running yet. conductor_steps.set_node_deployment_steps(task) except exception.InstanceDeployFailure as e: with excutils.save_and_reraise_exception(): @@ -313,6 +315,7 @@ def do_next_deploy_step(task, step_index, conductor_id): driver_internal_info.pop('deploy_step_index', None) driver_internal_info.pop('deployment_reboot', None) driver_internal_info.pop('deployment_polling', None) + driver_internal_info.pop('steps_validated', None) # Remove the agent_url cached from the deployment. driver_internal_info.pop('agent_url', None) node.driver_internal_info = driver_internal_info diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 4a588c8746..7a44ca12b3 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -872,6 +872,15 @@ class ConductorManager(base_manager.BaseConductorManager): save_required = False info = node.driver_internal_info + + # Agent is running, we're ready to validate the remaining steps + if not info.get('steps_validated'): + conductor_steps.validate_deploy_templates(task) + conductor_steps.set_node_deployment_steps( + task, reset_current=False) + info['steps_validated'] = True + save_required = True + try: skip_current_step = info.pop('skip_current_deploy_step') except KeyError: diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py index 3fb539e34a..09ecf5fa24 100644 --- a/ironic/conductor/steps.py +++ b/ironic/conductor/steps.py @@ -87,6 +87,11 @@ def is_equivalent(step1, step2): and step1.get('step') == step2.get('step')) +def find_step(steps, step): + """Find an identical step in the list of steps.""" + return next((x for x in steps if is_equivalent(x, step)), None) + + def _get_steps(task, interfaces, get_method, enabled=False, sort_step_key=None): """Get steps for task.node. @@ -299,19 +304,21 @@ def _get_all_deployment_steps(task): return _sorted_steps(steps, _deploy_step_key) -def set_node_deployment_steps(task): +def set_node_deployment_steps(task, reset_current=True): """Set up the node with deployment step information for deploying. Get the deploy steps from the driver. + :param reset_current: Whether to reset the current step to the first one. :raises: InstanceDeployFailure if there was a problem getting the deployment steps. """ node = task.node driver_internal_info = node.driver_internal_info driver_internal_info['deploy_steps'] = _get_all_deployment_steps(task) - node.deploy_step = {} - driver_internal_info['deploy_step_index'] = None + if reset_current: + node.deploy_step = {} + driver_internal_info['deploy_step_index'] = None node.driver_internal_info = driver_internal_info node.save() diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 6eb2312b40..1b7eeb6252 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -477,14 +477,22 @@ class HeartbeatMixin(object): {'node': node.uuid}) def _heartbeat_deploy_wait(self, task): - msg = _('Failed checking if deploy is done') + msg = _('Unexpected exception') node = task.node try: + # NOTE(dtantsur): on first heartbeat, load in-band steps. + if not node.driver_internal_info.get('agent_cached_deploy_steps'): + msg = _('Failed to load in-band deploy steps') + # Refresh steps since this is the first time IPA has + # booted and we need to collect in-band steps. + self.refresh_steps(task, 'deploy') + # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point if the driver is polling for completion of a step. if self.in_core_deploy_step(task): + msg = _('Failed checking if deploy is done') if not self.deploy_has_started(task): msg = _('Node failed to deploy') self.continue_deploy(task) @@ -494,15 +502,14 @@ class HeartbeatMixin(object): else: node.touch_provisioning() else: - # The exceptions from RPC are not possible as we using cast - # here - # Check if the driver is polling for completion of a step, - # via the 'deployment_polling' flag. + node.touch_provisioning() + # Check if the driver is polling for completion of + # a step, via the 'deployment_polling' flag. polling = node.driver_internal_info.get( 'deployment_polling', False) if not polling: - manager_utils.notify_conductor_resume_deploy(task) - node.touch_provisioning() + msg = _('Failed to process the next deploy step') + self.process_next_step(task, 'deploy') except Exception as e: last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e} LOG.exception('Asynchronous exception for node %(node)s: %(err)s', @@ -663,6 +670,25 @@ class AgentDeployMixin(HeartbeatMixin): task, 'clean', interface='deploy', override_priorities=new_priorities) + @METRICS.timer('AgentDeployMixin.get_deploy_steps') + def get_deploy_steps(self, task): + """Get the list of deploy steps from the agent. + + :param task: a TaskManager object containing the node + :raises InstanceDeployFailure: if the deploy steps are not yet + available (cached), for example, when a node has just been + enrolled and has not been deployed yet. + :returns: A list of deploy step dictionaries + """ + steps = super(AgentDeployMixin, self).get_deploy_steps(task)[:] + ib_steps = get_steps(task, 'deploy', interface='deploy') + # NOTE(dtantsur): we allow in-band steps to be shadowed by out-of-band + # ones, see the docstring of execute_deploy_step for details. + steps += [step for step in ib_steps + # FIXME(dtantsur): nested loops are not too efficient + if not conductor_steps.find_step(steps, step)] + return steps + @METRICS.timer('AgentDeployMixin.refresh_steps') def refresh_steps(self, task, step_type): """Refresh the node's cached clean/deploy steps from the booted agent. @@ -687,7 +713,18 @@ class AgentDeployMixin(HeartbeatMixin): 'steps': previous_steps}) call = getattr(self._client, 'get_%s_steps' % step_type) - agent_result = call(node, task.ports).get('command_result', {}) + try: + agent_result = call(node, task.ports).get('command_result', {}) + except exception.AgentAPIError as exc: + if step_type == 'clean': + raise + else: + LOG.warning('Agent running on node %(node)s does not support ' + 'in-band deploy steps: %(err)s. Support for old ' + 'agents will be removed in the V release.', + {'node': node.uuid, 'err': exc}) + return + missing = set(['%s_steps' % step_type, 'hardware_manager_version']).difference(agent_result) if missing: @@ -741,6 +778,37 @@ class AgentDeployMixin(HeartbeatMixin): """ return execute_step(task, step, 'clean') + @METRICS.timer('AgentDeployMixin.execute_deploy_step') + def execute_deploy_step(self, task, step): + """Execute a deploy step. + + We're trying to find a step among both out-of-band and in-band steps. + In case of duplicates, out-of-band steps take priority. This property + allows having an out-of-band deploy step that calls into + a corresponding in-band step after some preparation (e.g. with + additional input). + + :param task: a TaskManager object containing the node + :param step: a deploy step dictionary to execute + :raises: NodeCleaningFailure if the agent does not return a command + status + :returns: states.DEPLOYWAIT to signify the step will be completed async + """ + agent_running = task.node.driver_internal_info.get( + 'agent_cached_deploy_steps') + oob_steps = self.deploy_steps + + if conductor_steps.find_step(oob_steps, step): + return super(AgentDeployMixin, self).execute_deploy_step( + task, step) + elif not agent_running: + raise exception.InstanceDeployFailure( + _('Deploy step %(step)s has not been found. Available ' + 'out-of-band steps: %(oob)s. Agent is not running.') % + {'step': step, 'oob': oob_steps}) + else: + return execute_step(task, step, 'deploy') + def _process_version_mismatch(self, task, step_type): node = task.node # For manual clean, the target provision state is MANAGEABLE, whereas @@ -811,7 +879,7 @@ class AgentDeployMixin(HeartbeatMixin): process (the agent's get_clean|deploy_steps() call) and before executing each step. If the version has changed between steps, the agent is unable to tell if an ordering change will cause an issue - so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we + so it returns VERSION_MISMATCH. For automated cleaning, we restart the entire cleaning cycle. For manual cleaning or deploy, we don't. @@ -854,8 +922,10 @@ class AgentDeployMixin(HeartbeatMixin): 'type': step_type}) LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg) + # NOTE(dtantsur): VERSION_MISMATCH is a new alias for + # CLEAN_VERSION_MISMATCH, remove the old one after IPA removes it. elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH', - 'DEPLOY_VERSION_MISMATCH'): + 'VERSION_MISMATCH'): self._process_version_mismatch(task, step_type) elif command.get('command_status') == 'SUCCEEDED': step_hook = _get_post_step_hook(node, step_type) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 37580706b1..d95aa83d3f 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1663,6 +1663,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, 'step': 'deploy_start', 'priority': 50, 'interface': 'deploy'} self.deploy_end = { 'step': 'deploy_end', 'priority': 20, 'interface': 'deploy'} + self.in_band_step = { + 'step': 'deploy_middle', 'priority': 30, 'interface': 'deploy'} self.deploy_steps = [self.deploy_start, self.deploy_end] @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', @@ -1715,7 +1717,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, prv_state = states.DEPLOYWAIT tgt_prv_state = states.ACTIVE driver_info = {'deploy_steps': self.deploy_steps, - 'deploy_step_index': 0} + 'deploy_step_index': 0, + 'steps_validated': True} node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=prv_state, target_provision_state=tgt_prv_state, @@ -1732,6 +1735,36 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, deployments.do_next_deploy_step, mock.ANY, 1, mock.ANY) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_deploy_steps', + autospec=True) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_continue_node_deploy_first_agent_boot(self, mock_spawn, + mock_get_steps): + new_steps = [self.deploy_start, self.in_band_step, self.deploy_end] + mock_get_steps.return_value = new_steps + prv_state = states.DEPLOYWAIT + tgt_prv_state = states.ACTIVE + driver_info = {'deploy_steps': self.deploy_steps, + 'deploy_step_index': 0} + node = obj_utils.create_test_node(self.context, driver='fake-hardware', + provision_state=prv_state, + target_provision_state=tgt_prv_state, + last_error=None, + driver_internal_info=driver_info, + deploy_step=self.deploy_steps[0]) + self._start_service() + self.service.continue_node_deploy(self.context, node.uuid) + self._stop_service() + node.refresh() + self.assertEqual(states.DEPLOYING, node.provision_state) + self.assertEqual(tgt_prv_state, node.target_provision_state) + self.assertTrue(node.driver_internal_info['steps_validated']) + self.assertEqual(new_steps, node.driver_internal_info['deploy_steps']) + mock_spawn.assert_called_with(mock.ANY, + deployments.do_next_deploy_step, + mock.ANY, 1, mock.ANY) + @mock.patch.object(task_manager.TaskManager, 'process_event', autospec=True) @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', @@ -1744,7 +1777,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, prv_state = states.DEPLOYING tgt_prv_state = states.ACTIVE driver_info = {'deploy_steps': self.deploy_steps, - 'deploy_step_index': 0} + 'deploy_step_index': 0, + 'steps_validated': True} self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=prv_state, @@ -1767,7 +1801,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, def _continue_node_deploy_skip_step(self, mock_spawn, skip=True): # test that skipping current step mechanism works driver_info = {'deploy_steps': self.deploy_steps, - 'deploy_step_index': 0} + 'deploy_step_index': 0, + 'steps_validated': True} if not skip: driver_info['skip_current_deploy_step'] = skip node = obj_utils.create_test_node( @@ -1801,7 +1836,8 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, # test that deployment_polling flag is cleared driver_info = {'deploy_steps': self.deploy_steps, 'deploy_step_index': 0, - 'deployment_polling': True} + 'deployment_polling': True, + 'steps_validated': True} node = obj_utils.create_test_node( self.context, driver='fake-hardware', provision_state=states.DEPLOYWAIT, diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index c4f4f722c8..6dea267857 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -83,6 +83,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): super(HeartbeatMixinTest, self).setUp() self.deploy = agent_base.HeartbeatMixin() + @mock.patch.object(agent_base.HeartbeatMixin, + 'refresh_steps', autospec=True) @mock.patch.object(agent_base.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base.HeartbeatMixin, @@ -93,7 +95,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): 'reboot_to_instance', autospec=True) def test_heartbeat_continue_deploy(self, rti_mock, cd_mock, deploy_started_mock, - in_deploy_mock): + in_deploy_mock, + refresh_steps_mock): in_deploy_mock.return_value = True deploy_started_mock.return_value = False self.node.provision_state = states.DEPLOYWAIT @@ -109,6 +112,42 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): task.node.driver_internal_info['agent_version']) cd_mock.assert_called_once_with(self.deploy, task) self.assertFalse(rti_mock.called) + refresh_steps_mock.assert_called_once_with(self.deploy, + task, 'deploy') + + @mock.patch.object(agent_base.HeartbeatMixin, + 'refresh_steps', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'in_core_deploy_step', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'deploy_has_started', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + def test_heartbeat_continue_deploy_second_run(self, rti_mock, cd_mock, + deploy_started_mock, + in_deploy_mock, + refresh_steps_mock): + in_deploy_mock.return_value = True + deploy_started_mock.return_value = False + dii = self.node.driver_internal_info + dii['agent_cached_deploy_steps'] = ['step'] + self.node.driver_internal_info = dii + self.node.provision_state = states.DEPLOYWAIT + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, 'url', '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + 'url', task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + cd_mock.assert_called_once_with(self.deploy, task) + self.assertFalse(rti_mock.called) + self.assertFalse(refresh_steps_mock.called) @mock.patch.object(agent_base.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @@ -141,8 +180,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(cd_mock.called) rti_mock.assert_called_once_with(self.deploy, task) - @mock.patch.object(manager_utils, - 'notify_conductor_resume_deploy', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'process_next_step', autospec=True) @mock.patch.object(agent_base.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base.HeartbeatMixin, @@ -157,7 +196,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): deploy_is_done_mock, deploy_started_mock, in_deploy_mock, - in_resume_deploy_mock): + process_next_mock): # Check that heartbeats do not trigger deployment actions when not in # the deploy.deploy step. in_deploy_mock.return_value = False @@ -176,7 +215,53 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(deploy_is_done_mock.called) self.assertFalse(cd_mock.called) self.assertFalse(rti_mock.called) - self.assertTrue(in_resume_deploy_mock.called) + process_next_mock.assert_called_once_with(self.deploy, + task, 'deploy') + + @mock.patch.object(agent_base.HeartbeatMixin, + 'refresh_steps', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'process_next_step', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'in_core_deploy_step', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'deploy_has_started', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'deploy_is_done', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + def test_heartbeat_not_in_core_deploy_step_refresh(self, rti_mock, cd_mock, + deploy_is_done_mock, + deploy_started_mock, + in_deploy_mock, + process_next_mock, + refresh_steps_mock): + # Check loading in-band deploy steps. + in_deploy_mock.return_value = False + self.node.provision_state = states.DEPLOYWAIT + info = self.node.driver_internal_info + info.pop('agent_cached_deploy_steps', None) + self.node.driver_internal_info = info + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, 'url', '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + 'url', task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + self.assertFalse(deploy_started_mock.called) + self.assertFalse(deploy_is_done_mock.called) + self.assertFalse(cd_mock.called) + self.assertFalse(rti_mock.called) + refresh_steps_mock.assert_called_once_with(self.deploy, + task, 'deploy') + process_next_mock.assert_called_once_with(self.deploy, + task, 'deploy') @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy', autospec=True) @@ -200,6 +285,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): in_deploy_mock.return_value = False self.node.provision_state = states.DEPLOYWAIT info = self.node.driver_internal_info + info['agent_cached_deploy_steps'] = ['step1'] info['deployment_polling'] = True self.node.driver_internal_info = info self.node.save() @@ -2132,6 +2218,10 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): task.ports) +class FakeAgentDeploy(agent_base.AgentDeployMixin, fake.FakeDeploy): + pass + + class StepMethodsTestCase(db_base.DbTestCase): def setUp(self): @@ -2159,6 +2249,7 @@ class StepMethodsTestCase(db_base.DbTestCase): self.node = object_utils.create_test_node(self.context, **n) self.ports = [object_utils.create_test_port(self.context, node_id=self.node.id)] + self.deploy = FakeAgentDeploy() def test_agent_get_steps(self): with task_manager.acquire( @@ -2222,6 +2313,26 @@ class StepMethodsTestCase(db_base.DbTestCase): self.context, self.node.uuid, shared=False) as task: self.assertEqual([], agent_base.get_steps(task, 'clean')) + def test_get_deploy_steps(self): + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + task.node.driver_internal_info = { + 'agent_cached_deploy_steps': self.clean_steps + } + steps = self.deploy.get_deploy_steps(task) + # 2 in-band steps + one out-of-band + self.assertEqual(3, len(steps)) + self.assertIn(self.clean_steps['deploy'][0], steps) + self.assertIn(self.clean_steps['deploy'][1], steps) + self.assertNotIn(self.clean_steps['raid'][0], steps) + + def test_get_deploy_steps_only_oob(self): + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + steps = self.deploy.get_deploy_steps(task) + # one out-of-band step + self.assertEqual(1, len(steps)) + @mock.patch('ironic.objects.Port.list_by_node_id', spec_set=types.FunctionType) @mock.patch.object(agent_client.AgentClient, 'execute_clean_step',