diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 8c28307ce2..ce71cf2cc3 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager): action=event, node=task.node.uuid, state=task.node.provision_state) - def _get_node_next_deploy_steps(self, task): - return self._get_node_next_steps(task, 'deploy') + def _get_node_next_deploy_steps(self, task, skip_current_step=True): + return self._get_node_next_steps(task, 'deploy', + skip_current_step=skip_current_step) @METRICS.timer('ConductorManager.continue_node_deploy') def continue_node_deploy(self, context, node_id): @@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'deploy_state': ', '.join(expected_states)}) - next_step_index = self._get_node_next_deploy_steps(task) + info = node.driver_internal_info + try: + skip_current_step = info.pop('skip_current_deploy_step') + except KeyError: + skip_current_step = True + else: + node.driver_internal_info = info + node.save() + + next_step_index = self._get_node_next_deploy_steps( + task, skip_current_step=skip_current_step) # TODO(rloo): When deprecation period is over and node is in # states.DEPLOYWAIT only, delete the 'if' and always 'resume'. @@ -3861,6 +3872,16 @@ def _do_next_deploy_step(task, step_index, conductor_id): try: result = interface.execute_deploy_step(task, step) except exception.IronicException as e: + if isinstance(e, exception.AgentConnectionFailed): + if task.node.driver_internal_info.get('deployment_reboot'): + LOG.info('Agent is not yet running on node %(node)s after ' + 'deployment reboot, waiting for agent to come up ' + 'to run next deploy step %(step)s.', + {'node': node.uuid, 'step': step}) + driver_internal_info['skip_current_deploy_step'] = False + node.driver_internal_info = driver_internal_info + task.process_event('wait') + return log_msg = ('Node %(node)s failed deploy step %(step)s. Error: ' '%(err)s' % {'node': node.uuid, 'step': node.deploy_step, 'err': e}) @@ -3885,7 +3906,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): node.save() # Check if the step is done or not. The step should return - # states.CLEANWAIT if the step is still being executed, or + # states.DEPLOYWAIT if the step is still being executed, or # None if the step is done. # NOTE(deva): Some drivers may return states.DEPLOYWAIT # eg. if they are waiting for a callback @@ -3915,6 +3936,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): driver_internal_info = node.driver_internal_info driver_internal_info['deploy_steps'] = None driver_internal_info.pop('deploy_step_index', None) + driver_internal_info.pop('deployment_reboot', None) node.driver_internal_info = driver_internal_info node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index c555701c34..f32b9291b0 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False, node.deploy_step = {} info = node.driver_internal_info info.pop('deploy_step_index', None) + # Clear any leftover metadata about deployment reboots + info.pop('deployment_reboot', None) node.driver_internal_info = info if cleanup_err: diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py index b4f640e30f..9e1a067a06 100644 --- a/ironic/drivers/modules/agent.py +++ b/ironic/drivers/modules/agent.py @@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): task.process_event('wait') self.continue_deploy(task) elif task.driver.storage.should_write_image(task): - manager_utils.node_power_action(task, states.REBOOT) + # Check if the driver has already performed a reboot in a previous + # deploy step. + if not task.node.driver_internal_info.get('deployment_reboot'): + manager_utils.node_power_action(task, states.REBOOT) + info = task.node.driver_internal_info + info.pop('deployment_reboot', None) + task.node.driver_internal_info = info + task.node.save() return states.DEPLOYWAIT else: # TODO(TheJulia): At some point, we should de-dupe this code diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 0f2196759a..0fa2d45487 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -366,6 +366,9 @@ class HeartbeatMixin(object): else: node.touch_provisioning() else: + # The exceptions from RPC are not possible as we using cast + # here + manager_utils.notify_conductor_resume_deploy(task) node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py index b663db6978..e6932a19ab 100644 --- a/ironic/drivers/modules/iscsi_deploy.py +++ b/ironic/drivers/modules/iscsi_deploy.py @@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface): # Standard deploy process deploy_utils.cache_instance_image(task.context, node) check_image_size(task) - manager_utils.node_power_action(task, states.REBOOT) + # Check if the driver has already performed a reboot in a previous + # deploy step. + if not task.node.driver_internal_info.get('deployment_reboot', + False): + manager_utils.node_power_action(task, states.REBOOT) + info = task.node.driver_internal_info + info.pop('deployment_reboot', None) + task.node.driver_internal_info = info + task.node.save() + return states.DEPLOYWAIT else: # Boot to an Storage Volume diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 1a6063c127..3a101b5d7a 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, mock.ANY, 1, mock.ANY) self.assertFalse(mock_event.called) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def _continue_node_deploy_skip_step(self, mock_spawn, skip=True): + # test that skipping current step mechanism works + driver_info = {'deploy_steps': self.deploy_steps, + 'deploy_step_index': 0} + if not skip: + driver_info['skip_current_deploy_step'] = skip + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, deploy_step=self.deploy_steps[0]) + self._start_service() + self.service.continue_node_deploy(self.context, node.uuid) + self._stop_service() + node.refresh() + if skip: + expected_step_index = 1 + else: + self.assertNotIn( + 'skip_current_deploy_step', node.driver_internal_info) + expected_step_index = 0 + mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step, + mock.ANY, expected_step_index, mock.ANY) + + def test_continue_node_deploy_skip_step(self): + self._continue_node_deploy_skip_step() + + def test_continue_node_deploy_no_skip_step(self): + self._continue_node_deploy_skip_step(skip=False) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test_do_next_deploy_step_oob_reboot(self, mock_execute): + # When a deploy step fails, go to DEPLOYWAIT + tgt_prov_state = states.ACTIVE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'deploy_steps': self.deploy_steps, + 'deploy_step_index': None, + 'deployment_reboot': True}, + clean_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + manager._do_next_deploy_step(task, 0, mock.ANY) + + self._stop_service() + node.refresh() + + # Make sure we go to CLEANWAIT + self.assertEqual(states.DEPLOYWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.deploy_steps[0], node.deploy_step) + self.assertEqual(0, node.driver_internal_info['deploy_step_index']) + self.assertFalse(node.driver_internal_info['skip_current_deploy_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.deploy_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test_do_next_clean_step_oob_reboot_fail(self, + mock_execute): + # When a deploy step fails with no reboot requested go to DEPLOYFAIL + tgt_prov_state = states.ACTIVE + + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'deploy_steps': self.deploy_steps, + 'deploy_step_index': None}, + deploy_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + manager._do_next_deploy_step(task, 0, mock.ANY) + + self._stop_service() + node.refresh() + + # Make sure we go to DEPLOYFAIL, clear deploy_steps + self.assertEqual(states.DEPLOYFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.deploy_step) + self.assertNotIn('deploy_step_index', node.driver_internal_info) + self.assertNotIn('skip_current_deploy_step', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.deploy_steps[0]) + @mgr_utils.mock_record_keepalive class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, mock_execute.assert_called_once_with(mock.ANY, mock.ANY, self.deploy_steps[0]) - def test__get_node_next_deploy_steps(self): + def _test__get_node_next_deploy_steps(self, skip=True): driver_internal_info = {'deploy_steps': self.deploy_steps, 'deploy_step_index': 0} node = obj_utils.create_test_node( @@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, deploy_step=self.deploy_steps[0]) with task_manager.acquire(self.context, node.uuid) as task: - step_index = self.service._get_node_next_deploy_steps(task) - self.assertEqual(1, step_index) + step_index = self.service._get_node_next_deploy_steps( + task, skip_current_step=skip) + expected_index = 1 if skip else 0 + self.assertEqual(expected_index, step_index) + + def test__get_node_next_deploy_steps(self): + self._test__get_node_next_deploy_steps() + + def test__get_node_next_deploy_steps_no_skip(self): + self._test__get_node_next_deploy_steps(skip=False) def test__get_node_next_deploy_steps_unset_deploy_step(self): driver_internal_info = {'deploy_steps': self.deploy_steps, diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index dc2e4b3d0a..5c5fb68b31 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertEqual(self.errmsg, self.node.last_error) self.assertEqual({}, self.node.deploy_step) self.assertNotIn('deploy_step_index', self.node.driver_internal_info) + self.assertNotIn('deployment_reboot', self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') def _test_deploying_error_handler_cleanup(self, exc, expected_str): diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py index e804f3d176..15d5b675a2 100644 --- a/ironic/tests/unit/drivers/modules/test_agent.py +++ b/ironic/tests/unit/drivers/modules/test_agent.py @@ -321,6 +321,23 @@ class TestAgentDeploy(db_base.DbTestCase): power_mock.assert_called_once_with(task, states.REBOOT) self.assertFalse(mock_pxe_instance.called) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_deploy_with_deployment_reboot(self, power_mock, + mock_pxe_instance): + driver_internal_info = self.node.driver_internal_info + driver_internal_info['deployment_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + driver_return = self.driver.deploy(task) + self.assertEqual(driver_return, states.DEPLOYWAIT) + self.assertFalse(power_mock.called) + self.assertFalse(mock_pxe_instance.called) + self.assertNotIn( + 'deployment_reboot', task.node.driver_internal_info) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) @mock.patch.object(noop_storage.NoopStorage, 'should_write_image', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 223c3815c8..185e063600 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(cd_mock.called) rti_mock.assert_called_once_with(self.deploy, task) + @mock.patch.object(manager_utils, + 'notify_conductor_resume_deploy', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, @@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock, deploy_is_done_mock, deploy_started_mock, - in_deploy_mock): + in_deploy_mock, + in_resume_deploy_mock): # Check that heartbeats do not trigger deployment actions when not in # the deploy.deploy step. in_deploy_mock.return_value = False @@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(deploy_is_done_mock.called) self.assertFalse(cd_mock.called) self.assertFalse(rti_mock.called) + self.assertTrue(in_resume_deploy_mock.called) @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py index 054a81e50d..e36431b543 100644 --- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py +++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py @@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase): mock_check_image_size.assert_called_once_with(task) mock_node_power_action.assert_called_once_with(task, states.REBOOT) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True) + @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True) + def test_deploy_with_deployment_reboot(self, mock_cache_instance_image, + mock_check_image_size, + mock_node_power_action): + driver_internal_info = self.node.driver_internal_info + driver_internal_info['deployment_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + with task_manager.acquire(self.context, + self.node.uuid, shared=False) as task: + state = task.driver.deploy.deploy(task) + self.assertEqual(state, states.DEPLOYWAIT) + mock_cache_instance_image.assert_called_once_with( + self.context, task.node) + mock_check_image_size.assert_called_once_with(task) + self.assertFalse(mock_node_power_action.called) + self.assertNotIn( + 'deployment_reboot', task.node.driver_internal_info) + @mock.patch.object(noop_storage.NoopStorage, 'should_write_image', autospec=True) @mock.patch.object(flat_network.FlatNetwork, diff --git a/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml new file mode 100644 index 0000000000..68a1686ab6 --- /dev/null +++ b/releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue wherein asynchronous out-of-band deploy steps in + deployment template fails to execute. See `story 2006342 + `__ for details.