diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 75308482d7..d0d3acc39e 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -286,7 +286,8 @@ def do_next_deploy_step(task, step_index, conductor_id): LOG.info('Deploy step %(step)s on node %(node)s being ' 'executed asynchronously, waiting for driver.', {'node': node.uuid, 'step': step}) - task.process_event('wait') + if task.node.provision_state != states.DEPLOYWAIT: + task.process_event('wait') return elif result is not None: # NOTE(rloo): This is an internal/dev error; shouldn't happen. diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py index 605860d8a8..a1aa4857c7 100644 --- a/ironic/drivers/modules/agent.py +++ b/ironic/drivers/modules/agent.py @@ -186,17 +186,13 @@ class AgentDeployMixin(agent_base.AgentDeployMixin): if not commands: return False - last_command = commands[-1] - - if last_command['command_name'] != 'prepare_image': - # catches race condition where prepare_image is still processing - # so deploy hasn't started yet + try: + last_command = next(cmd for cmd in reversed(commands) + if cmd['command_name'] == 'prepare_image') + except StopIteration: return False - - if last_command['command_status'] != 'RUNNING': - return True - - return False + else: + return last_command['command_status'] != 'RUNNING' @METRICS.timer('AgentDeployMixin.continue_deploy') @task_manager.require_exclusive_lock @@ -487,6 +483,7 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): # the state machine state going from DEPLOYWAIT -> DEPLOYING task.process_event('wait') self.continue_deploy(task) + return states.DEPLOYWAIT elif task.driver.storage.should_write_image(task): # Check if the driver has already performed a reboot in a previous # deploy step. diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 61ef16451c..473d82a659 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -722,10 +722,15 @@ class AgentDeployMixin(HeartbeatMixin): 'steps': previous_steps}) call = getattr(self._client, 'get_%s_steps' % step_type) - # TODO(dtantsur): remove the error handling in the V release. try: agent_result = call(node, task.ports).get('command_result', {}) except exception.AgentAPIError as exc: + if 'agent is busy' in str(exc): + LOG.debug('Agent is busy with a command, will refresh steps ' + 'on the next heartbeat') + return + + # TODO(dtantsur): change to just 'raise' if step_type == 'clean': raise else: diff --git a/ironic/tests/unit/conductor/test_deployments.py b/ironic/tests/unit/conductor/test_deployments.py index 8e8ccdea65..df078032d2 100644 --- a/ironic/tests/unit/conductor/test_deployments.py +++ b/ironic/tests/unit/conductor/test_deployments.py @@ -424,6 +424,39 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, mock_execute.assert_called_once_with(mock.ANY, task, self.deploy_steps[0]) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', + autospec=True) + def test__do_next_deploy_step_in_deploywait(self, mock_execute): + driver_internal_info = {'deploy_step_index': None, + 'deploy_steps': self.deploy_steps} + self._start_service() + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + driver_internal_info=driver_internal_info, + deploy_step={}) + + def fake_execute(interface, task, step): + # A deploy step leaves the node in DEPLOYWAIT + task.process_event('wait') + return states.DEPLOYWAIT + + mock_execute.side_effect = fake_execute + expected_first_step = node.driver_internal_info['deploy_steps'][0] + task = task_manager.TaskManager(self.context, node.uuid) + task.process_event('deploy') + + deployments.do_next_deploy_step(task, 0, self.service.conductor.id) + + node.refresh() + self.assertIsNone(node.last_error) + self.assertEqual(states.DEPLOYWAIT, node.provision_state) + self.assertEqual(states.ACTIVE, node.target_provision_state) + self.assertEqual(expected_first_step, node.deploy_step) + self.assertEqual(0, node.driver_internal_info['deploy_step_index']) + self.assertEqual(self.service.conductor.id, node.conductor_affinity) + mock_execute.assert_called_once_with(mock.ANY, task, + self.deploy_steps[0]) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', autospec=True) def test__do_next_deploy_step_continue_from_last_step(self, mock_execute): diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py index 7f04e698e6..57f7c49900 100644 --- a/ironic/tests/unit/drivers/modules/test_agent.py +++ b/ironic/tests/unit/drivers/modules/test_agent.py @@ -492,7 +492,7 @@ class TestAgentDeploy(db_base.DbTestCase): self.node.save() with task_manager.acquire( self.context, self.node['uuid'], shared=False) as task: - self.driver.deploy(task) + self.assertEqual(states.DEPLOYWAIT, self.driver.deploy(task)) self.assertFalse(power_mock.called) self.assertFalse(mock_pxe_instance.called) task.node.refresh() @@ -1739,6 +1739,27 @@ class TestAgentDeploy(db_base.DbTestCase): 'command_status': 'RUNNING'}] self.assertFalse(task.driver.deploy.deploy_is_done(task)) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_deploy_is_done_several_results(self, mock_get_cmd): + with task_manager.acquire(self.context, self.node.uuid) as task: + mock_get_cmd.return_value = [ + {'command_name': 'prepare_image', 'command_status': 'SUCCESS'}, + {'command_name': 'other_command', 'command_status': 'SUCCESS'}, + {'command_name': 'prepare_image', 'command_status': 'RUNNING'}, + ] + self.assertFalse(task.driver.deploy.deploy_is_done(task)) + + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_deploy_is_done_not_the_last(self, mock_get_cmd): + with task_manager.acquire(self.context, self.node.uuid) as task: + mock_get_cmd.return_value = [ + {'command_name': 'prepare_image', 'command_status': 'SUCCESS'}, + {'command_name': 'other_command', 'command_status': 'SUCCESS'}, + ] + self.assertTrue(task.driver.deploy.deploy_is_done(task)) + @mock.patch.object(manager_utils, 'restore_power_state_if_needed', autospec=True) @mock.patch.object(manager_utils, 'power_on_node_if_needed', diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 21dd388816..160fef0128 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -2235,6 +2235,25 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest): self.assertEqual([self.clean_steps['clean_steps'][ 'SpecificHardwareManager'][1]], steps['raid']) + @mock.patch.object(agent_base.LOG, 'warning', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_deploy_steps', + autospec=True) + def test_refresh_steps_busy(self, client_mock, log_mock): + client_mock.side_effect = exception.AgentAPIError( + node="node", status="500", error='agent is busy') + + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.refresh_steps(task, 'deploy') + + client_mock.assert_called_once_with(mock.ANY, task.node, + task.ports) + self.assertNotIn('agent_cached_deploy_steps_refreshed', + task.node.driver_internal_info) + self.assertIsNone(task.node.driver_internal_info.get( + 'agent_cached_deploy_steps')) + self.assertFalse(log_mock.called) + @mock.patch.object(agent_client.AgentClient, 'get_clean_steps', autospec=True) def test_refresh_steps_missing_steps(self, client_mock): diff --git a/releasenotes/notes/direct-fast-track-d0f43850b6e80751.yaml b/releasenotes/notes/direct-fast-track-d0f43850b6e80751.yaml new file mode 100644 index 0000000000..fee9738a9d --- /dev/null +++ b/releasenotes/notes/direct-fast-track-d0f43850b6e80751.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes fast-track deployments with the ``direct`` deploy interface that + used to hang previously.