Fix fast-track with the direct deploy interface

Several small fixes:

1) Make sure the deploy.deploy step returns DEPLOYWAIT after running
   prepare_image otherwise the conductor wrongly assumes that the
   deploy is done.
2) Handle the case when provision_state==DEPLOYWAIT when returning
   from an asynchronous deploy step.
3) Do not assume that prepare_image is always the last command to run,
   sometimes get_deploy_steps sneaks in.
4) Do not issue a deprecation warning when receiving "agent is busy"
   on get_deploy_steps, this is normal for fast-track.

Change-Id: I19274c48bd36fca19961a7d78467ec8c29f85905
(cherry picked from commit f0803493de)
This commit is contained in:
Dmitry Tantsur 2020-06-17 13:24:10 +02:00
parent ee48b6c38f
commit 316a868da8
7 changed files with 94 additions and 13 deletions

View File

@ -286,7 +286,8 @@ def do_next_deploy_step(task, step_index, conductor_id):
LOG.info('Deploy step %(step)s on node %(node)s being ' LOG.info('Deploy step %(step)s on node %(node)s being '
'executed asynchronously, waiting for driver.', 'executed asynchronously, waiting for driver.',
{'node': node.uuid, 'step': step}) {'node': node.uuid, 'step': step})
task.process_event('wait') if task.node.provision_state != states.DEPLOYWAIT:
task.process_event('wait')
return return
elif result is not None: elif result is not None:
# NOTE(rloo): This is an internal/dev error; shouldn't happen. # NOTE(rloo): This is an internal/dev error; shouldn't happen.

View File

@ -186,17 +186,13 @@ class AgentDeployMixin(agent_base.AgentDeployMixin):
if not commands: if not commands:
return False return False
last_command = commands[-1] try:
last_command = next(cmd for cmd in reversed(commands)
if last_command['command_name'] != 'prepare_image': if cmd['command_name'] == 'prepare_image')
# catches race condition where prepare_image is still processing except StopIteration:
# so deploy hasn't started yet
return False return False
else:
if last_command['command_status'] != 'RUNNING': return last_command['command_status'] != 'RUNNING'
return True
return False
@METRICS.timer('AgentDeployMixin.continue_deploy') @METRICS.timer('AgentDeployMixin.continue_deploy')
@task_manager.require_exclusive_lock @task_manager.require_exclusive_lock
@ -478,6 +474,7 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
# the state machine state going from DEPLOYWAIT -> DEPLOYING # the state machine state going from DEPLOYWAIT -> DEPLOYING
task.process_event('wait') task.process_event('wait')
self.continue_deploy(task) self.continue_deploy(task)
return states.DEPLOYWAIT
elif task.driver.storage.should_write_image(task): elif task.driver.storage.should_write_image(task):
# Check if the driver has already performed a reboot in a previous # Check if the driver has already performed a reboot in a previous
# deploy step. # deploy step.

View File

@ -722,10 +722,15 @@ class AgentDeployMixin(HeartbeatMixin):
'steps': previous_steps}) 'steps': previous_steps})
call = getattr(self._client, 'get_%s_steps' % step_type) call = getattr(self._client, 'get_%s_steps' % step_type)
# TODO(dtantsur): remove the error handling in the V release.
try: try:
agent_result = call(node, task.ports).get('command_result', {}) agent_result = call(node, task.ports).get('command_result', {})
except exception.AgentAPIError as exc: except exception.AgentAPIError as exc:
if 'agent is busy' in str(exc):
LOG.debug('Agent is busy with a command, will refresh steps '
'on the next heartbeat')
return
# TODO(dtantsur): change to just 'raise'
if step_type == 'clean': if step_type == 'clean':
raise raise
else: else:

View File

@ -423,6 +423,39 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
mock_execute.assert_called_once_with(mock.ANY, task, mock_execute.assert_called_once_with(mock.ANY, task,
self.deploy_steps[0]) self.deploy_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True)
def test__do_next_deploy_step_in_deploywait(self, mock_execute):
driver_internal_info = {'deploy_step_index': None,
'deploy_steps': self.deploy_steps}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
driver_internal_info=driver_internal_info,
deploy_step={})
def fake_execute(interface, task, step):
# A deploy step leaves the node in DEPLOYWAIT
task.process_event('wait')
return states.DEPLOYWAIT
mock_execute.side_effect = fake_execute
expected_first_step = node.driver_internal_info['deploy_steps'][0]
task = task_manager.TaskManager(self.context, node.uuid)
task.process_event('deploy')
deployments.do_next_deploy_step(task, 0, self.service.conductor.id)
node.refresh()
self.assertIsNone(node.last_error)
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(states.ACTIVE, node.target_provision_state)
self.assertEqual(expected_first_step, node.deploy_step)
self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
self.assertEqual(self.service.conductor.id, node.conductor_affinity)
mock_execute.assert_called_once_with(mock.ANY, task,
self.deploy_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True) autospec=True)
def test__do_next_deploy_step_continue_from_last_step(self, mock_execute): def test__do_next_deploy_step_continue_from_last_step(self, mock_execute):

View File

@ -492,7 +492,7 @@ class TestAgentDeploy(db_base.DbTestCase):
self.node.save() self.node.save()
with task_manager.acquire( with task_manager.acquire(
self.context, self.node['uuid'], shared=False) as task: self.context, self.node['uuid'], shared=False) as task:
self.driver.deploy(task) self.assertEqual(states.DEPLOYWAIT, self.driver.deploy(task))
self.assertFalse(power_mock.called) self.assertFalse(power_mock.called)
self.assertFalse(mock_pxe_instance.called) self.assertFalse(mock_pxe_instance.called)
task.node.refresh() task.node.refresh()
@ -1661,6 +1661,27 @@ class TestAgentDeploy(db_base.DbTestCase):
'command_status': 'RUNNING'}] 'command_status': 'RUNNING'}]
self.assertFalse(task.driver.deploy.deploy_is_done(task)) self.assertFalse(task.driver.deploy.deploy_is_done(task))
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_deploy_is_done_several_results(self, mock_get_cmd):
with task_manager.acquire(self.context, self.node.uuid) as task:
mock_get_cmd.return_value = [
{'command_name': 'prepare_image', 'command_status': 'SUCCESS'},
{'command_name': 'other_command', 'command_status': 'SUCCESS'},
{'command_name': 'prepare_image', 'command_status': 'RUNNING'},
]
self.assertFalse(task.driver.deploy.deploy_is_done(task))
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_deploy_is_done_not_the_last(self, mock_get_cmd):
with task_manager.acquire(self.context, self.node.uuid) as task:
mock_get_cmd.return_value = [
{'command_name': 'prepare_image', 'command_status': 'SUCCESS'},
{'command_name': 'other_command', 'command_status': 'SUCCESS'},
]
self.assertTrue(task.driver.deploy.deploy_is_done(task))
@mock.patch.object(manager_utils, 'restore_power_state_if_needed', @mock.patch.object(manager_utils, 'restore_power_state_if_needed',
autospec=True) autospec=True)
@mock.patch.object(manager_utils, 'power_on_node_if_needed', @mock.patch.object(manager_utils, 'power_on_node_if_needed',

View File

@ -2192,6 +2192,25 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
self.assertEqual([self.clean_steps['clean_steps'][ self.assertEqual([self.clean_steps['clean_steps'][
'SpecificHardwareManager'][1]], steps['raid']) 'SpecificHardwareManager'][1]], steps['raid'])
@mock.patch.object(agent_base.LOG, 'warning', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_deploy_steps',
autospec=True)
def test_refresh_steps_busy(self, client_mock, log_mock):
client_mock.side_effect = exception.AgentAPIError(
node="node", status="500", error='agent is busy')
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.refresh_steps(task, 'deploy')
client_mock.assert_called_once_with(mock.ANY, task.node,
task.ports)
self.assertNotIn('agent_cached_deploy_steps_refreshed',
task.node.driver_internal_info)
self.assertIsNone(task.node.driver_internal_info.get(
'agent_cached_deploy_steps'))
self.assertFalse(log_mock.called)
@mock.patch.object(agent_client.AgentClient, 'get_clean_steps', @mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
autospec=True) autospec=True)
def test_refresh_steps_missing_steps(self, client_mock): def test_refresh_steps_missing_steps(self, client_mock):

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Fixes fast-track deployments with the ``direct`` deploy interface that
used to hang previously.