Don't resume deployment or cleaning on heartbeat when polling
Some drivers use a periodic task to poll for completion of a deploy or clean step. The iDRAC RAID driver is one example of this. In https://review.opendev.org/#/c/676152, the agent heartbeat handler was modified to resume deployment if not currently in the core deploy step. This makes sense for the ilo driver, which does not poll for completion of RAID configuration, but the iDRAC driver polls the lifecycle controller's job queue, and expects to be able to resume deployment once the job is complete. However, there is now a race between the agent heartbeat as the node boots up, and the job queue poller. This change adds new flags, cleaning_polling and deployment_polling, which can be used by a driver to signal that they are polling for completion of a deploy step, and that the agent heartbeat should not be used for this purpose. We also add here some more cleanup of the cleaning and deployment step metadata in driver_internal_info, since if these fields are left in place they may affect subsequent cleaning or deployment steps. Change-Id: I34591440ab993a80a0cc88be6e10e33f1ae4a660 Story: 2003817 Task: 36563
This commit is contained in:
parent
f7e1739c0c
commit
3e51982252
@ -916,12 +916,17 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'state': node.provision_state,
|
||||
'deploy_state': ', '.join(expected_states)})
|
||||
|
||||
save_required = False
|
||||
info = node.driver_internal_info
|
||||
try:
|
||||
skip_current_step = info.pop('skip_current_deploy_step')
|
||||
except KeyError:
|
||||
skip_current_step = True
|
||||
else:
|
||||
save_required = True
|
||||
if info.pop('deployment_polling', None) is not None:
|
||||
save_required = True
|
||||
if save_required:
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
||||
@ -1227,12 +1232,17 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'state': node.provision_state,
|
||||
'clean_state': states.CLEANWAIT})
|
||||
|
||||
save_required = False
|
||||
info = node.driver_internal_info
|
||||
try:
|
||||
skip_current_step = info.pop('skip_current_clean_step')
|
||||
except KeyError:
|
||||
skip_current_step = True
|
||||
else:
|
||||
save_required = True
|
||||
if info.pop('cleaning_polling', None) is not None:
|
||||
save_required = True
|
||||
if save_required:
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
||||
@ -1460,6 +1470,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
driver_internal_info['clean_steps'] = None
|
||||
driver_internal_info.pop('clean_step_index', None)
|
||||
driver_internal_info.pop('cleaning_reboot', None)
|
||||
driver_internal_info.pop('cleaning_polling', None)
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
try:
|
||||
@ -1543,6 +1554,13 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
|
||||
node.last_error = last_error
|
||||
node.clean_step = None
|
||||
info = node.driver_internal_info
|
||||
# Clear any leftover metadata about cleaning
|
||||
info.pop('clean_step_index', None)
|
||||
info.pop('cleaning_reboot', None)
|
||||
info.pop('cleaning_polling', None)
|
||||
info.pop('skip_current_clean_step', None)
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
LOG.info(info_message)
|
||||
|
||||
@ -3958,6 +3976,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
|
||||
driver_internal_info['deploy_steps'] = None
|
||||
driver_internal_info.pop('deploy_step_index', None)
|
||||
driver_internal_info.pop('deployment_reboot', None)
|
||||
driver_internal_info.pop('deployment_polling', None)
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
|
||||
|
@ -408,9 +408,11 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
# Clear clean step, msg should already include current step
|
||||
node.clean_step = {}
|
||||
info = node.driver_internal_info
|
||||
# Clear any leftover metadata about cleaning
|
||||
info.pop('clean_step_index', None)
|
||||
# Clear any leftover metadata about cleaning reboots
|
||||
info.pop('cleaning_reboot', None)
|
||||
info.pop('cleaning_polling', None)
|
||||
info.pop('skip_current_clean_step', None)
|
||||
node.driver_internal_info = info
|
||||
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
@ -466,9 +468,11 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
|
||||
# in node.driver_internal_info for debugging purposes.
|
||||
node.deploy_step = {}
|
||||
info = node.driver_internal_info
|
||||
# Clear any leftover metadata about deployment.
|
||||
info.pop('deploy_step_index', None)
|
||||
# Clear any leftover metadata about deployment reboots
|
||||
info.pop('deployment_reboot', None)
|
||||
info.pop('deployment_polling', None)
|
||||
info.pop('skip_current_deploy_step', None)
|
||||
node.driver_internal_info = info
|
||||
|
||||
if cleanup_err:
|
||||
|
@ -377,7 +377,7 @@ class HeartbeatMixin(object):
|
||||
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
|
||||
# are currently in the core deploy.deploy step. Other deploy steps
|
||||
# may cause the agent to boot, but we should not trigger deployment
|
||||
# at that point.
|
||||
# at that point if the driver is polling for completion of a step.
|
||||
if node.provision_state == states.DEPLOYWAIT:
|
||||
if self.in_core_deploy_step(task):
|
||||
if not self.deploy_has_started(task):
|
||||
@ -391,6 +391,11 @@ class HeartbeatMixin(object):
|
||||
else:
|
||||
# The exceptions from RPC are not possible as we using cast
|
||||
# here
|
||||
# Check if the driver is polling for completion of a step,
|
||||
# via the 'deployment_polling' flag.
|
||||
polling = node.driver_internal_info.get(
|
||||
'deployment_polling', False)
|
||||
if not polling:
|
||||
manager_utils.notify_conductor_resume_deploy(task)
|
||||
node.touch_provisioning()
|
||||
elif node.provision_state == states.CLEANWAIT:
|
||||
@ -408,6 +413,11 @@ class HeartbeatMixin(object):
|
||||
manager_utils.notify_conductor_resume_clean(task)
|
||||
else:
|
||||
msg = _('Node failed to check cleaning progress.')
|
||||
# Check if the driver is polling for completion of a step,
|
||||
# via the 'cleaning_polling' flag.
|
||||
polling = node.driver_internal_info.get(
|
||||
'cleaning_polling', False)
|
||||
if not polling:
|
||||
self.continue_cleaning(task)
|
||||
elif (node.provision_state == states.RESCUEWAIT):
|
||||
msg = _('Node failed to perform rescue operation.')
|
||||
|
@ -1475,7 +1475,8 @@ def get_async_step_return_state(node):
|
||||
return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT
|
||||
|
||||
|
||||
def set_async_step_flags(node, reboot=None, skip_current_step=None):
|
||||
def set_async_step_flags(node, reboot=None, skip_current_step=None,
|
||||
polling=None):
|
||||
"""Sets appropriate reboot flags in driver_internal_info based on operation
|
||||
|
||||
:param node: an ironic node object.
|
||||
@ -1488,16 +1489,24 @@ def set_async_step_flags(node, reboot=None, skip_current_step=None):
|
||||
skip_current_deploy_step based on cleaning or deployment operation
|
||||
in progress. If it is None, corresponding skip step flag is not set
|
||||
in node's driver_internal_info.
|
||||
:param polling: Boolean value to set for node's driver_internal_info flag
|
||||
deployment_polling or cleaning_polling. If it is None, the
|
||||
corresponding polling flag is not set in the node's
|
||||
driver_internal_info.
|
||||
"""
|
||||
info = node.driver_internal_info
|
||||
cleaning = {'reboot': 'cleaning_reboot',
|
||||
'skip': 'skip_current_clean_step'}
|
||||
'skip': 'skip_current_clean_step',
|
||||
'polling': 'cleaning_polling'}
|
||||
deployment = {'reboot': 'deployment_reboot',
|
||||
'skip': 'skip_current_deploy_step'}
|
||||
'skip': 'skip_current_deploy_step',
|
||||
'polling': 'deployment_polling'}
|
||||
fields = cleaning if node.clean_step else deployment
|
||||
if reboot is not None:
|
||||
info[fields['reboot']] = reboot
|
||||
if skip_current_step is not None:
|
||||
info[fields['skip']] = skip_current_step
|
||||
if polling is not None:
|
||||
info[fields['polling']] = polling
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
@ -1878,6 +1878,26 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
def test_continue_node_deploy_no_skip_step(self):
|
||||
self._continue_node_deploy_skip_step(skip=False)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_continue_node_deploy_polling(self, mock_spawn):
|
||||
# test that deployment_polling flag is cleared
|
||||
driver_info = {'deploy_steps': self.deploy_steps,
|
||||
'deploy_step_index': 0,
|
||||
'deployment_polling': True}
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.DEPLOYWAIT,
|
||||
target_provision_state=states.MANAGEABLE,
|
||||
driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
|
||||
self._start_service()
|
||||
self.service.continue_node_deploy(self.context, node.uuid)
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
self.assertNotIn('deployment_polling', node.driver_internal_info)
|
||||
mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
|
||||
mock.ANY, 1, mock.ANY)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
|
||||
autospec=True)
|
||||
def test_do_next_deploy_step_oob_reboot(self, mock_execute):
|
||||
@ -3257,7 +3277,12 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANFAIL,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
clean_step={'step': 'foo', 'abortable': True})
|
||||
clean_step={'step': 'foo', 'abortable': True},
|
||||
driver_internal_info={
|
||||
'clean_step_index': 2,
|
||||
'cleaning_reboot': True,
|
||||
'cleaning_polling': True,
|
||||
'skip_current_clean_step': True})
|
||||
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
self.service._do_node_clean_abort(task, step_name=step_name)
|
||||
@ -3265,8 +3290,16 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
tear_mock.assert_called_once_with(task.driver.deploy, task)
|
||||
if step_name:
|
||||
self.assertIn(step_name, task.node.last_error)
|
||||
# assert node's clean_step was cleaned up
|
||||
# assert node's clean_step and metadata was cleaned up
|
||||
self.assertEqual({}, task.node.clean_step)
|
||||
self.assertNotIn('clean_step_index',
|
||||
task.node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_reboot',
|
||||
task.node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_polling',
|
||||
task.node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_clean_step',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
def test__do_node_clean_abort(self):
|
||||
self._test__do_node_clean_abort(None)
|
||||
@ -3543,6 +3576,27 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
def test_continue_node_clean_no_skip_step(self):
|
||||
self._continue_node_clean_skip_step(skip=False)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_continue_node_clean_polling(self, mock_spawn):
|
||||
# test that cleaning_polling flag is cleared
|
||||
driver_info = {'clean_steps': self.clean_steps,
|
||||
'clean_step_index': 0,
|
||||
'cleaning_polling': True}
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANWAIT,
|
||||
target_provision_state=states.MANAGEABLE,
|
||||
driver_internal_info=driver_info, clean_step=self.clean_steps[0])
|
||||
self._start_service()
|
||||
self.service.continue_node_clean(self.context, node.uuid)
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
self.assertNotIn('cleaning_polling', node.driver_internal_info)
|
||||
mock_spawn.assert_called_with(self.service,
|
||||
self.service._do_next_clean_step,
|
||||
mock.ANY, 1)
|
||||
|
||||
def _continue_node_clean_abort(self, manual=False):
|
||||
last_clean_step = self.clean_steps[0]
|
||||
last_clean_step['abortable'] = False
|
||||
|
@ -915,6 +915,11 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.task)
|
||||
|
||||
def test_deploying_error_handler(self):
|
||||
info = self.node.driver_internal_info
|
||||
info['deploy_step_index'] = 2
|
||||
info['deployment_reboot'] = True
|
||||
info['deployment_polling'] = True
|
||||
info['skip_current_deploy_step'] = True
|
||||
conductor_utils.deploying_error_handler(self.task, self.logmsg,
|
||||
self.errmsg)
|
||||
|
||||
@ -924,6 +929,9 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
|
||||
self.assertEqual({}, self.node.deploy_step)
|
||||
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
|
||||
self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
|
||||
self.assertNotIn('deployment_polling', self.node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_deploy_step',
|
||||
self.node.driver_internal_info)
|
||||
self.task.process_event.assert_called_once_with('fail')
|
||||
|
||||
def _test_deploying_error_handler_cleanup(self, exc, expected_str):
|
||||
@ -1049,12 +1057,18 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.node.clean_step = {'key': 'val'}
|
||||
self.node.driver_internal_info = {
|
||||
'cleaning_reboot': True,
|
||||
'cleaning_polling': True,
|
||||
'skip_current_clean_step': True,
|
||||
'clean_step_index': 0}
|
||||
msg = 'error bar'
|
||||
conductor_utils.cleaning_error_handler(self.task, msg)
|
||||
self.node.save.assert_called_once_with()
|
||||
self.assertEqual({}, self.node.clean_step)
|
||||
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_reboot', self.node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_polling', self.node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_clean_step',
|
||||
self.node.driver_internal_info)
|
||||
self.assertEqual(msg, self.node.last_error)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertEqual(msg, self.node.maintenance_reason)
|
||||
|
@ -175,6 +175,46 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
self.assertFalse(rti_mock.called)
|
||||
self.assertTrue(in_resume_deploy_mock.called)
|
||||
|
||||
@mock.patch.object(manager_utils,
|
||||
'notify_conductor_resume_deploy', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'in_core_deploy_step', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'deploy_has_started', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'deploy_is_done', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'reboot_to_instance', autospec=True)
|
||||
def test_heartbeat_not_in_core_deploy_step_polling(self, rti_mock, cd_mock,
|
||||
deploy_is_done_mock,
|
||||
deploy_started_mock,
|
||||
in_deploy_mock,
|
||||
in_resume_deploy_mock):
|
||||
# Check that heartbeats do not trigger deployment actions when not in
|
||||
# the deploy.deploy step.
|
||||
in_deploy_mock.return_value = False
|
||||
self.node.provision_state = states.DEPLOYWAIT
|
||||
info = self.node.driver_internal_info
|
||||
info['deployment_polling'] = True
|
||||
self.node.driver_internal_info = info
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
self.deploy.heartbeat(task, 'url', '3.2.0')
|
||||
self.assertFalse(task.shared)
|
||||
self.assertEqual(
|
||||
'url', task.node.driver_internal_info['agent_url'])
|
||||
self.assertEqual(
|
||||
'3.2.0',
|
||||
task.node.driver_internal_info['agent_version'])
|
||||
self.assertFalse(deploy_started_mock.called)
|
||||
self.assertFalse(deploy_is_done_mock.called)
|
||||
self.assertFalse(cd_mock.called)
|
||||
self.assertFalse(rti_mock.called)
|
||||
self.assertFalse(in_resume_deploy_mock.called)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
@ -451,6 +491,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
mock_touch.assert_called_once_with(mock.ANY)
|
||||
mock_continue.assert_called_once_with(mock.ANY, task)
|
||||
|
||||
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'continue_cleaning', autospec=True)
|
||||
def test_heartbeat_continue_cleaning_polling(self, mock_continue,
|
||||
mock_touch):
|
||||
info = self.node.driver_internal_info
|
||||
info['cleaning_polling'] = True
|
||||
self.node.driver_internal_info = info
|
||||
self.node.clean_step = {
|
||||
'priority': 10,
|
||||
'interface': 'deploy',
|
||||
'step': 'foo',
|
||||
'reboot_requested': False
|
||||
}
|
||||
self.node.provision_state = states.CLEANWAIT
|
||||
self.node.save()
|
||||
with task_manager.acquire(
|
||||
self.context, self.node.uuid, shared=False) as task:
|
||||
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
|
||||
|
||||
mock_touch.assert_called_once_with(mock.ANY)
|
||||
self.assertFalse(mock_continue.called)
|
||||
|
||||
@mock.patch.object(manager_utils, 'cleaning_error_handler')
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'continue_cleaning', autospec=True)
|
||||
|
@ -2889,15 +2889,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
|
||||
self.node.save()
|
||||
self._test_get_async_step_return_state()
|
||||
|
||||
def test_set_async_step_flags_cleaning_set_both(self):
|
||||
def test_set_async_step_flags_cleaning_set_all(self):
|
||||
self.node.clean_step = {'step': 'create_configuration',
|
||||
'interface': 'raid'}
|
||||
self.node.driver_internal_info = {}
|
||||
expected = {'cleaning_reboot': True,
|
||||
'cleaning_polling': True,
|
||||
'skip_current_clean_step': True}
|
||||
self.node.save()
|
||||
utils.set_async_step_flags(self.node, reboot=True,
|
||||
skip_current_step=True)
|
||||
skip_current_step=True,
|
||||
polling=True)
|
||||
self.assertEqual(expected, self.node.driver_internal_info)
|
||||
|
||||
def test_set_async_step_flags_cleaning_set_one(self):
|
||||
@ -2909,15 +2911,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
|
||||
self.assertEqual({'cleaning_reboot': True},
|
||||
self.node.driver_internal_info)
|
||||
|
||||
def test_set_async_step_flags_deploying_set_both(self):
|
||||
def test_set_async_step_flags_deploying_set_all(self):
|
||||
self.node.deploy_step = {'step': 'create_configuration',
|
||||
'interface': 'raid'}
|
||||
self.node.driver_internal_info = {}
|
||||
expected = {'deployment_reboot': True,
|
||||
'deployment_polling': True,
|
||||
'skip_current_deploy_step': True}
|
||||
self.node.save()
|
||||
utils.set_async_step_flags(self.node, reboot=True,
|
||||
skip_current_step=True)
|
||||
skip_current_step=True,
|
||||
polling=True)
|
||||
self.assertEqual(expected, self.node.driver_internal_info)
|
||||
|
||||
def test_set_async_step_flags_deploying_set_one(self):
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes an issue with asynchronous deploy steps that poll for completion
|
||||
where the step could fail to execute. The ``deployment_polling`` and
|
||||
``cleaning_polling`` flags may be used by driver implementations to signal
|
||||
that the driver is polling for completion. See `story 2003817
|
||||
<https://storyboard.openstack.org/#!/story/2003817>`__ for details.
|
Loading…
Reference in New Issue
Block a user