Don't resume deployment or cleaning on heartbeat when polling

Some drivers use a periodic task to poll for completion of a deploy
or clean step. The iDRAC RAID driver is one example of this. In
https://review.opendev.org/#/c/676152, the agent heartbeat handler was
modified to resume deployment if not currently in the core deploy step.
This makes sense for the ilo driver, which does not poll for completion
of RAID configuration, but the iDRAC driver polls the lifecycle
controller's job queue, and expects to be able to resume deployment once
the job is complete. However, there is now a race between the agent
heartbeat as the node boots up, and the job queue poller.

This change adds new flags, cleaning_polling and deployment_polling,
which can be used by a driver to signal that they are polling for
completion of a deploy step, and that the agent heartbeat should not be
used for this purpose.

We also add here some more cleanup of the cleaning and deployment step
metadata in driver_internal_info, since if these fields are left in
place they may affect subsequent cleaning or deployment steps.

Change-Id: I34591440ab993a80a0cc88be6e10e33f1ae4a660
Story: 2003817
Task: 36563
This commit is contained in:
Mark Goddard 2019-09-09 17:26:21 +01:00
parent f7e1739c0c
commit 3e51982252
9 changed files with 199 additions and 14 deletions

View File

@ -916,12 +916,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state, 'state': node.provision_state,
'deploy_state': ', '.join(expected_states)}) 'deploy_state': ', '.join(expected_states)})
save_required = False
info = node.driver_internal_info info = node.driver_internal_info
try: try:
skip_current_step = info.pop('skip_current_deploy_step') skip_current_step = info.pop('skip_current_deploy_step')
except KeyError: except KeyError:
skip_current_step = True skip_current_step = True
else: else:
save_required = True
if info.pop('deployment_polling', None) is not None:
save_required = True
if save_required:
node.driver_internal_info = info node.driver_internal_info = info
node.save() node.save()
@ -1227,12 +1232,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state, 'state': node.provision_state,
'clean_state': states.CLEANWAIT}) 'clean_state': states.CLEANWAIT})
save_required = False
info = node.driver_internal_info info = node.driver_internal_info
try: try:
skip_current_step = info.pop('skip_current_clean_step') skip_current_step = info.pop('skip_current_clean_step')
except KeyError: except KeyError:
skip_current_step = True skip_current_step = True
else: else:
save_required = True
if info.pop('cleaning_polling', None) is not None:
save_required = True
if save_required:
node.driver_internal_info = info node.driver_internal_info = info
node.save() node.save()
@ -1460,6 +1470,7 @@ class ConductorManager(base_manager.BaseConductorManager):
driver_internal_info['clean_steps'] = None driver_internal_info['clean_steps'] = None
driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('clean_step_index', None)
driver_internal_info.pop('cleaning_reboot', None) driver_internal_info.pop('cleaning_reboot', None)
driver_internal_info.pop('cleaning_polling', None)
node.driver_internal_info = driver_internal_info node.driver_internal_info = driver_internal_info
node.save() node.save()
try: try:
@ -1543,6 +1554,13 @@ class ConductorManager(base_manager.BaseConductorManager):
node.last_error = last_error node.last_error = last_error
node.clean_step = None node.clean_step = None
info = node.driver_internal_info
# Clear any leftover metadata about cleaning
info.pop('clean_step_index', None)
info.pop('cleaning_reboot', None)
info.pop('cleaning_polling', None)
info.pop('skip_current_clean_step', None)
node.driver_internal_info = info
node.save() node.save()
LOG.info(info_message) LOG.info(info_message)
@ -3958,6 +3976,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
driver_internal_info['deploy_steps'] = None driver_internal_info['deploy_steps'] = None
driver_internal_info.pop('deploy_step_index', None) driver_internal_info.pop('deploy_step_index', None)
driver_internal_info.pop('deployment_reboot', None) driver_internal_info.pop('deployment_reboot', None)
driver_internal_info.pop('deployment_polling', None)
node.driver_internal_info = driver_internal_info node.driver_internal_info = driver_internal_info
node.save() node.save()

View File

@ -408,9 +408,11 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# Clear clean step, msg should already include current step # Clear clean step, msg should already include current step
node.clean_step = {} node.clean_step = {}
info = node.driver_internal_info info = node.driver_internal_info
# Clear any leftover metadata about cleaning
info.pop('clean_step_index', None) info.pop('clean_step_index', None)
# Clear any leftover metadata about cleaning reboots
info.pop('cleaning_reboot', None) info.pop('cleaning_reboot', None)
info.pop('cleaning_polling', None)
info.pop('skip_current_clean_step', None)
node.driver_internal_info = info node.driver_internal_info = info
# For manual cleaning, the target provision state is MANAGEABLE, whereas # For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE. # for automated cleaning, it is AVAILABLE.
@ -466,9 +468,11 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
# in node.driver_internal_info for debugging purposes. # in node.driver_internal_info for debugging purposes.
node.deploy_step = {} node.deploy_step = {}
info = node.driver_internal_info info = node.driver_internal_info
# Clear any leftover metadata about deployment.
info.pop('deploy_step_index', None) info.pop('deploy_step_index', None)
# Clear any leftover metadata about deployment reboots
info.pop('deployment_reboot', None) info.pop('deployment_reboot', None)
info.pop('deployment_polling', None)
info.pop('skip_current_deploy_step', None)
node.driver_internal_info = info node.driver_internal_info = info
if cleanup_err: if cleanup_err:

View File

@ -377,7 +377,7 @@ class HeartbeatMixin(object):
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
# are currently in the core deploy.deploy step. Other deploy steps # are currently in the core deploy.deploy step. Other deploy steps
# may cause the agent to boot, but we should not trigger deployment # may cause the agent to boot, but we should not trigger deployment
# at that point. # at that point if the driver is polling for completion of a step.
if node.provision_state == states.DEPLOYWAIT: if node.provision_state == states.DEPLOYWAIT:
if self.in_core_deploy_step(task): if self.in_core_deploy_step(task):
if not self.deploy_has_started(task): if not self.deploy_has_started(task):
@ -391,7 +391,12 @@ class HeartbeatMixin(object):
else: else:
# The exceptions from RPC are not possible as we using cast # The exceptions from RPC are not possible as we using cast
# here # here
manager_utils.notify_conductor_resume_deploy(task) # Check if the driver is polling for completion of a step,
# via the 'deployment_polling' flag.
polling = node.driver_internal_info.get(
'deployment_polling', False)
if not polling:
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning() node.touch_provisioning()
elif node.provision_state == states.CLEANWAIT: elif node.provision_state == states.CLEANWAIT:
node.touch_provisioning() node.touch_provisioning()
@ -408,7 +413,12 @@ class HeartbeatMixin(object):
manager_utils.notify_conductor_resume_clean(task) manager_utils.notify_conductor_resume_clean(task)
else: else:
msg = _('Node failed to check cleaning progress.') msg = _('Node failed to check cleaning progress.')
self.continue_cleaning(task) # Check if the driver is polling for completion of a step,
# via the 'cleaning_polling' flag.
polling = node.driver_internal_info.get(
'cleaning_polling', False)
if not polling:
self.continue_cleaning(task)
elif (node.provision_state == states.RESCUEWAIT): elif (node.provision_state == states.RESCUEWAIT):
msg = _('Node failed to perform rescue operation.') msg = _('Node failed to perform rescue operation.')
self._finalize_rescue(task) self._finalize_rescue(task)

View File

@ -1475,7 +1475,8 @@ def get_async_step_return_state(node):
return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT
def set_async_step_flags(node, reboot=None, skip_current_step=None): def set_async_step_flags(node, reboot=None, skip_current_step=None,
polling=None):
"""Sets appropriate reboot flags in driver_internal_info based on operation """Sets appropriate reboot flags in driver_internal_info based on operation
:param node: an ironic node object. :param node: an ironic node object.
@ -1488,16 +1489,24 @@ def set_async_step_flags(node, reboot=None, skip_current_step=None):
skip_current_deploy_step based on cleaning or deployment operation skip_current_deploy_step based on cleaning or deployment operation
in progress. If it is None, corresponding skip step flag is not set in progress. If it is None, corresponding skip step flag is not set
in node's driver_internal_info. in node's driver_internal_info.
:param polling: Boolean value to set for node's driver_internal_info flag
deployment_polling or cleaning_polling. If it is None, the
corresponding polling flag is not set in the node's
driver_internal_info.
""" """
info = node.driver_internal_info info = node.driver_internal_info
cleaning = {'reboot': 'cleaning_reboot', cleaning = {'reboot': 'cleaning_reboot',
'skip': 'skip_current_clean_step'} 'skip': 'skip_current_clean_step',
'polling': 'cleaning_polling'}
deployment = {'reboot': 'deployment_reboot', deployment = {'reboot': 'deployment_reboot',
'skip': 'skip_current_deploy_step'} 'skip': 'skip_current_deploy_step',
'polling': 'deployment_polling'}
fields = cleaning if node.clean_step else deployment fields = cleaning if node.clean_step else deployment
if reboot is not None: if reboot is not None:
info[fields['reboot']] = reboot info[fields['reboot']] = reboot
if skip_current_step is not None: if skip_current_step is not None:
info[fields['skip']] = skip_current_step info[fields['skip']] = skip_current_step
if polling is not None:
info[fields['polling']] = polling
node.driver_internal_info = info node.driver_internal_info = info
node.save() node.save()

View File

@ -1878,6 +1878,26 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
def test_continue_node_deploy_no_skip_step(self): def test_continue_node_deploy_no_skip_step(self):
self._continue_node_deploy_skip_step(skip=False) self._continue_node_deploy_skip_step(skip=False)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_continue_node_deploy_polling(self, mock_spawn):
# test that deployment_polling flag is cleared
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0,
'deployment_polling': True}
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
self._start_service()
self.service.continue_node_deploy(self.context, node.uuid)
self._stop_service()
node.refresh()
self.assertNotIn('deployment_polling', node.driver_internal_info)
mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
mock.ANY, 1, mock.ANY)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True) autospec=True)
def test_do_next_deploy_step_oob_reboot(self, mock_execute): def test_do_next_deploy_step_oob_reboot(self, mock_execute):
@ -3257,7 +3277,12 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
self.context, driver='fake-hardware', self.context, driver='fake-hardware',
provision_state=states.CLEANFAIL, provision_state=states.CLEANFAIL,
target_provision_state=states.AVAILABLE, target_provision_state=states.AVAILABLE,
clean_step={'step': 'foo', 'abortable': True}) clean_step={'step': 'foo', 'abortable': True},
driver_internal_info={
'clean_step_index': 2,
'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True})
with task_manager.acquire(self.context, node.uuid) as task: with task_manager.acquire(self.context, node.uuid) as task:
self.service._do_node_clean_abort(task, step_name=step_name) self.service._do_node_clean_abort(task, step_name=step_name)
@ -3265,8 +3290,16 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
tear_mock.assert_called_once_with(task.driver.deploy, task) tear_mock.assert_called_once_with(task.driver.deploy, task)
if step_name: if step_name:
self.assertIn(step_name, task.node.last_error) self.assertIn(step_name, task.node.last_error)
# assert node's clean_step was cleaned up # assert node's clean_step and metadata was cleaned up
self.assertEqual({}, task.node.clean_step) self.assertEqual({}, task.node.clean_step)
self.assertNotIn('clean_step_index',
task.node.driver_internal_info)
self.assertNotIn('cleaning_reboot',
task.node.driver_internal_info)
self.assertNotIn('cleaning_polling',
task.node.driver_internal_info)
self.assertNotIn('skip_current_clean_step',
task.node.driver_internal_info)
def test__do_node_clean_abort(self): def test__do_node_clean_abort(self):
self._test__do_node_clean_abort(None) self._test__do_node_clean_abort(None)
@ -3543,6 +3576,27 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test_continue_node_clean_no_skip_step(self): def test_continue_node_clean_no_skip_step(self):
self._continue_node_clean_skip_step(skip=False) self._continue_node_clean_skip_step(skip=False)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_continue_node_clean_polling(self, mock_spawn):
# test that cleaning_polling flag is cleared
driver_info = {'clean_steps': self.clean_steps,
'clean_step_index': 0,
'cleaning_polling': True}
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, clean_step=self.clean_steps[0])
self._start_service()
self.service.continue_node_clean(self.context, node.uuid)
self._stop_service()
node.refresh()
self.assertNotIn('cleaning_polling', node.driver_internal_info)
mock_spawn.assert_called_with(self.service,
self.service._do_next_clean_step,
mock.ANY, 1)
def _continue_node_clean_abort(self, manual=False): def _continue_node_clean_abort(self, manual=False):
last_clean_step = self.clean_steps[0] last_clean_step = self.clean_steps[0]
last_clean_step['abortable'] = False last_clean_step['abortable'] = False

View File

@ -915,6 +915,11 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.task) self.task)
def test_deploying_error_handler(self): def test_deploying_error_handler(self):
info = self.node.driver_internal_info
info['deploy_step_index'] = 2
info['deployment_reboot'] = True
info['deployment_polling'] = True
info['skip_current_deploy_step'] = True
conductor_utils.deploying_error_handler(self.task, self.logmsg, conductor_utils.deploying_error_handler(self.task, self.logmsg,
self.errmsg) self.errmsg)
@ -924,6 +929,9 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual({}, self.node.deploy_step) self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info) self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
self.assertNotIn('deployment_reboot', self.node.driver_internal_info) self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
self.assertNotIn('deployment_polling', self.node.driver_internal_info)
self.assertNotIn('skip_current_deploy_step',
self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail') self.task.process_event.assert_called_once_with('fail')
def _test_deploying_error_handler_cleanup(self, exc, expected_str): def _test_deploying_error_handler_cleanup(self, exc, expected_str):
@ -1049,12 +1057,18 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.node.clean_step = {'key': 'val'} self.node.clean_step = {'key': 'val'}
self.node.driver_internal_info = { self.node.driver_internal_info = {
'cleaning_reboot': True, 'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True,
'clean_step_index': 0} 'clean_step_index': 0}
msg = 'error bar' msg = 'error bar'
conductor_utils.cleaning_error_handler(self.task, msg) conductor_utils.cleaning_error_handler(self.task, msg)
self.node.save.assert_called_once_with() self.node.save.assert_called_once_with()
self.assertEqual({}, self.node.clean_step) self.assertEqual({}, self.node.clean_step)
self.assertNotIn('clean_step_index', self.node.driver_internal_info) self.assertNotIn('clean_step_index', self.node.driver_internal_info)
self.assertNotIn('cleaning_reboot', self.node.driver_internal_info)
self.assertNotIn('cleaning_polling', self.node.driver_internal_info)
self.assertNotIn('skip_current_clean_step',
self.node.driver_internal_info)
self.assertEqual(msg, self.node.last_error) self.assertEqual(msg, self.node.last_error)
self.assertTrue(self.node.maintenance) self.assertTrue(self.node.maintenance)
self.assertEqual(msg, self.node.maintenance_reason) self.assertEqual(msg, self.node.maintenance_reason)

View File

@ -175,6 +175,46 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(rti_mock.called) self.assertFalse(rti_mock.called)
self.assertTrue(in_resume_deploy_mock.called) self.assertTrue(in_resume_deploy_mock.called)
@mock.patch.object(manager_utils,
'notify_conductor_resume_deploy', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'deploy_has_started', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'deploy_is_done', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'reboot_to_instance', autospec=True)
def test_heartbeat_not_in_core_deploy_step_polling(self, rti_mock, cd_mock,
deploy_is_done_mock,
deploy_started_mock,
in_deploy_mock,
in_resume_deploy_mock):
# Check that heartbeats do not trigger deployment actions when not in
# the deploy.deploy step.
in_deploy_mock.return_value = False
self.node.provision_state = states.DEPLOYWAIT
info = self.node.driver_internal_info
info['deployment_polling'] = True
self.node.driver_internal_info = info
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.deploy.heartbeat(task, 'url', '3.2.0')
self.assertFalse(task.shared)
self.assertEqual(
'url', task.node.driver_internal_info['agent_url'])
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
self.assertFalse(deploy_started_mock.called)
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
self.assertFalse(in_resume_deploy_mock.called)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True) autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, @mock.patch.object(agent_base_vendor.HeartbeatMixin,
@ -451,6 +491,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
mock_touch.assert_called_once_with(mock.ANY) mock_touch.assert_called_once_with(mock.ANY)
mock_continue.assert_called_once_with(mock.ANY, task) mock_continue.assert_called_once_with(mock.ANY, task)
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'continue_cleaning', autospec=True)
def test_heartbeat_continue_cleaning_polling(self, mock_continue,
mock_touch):
info = self.node.driver_internal_info
info['cleaning_polling'] = True
self.node.driver_internal_info = info
self.node.clean_step = {
'priority': 10,
'interface': 'deploy',
'step': 'foo',
'reboot_requested': False
}
self.node.provision_state = states.CLEANWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
mock_touch.assert_called_once_with(mock.ANY)
self.assertFalse(mock_continue.called)
@mock.patch.object(manager_utils, 'cleaning_error_handler') @mock.patch.object(manager_utils, 'cleaning_error_handler')
@mock.patch.object(agent_base_vendor.HeartbeatMixin, @mock.patch.object(agent_base_vendor.HeartbeatMixin,
'continue_cleaning', autospec=True) 'continue_cleaning', autospec=True)

View File

@ -2889,15 +2889,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
self.node.save() self.node.save()
self._test_get_async_step_return_state() self._test_get_async_step_return_state()
def test_set_async_step_flags_cleaning_set_both(self): def test_set_async_step_flags_cleaning_set_all(self):
self.node.clean_step = {'step': 'create_configuration', self.node.clean_step = {'step': 'create_configuration',
'interface': 'raid'} 'interface': 'raid'}
self.node.driver_internal_info = {} self.node.driver_internal_info = {}
expected = {'cleaning_reboot': True, expected = {'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True} 'skip_current_clean_step': True}
self.node.save() self.node.save()
utils.set_async_step_flags(self.node, reboot=True, utils.set_async_step_flags(self.node, reboot=True,
skip_current_step=True) skip_current_step=True,
polling=True)
self.assertEqual(expected, self.node.driver_internal_info) self.assertEqual(expected, self.node.driver_internal_info)
def test_set_async_step_flags_cleaning_set_one(self): def test_set_async_step_flags_cleaning_set_one(self):
@ -2909,15 +2911,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
self.assertEqual({'cleaning_reboot': True}, self.assertEqual({'cleaning_reboot': True},
self.node.driver_internal_info) self.node.driver_internal_info)
def test_set_async_step_flags_deploying_set_both(self): def test_set_async_step_flags_deploying_set_all(self):
self.node.deploy_step = {'step': 'create_configuration', self.node.deploy_step = {'step': 'create_configuration',
'interface': 'raid'} 'interface': 'raid'}
self.node.driver_internal_info = {} self.node.driver_internal_info = {}
expected = {'deployment_reboot': True, expected = {'deployment_reboot': True,
'deployment_polling': True,
'skip_current_deploy_step': True} 'skip_current_deploy_step': True}
self.node.save() self.node.save()
utils.set_async_step_flags(self.node, reboot=True, utils.set_async_step_flags(self.node, reboot=True,
skip_current_step=True) skip_current_step=True,
polling=True)
self.assertEqual(expected, self.node.driver_internal_info) self.assertEqual(expected, self.node.driver_internal_info)
def test_set_async_step_flags_deploying_set_one(self): def test_set_async_step_flags_deploying_set_one(self):

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue with asynchronous deploy steps that poll for completion
where the step could fail to execute. The ``deployment_polling`` and
``cleaning_polling`` flags may be used by driver implementations to signal
that the driver is polling for completion. See `story 2003817
<https://storyboard.openstack.org/#!/story/2003817>`__ for details.