Merge "Don't resume deployment or cleaning on heartbeat when polling"

This commit is contained in:
Zuul 2019-09-23 11:09:33 +00:00 committed by Gerrit Code Review
commit add03f5fe7
9 changed files with 199 additions and 14 deletions

View File

@ -918,12 +918,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state,
'deploy_state': ', '.join(expected_states)})
save_required = False
info = node.driver_internal_info
try:
skip_current_step = info.pop('skip_current_deploy_step')
except KeyError:
skip_current_step = True
else:
save_required = True
if info.pop('deployment_polling', None) is not None:
save_required = True
if save_required:
node.driver_internal_info = info
node.save()
@ -1229,12 +1234,17 @@ class ConductorManager(base_manager.BaseConductorManager):
'state': node.provision_state,
'clean_state': states.CLEANWAIT})
save_required = False
info = node.driver_internal_info
try:
skip_current_step = info.pop('skip_current_clean_step')
except KeyError:
skip_current_step = True
else:
save_required = True
if info.pop('cleaning_polling', None) is not None:
save_required = True
if save_required:
node.driver_internal_info = info
node.save()
@ -1462,6 +1472,7 @@ class ConductorManager(base_manager.BaseConductorManager):
driver_internal_info['clean_steps'] = None
driver_internal_info.pop('clean_step_index', None)
driver_internal_info.pop('cleaning_reboot', None)
driver_internal_info.pop('cleaning_polling', None)
node.driver_internal_info = driver_internal_info
node.save()
try:
@ -1545,6 +1556,13 @@ class ConductorManager(base_manager.BaseConductorManager):
node.last_error = last_error
node.clean_step = None
info = node.driver_internal_info
# Clear any leftover metadata about cleaning
info.pop('clean_step_index', None)
info.pop('cleaning_reboot', None)
info.pop('cleaning_polling', None)
info.pop('skip_current_clean_step', None)
node.driver_internal_info = info
node.save()
LOG.info(info_message)
@ -3960,6 +3978,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
driver_internal_info['deploy_steps'] = None
driver_internal_info.pop('deploy_step_index', None)
driver_internal_info.pop('deployment_reboot', None)
driver_internal_info.pop('deployment_polling', None)
node.driver_internal_info = driver_internal_info
node.save()

View File

@ -408,9 +408,11 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# Clear clean step, msg should already include current step
node.clean_step = {}
info = node.driver_internal_info
# Clear any leftover metadata about cleaning
info.pop('clean_step_index', None)
# Clear any leftover metadata about cleaning reboots
info.pop('cleaning_reboot', None)
info.pop('cleaning_polling', None)
info.pop('skip_current_clean_step', None)
node.driver_internal_info = info
# For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE.
@ -466,9 +468,11 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
# in node.driver_internal_info for debugging purposes.
node.deploy_step = {}
info = node.driver_internal_info
# Clear any leftover metadata about deployment.
info.pop('deploy_step_index', None)
# Clear any leftover metadata about deployment reboots
info.pop('deployment_reboot', None)
info.pop('deployment_polling', None)
info.pop('skip_current_deploy_step', None)
node.driver_internal_info = info
if cleanup_err:

View File

@ -380,7 +380,7 @@ class HeartbeatMixin(object):
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
# are currently in the core deploy.deploy step. Other deploy steps
# may cause the agent to boot, but we should not trigger deployment
# at that point.
# at that point if the driver is polling for completion of a step.
if node.provision_state == states.DEPLOYWAIT:
if self.in_core_deploy_step(task):
if not self.deploy_has_started(task):
@ -394,7 +394,12 @@ class HeartbeatMixin(object):
else:
# The exceptions from RPC are not possible as we using cast
# here
manager_utils.notify_conductor_resume_deploy(task)
# Check if the driver is polling for completion of a step,
# via the 'deployment_polling' flag.
polling = node.driver_internal_info.get(
'deployment_polling', False)
if not polling:
manager_utils.notify_conductor_resume_deploy(task)
node.touch_provisioning()
elif node.provision_state == states.CLEANWAIT:
node.touch_provisioning()
@ -411,7 +416,12 @@ class HeartbeatMixin(object):
manager_utils.notify_conductor_resume_clean(task)
else:
msg = _('Node failed to check cleaning progress.')
self.continue_cleaning(task)
# Check if the driver is polling for completion of a step,
# via the 'cleaning_polling' flag.
polling = node.driver_internal_info.get(
'cleaning_polling', False)
if not polling:
self.continue_cleaning(task)
elif (node.provision_state == states.RESCUEWAIT):
msg = _('Node failed to perform rescue operation.')
self._finalize_rescue(task)

View File

@ -1475,7 +1475,8 @@ def get_async_step_return_state(node):
return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT
def set_async_step_flags(node, reboot=None, skip_current_step=None):
def set_async_step_flags(node, reboot=None, skip_current_step=None,
polling=None):
"""Sets appropriate reboot flags in driver_internal_info based on operation
:param node: an ironic node object.
@ -1488,16 +1489,24 @@ def set_async_step_flags(node, reboot=None, skip_current_step=None):
skip_current_deploy_step based on cleaning or deployment operation
in progress. If it is None, corresponding skip step flag is not set
in node's driver_internal_info.
:param polling: Boolean value to set for node's driver_internal_info flag
deployment_polling or cleaning_polling. If it is None, the
corresponding polling flag is not set in the node's
driver_internal_info.
"""
info = node.driver_internal_info
cleaning = {'reboot': 'cleaning_reboot',
'skip': 'skip_current_clean_step'}
'skip': 'skip_current_clean_step',
'polling': 'cleaning_polling'}
deployment = {'reboot': 'deployment_reboot',
'skip': 'skip_current_deploy_step'}
'skip': 'skip_current_deploy_step',
'polling': 'deployment_polling'}
fields = cleaning if node.clean_step else deployment
if reboot is not None:
info[fields['reboot']] = reboot
if skip_current_step is not None:
info[fields['skip']] = skip_current_step
if polling is not None:
info[fields['polling']] = polling
node.driver_internal_info = info
node.save()

View File

@ -1890,6 +1890,26 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
def test_continue_node_deploy_no_skip_step(self):
self._continue_node_deploy_skip_step(skip=False)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_continue_node_deploy_polling(self, mock_spawn):
# test that deployment_polling flag is cleared
driver_info = {'deploy_steps': self.deploy_steps,
'deploy_step_index': 0,
'deployment_polling': True}
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
self._start_service()
self.service.continue_node_deploy(self.context, node.uuid)
self._stop_service()
node.refresh()
self.assertNotIn('deployment_polling', node.driver_internal_info)
mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
mock.ANY, 1, mock.ANY)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True)
def test_do_next_deploy_step_oob_reboot(self, mock_execute):
@ -3269,7 +3289,12 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
self.context, driver='fake-hardware',
provision_state=states.CLEANFAIL,
target_provision_state=states.AVAILABLE,
clean_step={'step': 'foo', 'abortable': True})
clean_step={'step': 'foo', 'abortable': True},
driver_internal_info={
'clean_step_index': 2,
'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True})
with task_manager.acquire(self.context, node.uuid) as task:
self.service._do_node_clean_abort(task, step_name=step_name)
@ -3277,8 +3302,16 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin,
tear_mock.assert_called_once_with(task.driver.deploy, task)
if step_name:
self.assertIn(step_name, task.node.last_error)
# assert node's clean_step was cleaned up
# assert node's clean_step and metadata was cleaned up
self.assertEqual({}, task.node.clean_step)
self.assertNotIn('clean_step_index',
task.node.driver_internal_info)
self.assertNotIn('cleaning_reboot',
task.node.driver_internal_info)
self.assertNotIn('cleaning_polling',
task.node.driver_internal_info)
self.assertNotIn('skip_current_clean_step',
task.node.driver_internal_info)
def test__do_node_clean_abort(self):
self._test__do_node_clean_abort(None)
@ -3555,6 +3588,27 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test_continue_node_clean_no_skip_step(self):
self._continue_node_clean_skip_step(skip=False)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_continue_node_clean_polling(self, mock_spawn):
# test that cleaning_polling flag is cleared
driver_info = {'clean_steps': self.clean_steps,
'clean_step_index': 0,
'cleaning_polling': True}
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANWAIT,
target_provision_state=states.MANAGEABLE,
driver_internal_info=driver_info, clean_step=self.clean_steps[0])
self._start_service()
self.service.continue_node_clean(self.context, node.uuid)
self._stop_service()
node.refresh()
self.assertNotIn('cleaning_polling', node.driver_internal_info)
mock_spawn.assert_called_with(self.service,
self.service._do_next_clean_step,
mock.ANY, 1)
def _continue_node_clean_abort(self, manual=False):
last_clean_step = self.clean_steps[0]
last_clean_step['abortable'] = False

View File

@ -915,6 +915,11 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.task)
def test_deploying_error_handler(self):
info = self.node.driver_internal_info
info['deploy_step_index'] = 2
info['deployment_reboot'] = True
info['deployment_polling'] = True
info['skip_current_deploy_step'] = True
conductor_utils.deploying_error_handler(self.task, self.logmsg,
self.errmsg)
@ -924,6 +929,9 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
self.assertNotIn('deployment_polling', self.node.driver_internal_info)
self.assertNotIn('skip_current_deploy_step',
self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail')
def _test_deploying_error_handler_cleanup(self, exc, expected_str):
@ -1049,12 +1057,18 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.node.clean_step = {'key': 'val'}
self.node.driver_internal_info = {
'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True,
'clean_step_index': 0}
msg = 'error bar'
conductor_utils.cleaning_error_handler(self.task, msg)
self.node.save.assert_called_once_with()
self.assertEqual({}, self.node.clean_step)
self.assertNotIn('clean_step_index', self.node.driver_internal_info)
self.assertNotIn('cleaning_reboot', self.node.driver_internal_info)
self.assertNotIn('cleaning_polling', self.node.driver_internal_info)
self.assertNotIn('skip_current_clean_step',
self.node.driver_internal_info)
self.assertEqual(msg, self.node.last_error)
self.assertTrue(self.node.maintenance)
self.assertEqual(msg, self.node.maintenance_reason)

View File

@ -175,6 +175,46 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertFalse(rti_mock.called)
self.assertTrue(in_resume_deploy_mock.called)
@mock.patch.object(manager_utils,
'notify_conductor_resume_deploy', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'deploy_has_started', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'deploy_is_done', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'reboot_to_instance', autospec=True)
def test_heartbeat_not_in_core_deploy_step_polling(self, rti_mock, cd_mock,
deploy_is_done_mock,
deploy_started_mock,
in_deploy_mock,
in_resume_deploy_mock):
# Check that heartbeats do not trigger deployment actions when not in
# the deploy.deploy step.
in_deploy_mock.return_value = False
self.node.provision_state = states.DEPLOYWAIT
info = self.node.driver_internal_info
info['deployment_polling'] = True
self.node.driver_internal_info = info
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.deploy.heartbeat(task, 'url', '3.2.0')
self.assertFalse(task.shared)
self.assertEqual(
'url', task.node.driver_internal_info['agent_url'])
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
self.assertFalse(deploy_started_mock.called)
self.assertFalse(deploy_is_done_mock.called)
self.assertFalse(cd_mock.called)
self.assertFalse(rti_mock.called)
self.assertFalse(in_resume_deploy_mock.called)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
@ -458,6 +498,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
mock_touch.assert_called_once_with(mock.ANY)
mock_continue.assert_called_once_with(mock.ANY, task)
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'continue_cleaning', autospec=True)
def test_heartbeat_continue_cleaning_polling(self, mock_continue,
mock_touch):
info = self.node.driver_internal_info
info['cleaning_polling'] = True
self.node.driver_internal_info = info
self.node.clean_step = {
'priority': 10,
'interface': 'deploy',
'step': 'foo',
'reboot_requested': False
}
self.node.provision_state = states.CLEANWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
mock_touch.assert_called_once_with(mock.ANY)
self.assertFalse(mock_continue.called)
@mock.patch.object(manager_utils, 'cleaning_error_handler')
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'continue_cleaning', autospec=True)

View File

@ -2889,15 +2889,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
self.node.save()
self._test_get_async_step_return_state()
def test_set_async_step_flags_cleaning_set_both(self):
def test_set_async_step_flags_cleaning_set_all(self):
self.node.clean_step = {'step': 'create_configuration',
'interface': 'raid'}
self.node.driver_internal_info = {}
expected = {'cleaning_reboot': True,
'cleaning_polling': True,
'skip_current_clean_step': True}
self.node.save()
utils.set_async_step_flags(self.node, reboot=True,
skip_current_step=True)
skip_current_step=True,
polling=True)
self.assertEqual(expected, self.node.driver_internal_info)
def test_set_async_step_flags_cleaning_set_one(self):
@ -2909,15 +2911,17 @@ class AsyncStepTestCase(db_base.DbTestCase):
self.assertEqual({'cleaning_reboot': True},
self.node.driver_internal_info)
def test_set_async_step_flags_deploying_set_both(self):
def test_set_async_step_flags_deploying_set_all(self):
self.node.deploy_step = {'step': 'create_configuration',
'interface': 'raid'}
self.node.driver_internal_info = {}
expected = {'deployment_reboot': True,
'deployment_polling': True,
'skip_current_deploy_step': True}
self.node.save()
utils.set_async_step_flags(self.node, reboot=True,
skip_current_step=True)
skip_current_step=True,
polling=True)
self.assertEqual(expected, self.node.driver_internal_info)
def test_set_async_step_flags_deploying_set_one(self):

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue with asynchronous deploy steps that poll for completion
where the step could fail to execute. The ``deployment_polling`` and
``cleaning_polling`` flags may be used by driver implementations to signal
that the driver is polling for completion. See `story 2003817
<https://storyboard.openstack.org/#!/story/2003817>`__ for details.