diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index b83d5e5296..15e79dd78b 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -918,12 +918,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'deploy_state': ', '.join(expected_states)}) + save_required = False info = node.driver_internal_info try: skip_current_step = info.pop('skip_current_deploy_step') except KeyError: skip_current_step = True else: + save_required = True + if info.pop('deployment_polling', None) is not None: + save_required = True + if save_required: node.driver_internal_info = info node.save() @@ -1229,12 +1234,17 @@ class ConductorManager(base_manager.BaseConductorManager): 'state': node.provision_state, 'clean_state': states.CLEANWAIT}) + save_required = False info = node.driver_internal_info try: skip_current_step = info.pop('skip_current_clean_step') except KeyError: skip_current_step = True else: + save_required = True + if info.pop('cleaning_polling', None) is not None: + save_required = True + if save_required: node.driver_internal_info = info node.save() @@ -1462,6 +1472,7 @@ class ConductorManager(base_manager.BaseConductorManager): driver_internal_info['clean_steps'] = None driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) + driver_internal_info.pop('cleaning_polling', None) node.driver_internal_info = driver_internal_info node.save() try: @@ -1545,6 +1556,13 @@ class ConductorManager(base_manager.BaseConductorManager): node.last_error = last_error node.clean_step = None + info = node.driver_internal_info + # Clear any leftover metadata about cleaning + info.pop('clean_step_index', None) + info.pop('cleaning_reboot', None) + info.pop('cleaning_polling', None) + info.pop('skip_current_clean_step', None) + node.driver_internal_info = info node.save() LOG.info(info_message) @@ -3960,6 +3978,7 @@ def _do_next_deploy_step(task, step_index, conductor_id): driver_internal_info['deploy_steps'] = None driver_internal_info.pop('deploy_step_index', None) driver_internal_info.pop('deployment_reboot', None) + driver_internal_info.pop('deployment_polling', None) node.driver_internal_info = driver_internal_info node.save() diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 51e1bf6ea3..428c07002d 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -408,9 +408,11 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, # Clear clean step, msg should already include current step node.clean_step = {} info = node.driver_internal_info + # Clear any leftover metadata about cleaning info.pop('clean_step_index', None) - # Clear any leftover metadata about cleaning reboots info.pop('cleaning_reboot', None) + info.pop('cleaning_polling', None) + info.pop('skip_current_clean_step', None) node.driver_internal_info = info # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. @@ -466,9 +468,11 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False, # in node.driver_internal_info for debugging purposes. node.deploy_step = {} info = node.driver_internal_info + # Clear any leftover metadata about deployment. info.pop('deploy_step_index', None) - # Clear any leftover metadata about deployment reboots info.pop('deployment_reboot', None) + info.pop('deployment_polling', None) + info.pop('skip_current_deploy_step', None) node.driver_internal_info = info if cleanup_err: diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index a158daddd0..1f0710023d 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -380,7 +380,7 @@ class HeartbeatMixin(object): # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment - # at that point. + # at that point if the driver is polling for completion of a step. if node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): @@ -394,7 +394,12 @@ class HeartbeatMixin(object): else: # The exceptions from RPC are not possible as we using cast # here - manager_utils.notify_conductor_resume_deploy(task) + # Check if the driver is polling for completion of a step, + # via the 'deployment_polling' flag. + polling = node.driver_internal_info.get( + 'deployment_polling', False) + if not polling: + manager_utils.notify_conductor_resume_deploy(task) node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() @@ -411,7 +416,12 @@ class HeartbeatMixin(object): manager_utils.notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') - self.continue_cleaning(task) + # Check if the driver is polling for completion of a step, + # via the 'cleaning_polling' flag. + polling = node.driver_internal_info.get( + 'cleaning_polling', False) + if not polling: + self.continue_cleaning(task) elif (node.provision_state == states.RESCUEWAIT): msg = _('Node failed to perform rescue operation.') self._finalize_rescue(task) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index c96c3eb378..e29b3668cb 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -1475,7 +1475,8 @@ def get_async_step_return_state(node): return states.CLEANWAIT if node.clean_step else states.DEPLOYWAIT -def set_async_step_flags(node, reboot=None, skip_current_step=None): +def set_async_step_flags(node, reboot=None, skip_current_step=None, + polling=None): """Sets appropriate reboot flags in driver_internal_info based on operation :param node: an ironic node object. @@ -1488,16 +1489,24 @@ def set_async_step_flags(node, reboot=None, skip_current_step=None): skip_current_deploy_step based on cleaning or deployment operation in progress. If it is None, corresponding skip step flag is not set in node's driver_internal_info. + :param polling: Boolean value to set for node's driver_internal_info flag + deployment_polling or cleaning_polling. If it is None, the + corresponding polling flag is not set in the node's + driver_internal_info. """ info = node.driver_internal_info cleaning = {'reboot': 'cleaning_reboot', - 'skip': 'skip_current_clean_step'} + 'skip': 'skip_current_clean_step', + 'polling': 'cleaning_polling'} deployment = {'reboot': 'deployment_reboot', - 'skip': 'skip_current_deploy_step'} + 'skip': 'skip_current_deploy_step', + 'polling': 'deployment_polling'} fields = cleaning if node.clean_step else deployment if reboot is not None: info[fields['reboot']] = reboot if skip_current_step is not None: info[fields['skip']] = skip_current_step + if polling is not None: + info[fields['polling']] = polling node.driver_internal_info = info node.save() diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 01b5fcc3d9..ac0b60716e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1890,6 +1890,26 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, def test_continue_node_deploy_no_skip_step(self): self._continue_node_deploy_skip_step(skip=False) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_continue_node_deploy_polling(self, mock_spawn): + # test that deployment_polling flag is cleared + driver_info = {'deploy_steps': self.deploy_steps, + 'deploy_step_index': 0, + 'deployment_polling': True} + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.DEPLOYWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, deploy_step=self.deploy_steps[0]) + self._start_service() + self.service.continue_node_deploy(self.context, node.uuid) + self._stop_service() + node.refresh() + self.assertNotIn('deployment_polling', node.driver_internal_info) + mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step, + mock.ANY, 1, mock.ANY) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', autospec=True) def test_do_next_deploy_step_oob_reboot(self, mock_execute): @@ -3269,7 +3289,12 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin, self.context, driver='fake-hardware', provision_state=states.CLEANFAIL, target_provision_state=states.AVAILABLE, - clean_step={'step': 'foo', 'abortable': True}) + clean_step={'step': 'foo', 'abortable': True}, + driver_internal_info={ + 'clean_step_index': 2, + 'cleaning_reboot': True, + 'cleaning_polling': True, + 'skip_current_clean_step': True}) with task_manager.acquire(self.context, node.uuid) as task: self.service._do_node_clean_abort(task, step_name=step_name) @@ -3277,8 +3302,16 @@ class DoNodeCleanAbortTestCase(mgr_utils.ServiceSetUpMixin, tear_mock.assert_called_once_with(task.driver.deploy, task) if step_name: self.assertIn(step_name, task.node.last_error) - # assert node's clean_step was cleaned up + # assert node's clean_step and metadata was cleaned up self.assertEqual({}, task.node.clean_step) + self.assertNotIn('clean_step_index', + task.node.driver_internal_info) + self.assertNotIn('cleaning_reboot', + task.node.driver_internal_info) + self.assertNotIn('cleaning_polling', + task.node.driver_internal_info) + self.assertNotIn('skip_current_clean_step', + task.node.driver_internal_info) def test__do_node_clean_abort(self): self._test__do_node_clean_abort(None) @@ -3555,6 +3588,27 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test_continue_node_clean_no_skip_step(self): self._continue_node_clean_skip_step(skip=False) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_continue_node_clean_polling(self, mock_spawn): + # test that cleaning_polling flag is cleared + driver_info = {'clean_steps': self.clean_steps, + 'clean_step_index': 0, + 'cleaning_polling': True} + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANWAIT, + target_provision_state=states.MANAGEABLE, + driver_internal_info=driver_info, clean_step=self.clean_steps[0]) + self._start_service() + self.service.continue_node_clean(self.context, node.uuid) + self._stop_service() + node.refresh() + self.assertNotIn('cleaning_polling', node.driver_internal_info) + mock_spawn.assert_called_with(self.service, + self.service._do_next_clean_step, + mock.ANY, 1) + def _continue_node_clean_abort(self, manual=False): last_clean_step = self.clean_steps[0] last_clean_step['abortable'] = False diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 7653ed391e..1526b2ec98 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -915,6 +915,11 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.task) def test_deploying_error_handler(self): + info = self.node.driver_internal_info + info['deploy_step_index'] = 2 + info['deployment_reboot'] = True + info['deployment_polling'] = True + info['skip_current_deploy_step'] = True conductor_utils.deploying_error_handler(self.task, self.logmsg, self.errmsg) @@ -924,6 +929,9 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertEqual({}, self.node.deploy_step) self.assertNotIn('deploy_step_index', self.node.driver_internal_info) self.assertNotIn('deployment_reboot', self.node.driver_internal_info) + self.assertNotIn('deployment_polling', self.node.driver_internal_info) + self.assertNotIn('skip_current_deploy_step', + self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') def _test_deploying_error_handler_cleanup(self, exc, expected_str): @@ -1049,12 +1057,18 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.node.clean_step = {'key': 'val'} self.node.driver_internal_info = { 'cleaning_reboot': True, + 'cleaning_polling': True, + 'skip_current_clean_step': True, 'clean_step_index': 0} msg = 'error bar' conductor_utils.cleaning_error_handler(self.task, msg) self.node.save.assert_called_once_with() self.assertEqual({}, self.node.clean_step) self.assertNotIn('clean_step_index', self.node.driver_internal_info) + self.assertNotIn('cleaning_reboot', self.node.driver_internal_info) + self.assertNotIn('cleaning_polling', self.node.driver_internal_info) + self.assertNotIn('skip_current_clean_step', + self.node.driver_internal_info) self.assertEqual(msg, self.node.last_error) self.assertTrue(self.node.maintenance) self.assertEqual(msg, self.node.maintenance_reason) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 2fcf4d2e05..afb14c02f6 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -175,6 +175,46 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertFalse(rti_mock.called) self.assertTrue(in_resume_deploy_mock.called) + @mock.patch.object(manager_utils, + 'notify_conductor_resume_deploy', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'in_core_deploy_step', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'deploy_has_started', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'deploy_is_done', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + def test_heartbeat_not_in_core_deploy_step_polling(self, rti_mock, cd_mock, + deploy_is_done_mock, + deploy_started_mock, + in_deploy_mock, + in_resume_deploy_mock): + # Check that heartbeats do not trigger deployment actions when not in + # the deploy.deploy step. + in_deploy_mock.return_value = False + self.node.provision_state = states.DEPLOYWAIT + info = self.node.driver_internal_info + info['deployment_polling'] = True + self.node.driver_internal_info = info + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, 'url', '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + 'url', task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + self.assertFalse(deploy_started_mock.called) + self.assertFalse(deploy_is_done_mock.called) + self.assertFalse(cd_mock.called) + self.assertFalse(rti_mock.called) + self.assertFalse(in_resume_deploy_mock.called) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, @@ -458,6 +498,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): mock_touch.assert_called_once_with(mock.ANY) mock_continue.assert_called_once_with(mock.ANY, task) + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'continue_cleaning', autospec=True) + def test_heartbeat_continue_cleaning_polling(self, mock_continue, + mock_touch): + info = self.node.driver_internal_info + info['cleaning_polling'] = True + self.node.driver_internal_info = info + self.node.clean_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'foo', + 'reboot_requested': False + } + self.node.provision_state = states.CLEANWAIT + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_touch.assert_called_once_with(mock.ANY) + self.assertFalse(mock_continue.called) + @mock.patch.object(manager_utils, 'cleaning_error_handler') @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_cleaning', autospec=True) diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 72e903207f..7f448df304 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -2889,15 +2889,17 @@ class AsyncStepTestCase(db_base.DbTestCase): self.node.save() self._test_get_async_step_return_state() - def test_set_async_step_flags_cleaning_set_both(self): + def test_set_async_step_flags_cleaning_set_all(self): self.node.clean_step = {'step': 'create_configuration', 'interface': 'raid'} self.node.driver_internal_info = {} expected = {'cleaning_reboot': True, + 'cleaning_polling': True, 'skip_current_clean_step': True} self.node.save() utils.set_async_step_flags(self.node, reboot=True, - skip_current_step=True) + skip_current_step=True, + polling=True) self.assertEqual(expected, self.node.driver_internal_info) def test_set_async_step_flags_cleaning_set_one(self): @@ -2909,15 +2911,17 @@ class AsyncStepTestCase(db_base.DbTestCase): self.assertEqual({'cleaning_reboot': True}, self.node.driver_internal_info) - def test_set_async_step_flags_deploying_set_both(self): + def test_set_async_step_flags_deploying_set_all(self): self.node.deploy_step = {'step': 'create_configuration', 'interface': 'raid'} self.node.driver_internal_info = {} expected = {'deployment_reboot': True, + 'deployment_polling': True, 'skip_current_deploy_step': True} self.node.save() utils.set_async_step_flags(self.node, reboot=True, - skip_current_step=True) + skip_current_step=True, + polling=True) self.assertEqual(expected, self.node.driver_internal_info) def test_set_async_step_flags_deploying_set_one(self): diff --git a/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml b/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml new file mode 100644 index 0000000000..6e30ead3bc --- /dev/null +++ b/releasenotes/notes/deployment-cleaning-polling-flag-be13a866a7c302d7.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue with asynchronous deploy steps that poll for completion + where the step could fail to execute. The ``deployment_polling`` and + ``cleaning_polling`` flags may be used by driver implementations to signal + that the driver is polling for completion. See `story 2003817 + `__ for details.