From 2921fe685d8f096717f8795494c1032025407fe4 Mon Sep 17 00:00:00 2001 From: Zhenguo Niu Date: Tue, 2 Aug 2016 20:24:54 +0800 Subject: [PATCH] Clean nodes stuck in CLEANING state when ir-cond restarts When a conductor managing a node dies abruptly mid cleaing, the node will get stuck in the CLEANING state. This also moves _start_service() before creating CLEANING nodes in tests. Finally, it adds autospec to a few places where the tests fail in a mysterious way otherwise. Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b Co-Authored-By: Dmitry Tantsur Partial-Bug: #1651092 --- ironic/conductor/base_manager.py | 37 +++++++----- .../tests/unit/conductor/test_base_manager.py | 6 +- ironic/tests/unit/conductor/test_manager.py | 57 +++++++++++-------- ...-cleaning-on-startup-443823ea4f937965.yaml | 5 ++ 4 files changed, 64 insertions(+), 41 deletions(-) create mode 100644 releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 34319babf3..a75a5161f0 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -223,21 +223,14 @@ class BaseConductorManager(object): self._periodic_tasks_worker.add_done_callback( self._on_periodic_tasks_stop) - # NOTE(lucasagomes): If the conductor server dies abruptly - # mid deployment (OMM Killer, power outage, etc...) we - # can not resume the deployment even if the conductor - # comes back online. Cleaning the reservation of the nodes - # (dbapi.clear_node_reservations_for_conductor) is not enough to - # unstick it, so let's gracefully fail the deployment so the node - # can go through the steps (deleting & cleaning) to make itself - # available again. - filters = {'reserved': False, - 'provision_state': states.DEPLOYING} - last_error = (_("The deployment can't be resumed by conductor " - "%s. Moving to fail state.") % self.host) - self._fail_if_in_state(ironic_context.get_admin_context(), filters, - states.DEPLOYING, 'provision_updated_at', - last_error=last_error) + self._fail_transient_state( + states.DEPLOYING, + _("The deployment can't be resumed by conductor " + "%s. Moving to fail state.") % self.host) + self._fail_transient_state( + states.CLEANING, + _("The cleaning can't be resumed by conductor " + "%s. Moving to fail state.") % self.host) # Start consoles if it set enabled in a greenthread. try: @@ -259,6 +252,20 @@ class BaseConductorManager(object): self._started = True + def _fail_transient_state(self, state, last_error): + """Apply "fail" transition to nodes in a transient state. + + If the conductor server dies abruptly mid deployment or cleaning + (OMM Killer, power outage, etc...) we can not resume the process even + if the conductor comes back online. Cleaning the reservation of + the nodes (dbapi.clear_node_reservations_for_conductor) is not enough + to unstick it, so let's gracefully fail the process. + """ + filters = {'reserved': False, 'provision_state': state} + self._fail_if_in_state(ironic_context.get_admin_context(), filters, + state, 'provision_updated_at', + last_error=last_error) + def del_host(self, deregister=True): # Conductor deregistration fails if called on non-initialized # conductor (e.g. when rpc server is unreachable). diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index ea2ff86aa3..ff62fdb89e 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): ht_mock.assert_called_once_with() @mock.patch.object(base_manager, 'LOG') - @mock.patch.object(base_manager.BaseConductorManager, 'del_host') + @mock.patch.object(base_manager.BaseConductorManager, 'del_host', + autospec=True) @mock.patch.object(driver_factory, 'DriverFactory') def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock, log_mock): @@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertFalse(del_mock.called) @mock.patch.object(base_manager, 'LOG') - @mock.patch.object(base_manager.BaseConductorManager, 'del_host') + @mock.patch.object(base_manager.BaseConductorManager, 'del_host', + autospec=True) @mock.patch.object(driver_factory, 'HardwareTypesFactory') def test_starts_with_only_classic_drivers(self, ht_mock, del_mock, log_mock): diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 3cfe6a56aa..dcd9f65b02 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2298,13 +2298,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakePower.validate') def test__do_node_clean_automated_disabled(self, mock_validate): self.config(automated_clean=False, group='conductor') + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, target_provision_state=states.AVAILABLE, last_error=None) - - self._start_service() with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_node_clean(task) @@ -2416,6 +2416,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): else: tgt_prov_state = states.AVAILABLE driver_info = {'clean_steps': self.clean_steps} + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2424,7 +2426,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): power_state=states.POWER_OFF, driver_internal_info=driver_info) - self._start_service() with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_node_clean(task, clean_steps=clean_steps) @@ -2462,6 +2463,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): tgt_prov_state = states.AVAILABLE driver_internal_info['clean_steps'] = self.clean_steps + self._start_service() + node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2472,8 +2475,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_execute.return_value = return_state expected_first_step = node.driver_internal_info['clean_steps'][0] - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, 0) @@ -2500,6 +2501,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): manual=False): # Resume an in-progress cleaning after the first async step tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2510,8 +2513,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): clean_step=self.clean_steps[0]) mock_execute.return_value = return_state - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, self.next_clean_step_index) @@ -2538,6 +2539,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE info = {'clean_steps': self.clean_steps, 'clean_step_index': len(self.clean_steps) - 1} + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2546,8 +2549,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): driver_internal_info=info, clean_step=self.clean_steps[-1]) - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, None) @@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_power_execute, manual=False): # Run all steps from start to finish (all synchronous) tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2586,8 +2589,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_deploy_execute.return_value = None mock_power_execute.return_value = None - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, 0) @@ -2620,6 +2621,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): manual=False): # When a clean step fails, go to CLEANFAIL tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2630,8 +2633,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): clean_step={}) mock_execute.side_effect = Exception() - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, 0) @@ -2665,6 +2666,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock, manual=True): tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2678,8 +2681,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): power_exec_mock.return_value = None tear_mock.side_effect = Exception('boom') - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, 0) @@ -2722,6 +2723,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): {'clean_steps': None}): # Resume where there are no steps, should be a noop tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', uuid=uuidutils.generate_uuid(), @@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): driver_internal_info=info, clean_step={}) - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, None) @@ -2760,6 +2761,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self, deploy_exec_mock, power_exec_mock, manual=False): # When a clean step fails, go to CLEANFAIL tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE + + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake', provision_state=states.CLEANING, @@ -2770,8 +2773,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): clean_step={}) deploy_exec_mock.return_value = "foo" - self._start_service() - with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_next_clean_step(task, 0) @@ -3372,11 +3373,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn, self.assertEqual([(nodes[0].uuid, 'fake', 0)], result) mock_nodeinfo_list.assert_called_once_with( columns=self.columns, filters=mock.sentinel.filters) - mock_fail_if_state.assert_called_once_with( - mock.ANY, mock.ANY, - {'provision_state': 'deploying', 'reserved': False}, - 'deploying', 'provision_updated_at', - last_error=mock.ANY) + expected_calls = [mock.call(mock.ANY, mock.ANY, + {'provision_state': 'deploying', + 'reserved': False}, + 'deploying', + 'provision_updated_at', + last_error=mock.ANY), + mock.call(mock.ANY, mock.ANY, + {'provision_state': 'cleaning', + 'reserved': False}, + 'cleaning', + 'provision_updated_at', + last_error=mock.ANY)] + mock_fail_if_state.assert_has_calls(expected_calls) @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') def test_iter_nodes_shutdown(self, mock_nodeinfo_list): diff --git a/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml new file mode 100644 index 0000000000..09d2a016cc --- /dev/null +++ b/releasenotes/notes/clean-nodes-stuck-in-cleaning-on-startup-443823ea4f937965.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - When a conductor managing a node dies mid-cleaning the node would get stuck + in the CLEANING state. Now upon conductor startup nodes in the CLEANING state + will be moved to the CLEANFAIL state.