From d70db292aeadc99434c002a042da0993150c32fc Mon Sep 17 00:00:00 2001 From: Dao Cong Tien Date: Mon, 30 Jul 2018 17:02:09 +0700 Subject: [PATCH] Node gets stuck in ING state when conductor goes down If a node is in ING state such as INSPECTING, RESCUING, and the conductor goes down, when the conductor backs, the node gets stuck with that ING state. The cases for (DEPLOYING, CLEANING) are already processed as expected, but (INSPECTING, RESCUING, UNRESCUING, VERIFYING, ADOPTING). DELETING cannot be transitioned to 'fail' state. Change-Id: Ie6886ea78fac8bae81675dabf467939deb1c4460 Story: #2003147 Task: #23282 --- ironic/common/states.py | 9 ++++++++ ironic/conductor/base_manager.py | 14 +++++------ .../tests/unit/conductor/test_base_manager.py | 23 +++++++++++++++++++ ironic/tests/unit/conductor/test_manager.py | 22 +++++++++--------- ...-when-conductor-down-3aa41a3abed9daf5.yaml | 7 ++++++ 5 files changed, 56 insertions(+), 19 deletions(-) create mode 100644 releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml diff --git a/ironic/common/states.py b/ironic/common/states.py index cef2459f54..6a4d5013d0 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -234,6 +234,15 @@ UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING, RESCUEWAIT, UNRESCUING) """States that can be changed without external request.""" +STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING, + ADOPTING, RESCUING, UNRESCUING) +"""States that cannot be resumed once a conductor dies. + +If a node gets stuck with one of these states for some reason +(eg. conductor goes down when executing task), node will be moved +to fail state. +""" + ############## # Power states ############## diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index 1be8274b33..5156cbc941 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -179,14 +179,12 @@ class BaseConductorManager(object): self._periodic_tasks_worker.add_done_callback( self._on_periodic_tasks_stop) - self._fail_transient_state( - states.DEPLOYING, - _("The deployment can't be resumed by conductor " - "%s. Moving to fail state.") % self.host) - self._fail_transient_state( - states.CLEANING, - _("The cleaning can't be resumed by conductor " - "%s. Moving to fail state.") % self.host) + for state in states.STUCK_STATES_TREATED_AS_FAIL: + self._fail_transient_state( + state, + _("The %(state)s state can't be resumed by conductor " + "%(host)s. Moving to fail state.") % + {'state': state, 'host': self.host}) # Start consoles if it set enabled in a greenthread. try: diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 5de426afcd..0c619d912b 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -13,6 +13,7 @@ """Test class for Ironic BaseConductorManager.""" import collections +import uuid import eventlet import futurist @@ -24,6 +25,7 @@ from oslo_utils import uuidutils from ironic.common import driver_factory from ironic.common import exception +from ironic.common import states from ironic.conductor import base_manager from ironic.conductor import manager from ironic.conductor import notification_utils @@ -206,6 +208,27 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertRaisesRegex(RuntimeError, 'already running', self.service.init_host) + def test_start_recover_nodes_stuck(self): + state_trans = [ + (states.DEPLOYING, states.DEPLOYFAIL), + (states.CLEANING, states.CLEANFAIL), + (states.VERIFYING, states.ENROLL), + (states.INSPECTING, states.INSPECTFAIL), + (states.ADOPTING, states.ADOPTFAIL), + (states.RESCUING, states.RESCUEFAIL), + (states.UNRESCUING, states.UNRESCUEFAIL), + ] + nodes = [obj_utils.create_test_node(self.context, uuid=uuid.uuid4(), + driver='fake-hardware', + provision_state=state[0]) + for state in state_trans] + + self._start_service() + for node, state in zip(nodes, state_trans): + node.refresh() + self.assertEqual(state[1], node.provision_state, + 'Test failed when recovering from %s' % state[0]) + @mock.patch.object(base_manager, 'LOG') def test_warning_on_low_workers_pool(self, log_mock): CONF.set_override('workers_pool_size', 3, 'conductor') diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 80bd7a4664..03c9982bd6 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -4035,11 +4035,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue') def test__do_node_rescue_returns_rescuewait(self, mock_rescue): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.RESCUING, instance_info={'rescue_password': 'password'}) - self._start_service() with task_manager.TaskManager(self.context, node.uuid) as task: mock_rescue.return_value = states.RESCUEWAIT self.service._do_node_rescue(task) @@ -4050,11 +4050,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue') def test__do_node_rescue_returns_rescue(self, mock_rescue): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.RESCUING, instance_info={'rescue_password': 'password'}) - self._start_service() with task_manager.TaskManager(self.context, node.uuid) as task: mock_rescue.return_value = states.RESCUE self.service._do_node_rescue(task) @@ -4066,11 +4066,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch.object(manager, 'LOG') @mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue') def test__do_node_rescue_errors(self, mock_rescue, mock_log): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.RESCUING, instance_info={'rescue_password': 'password'}) - self._start_service() mock_rescue.side_effect = exception.InstanceRescueFailure( 'failed to rescue') with task_manager.TaskManager(self.context, node.uuid) as task: @@ -4086,11 +4086,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch.object(manager, 'LOG') @mock.patch('ironic.drivers.modules.fake.FakeRescue.rescue') def test__do_node_rescue_bad_state(self, mock_rescue, mock_log): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.RESCUING, instance_info={'rescue_password': 'password'}) - self._start_service() mock_rescue.return_value = states.ACTIVE with task_manager.TaskManager(self.context, node.uuid) as task: self.service._do_node_rescue(task) @@ -4156,11 +4156,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue') def test__do_node_unrescue(self, mock_unrescue): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.UNRESCUING, target_provision_state=states.ACTIVE, instance_info={}) - self._start_service() with task_manager.TaskManager(self.context, node.uuid) as task: mock_unrescue.return_value = states.ACTIVE self.service._do_node_unrescue(task) @@ -4171,11 +4171,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch.object(manager, 'LOG') @mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue') def test__do_node_unrescue_ironic_error(self, mock_unrescue, mock_log): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.UNRESCUING, target_provision_state=states.ACTIVE, instance_info={}) - self._start_service() mock_unrescue.side_effect = exception.InstanceUnrescueFailure( 'Unable to unrescue') with task_manager.TaskManager(self.context, node.uuid) as task: @@ -4190,11 +4190,11 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch.object(manager, 'LOG') @mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue') def test__do_node_unrescue_other_error(self, mock_unrescue, mock_log): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.UNRESCUING, target_provision_state=states.ACTIVE, instance_info={}) - self._start_service() mock_unrescue.side_effect = RuntimeError('Some failure') with task_manager.TaskManager(self.context, node.uuid) as task: self.assertRaises(RuntimeError, @@ -4207,10 +4207,10 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, @mock.patch('ironic.drivers.modules.fake.FakeRescue.unrescue') def test__do_node_unrescue_bad_state(self, mock_unrescue): + self._start_service() node = obj_utils.create_test_node(self.context, driver='fake-hardware', provision_state=states.UNRESCUING, instance_info={}) - self._start_service() mock_unrescue.return_value = states.RESCUEWAIT with task_manager.TaskManager(self.context, node.uuid) as task: self.service._do_node_unrescue(task) @@ -4272,6 +4272,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakePower.validate') def test__do_node_verify(self, mock_validate, mock_get_power_state, mock_notif): + self._start_service() mock_get_power_state.return_value = states.POWER_OFF # Required for exception handling mock_notif.__name__ = 'NodeCorrectedPowerStateNotification' @@ -4282,7 +4283,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): last_error=None, power_state=states.NOSTATE) - self._start_service() with task_manager.acquire( self.context, node['id'], shared=False) as task: self.service._do_node_verify(task) @@ -4311,6 +4311,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakePower.validate') def test__do_node_verify_validation_fails(self, mock_validate, mock_get_power_state): + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake-hardware', provision_state=states.VERIFYING, @@ -4320,7 +4321,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_validate.side_effect = RuntimeError("boom") - self._start_service() with task_manager.acquire( self.context, node['id'], shared=False) as task: self.service._do_node_verify(task) @@ -4339,6 +4339,7 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakePower.validate') def test__do_node_verify_get_state_fails(self, mock_validate, mock_get_power_state): + self._start_service() node = obj_utils.create_test_node( self.context, driver='fake-hardware', provision_state=states.VERIFYING, @@ -4348,7 +4349,6 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_get_power_state.side_effect = RuntimeError("boom") - self._start_service() with task_manager.acquire( self.context, node['id'], shared=False) as task: self.service._do_node_verify(task) diff --git a/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml b/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml new file mode 100644 index 0000000000..6a8eb5d392 --- /dev/null +++ b/releasenotes/notes/node-stuck-when-conductor-down-3aa41a3abed9daf5.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - If a node gets stuck in one of the states ``deploying``, ``cleaning``, + ``verifying``, ``inspecting``, ``adopting``, ``rescuing``, ``unrescuing`` + for some reason (eg. conductor goes down when executing a task), it will + be moved to an appropriate failure state in the next time the conductor + starts.