Clean nodes stuck in CLEANING state when ir-cond restarts
When a conductor managing a node dies abruptly mid cleaing, the node will get stuck in the CLEANING state. This also moves _start_service() before creating CLEANING nodes in tests. Finally, it adds autospec to a few places where the tests fail in a mysterious way otherwise. Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com> Partial-Bug: #1651092
This commit is contained in:
parent
5b75b6e4f2
commit
2921fe685d
@ -223,21 +223,14 @@ class BaseConductorManager(object):
|
||||
self._periodic_tasks_worker.add_done_callback(
|
||||
self._on_periodic_tasks_stop)
|
||||
|
||||
# NOTE(lucasagomes): If the conductor server dies abruptly
|
||||
# mid deployment (OMM Killer, power outage, etc...) we
|
||||
# can not resume the deployment even if the conductor
|
||||
# comes back online. Cleaning the reservation of the nodes
|
||||
# (dbapi.clear_node_reservations_for_conductor) is not enough to
|
||||
# unstick it, so let's gracefully fail the deployment so the node
|
||||
# can go through the steps (deleting & cleaning) to make itself
|
||||
# available again.
|
||||
filters = {'reserved': False,
|
||||
'provision_state': states.DEPLOYING}
|
||||
last_error = (_("The deployment can't be resumed by conductor "
|
||||
self._fail_transient_state(
|
||||
states.DEPLOYING,
|
||||
_("The deployment can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
self._fail_transient_state(
|
||||
states.CLEANING,
|
||||
_("The cleaning can't be resumed by conductor "
|
||||
"%s. Moving to fail state.") % self.host)
|
||||
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
|
||||
states.DEPLOYING, 'provision_updated_at',
|
||||
last_error=last_error)
|
||||
|
||||
# Start consoles if it set enabled in a greenthread.
|
||||
try:
|
||||
@ -259,6 +252,20 @@ class BaseConductorManager(object):
|
||||
|
||||
self._started = True
|
||||
|
||||
def _fail_transient_state(self, state, last_error):
|
||||
"""Apply "fail" transition to nodes in a transient state.
|
||||
|
||||
If the conductor server dies abruptly mid deployment or cleaning
|
||||
(OMM Killer, power outage, etc...) we can not resume the process even
|
||||
if the conductor comes back online. Cleaning the reservation of
|
||||
the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
|
||||
to unstick it, so let's gracefully fail the process.
|
||||
"""
|
||||
filters = {'reserved': False, 'provision_state': state}
|
||||
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
|
||||
state, 'provision_updated_at',
|
||||
last_error=last_error)
|
||||
|
||||
def del_host(self, deregister=True):
|
||||
# Conductor deregistration fails if called on non-initialized
|
||||
# conductor (e.g. when rpc server is unreachable).
|
||||
|
@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
ht_mock.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
|
||||
autospec=True)
|
||||
@mock.patch.object(driver_factory, 'DriverFactory')
|
||||
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
|
||||
log_mock):
|
||||
@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertFalse(del_mock.called)
|
||||
|
||||
@mock.patch.object(base_manager, 'LOG')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host')
|
||||
@mock.patch.object(base_manager.BaseConductorManager, 'del_host',
|
||||
autospec=True)
|
||||
@mock.patch.object(driver_factory, 'HardwareTypesFactory')
|
||||
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
|
||||
log_mock):
|
||||
|
@ -2298,13 +2298,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
|
||||
def test__do_node_clean_automated_disabled(self, mock_validate):
|
||||
self.config(automated_clean=False, group='conductor')
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
last_error=None)
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_node_clean(task)
|
||||
@ -2416,6 +2416,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
else:
|
||||
tgt_prov_state = states.AVAILABLE
|
||||
driver_info = {'clean_steps': self.clean_steps}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2424,7 +2426,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
power_state=states.POWER_OFF,
|
||||
driver_internal_info=driver_info)
|
||||
|
||||
self._start_service()
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_node_clean(task, clean_steps=clean_steps)
|
||||
@ -2462,6 +2463,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
tgt_prov_state = states.AVAILABLE
|
||||
driver_internal_info['clean_steps'] = self.clean_steps
|
||||
|
||||
self._start_service()
|
||||
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2472,8 +2475,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_execute.return_value = return_state
|
||||
expected_first_step = node.driver_internal_info['clean_steps'][0]
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2500,6 +2501,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
manual=False):
|
||||
# Resume an in-progress cleaning after the first async step
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2510,8 +2513,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step=self.clean_steps[0])
|
||||
mock_execute.return_value = return_state
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, self.next_clean_step_index)
|
||||
@ -2538,6 +2539,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
info = {'clean_steps': self.clean_steps,
|
||||
'clean_step_index': len(self.clean_steps) - 1}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2546,8 +2549,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
driver_internal_info=info,
|
||||
clean_step=self.clean_steps[-1])
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_power_execute, manual=False):
|
||||
# Run all steps from start to finish (all synchronous)
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2586,8 +2589,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_deploy_execute.return_value = None
|
||||
mock_power_execute.return_value = None
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2620,6 +2621,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
manual=False):
|
||||
# When a clean step fails, go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2630,8 +2633,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step={})
|
||||
mock_execute.side_effect = Exception()
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2665,6 +2666,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
|
||||
manual=True):
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2678,8 +2681,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
power_exec_mock.return_value = None
|
||||
tear_mock.side_effect = Exception('boom')
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -2722,6 +2723,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
{'clean_steps': None}):
|
||||
# Resume where there are no steps, should be a noop
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
uuid=uuidutils.generate_uuid(),
|
||||
@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
driver_internal_info=info,
|
||||
clean_step={})
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
@ -2760,6 +2761,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self, deploy_exec_mock, power_exec_mock, manual=False):
|
||||
# When a clean step fails, go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
provision_state=states.CLEANING,
|
||||
@ -2770,8 +2773,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
clean_step={})
|
||||
deploy_exec_mock.return_value = "foo"
|
||||
|
||||
self._start_service()
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
@ -3372,11 +3373,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
|
||||
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
|
||||
mock_nodeinfo_list.assert_called_once_with(
|
||||
columns=self.columns, filters=mock.sentinel.filters)
|
||||
mock_fail_if_state.assert_called_once_with(
|
||||
mock.ANY, mock.ANY,
|
||||
{'provision_state': 'deploying', 'reserved': False},
|
||||
'deploying', 'provision_updated_at',
|
||||
last_error=mock.ANY)
|
||||
expected_calls = [mock.call(mock.ANY, mock.ANY,
|
||||
{'provision_state': 'deploying',
|
||||
'reserved': False},
|
||||
'deploying',
|
||||
'provision_updated_at',
|
||||
last_error=mock.ANY),
|
||||
mock.call(mock.ANY, mock.ANY,
|
||||
{'provision_state': 'cleaning',
|
||||
'reserved': False},
|
||||
'cleaning',
|
||||
'provision_updated_at',
|
||||
last_error=mock.ANY)]
|
||||
mock_fail_if_state.assert_has_calls(expected_calls)
|
||||
|
||||
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
|
||||
def test_iter_nodes_shutdown(self, mock_nodeinfo_list):
|
||||
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- When a conductor managing a node dies mid-cleaning the node would get stuck
|
||||
in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
|
||||
will be moved to the CLEANFAIL state.
|
Loading…
Reference in New Issue
Block a user