Clean nodes stuck in CLEANING state when ir-cond restarts

When a conductor managing a node dies abruptly mid cleaing, the
node will get stuck in the CLEANING state.

This also moves _start_service() before creating CLEANING nodes
in tests. Finally, it adds autospec to a few places where the tests
fail in a mysterious way otherwise.

Change-Id: Ia7bce4dff57569707de4fcf3002eac241a5aa85b
Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com>
Partial-Bug: #1651092
This commit is contained in:
Zhenguo Niu 2016-08-02 20:24:54 +08:00 committed by Dmitry Tantsur
parent 5b75b6e4f2
commit 2921fe685d
4 changed files with 64 additions and 41 deletions

View File

@ -223,21 +223,14 @@ class BaseConductorManager(object):
self._periodic_tasks_worker.add_done_callback( self._periodic_tasks_worker.add_done_callback(
self._on_periodic_tasks_stop) self._on_periodic_tasks_stop)
# NOTE(lucasagomes): If the conductor server dies abruptly self._fail_transient_state(
# mid deployment (OMM Killer, power outage, etc...) we states.DEPLOYING,
# can not resume the deployment even if the conductor _("The deployment can't be resumed by conductor "
# comes back online. Cleaning the reservation of the nodes "%s. Moving to fail state.") % self.host)
# (dbapi.clear_node_reservations_for_conductor) is not enough to self._fail_transient_state(
# unstick it, so let's gracefully fail the deployment so the node states.CLEANING,
# can go through the steps (deleting & cleaning) to make itself _("The cleaning can't be resumed by conductor "
# available again.
filters = {'reserved': False,
'provision_state': states.DEPLOYING}
last_error = (_("The deployment can't be resumed by conductor "
"%s. Moving to fail state.") % self.host) "%s. Moving to fail state.") % self.host)
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
states.DEPLOYING, 'provision_updated_at',
last_error=last_error)
# Start consoles if it set enabled in a greenthread. # Start consoles if it set enabled in a greenthread.
try: try:
@ -259,6 +252,20 @@ class BaseConductorManager(object):
self._started = True self._started = True
def _fail_transient_state(self, state, last_error):
"""Apply "fail" transition to nodes in a transient state.
If the conductor server dies abruptly mid deployment or cleaning
(OMM Killer, power outage, etc...) we can not resume the process even
if the conductor comes back online. Cleaning the reservation of
the nodes (dbapi.clear_node_reservations_for_conductor) is not enough
to unstick it, so let's gracefully fail the process.
"""
filters = {'reserved': False, 'provision_state': state}
self._fail_if_in_state(ironic_context.get_admin_context(), filters,
state, 'provision_updated_at',
last_error=last_error)
def del_host(self, deregister=True): def del_host(self, deregister=True):
# Conductor deregistration fails if called on non-initialized # Conductor deregistration fails if called on non-initialized
# conductor (e.g. when rpc server is unreachable). # conductor (e.g. when rpc server is unreachable).

View File

@ -184,7 +184,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
ht_mock.assert_called_once_with() ht_mock.assert_called_once_with()
@mock.patch.object(base_manager, 'LOG') @mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host') @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'DriverFactory') @mock.patch.object(driver_factory, 'DriverFactory')
def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock, def test_starts_with_only_dynamic_drivers(self, df_mock, del_mock,
log_mock): log_mock):
@ -197,7 +198,8 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertFalse(del_mock.called) self.assertFalse(del_mock.called)
@mock.patch.object(base_manager, 'LOG') @mock.patch.object(base_manager, 'LOG')
@mock.patch.object(base_manager.BaseConductorManager, 'del_host') @mock.patch.object(base_manager.BaseConductorManager, 'del_host',
autospec=True)
@mock.patch.object(driver_factory, 'HardwareTypesFactory') @mock.patch.object(driver_factory, 'HardwareTypesFactory')
def test_starts_with_only_classic_drivers(self, ht_mock, del_mock, def test_starts_with_only_classic_drivers(self, ht_mock, del_mock,
log_mock): log_mock):

View File

@ -2298,13 +2298,13 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@mock.patch('ironic.drivers.modules.fake.FakePower.validate') @mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_clean_automated_disabled(self, mock_validate): def test__do_node_clean_automated_disabled(self, mock_validate):
self.config(automated_clean=False, group='conductor') self.config(automated_clean=False, group='conductor')
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE, target_provision_state=states.AVAILABLE,
last_error=None) last_error=None)
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task) self.service._do_node_clean(task)
@ -2416,6 +2416,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
else: else:
tgt_prov_state = states.AVAILABLE tgt_prov_state = states.AVAILABLE
driver_info = {'clean_steps': self.clean_steps} driver_info = {'clean_steps': self.clean_steps}
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2424,7 +2426,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_state=states.POWER_OFF, power_state=states.POWER_OFF,
driver_internal_info=driver_info) driver_internal_info=driver_info)
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task, clean_steps=clean_steps) self.service._do_node_clean(task, clean_steps=clean_steps)
@ -2462,6 +2463,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.AVAILABLE tgt_prov_state = states.AVAILABLE
driver_internal_info['clean_steps'] = self.clean_steps driver_internal_info['clean_steps'] = self.clean_steps
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2472,8 +2475,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_execute.return_value = return_state mock_execute.return_value = return_state
expected_first_step = node.driver_internal_info['clean_steps'][0] expected_first_step = node.driver_internal_info['clean_steps'][0]
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0) self.service._do_next_clean_step(task, 0)
@ -2500,6 +2501,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False): manual=False):
# Resume an in-progress cleaning after the first async step # Resume an in-progress cleaning after the first async step
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2510,8 +2513,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step=self.clean_steps[0]) clean_step=self.clean_steps[0])
mock_execute.return_value = return_state mock_execute.return_value = return_state
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, self.next_clean_step_index) self.service._do_next_clean_step(task, self.next_clean_step_index)
@ -2538,6 +2539,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
info = {'clean_steps': self.clean_steps, info = {'clean_steps': self.clean_steps,
'clean_step_index': len(self.clean_steps) - 1} 'clean_step_index': len(self.clean_steps) - 1}
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2546,8 +2549,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info, driver_internal_info=info,
clean_step=self.clean_steps[-1]) clean_step=self.clean_steps[-1])
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None) self.service._do_next_clean_step(task, None)
@ -2575,6 +2576,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_power_execute, manual=False): mock_power_execute, manual=False):
# Run all steps from start to finish (all synchronous) # Run all steps from start to finish (all synchronous)
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2586,8 +2589,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_deploy_execute.return_value = None mock_deploy_execute.return_value = None
mock_power_execute.return_value = None mock_power_execute.return_value = None
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0) self.service._do_next_clean_step(task, 0)
@ -2620,6 +2621,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
manual=False): manual=False):
# When a clean step fails, go to CLEANFAIL # When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2630,8 +2633,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={}) clean_step={})
mock_execute.side_effect = Exception() mock_execute.side_effect = Exception()
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0) self.service._do_next_clean_step(task, 0)
@ -2665,6 +2666,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock, self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock,
manual=True): manual=True):
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2678,8 +2681,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
power_exec_mock.return_value = None power_exec_mock.return_value = None
tear_mock.side_effect = Exception('boom') tear_mock.side_effect = Exception('boom')
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0) self.service._do_next_clean_step(task, 0)
@ -2722,6 +2723,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
{'clean_steps': None}): {'clean_steps': None}):
# Resume where there are no steps, should be a noop # Resume where there are no steps, should be a noop
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
uuid=uuidutils.generate_uuid(), uuid=uuidutils.generate_uuid(),
@ -2731,8 +2734,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
driver_internal_info=info, driver_internal_info=info,
clean_step={}) clean_step={})
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None) self.service._do_next_clean_step(task, None)
@ -2760,6 +2761,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self, deploy_exec_mock, power_exec_mock, manual=False): self, deploy_exec_mock, power_exec_mock, manual=False):
# When a clean step fails, go to CLEANFAIL # When a clean step fails, go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
self._start_service()
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake', self.context, driver='fake',
provision_state=states.CLEANING, provision_state=states.CLEANING,
@ -2770,8 +2773,6 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
clean_step={}) clean_step={})
deploy_exec_mock.return_value = "foo" deploy_exec_mock.return_value = "foo"
self._start_service()
with task_manager.acquire( with task_manager.acquire(
self.context, node.uuid, shared=False) as task: self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0) self.service._do_next_clean_step(task, 0)
@ -3372,11 +3373,19 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
self.assertEqual([(nodes[0].uuid, 'fake', 0)], result) self.assertEqual([(nodes[0].uuid, 'fake', 0)], result)
mock_nodeinfo_list.assert_called_once_with( mock_nodeinfo_list.assert_called_once_with(
columns=self.columns, filters=mock.sentinel.filters) columns=self.columns, filters=mock.sentinel.filters)
mock_fail_if_state.assert_called_once_with( expected_calls = [mock.call(mock.ANY, mock.ANY,
mock.ANY, mock.ANY, {'provision_state': 'deploying',
{'provision_state': 'deploying', 'reserved': False}, 'reserved': False},
'deploying', 'provision_updated_at', 'deploying',
last_error=mock.ANY) 'provision_updated_at',
last_error=mock.ANY),
mock.call(mock.ANY, mock.ANY,
{'provision_state': 'cleaning',
'reserved': False},
'cleaning',
'provision_updated_at',
last_error=mock.ANY)]
mock_fail_if_state.assert_has_calls(expected_calls)
@mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list') @mock.patch.object(dbapi.IMPL, 'get_nodeinfo_list')
def test_iter_nodes_shutdown(self, mock_nodeinfo_list): def test_iter_nodes_shutdown(self, mock_nodeinfo_list):

View File

@ -0,0 +1,5 @@
---
fixes:
- When a conductor managing a node dies mid-cleaning the node would get stuck
in the CLEANING state. Now upon conductor startup nodes in the CLEANING state
will be moved to the CLEANFAIL state.