Merge "Rework logic handling reserved orphaned nodes in the conductor"
This commit is contained in:
commit
a8584477e4
@ -22,6 +22,7 @@ from futurist import rejection
|
||||
from oslo_db import exception as db_exception
|
||||
from oslo_log import log
|
||||
from oslo_utils import excutils
|
||||
import six
|
||||
|
||||
from ironic.common import context as ironic_context
|
||||
from ironic.common import driver_factory
|
||||
@ -440,7 +441,8 @@ class BaseConductorManager(object):
|
||||
this would process nodes whose provision_updated_at
|
||||
field value was 60 or more seconds before 'now'.
|
||||
:param: provision_state: provision_state that the node is in,
|
||||
for the provisioning activity to have failed.
|
||||
for the provisioning activity to have failed,
|
||||
either one string or a set.
|
||||
:param: sort_key: the nodes are sorted based on this key.
|
||||
:param: callback_method: the callback method to be invoked in a
|
||||
spawned thread, for a failed node. This
|
||||
@ -457,6 +459,9 @@ class BaseConductorManager(object):
|
||||
fsm.
|
||||
|
||||
"""
|
||||
if isinstance(provision_state, six.string_types):
|
||||
provision_state = {provision_state}
|
||||
|
||||
node_iter = self.iter_nodes(filters=filters,
|
||||
sort_key=sort_key,
|
||||
sort_dir='asc')
|
||||
@ -467,7 +472,7 @@ class BaseConductorManager(object):
|
||||
with task_manager.acquire(context, node_uuid,
|
||||
purpose='node state check') as task:
|
||||
if (task.node.maintenance or
|
||||
task.node.provision_state != provision_state):
|
||||
task.node.provision_state not in provision_state):
|
||||
continue
|
||||
|
||||
target_state = (None if not keep_target_state else
|
||||
|
@ -1561,15 +1561,24 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
self._fail_if_in_state(context, filters, states.DEPLOYWAIT,
|
||||
sort_key, callback_method, err_handler)
|
||||
|
||||
@METRICS.timer('ConductorManager._check_deploying_status')
|
||||
@METRICS.timer('ConductorManager._check_orphan_nodes')
|
||||
@periodics.periodic(spacing=CONF.conductor.check_provision_state_interval)
|
||||
def _check_deploying_status(self, context):
|
||||
"""Periodically checks the status of nodes in DEPLOYING state.
|
||||
def _check_orphan_nodes(self, context):
|
||||
"""Periodically checks the status of nodes that were taken over.
|
||||
|
||||
Periodically checks the nodes in DEPLOYING and the state of the
|
||||
conductor deploying them. If we find out that a conductor that
|
||||
was provisioning the node has died we then break release the
|
||||
node and gracefully mark the deployment as failed.
|
||||
Periodically checks the nodes that are managed by this conductor but
|
||||
have a reservation from a conductor that went offline.
|
||||
|
||||
1. Nodes in DEPLOYING state move to DEPLOY FAIL.
|
||||
|
||||
2. Nodes in CLEANING state move to CLEAN FAIL with maintenance set.
|
||||
|
||||
3. Nodes in a transient power state get the power operation aborted.
|
||||
|
||||
4. Reservation is removed.
|
||||
|
||||
The latter operation happens even for nodes in maintenance mode,
|
||||
otherwise it's not possible to move them out of maintenance.
|
||||
|
||||
:param context: request context.
|
||||
"""
|
||||
@ -1578,12 +1587,14 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
return
|
||||
|
||||
node_iter = self.iter_nodes(
|
||||
fields=['id', 'reservation'],
|
||||
filters={'provision_state': states.DEPLOYING,
|
||||
'maintenance': False,
|
||||
'reserved_by_any_of': offline_conductors})
|
||||
fields=['id', 'reservation', 'maintenance', 'provision_state',
|
||||
'target_power_state'],
|
||||
filters={'reserved_by_any_of': offline_conductors})
|
||||
|
||||
for node_uuid, driver, node_id, conductor_hostname in node_iter:
|
||||
state_cleanup_required = []
|
||||
|
||||
for (node_uuid, driver, node_id, conductor_hostname,
|
||||
maintenance, provision_state, target_power_state) in node_iter:
|
||||
# NOTE(lucasagomes): Although very rare, this may lead to a
|
||||
# race condition. By the time we release the lock the conductor
|
||||
# that was previously managing the node could be back online.
|
||||
@ -1604,11 +1615,43 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
LOG.warning("During checking for deploying state, when "
|
||||
"releasing the lock of the node %s, it was "
|
||||
"already unlocked.", node_uuid)
|
||||
else:
|
||||
LOG.warning('Forcibly removed reservation of conductor %(old)s'
|
||||
' on node %(node)s as that conductor went offline',
|
||||
{'old': conductor_hostname, 'node': node_uuid})
|
||||
|
||||
# TODO(dtantsur): clean up all states that are not stable and
|
||||
# are not one of WAIT states.
|
||||
if not maintenance and (provision_state in (states.DEPLOYING,
|
||||
states.CLEANING) or
|
||||
target_power_state is not None):
|
||||
LOG.debug('Node %(node)s taken over from conductor %(old)s '
|
||||
'requires state clean up: provision state is '
|
||||
'%(state)s, target power state is %(pstate)s',
|
||||
{'node': node_uuid, 'old': conductor_hostname,
|
||||
'state': provision_state,
|
||||
'pstate': target_power_state})
|
||||
state_cleanup_required.append(node_uuid)
|
||||
|
||||
for node_uuid in state_cleanup_required:
|
||||
with task_manager.acquire(context, node_uuid,
|
||||
purpose='power state clean up') as task:
|
||||
if not task.node.maintenance and task.node.target_power_state:
|
||||
old_state = task.node.target_power_state
|
||||
task.node.target_power_state = None
|
||||
task.node.last_error = _('Pending power operation was '
|
||||
'aborted due to conductor take '
|
||||
'over')
|
||||
task.node.save()
|
||||
LOG.warning('Aborted pending power operation %(op)s '
|
||||
'on node %(node)s due to conductor take over',
|
||||
{'op': old_state, 'node': node_uuid})
|
||||
|
||||
self._fail_if_in_state(
|
||||
context, {'id': node_id}, states.DEPLOYING,
|
||||
context, {'uuid': node_uuid},
|
||||
{states.DEPLOYING, states.CLEANING},
|
||||
'provision_updated_at',
|
||||
callback_method=utils.cleanup_after_timeout,
|
||||
callback_method=utils.abort_on_conductor_take_over,
|
||||
err_handler=utils.provisioning_error_handler)
|
||||
|
||||
@METRICS.timer('ConductorManager._do_adoption')
|
||||
|
@ -377,6 +377,27 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
task.process_event('fail', target_state=target_state)
|
||||
|
||||
|
||||
@task_manager.require_exclusive_lock
|
||||
def abort_on_conductor_take_over(task):
|
||||
"""Set node's state when a task was aborted due to conductor take over.
|
||||
|
||||
:param task: a TaskManager instance.
|
||||
"""
|
||||
msg = _('Operation was aborted due to conductor take over')
|
||||
# By this time the "fail" even was processed, so we cannot end up in
|
||||
# CLEANING or CLEAN WAIT, only in CLEAN FAIL.
|
||||
if task.node.provision_state == states.CLEANFAIL:
|
||||
cleaning_error_handler(task, msg, set_fail_state=False)
|
||||
else:
|
||||
# For aborted deployment (and potentially other operations), just set
|
||||
# the last_error accordingly.
|
||||
task.node.last_error = msg
|
||||
task.node.save()
|
||||
|
||||
LOG.warning('Aborted the current operation on node %s due to '
|
||||
'conductor take over', task.node.uuid)
|
||||
|
||||
|
||||
def rescuing_error_handler(task, msg, set_fail_state=True):
|
||||
"""Cleanup rescue task after timeout or failure.
|
||||
|
||||
|
@ -6277,16 +6277,17 @@ class DestroyPortgroupTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
@mock.patch.object(manager.ConductorManager, '_fail_if_in_state')
|
||||
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
|
||||
@mock.patch.object(dbapi.IMPL, 'get_offline_conductors')
|
||||
class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
db_base.DbTestCase):
|
||||
class ManagerCheckOrphanNodesTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
db_base.DbTestCase):
|
||||
def setUp(self):
|
||||
super(ManagerCheckDeployingStatusTestCase, self).setUp()
|
||||
super(ManagerCheckOrphanNodesTestCase, self).setUp()
|
||||
self._start_service()
|
||||
|
||||
self.node = obj_utils.create_test_node(
|
||||
self.context, id=1, uuid=uuidutils.generate_uuid(),
|
||||
driver='fake', provision_state=states.DEPLOYING,
|
||||
target_provision_state=states.DEPLOYDONE,
|
||||
target_provision_state=states.ACTIVE,
|
||||
target_power_state=states.POWER_ON,
|
||||
reservation='fake-conductor')
|
||||
|
||||
# create a second node in a different state to test the
|
||||
@ -6296,28 +6297,53 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
driver='fake', provision_state=states.AVAILABLE,
|
||||
target_provision_state=states.NOSTATE)
|
||||
|
||||
def test__check_deploying_status(self, mock_off_cond, mock_mapped,
|
||||
mock_fail_if):
|
||||
def test__check_orphan_nodes(self, mock_off_cond, mock_mapped,
|
||||
mock_fail_if):
|
||||
mock_off_cond.return_value = ['fake-conductor']
|
||||
|
||||
self.service._check_deploying_status(self.context)
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
|
||||
self.node.refresh()
|
||||
mock_off_cond.assert_called_once_with()
|
||||
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
|
||||
mock_fail_if.assert_called_once_with(
|
||||
mock.ANY, {'id': self.node.id}, states.DEPLOYING,
|
||||
mock.ANY, {'uuid': self.node.uuid},
|
||||
{states.DEPLOYING, states.CLEANING},
|
||||
'provision_updated_at',
|
||||
callback_method=conductor_utils.cleanup_after_timeout,
|
||||
callback_method=conductor_utils.abort_on_conductor_take_over,
|
||||
err_handler=conductor_utils.provisioning_error_handler)
|
||||
# assert node was released
|
||||
self.assertIsNone(self.node.reservation)
|
||||
self.assertIsNone(self.node.target_power_state)
|
||||
self.assertIsNotNone(self.node.last_error)
|
||||
|
||||
def test__check_deploying_status_alive(self, mock_off_cond,
|
||||
mock_mapped, mock_fail_if):
|
||||
def test__check_orphan_nodes_cleaning(self, mock_off_cond, mock_mapped,
|
||||
mock_fail_if):
|
||||
self.node.provision_state = states.CLEANING
|
||||
self.node.save()
|
||||
mock_off_cond.return_value = ['fake-conductor']
|
||||
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
|
||||
self.node.refresh()
|
||||
mock_off_cond.assert_called_once_with()
|
||||
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
|
||||
mock_fail_if.assert_called_once_with(
|
||||
mock.ANY, {'uuid': self.node.uuid},
|
||||
{states.DEPLOYING, states.CLEANING},
|
||||
'provision_updated_at',
|
||||
callback_method=conductor_utils.abort_on_conductor_take_over,
|
||||
err_handler=conductor_utils.provisioning_error_handler)
|
||||
# assert node was released
|
||||
self.assertIsNone(self.node.reservation)
|
||||
self.assertIsNone(self.node.target_power_state)
|
||||
self.assertIsNotNone(self.node.last_error)
|
||||
|
||||
def test__check_orphan_nodes_alive(self, mock_off_cond,
|
||||
mock_mapped, mock_fail_if):
|
||||
mock_off_cond.return_value = []
|
||||
|
||||
self.service._check_deploying_status(self.context)
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
|
||||
self.node.refresh()
|
||||
mock_off_cond.assert_called_once_with()
|
||||
@ -6327,7 +6353,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
self.assertIsNotNone(self.node.reservation)
|
||||
|
||||
@mock.patch.object(objects.Node, 'release')
|
||||
def test__check_deploying_status_release_exceptions_skipping(
|
||||
def test__check_orphan_nodes_release_exceptions_skipping(
|
||||
self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
|
||||
mock_off_cond.return_value = ['fake-conductor']
|
||||
# Add another node so we can check both exceptions
|
||||
@ -6340,7 +6366,7 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
mock_mapped.return_value = True
|
||||
mock_release.side_effect = [exception.NodeNotFound('not found'),
|
||||
exception.NodeLocked('locked')]
|
||||
self.service._check_deploying_status(self.context)
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
|
||||
self.node.refresh()
|
||||
mock_off_cond.assert_called_once_with()
|
||||
@ -6350,22 +6376,52 @@ class ManagerCheckDeployingStatusTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
# Assert we skipped and didn't try to call _fail_if_in_state
|
||||
self.assertFalse(mock_fail_if.called)
|
||||
|
||||
@mock.patch.object(objects.Node, 'release')
|
||||
def test__check_deploying_status_release_node_not_locked(
|
||||
self, mock_release, mock_off_cond, mock_mapped, mock_fail_if):
|
||||
def test__check_orphan_nodes_release_node_not_locked(
|
||||
self, mock_off_cond, mock_mapped, mock_fail_if):
|
||||
# this simulates releasing the node elsewhere
|
||||
count = [0]
|
||||
|
||||
def _fake_release(*args, **kwargs):
|
||||
self.node.reservation = None
|
||||
self.node.save()
|
||||
# raise an exception only the first time release is called
|
||||
count[0] += 1
|
||||
if count[0] == 1:
|
||||
raise exception.NodeNotLocked('not locked')
|
||||
|
||||
mock_off_cond.return_value = ['fake-conductor']
|
||||
mock_mapped.return_value = True
|
||||
mock_release.side_effect = exception.NodeNotLocked('not locked')
|
||||
self.service._check_deploying_status(self.context)
|
||||
with mock.patch.object(objects.Node, 'release',
|
||||
side_effect=_fake_release) as mock_release:
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
mock_release.assert_called_with(self.context, mock.ANY,
|
||||
self.node.id)
|
||||
|
||||
mock_off_cond.assert_called_once_with()
|
||||
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
|
||||
mock_fail_if.assert_called_once_with(
|
||||
mock.ANY, {'uuid': self.node.uuid},
|
||||
{states.DEPLOYING, states.CLEANING},
|
||||
'provision_updated_at',
|
||||
callback_method=conductor_utils.abort_on_conductor_take_over,
|
||||
err_handler=conductor_utils.provisioning_error_handler)
|
||||
|
||||
def test__check_orphan_nodes_maintenance(self, mock_off_cond, mock_mapped,
|
||||
mock_fail_if):
|
||||
self.node.maintenance = True
|
||||
self.node.save()
|
||||
mock_off_cond.return_value = ['fake-conductor']
|
||||
|
||||
self.service._check_orphan_nodes(self.context)
|
||||
|
||||
self.node.refresh()
|
||||
mock_off_cond.assert_called_once_with()
|
||||
mock_mapped.assert_called_once_with(self.node.uuid, 'fake')
|
||||
mock_fail_if.assert_called_once_with(
|
||||
mock.ANY, {'id': self.node.id}, states.DEPLOYING,
|
||||
'provision_updated_at',
|
||||
callback_method=conductor_utils.cleanup_after_timeout,
|
||||
err_handler=conductor_utils.provisioning_error_handler)
|
||||
# assert node was released
|
||||
self.assertIsNone(self.node.reservation)
|
||||
# not changing states in maintenance
|
||||
self.assertFalse(mock_fail_if.called)
|
||||
self.assertIsNotNone(self.node.target_power_state)
|
||||
|
||||
|
||||
class TestIndirectionApiConductor(db_base.DbTestCase):
|
||||
|
@ -1234,6 +1234,25 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
conductor_utils.cleaning_error_handler(self.task, 'foo')
|
||||
self.assertTrue(log_mock.exception.called)
|
||||
|
||||
def test_abort_on_conductor_take_over_cleaning(self):
|
||||
self.node.maintenance = False
|
||||
self.node.provision_state = states.CLEANFAIL
|
||||
conductor_utils.abort_on_conductor_take_over(self.task)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertIn('take over', self.node.maintenance_reason)
|
||||
self.assertIn('take over', self.node.last_error)
|
||||
self.task.driver.deploy.tear_down_cleaning.assert_called_once_with(
|
||||
self.task)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
def test_abort_on_conductor_take_over_deploying(self):
|
||||
self.node.maintenance = False
|
||||
self.node.provision_state = states.DEPLOYFAIL
|
||||
conductor_utils.abort_on_conductor_take_over(self.task)
|
||||
self.assertFalse(self.node.maintenance)
|
||||
self.assertIn('take over', self.node.last_error)
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG')
|
||||
def test_spawn_cleaning_error_handler_no_worker(self, log_mock):
|
||||
exc = exception.NoFreeConductorWorker()
|
||||
|
9
releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
Normal file
9
releasenotes/notes/orphan-nodes-389cb6d90c2917ec.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
On node take over, any locks that are left from the old conductor are
|
||||
cleared by the new one. Previously it only happened for nodes in
|
||||
``DEPLOYING`` state.
|
||||
- |
|
||||
On taking over nodes in ``CLEANING`` state, the new conductor moves them
|
||||
to ``CLEAN FAIL`` and set maintenance.
|
Loading…
Reference in New Issue
Block a user