Introduce conductor touch while offline
This adds an `online` argument to the conductor touch methods so that touch can be called with `online=False`. When called periodically this allows the conductor `updated_at` to be within the threshold to avoid locked nodes being failed as orphans by another conductor. This will be used by drain shutdown (and graceful shutdown) so that tasks can complete on existing locked nodes within the shutdown timeout, while the conductor is also removed from the hash ring so new tasks are not started on that conductor. This change introduces the api but the existing behaviour won't change until BaseConductorManager.del_host() no longer calls keepalive_halt(). Change-Id: Iedd62193fac1009137b9ee47a6ef5a9a8576f261
This commit is contained in:
parent
9118440577
commit
3f9151163e
@ -321,13 +321,16 @@ class BaseConductorManager(object):
|
||||
# This is only used in tests currently. Delete it?
|
||||
self._periodic_task_callables = periodic_task_callables
|
||||
|
||||
def keepalive_halt(self):
|
||||
self._keepalive_evt.set()
|
||||
|
||||
def del_host(self, deregister=True, clear_node_reservations=True):
|
||||
# Conductor deregistration fails if called on non-initialized
|
||||
# conductor (e.g. when rpc server is unreachable).
|
||||
if not hasattr(self, 'conductor'):
|
||||
return
|
||||
self._shutdown = True
|
||||
self._keepalive_evt.set()
|
||||
self.keepalive_halt()
|
||||
|
||||
if clear_node_reservations:
|
||||
# clear all locks held by this conductor before deregistering
|
||||
@ -469,7 +472,7 @@ class BaseConductorManager(object):
|
||||
return
|
||||
while not self._keepalive_evt.is_set():
|
||||
try:
|
||||
self.conductor.touch()
|
||||
self.conductor.touch(online=not self._shutdown)
|
||||
except db_exception.DBConnectionError:
|
||||
LOG.warning('Conductor could not connect to database '
|
||||
'while heartbeating.')
|
||||
|
@ -585,10 +585,16 @@ class Connection(object, metaclass=abc.ABCMeta):
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def touch_conductor(self, hostname):
|
||||
def touch_conductor(self, hostname, online=True):
|
||||
"""Mark a conductor as active by updating its 'updated_at' property.
|
||||
|
||||
Calling periodically with ``online=False`` will result in the conductor
|
||||
appearing unregistered, but recently enough to prevent other conductors
|
||||
failing orphan nodes. This improves the behaviour of graceful and drain
|
||||
shutdown.
|
||||
|
||||
:param hostname: The hostname of this conductor service.
|
||||
:param online: Whether the conductor is online.
|
||||
:raises: ConductorNotFound
|
||||
"""
|
||||
|
||||
|
@ -1392,13 +1392,13 @@ class Connection(api.Connection):
|
||||
raise exception.ConductorNotFound(conductor=hostname)
|
||||
|
||||
@oslo_db_api.retry_on_deadlock
|
||||
def touch_conductor(self, hostname):
|
||||
def touch_conductor(self, hostname, online=True):
|
||||
with _session_for_write() as session:
|
||||
query = sa.update(models.Conductor).where(
|
||||
models.Conductor.hostname == hostname
|
||||
).values({
|
||||
'updated_at': timeutils.utcnow(),
|
||||
'online': True}
|
||||
'online': online}
|
||||
).execution_options(synchronize_session=False)
|
||||
res = session.execute(query)
|
||||
count = res.rowcount
|
||||
|
@ -111,9 +111,9 @@ class Conductor(base.IronicObject, object_base.VersionedObjectDictCompat):
|
||||
# methods can be used in the future to replace current explicit RPC calls.
|
||||
# Implications of calling new remote procedures should be thought through.
|
||||
# @object_base.remotable
|
||||
def touch(self, context=None):
|
||||
def touch(self, context=None, online=True):
|
||||
"""Touch this conductor's DB record, marking it as up-to-date."""
|
||||
self.dbapi.touch_conductor(self.hostname)
|
||||
self.dbapi.touch_conductor(self.hostname, online=online)
|
||||
|
||||
# NOTE(xek): We don't want to enable RPC on this call just yet. Remotable
|
||||
# methods can be used in the future to replace current explicit RPC calls.
|
||||
|
@ -359,7 +359,7 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
mock_is_sqlite.return_value = False
|
||||
self.service._conductor_service_record_keepalive()
|
||||
self.assertEqual(1, mock_is_sqlite.call_count)
|
||||
mock_touch.assert_called_once_with(self.hostname)
|
||||
mock_touch.assert_called_once_with(self.hostname, online=True)
|
||||
|
||||
@mock.patch.object(common_utils, 'is_ironic_using_sqlite', autospec=True)
|
||||
def test__conductor_service_record_keepalive_failed_db_conn(
|
||||
|
@ -156,6 +156,21 @@ class DbConductorTestCase(base.DbTestCase):
|
||||
c = self.dbapi.get_conductor(c.hostname)
|
||||
self.assertEqual(test_time, timeutils.normalize_time(c.updated_at))
|
||||
|
||||
@mock.patch.object(timeutils, 'utcnow', autospec=True)
|
||||
def test_touch_conductor_offline(self, mock_utcnow):
|
||||
test_time = datetime.datetime(2000, 1, 1, 0, 0)
|
||||
mock_utcnow.return_value = test_time
|
||||
c = self._create_test_cdr()
|
||||
self.assertEqual(test_time, timeutils.normalize_time(c.updated_at))
|
||||
|
||||
test_time = datetime.datetime(2000, 1, 1, 0, 1)
|
||||
mock_utcnow.return_value = test_time
|
||||
self.dbapi.touch_conductor(c.hostname, online=False)
|
||||
self.assertRaises(
|
||||
exception.ConductorNotFound,
|
||||
self.dbapi.get_conductor,
|
||||
c.hostname)
|
||||
|
||||
def test_touch_conductor_not_found(self):
|
||||
# A conductor's heartbeat will not create a new record,
|
||||
# it will only update existing ones
|
||||
|
@ -77,7 +77,7 @@ class TestConductorObject(db_base.DbTestCase):
|
||||
c = objects.Conductor.get_by_hostname(self.context, host)
|
||||
c.touch(self.context)
|
||||
mock_get_cdr.assert_called_once_with(host, online=True)
|
||||
mock_touch_cdr.assert_called_once_with(host)
|
||||
mock_touch_cdr.assert_called_once_with(host, online=True)
|
||||
|
||||
def test_refresh(self):
|
||||
host = self.fake_conductor['hostname']
|
||||
|
Loading…
Reference in New Issue
Block a user