Remove locks before RPC bus is started
A partner performing some testing recognized a case where if a request
is sent to the Ironic Conductor while it is in the process of starting,
and the request makes it into be processed, yet latter the operation
fails with errors such as NodeNotLocked exception. Notably they were
able to reproduce this by requesting the attachment or detachment of
a VIF at the same time as restarting the conductor.
In part, this condition is due to to the conductor being restarted
where the conductors table includes the node being restarted and
the webserver has not possibly had a chance to observe that the
conductor is in the process of restarting as the hash ring is
still valid.
In short - Incoming RPC requests can come in during the initialization
window and as such we should not remove locks while the conductor could
possibly already be receiving work.
As such, we've added a ``prepare_host`` method which initializes
the conductor database connection and removes the stale locks.
Under normal operating conditions, the database client is reused.
rhbz# 1847305
Change-Id: I8e759168f1dc81cdcf430f3e33be990731595ec3
(cherry picked from commit b8e4aba1ec
)
This commit is contained in:
parent
a53bba694b
commit
062872bce0
|
@ -48,6 +48,8 @@ class RPCService(service.Service):
|
||||||
admin_context = context.get_admin_context()
|
admin_context = context.get_admin_context()
|
||||||
|
|
||||||
serializer = objects_base.IronicObjectSerializer(is_server=True)
|
serializer = objects_base.IronicObjectSerializer(is_server=True)
|
||||||
|
# Perform preparatory actions before starting the RPC listener
|
||||||
|
self.manager.prepare_host()
|
||||||
if CONF.rpc_transport == 'json-rpc':
|
if CONF.rpc_transport == 'json-rpc':
|
||||||
self.rpcserver = json_rpc.WSGIService(self.manager,
|
self.rpcserver = json_rpc.WSGIService(self.manager,
|
||||||
serializer)
|
serializer)
|
||||||
|
|
|
@ -80,6 +80,33 @@ class BaseConductorManager(object):
|
||||||
self._started = False
|
self._started = False
|
||||||
self._shutdown = None
|
self._shutdown = None
|
||||||
self._zeroconf = None
|
self._zeroconf = None
|
||||||
|
self.dbapi = None
|
||||||
|
|
||||||
|
def prepare_host(self):
|
||||||
|
"""Prepares host for initialization
|
||||||
|
|
||||||
|
Removes existing database entries involved with node locking for nodes
|
||||||
|
in a transitory power state and nodes that are presently locked by
|
||||||
|
the hostname of this conductor.
|
||||||
|
|
||||||
|
Under normal operation, this is also when the initial database
|
||||||
|
connectivity is established for the conductor's normal operation.
|
||||||
|
"""
|
||||||
|
# NOTE(TheJulia) We need to clear locks early on in the process
|
||||||
|
# of starting where the database shows we still hold them.
|
||||||
|
# This must be done before we re-register our existence in the
|
||||||
|
# conductors table and begin accepting new requests via RPC as
|
||||||
|
# if we do not then we may squash our *new* locks from new work.
|
||||||
|
|
||||||
|
if not self.dbapi:
|
||||||
|
LOG.debug('Initializing database client for %s.', self.host)
|
||||||
|
self.dbapi = dbapi.get_instance()
|
||||||
|
LOG.debug('Removing stale locks from the database matching '
|
||||||
|
'this conductor\'s hostname: %s', self.host)
|
||||||
|
# clear all target_power_state with locks by this conductor
|
||||||
|
self.dbapi.clear_node_target_power_state(self.host)
|
||||||
|
# clear all locks held by this conductor before registering
|
||||||
|
self.dbapi.clear_node_reservations_for_conductor(self.host)
|
||||||
|
|
||||||
def init_host(self, admin_context=None):
|
def init_host(self, admin_context=None):
|
||||||
"""Initialize the conductor host.
|
"""Initialize the conductor host.
|
||||||
|
@ -97,7 +124,8 @@ class BaseConductorManager(object):
|
||||||
'conductor manager'))
|
'conductor manager'))
|
||||||
self._shutdown = False
|
self._shutdown = False
|
||||||
|
|
||||||
self.dbapi = dbapi.get_instance()
|
if not self.dbapi:
|
||||||
|
self.dbapi = dbapi.get_instance()
|
||||||
|
|
||||||
self._keepalive_evt = threading.Event()
|
self._keepalive_evt = threading.Event()
|
||||||
"""Event for the keepalive thread."""
|
"""Event for the keepalive thread."""
|
||||||
|
@ -141,10 +169,6 @@ class BaseConductorManager(object):
|
||||||
|
|
||||||
self._collect_periodic_tasks(admin_context)
|
self._collect_periodic_tasks(admin_context)
|
||||||
|
|
||||||
# clear all target_power_state with locks by this conductor
|
|
||||||
self.dbapi.clear_node_target_power_state(self.host)
|
|
||||||
# clear all locks held by this conductor before registering
|
|
||||||
self.dbapi.clear_node_reservations_for_conductor(self.host)
|
|
||||||
try:
|
try:
|
||||||
# Register this conductor with the cluster
|
# Register this conductor with the cluster
|
||||||
self.conductor = objects.Conductor.register(
|
self.conductor = objects.Conductor.register(
|
||||||
|
|
|
@ -35,13 +35,14 @@ class TestRPCService(base.TestCase):
|
||||||
mgr_class = "ConductorManager"
|
mgr_class = "ConductorManager"
|
||||||
self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class)
|
self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class)
|
||||||
|
|
||||||
|
@mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True)
|
||||||
@mock.patch.object(oslo_messaging, 'Target', autospec=True)
|
@mock.patch.object(oslo_messaging, 'Target', autospec=True)
|
||||||
@mock.patch.object(objects_base, 'IronicObjectSerializer', autospec=True)
|
@mock.patch.object(objects_base, 'IronicObjectSerializer', autospec=True)
|
||||||
@mock.patch.object(rpc, 'get_server', autospec=True)
|
@mock.patch.object(rpc, 'get_server', autospec=True)
|
||||||
@mock.patch.object(manager.ConductorManager, 'init_host', autospec=True)
|
@mock.patch.object(manager.ConductorManager, 'init_host', autospec=True)
|
||||||
@mock.patch.object(context, 'get_admin_context', autospec=True)
|
@mock.patch.object(context, 'get_admin_context', autospec=True)
|
||||||
def test_start(self, mock_ctx, mock_init_method,
|
def test_start(self, mock_ctx, mock_init_method,
|
||||||
mock_rpc, mock_ios, mock_target):
|
mock_rpc, mock_ios, mock_target, mock_prepare_method):
|
||||||
mock_rpc.return_value.start = mock.MagicMock()
|
mock_rpc.return_value.start = mock.MagicMock()
|
||||||
self.rpc_svc.handle_signal = mock.MagicMock()
|
self.rpc_svc.handle_signal = mock.MagicMock()
|
||||||
self.rpc_svc.start()
|
self.rpc_svc.start()
|
||||||
|
@ -49,5 +50,6 @@ class TestRPCService(base.TestCase):
|
||||||
mock_target.assert_called_once_with(topic=self.rpc_svc.topic,
|
mock_target.assert_called_once_with(topic=self.rpc_svc.topic,
|
||||||
server="fake_host")
|
server="fake_host")
|
||||||
mock_ios.assert_called_once_with(is_server=True)
|
mock_ios.assert_called_once_with(is_server=True)
|
||||||
|
mock_prepare_method.assert_called_once_with(self.rpc_svc.manager)
|
||||||
mock_init_method.assert_called_once_with(self.rpc_svc.manager,
|
mock_init_method.assert_called_once_with(self.rpc_svc.manager,
|
||||||
mock_ctx.return_value)
|
mock_ctx.return_value)
|
||||||
|
|
|
@ -142,6 +142,7 @@ class ServiceSetUpMixin(object):
|
||||||
self.service.init_host()
|
self.service.init_host()
|
||||||
else:
|
else:
|
||||||
with mock.patch.object(periodics, 'PeriodicWorker', autospec=True):
|
with mock.patch.object(periodics, 'PeriodicWorker', autospec=True):
|
||||||
|
self.service.prepare_host()
|
||||||
self.service.init_host()
|
self.service.init_host()
|
||||||
self.addCleanup(self._stop_service)
|
self.addCleanup(self._stop_service)
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ from ironic.conductor import base_manager
|
||||||
from ironic.conductor import manager
|
from ironic.conductor import manager
|
||||||
from ironic.conductor import notification_utils
|
from ironic.conductor import notification_utils
|
||||||
from ironic.conductor import task_manager
|
from ironic.conductor import task_manager
|
||||||
|
from ironic.db import api as dbapi
|
||||||
from ironic.drivers import fake_hardware
|
from ironic.drivers import fake_hardware
|
||||||
from ironic.drivers import generic
|
from ironic.drivers import generic
|
||||||
from ironic.drivers.modules import deploy_utils
|
from ironic.drivers.modules import deploy_utils
|
||||||
|
@ -295,6 +296,15 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||||
mock_zc.close.assert_called_once_with()
|
mock_zc.close.assert_called_once_with()
|
||||||
self.assertIsNone(self.service._zeroconf)
|
self.assertIsNone(self.service._zeroconf)
|
||||||
|
|
||||||
|
@mock.patch.object(dbapi, 'get_instance', autospec=True)
|
||||||
|
def test_start_dbapi_single_call(self, mock_dbapi):
|
||||||
|
self._start_service()
|
||||||
|
# NOTE(TheJulia): This seems like it should only be 1, but
|
||||||
|
# the hash ring initailization pulls it's own database connection
|
||||||
|
# instance, which is likely a good thing, thus this is 2 instead of
|
||||||
|
# 3 without reuse of the database connection.
|
||||||
|
self.assertEqual(2, mock_dbapi.call_count)
|
||||||
|
|
||||||
|
|
||||||
class CheckInterfacesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
class CheckInterfacesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||||
def test__check_enabled_interfaces_success(self):
|
def test__check_enabled_interfaces_success(self):
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes an issue where ``ironic-conductor`` initialization could return a
|
||||||
|
``NodeNotLocked`` error for requests requiring locks when the conductor
|
||||||
|
was starting. This was due to the conductor removing locks after
|
||||||
|
beginning accepting new work. The lock removal has been moved to after
|
||||||
|
the Database connectivity has been established but before the RPC bus
|
||||||
|
is initialized.
|
Loading…
Reference in New Issue