diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py index 9df03b9179..a385822506 100644 --- a/ironic/common/rpc_service.py +++ b/ironic/common/rpc_service.py @@ -48,6 +48,8 @@ class RPCService(service.Service): admin_context = context.get_admin_context() serializer = objects_base.IronicObjectSerializer(is_server=True) + # Perform preparatory actions before starting the RPC listener + self.manager.prepare_host() if CONF.rpc_transport == 'json-rpc': self.rpcserver = json_rpc.WSGIService(self.manager, serializer) diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index cd63d875c4..d4361dfa97 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -80,6 +80,33 @@ class BaseConductorManager(object): self._started = False self._shutdown = None self._zeroconf = None + self.dbapi = None + + def prepare_host(self): + """Prepares host for initialization + + Removes existing database entries involved with node locking for nodes + in a transitory power state and nodes that are presently locked by + the hostname of this conductor. + + Under normal operation, this is also when the initial database + connectivity is established for the conductor's normal operation. + """ + # NOTE(TheJulia) We need to clear locks early on in the process + # of starting where the database shows we still hold them. + # This must be done before we re-register our existence in the + # conductors table and begin accepting new requests via RPC as + # if we do not then we may squash our *new* locks from new work. + + if not self.dbapi: + LOG.debug('Initializing database client for %s.', self.host) + self.dbapi = dbapi.get_instance() + LOG.debug('Removing stale locks from the database matching ' + 'this conductor\'s hostname: %s', self.host) + # clear all target_power_state with locks by this conductor + self.dbapi.clear_node_target_power_state(self.host) + # clear all locks held by this conductor before registering + self.dbapi.clear_node_reservations_for_conductor(self.host) def init_host(self, admin_context=None): """Initialize the conductor host. @@ -97,7 +124,8 @@ class BaseConductorManager(object): 'conductor manager')) self._shutdown = False - self.dbapi = dbapi.get_instance() + if not self.dbapi: + self.dbapi = dbapi.get_instance() self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" @@ -141,10 +169,6 @@ class BaseConductorManager(object): self._collect_periodic_tasks(admin_context) - # clear all target_power_state with locks by this conductor - self.dbapi.clear_node_target_power_state(self.host) - # clear all locks held by this conductor before registering - self.dbapi.clear_node_reservations_for_conductor(self.host) try: # Register this conductor with the cluster self.conductor = objects.Conductor.register( diff --git a/ironic/tests/unit/common/test_rpc_service.py b/ironic/tests/unit/common/test_rpc_service.py index ec440324c7..f187ff7e07 100644 --- a/ironic/tests/unit/common/test_rpc_service.py +++ b/ironic/tests/unit/common/test_rpc_service.py @@ -36,13 +36,14 @@ class TestRPCService(base.TestCase): mgr_class = "ConductorManager" self.rpc_svc = rpc_service.RPCService(host, mgr_module, mgr_class) + @mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True) @mock.patch.object(oslo_messaging, 'Target', autospec=True) @mock.patch.object(objects_base, 'IronicObjectSerializer', autospec=True) @mock.patch.object(rpc, 'get_server', autospec=True) @mock.patch.object(manager.ConductorManager, 'init_host', autospec=True) @mock.patch.object(context, 'get_admin_context', autospec=True) def test_start(self, mock_ctx, mock_init_method, - mock_rpc, mock_ios, mock_target): + mock_rpc, mock_ios, mock_target, mock_prepare_method): mock_rpc.return_value.start = mock.MagicMock() self.rpc_svc.handle_signal = mock.MagicMock() self.rpc_svc.start() @@ -50,5 +51,6 @@ class TestRPCService(base.TestCase): mock_target.assert_called_once_with(topic=self.rpc_svc.topic, server="fake_host") mock_ios.assert_called_once_with(is_server=True) + mock_prepare_method.assert_called_once_with(self.rpc_svc.manager) mock_init_method.assert_called_once_with(self.rpc_svc.manager, mock_ctx.return_value) diff --git a/ironic/tests/unit/conductor/mgr_utils.py b/ironic/tests/unit/conductor/mgr_utils.py index e5bc43bcbb..9baadaf42d 100644 --- a/ironic/tests/unit/conductor/mgr_utils.py +++ b/ironic/tests/unit/conductor/mgr_utils.py @@ -143,6 +143,7 @@ class ServiceSetUpMixin(object): self.service.init_host() else: with mock.patch.object(periodics, 'PeriodicWorker', autospec=True): + self.service.prepare_host() self.service.init_host() self.addCleanup(self._stop_service) diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 35d8146ed1..7680af153e 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -31,6 +31,7 @@ from ironic.conductor import base_manager from ironic.conductor import manager from ironic.conductor import notification_utils from ironic.conductor import task_manager +from ironic.db import api as dbapi from ironic.drivers import fake_hardware from ironic.drivers import generic from ironic.drivers.modules import deploy_utils @@ -300,6 +301,15 @@ class StartStopTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_zc.close.assert_called_once_with() self.assertIsNone(self.service._zeroconf) + @mock.patch.object(dbapi, 'get_instance', autospec=True) + def test_start_dbapi_single_call(self, mock_dbapi): + self._start_service() + # NOTE(TheJulia): This seems like it should only be 1, but + # the hash ring initailization pulls it's own database connection + # instance, which is likely a good thing, thus this is 2 instead of + # 3 without reuse of the database connection. + self.assertEqual(2, mock_dbapi.call_count) + class CheckInterfacesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): def test__check_enabled_interfaces_success(self): diff --git a/releasenotes/notes/remove-locks-first-d12ac27106f800f8.yaml b/releasenotes/notes/remove-locks-first-d12ac27106f800f8.yaml new file mode 100644 index 0000000000..a7e0cb958c --- /dev/null +++ b/releasenotes/notes/remove-locks-first-d12ac27106f800f8.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue where ``ironic-conductor`` initialization could return a + ``NodeNotLocked`` error for requests requiring locks when the conductor + was starting. This was due to the conductor removing locks after + beginning accepting new work. The lock removal has been moved to after + the Database connectivity has been established but before the RPC bus + is initialized.