diff --git a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py index 43900efac7b..81120c57fe8 100644 --- a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py +++ b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py @@ -57,12 +57,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver): def ovn_client(self): return self._ovn_client - def _set_hash_ring_nodes_offline(self): - """Don't set hash ring nodes as offline. + def _remove_node_from_hash_ring(self): + """Don't remove the node from the Hash Ring. If this method was not overridden, cleanup would be performed when - calling the db sync and running neutron server would mark all the - nodes from the ring as offline. + calling the db sync and running neutron server would remove the + nodes from the Hash Ring. """ # Since we are not using the ovn mechanism driver while syncing, diff --git a/neutron/db/ovn_hash_ring_db.py b/neutron/db/ovn_hash_ring_db.py index 59f528f7730..69019aa0ffe 100644 --- a/neutron/db/ovn_hash_ring_db.py +++ b/neutron/db/ovn_hash_ring_db.py @@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name): CONF.host, group_name) +def remove_node_by_uuid(context, node_uuid): + with db_api.CONTEXT_WRITER.using(context): + context.session.query(ovn_models.OVNHashRing).filter( + ovn_models.OVNHashRing.node_uuid == node_uuid).delete() + LOG.info('Node "%s" removed from the Hash Ring', node_uuid) + + def _touch(context, updated_at=None, **filter_args): if updated_at is None: updated_at = timeutils.utcnow() @@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name): return query.count() -def set_nodes_from_host_as_offline(context, group_name): - timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9) - _touch(context, updated_at=timestamp, hostname=CONF.host, - group_name=group_name) +@db_api.CONTEXT_READER +def count_nodes_from_host(context, group_name): + query = context.session.query(ovn_models.OVNHashRing).filter( + ovn_models.OVNHashRing.group_name == group_name, + ovn_models.OVNHashRing.hostname == CONF.host) + return query.count() diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py index 1ad9d1b782b..9eef4129ff9 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py @@ -273,17 +273,17 @@ class OVNMechanismDriver(api.MechanismDriver): resources.SECURITY_GROUP_RULE, events.BEFORE_DELETE) - def _set_hash_ring_nodes_offline(self, *args, **kwargs): + def _remove_node_from_hash_ring(self, *args, **kwargs): + # The node_uuid attribute will be empty for worker types + # that are not added to the Hash Ring and can be skipped + if self.node_uuid is None: + return admin_context = n_context.get_admin_context() - ovn_hash_ring_db.set_nodes_from_host_as_offline( - admin_context, self.hash_ring_group) - LOG.info('Hash Ring nodes from host "%s" marked as offline', - cfg.CONF.host) + ovn_hash_ring_db.remove_node_by_uuid( + admin_context, self.node_uuid) def pre_fork_initialize(self, resource, event, trigger, payload=None): """Pre-initialize the ML2/OVN driver.""" - atexit.register(self._set_hash_ring_nodes_offline) - signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline) ovn_utils.create_neutron_pg_drop() @staticmethod @@ -301,6 +301,10 @@ class OVNMechanismDriver(api.MechanismDriver): thread for this host. Subsequently workers just need to register themselves to the hash ring. """ + # Attempt to remove the node from the ring when the worker stops + atexit.register(self._remove_node_from_hash_ring) + signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring) + admin_context = n_context.get_admin_context() if not self._hash_ring_probe_event.is_set(): # Clear existing entries diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py index a54cb9018c5..86a9a67ae6e 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py @@ -42,6 +42,7 @@ from neutron.db import ovn_revision_numbers_db as revision_numbers_db from neutron.db import segments_db from neutron.objects import router as router_obj from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync +from neutron import service from neutron.services.logapi.drivers.ovn import driver as log_driver @@ -1055,3 +1056,20 @@ class HashRingHealthCheckPeriodics(object): # here because we want the maintenance tasks from each instance to # execute this task. hash_ring_db.touch_nodes_from_host(self.ctx, self._group) + + # Check the number of the nodes in the ring and log a message in + # case they are out of sync. See LP #2024205 for more information + # on this issue. + api_workers = service._get_api_workers() + num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group) + + if num_nodes > api_workers: + LOG.critical( + 'The number of nodes in the Hash Ring (%d) is higher than ' + 'the number of API workers (%d) for host "%s". Something is ' + 'not right and OVSDB events could be missed because of this. ' + 'Please check the status of the Neutron processes, this can ' + 'happen when the API workers are killed and restarted. ' + 'Restarting the service should fix the issue, see LP ' + '#2024205 for more information.', + num_nodes, api_workers, cfg.CONF.host) diff --git a/neutron/tests/functional/base.py b/neutron/tests/functional/base.py index bb1149070b4..5a0fc506fd5 100644 --- a/neutron/tests/functional/base.py +++ b/neutron/tests/functional/base.py @@ -351,7 +351,7 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase, # NOTE(ralonsoh): do not access to the DB at exit when the SQL # connection is already closed, to avoid useless exception messages. mock.patch.object( - self.mech_driver, '_set_hash_ring_nodes_offline').start() + self.mech_driver, '_remove_node_from_hash_ring').start() self.mech_driver.pre_fork_initialize( mock.ANY, mock.ANY, trigger_cls.trigger) diff --git a/neutron/tests/unit/db/test_ovn_hash_ring_db.py b/neutron/tests/unit/db/test_ovn_hash_ring_db.py index db03cf7ad6b..b7d49a948be 100644 --- a/neutron/tests/unit/db/test_ovn_hash_ring_db.py +++ b/neutron/tests/unit/db/test_ovn_hash_ring_db.py @@ -270,16 +270,18 @@ class TestHashRing(testlib_api.SqlTestCaseLight): self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)) - def test_set_nodes_from_host_as_offline(self): + def test_remove_node_by_uuid(self): self._add_nodes_and_assert_exists(count=3) active_nodes = ovn_hash_ring_db.get_active_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) self.assertEqual(3, len(active_nodes)) - ovn_hash_ring_db.set_nodes_from_host_as_offline( - self.admin_ctx, HASH_RING_TEST_GROUP) + node_to_remove = active_nodes[0].node_uuid + ovn_hash_ring_db.remove_node_by_uuid( + self.admin_ctx, node_to_remove) active_nodes = ovn_hash_ring_db.get_active_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) - self.assertEqual(0, len(active_nodes)) + self.assertEqual(2, len(active_nodes)) + self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])