From 9e8e3a7867b689ca1bd462ddff294db030032350 Mon Sep 17 00:00:00 2001 From: Lucas Alvares Gomes Date: Fri, 16 Jun 2023 13:44:07 +0100 Subject: [PATCH] [OVN] Hash Ring: Better handle Neutron worker failures This patch implements a more resilient approach to handle the case where Neutron API workers are killed and restarted. Instead of marking all nodes for that host as offline, this patch tries to remove the worker that was killed from the Hash Ring leaving all others nodes for that host online. In case the we fail to remove the node and another entry is added upon the restart of the worker this patch also logs a clear critical log message to alert the operator that there are more Hash Ring nodes than API workers (it's expect to be the same) and that OVSDB events could go missing if they are routed to the previous node that failed to be removed from the ring. Closes-Bug: #2024205 Change-Id: I4b7376cf7df45fcc6e487970b068d06b4e74e319 Signed-off-by: Lucas Alvares Gomes --- neutron/cmd/ovn/neutron_ovn_db_sync_util.py | 8 ++++---- neutron/db/ovn_hash_ring_db.py | 17 +++++++++++++---- .../ml2/drivers/ovn/mech_driver/mech_driver.py | 18 +++++++++++------- .../ovn/mech_driver/ovsdb/maintenance.py | 18 ++++++++++++++++++ neutron/tests/functional/base.py | 2 +- neutron/tests/unit/db/test_ovn_hash_ring_db.py | 10 ++++++---- 6 files changed, 53 insertions(+), 20 deletions(-) diff --git a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py index a4013c8d129..25d49acc22a 100644 --- a/neutron/cmd/ovn/neutron_ovn_db_sync_util.py +++ b/neutron/cmd/ovn/neutron_ovn_db_sync_util.py @@ -58,12 +58,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver): def ovn_client(self): return self._ovn_client - def _set_hash_ring_nodes_offline(self): - """Don't set hash ring nodes as offline. + def _remove_node_from_hash_ring(self): + """Don't remove the node from the Hash Ring. If this method was not overridden, cleanup would be performed when - calling the db sync and running neutron server would mark all the - nodes from the ring as offline. + calling the db sync and running neutron server would remove the + nodes from the Hash Ring. """ # Since we are not using the ovn mechanism driver while syncing, diff --git a/neutron/db/ovn_hash_ring_db.py b/neutron/db/ovn_hash_ring_db.py index 59f528f7730..69019aa0ffe 100644 --- a/neutron/db/ovn_hash_ring_db.py +++ b/neutron/db/ovn_hash_ring_db.py @@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name): CONF.host, group_name) +def remove_node_by_uuid(context, node_uuid): + with db_api.CONTEXT_WRITER.using(context): + context.session.query(ovn_models.OVNHashRing).filter( + ovn_models.OVNHashRing.node_uuid == node_uuid).delete() + LOG.info('Node "%s" removed from the Hash Ring', node_uuid) + + def _touch(context, updated_at=None, **filter_args): if updated_at is None: updated_at = timeutils.utcnow() @@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name): return query.count() -def set_nodes_from_host_as_offline(context, group_name): - timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9) - _touch(context, updated_at=timestamp, hostname=CONF.host, - group_name=group_name) +@db_api.CONTEXT_READER +def count_nodes_from_host(context, group_name): + query = context.session.query(ovn_models.OVNHashRing).filter( + ovn_models.OVNHashRing.group_name == group_name, + ovn_models.OVNHashRing.hostname == CONF.host) + return query.count() diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py index 30e0bd833a3..6bf95b35dad 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py @@ -286,17 +286,17 @@ class OVNMechanismDriver(api.MechanismDriver): resources.SECURITY_GROUP_RULE, events.BEFORE_DELETE) - def _set_hash_ring_nodes_offline(self, *args, **kwargs): + def _remove_node_from_hash_ring(self, *args, **kwargs): + # The node_uuid attribute will be empty for worker types + # that are not added to the Hash Ring and can be skipped + if self.node_uuid is None: + return admin_context = n_context.get_admin_context() - ovn_hash_ring_db.set_nodes_from_host_as_offline( - admin_context, self.hash_ring_group) - LOG.info('Hash Ring nodes from host "%s" marked as offline', - cfg.CONF.host) + ovn_hash_ring_db.remove_node_by_uuid( + admin_context, self.node_uuid) def pre_fork_initialize(self, resource, event, trigger, payload=None): """Pre-initialize the ML2/OVN driver.""" - atexit.register(self._set_hash_ring_nodes_offline) - signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline) ovn_utils.create_neutron_pg_drop() @staticmethod @@ -314,6 +314,10 @@ class OVNMechanismDriver(api.MechanismDriver): thread for this host. Subsequently workers just need to register themselves to the hash ring. """ + # Attempt to remove the node from the ring when the worker stops + atexit.register(self._remove_node_from_hash_ring) + signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring) + admin_context = n_context.get_admin_context() if not self._hash_ring_probe_event.is_set(): # Clear existing entries diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py index f8abd4cbca5..75b814dad6d 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/maintenance.py @@ -41,6 +41,7 @@ from neutron.db import ovn_revision_numbers_db as revision_numbers_db from neutron.objects import ports as ports_obj from neutron.objects import router as router_obj from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync +from neutron import service from neutron.services.logapi.drivers.ovn import driver as log_driver @@ -1100,3 +1101,20 @@ class HashRingHealthCheckPeriodics(object): # here because we want the maintenance tasks from each instance to # execute this task. hash_ring_db.touch_nodes_from_host(self.ctx, self._group) + + # Check the number of the nodes in the ring and log a message in + # case they are out of sync. See LP #2024205 for more information + # on this issue. + api_workers = service._get_api_workers() + num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group) + + if num_nodes > api_workers: + LOG.critical( + 'The number of nodes in the Hash Ring (%d) is higher than ' + 'the number of API workers (%d) for host "%s". Something is ' + 'not right and OVSDB events could be missed because of this. ' + 'Please check the status of the Neutron processes, this can ' + 'happen when the API workers are killed and restarted. ' + 'Restarting the service should fix the issue, see LP ' + '#2024205 for more information.', + num_nodes, api_workers, cfg.CONF.host) diff --git a/neutron/tests/functional/base.py b/neutron/tests/functional/base.py index a8f179d1bad..ec237829cad 100644 --- a/neutron/tests/functional/base.py +++ b/neutron/tests/functional/base.py @@ -327,7 +327,7 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase, # NOTE(ralonsoh): do not access to the DB at exit when the SQL # connection is already closed, to avoid useless exception messages. mock.patch.object( - self.mech_driver, '_set_hash_ring_nodes_offline').start() + self.mech_driver, '_remove_node_from_hash_ring').start() self.mech_driver.pre_fork_initialize( mock.ANY, mock.ANY, trigger_cls.trigger) diff --git a/neutron/tests/unit/db/test_ovn_hash_ring_db.py b/neutron/tests/unit/db/test_ovn_hash_ring_db.py index 6d8e874acdb..ba7010bc3e7 100644 --- a/neutron/tests/unit/db/test_ovn_hash_ring_db.py +++ b/neutron/tests/unit/db/test_ovn_hash_ring_db.py @@ -270,16 +270,18 @@ class TestHashRing(testlib_api.SqlTestCaseLight): self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)) - def test_set_nodes_from_host_as_offline(self): + def test_remove_node_by_uuid(self): self._add_nodes_and_assert_exists(count=3) active_nodes = ovn_hash_ring_db.get_active_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) self.assertEqual(3, len(active_nodes)) - ovn_hash_ring_db.set_nodes_from_host_as_offline( - self.admin_ctx, HASH_RING_TEST_GROUP) + node_to_remove = active_nodes[0].node_uuid + ovn_hash_ring_db.remove_node_by_uuid( + self.admin_ctx, node_to_remove) active_nodes = ovn_hash_ring_db.get_active_nodes( self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP) - self.assertEqual(0, len(active_nodes)) + self.assertEqual(2, len(active_nodes)) + self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])