Merge "[OVN] Hash Ring: Better handle Neutron worker failures"
This commit is contained in:
commit
cbdde881a1
|
@ -58,12 +58,12 @@ class OVNMechanismDriver(mech_driver.OVNMechanismDriver):
|
|||
def ovn_client(self):
|
||||
return self._ovn_client
|
||||
|
||||
def _set_hash_ring_nodes_offline(self):
|
||||
"""Don't set hash ring nodes as offline.
|
||||
def _remove_node_from_hash_ring(self):
|
||||
"""Don't remove the node from the Hash Ring.
|
||||
|
||||
If this method was not overridden, cleanup would be performed when
|
||||
calling the db sync and running neutron server would mark all the
|
||||
nodes from the ring as offline.
|
||||
calling the db sync and running neutron server would remove the
|
||||
nodes from the Hash Ring.
|
||||
"""
|
||||
|
||||
# Since we are not using the ovn mechanism driver while syncing,
|
||||
|
|
|
@ -50,6 +50,13 @@ def remove_nodes_from_host(context, group_name):
|
|||
CONF.host, group_name)
|
||||
|
||||
|
||||
def remove_node_by_uuid(context, node_uuid):
|
||||
with db_api.CONTEXT_WRITER.using(context):
|
||||
context.session.query(ovn_models.OVNHashRing).filter(
|
||||
ovn_models.OVNHashRing.node_uuid == node_uuid).delete()
|
||||
LOG.info('Node "%s" removed from the Hash Ring', node_uuid)
|
||||
|
||||
|
||||
def _touch(context, updated_at=None, **filter_args):
|
||||
if updated_at is None:
|
||||
updated_at = timeutils.utcnow()
|
||||
|
@ -96,7 +103,9 @@ def count_offline_nodes(context, interval, group_name):
|
|||
return query.count()
|
||||
|
||||
|
||||
def set_nodes_from_host_as_offline(context, group_name):
|
||||
timestamp = datetime.datetime(day=26, month=10, year=1985, hour=9)
|
||||
_touch(context, updated_at=timestamp, hostname=CONF.host,
|
||||
group_name=group_name)
|
||||
@db_api.CONTEXT_READER
|
||||
def count_nodes_from_host(context, group_name):
|
||||
query = context.session.query(ovn_models.OVNHashRing).filter(
|
||||
ovn_models.OVNHashRing.group_name == group_name,
|
||||
ovn_models.OVNHashRing.hostname == CONF.host)
|
||||
return query.count()
|
||||
|
|
|
@ -286,17 +286,17 @@ class OVNMechanismDriver(api.MechanismDriver):
|
|||
resources.SECURITY_GROUP_RULE,
|
||||
events.BEFORE_DELETE)
|
||||
|
||||
def _set_hash_ring_nodes_offline(self, *args, **kwargs):
|
||||
def _remove_node_from_hash_ring(self, *args, **kwargs):
|
||||
# The node_uuid attribute will be empty for worker types
|
||||
# that are not added to the Hash Ring and can be skipped
|
||||
if self.node_uuid is None:
|
||||
return
|
||||
admin_context = n_context.get_admin_context()
|
||||
ovn_hash_ring_db.set_nodes_from_host_as_offline(
|
||||
admin_context, self.hash_ring_group)
|
||||
LOG.info('Hash Ring nodes from host "%s" marked as offline',
|
||||
cfg.CONF.host)
|
||||
ovn_hash_ring_db.remove_node_by_uuid(
|
||||
admin_context, self.node_uuid)
|
||||
|
||||
def pre_fork_initialize(self, resource, event, trigger, payload=None):
|
||||
"""Pre-initialize the ML2/OVN driver."""
|
||||
atexit.register(self._set_hash_ring_nodes_offline)
|
||||
signal.signal(signal.SIGTERM, self._set_hash_ring_nodes_offline)
|
||||
ovn_utils.create_neutron_pg_drop()
|
||||
|
||||
@staticmethod
|
||||
|
@ -314,6 +314,10 @@ class OVNMechanismDriver(api.MechanismDriver):
|
|||
thread for this host. Subsequently workers just need to register
|
||||
themselves to the hash ring.
|
||||
"""
|
||||
# Attempt to remove the node from the ring when the worker stops
|
||||
atexit.register(self._remove_node_from_hash_ring)
|
||||
signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring)
|
||||
|
||||
admin_context = n_context.get_admin_context()
|
||||
if not self._hash_ring_probe_event.is_set():
|
||||
# Clear existing entries
|
||||
|
|
|
@ -41,6 +41,7 @@ from neutron.db import ovn_revision_numbers_db as revision_numbers_db
|
|||
from neutron.objects import ports as ports_obj
|
||||
from neutron.objects import router as router_obj
|
||||
from neutron.plugins.ml2.drivers.ovn.mech_driver.ovsdb import ovn_db_sync
|
||||
from neutron import service
|
||||
from neutron.services.logapi.drivers.ovn import driver as log_driver
|
||||
|
||||
|
||||
|
@ -1100,3 +1101,20 @@ class HashRingHealthCheckPeriodics(object):
|
|||
# here because we want the maintenance tasks from each instance to
|
||||
# execute this task.
|
||||
hash_ring_db.touch_nodes_from_host(self.ctx, self._group)
|
||||
|
||||
# Check the number of the nodes in the ring and log a message in
|
||||
# case they are out of sync. See LP #2024205 for more information
|
||||
# on this issue.
|
||||
api_workers = service._get_api_workers()
|
||||
num_nodes = hash_ring_db.count_nodes_from_host(self.ctx, self._group)
|
||||
|
||||
if num_nodes > api_workers:
|
||||
LOG.critical(
|
||||
'The number of nodes in the Hash Ring (%d) is higher than '
|
||||
'the number of API workers (%d) for host "%s". Something is '
|
||||
'not right and OVSDB events could be missed because of this. '
|
||||
'Please check the status of the Neutron processes, this can '
|
||||
'happen when the API workers are killed and restarted. '
|
||||
'Restarting the service should fix the issue, see LP '
|
||||
'#2024205 for more information.',
|
||||
num_nodes, api_workers, cfg.CONF.host)
|
||||
|
|
|
@ -327,7 +327,7 @@ class TestOVNFunctionalBase(test_plugin.Ml2PluginV2TestCase,
|
|||
# NOTE(ralonsoh): do not access to the DB at exit when the SQL
|
||||
# connection is already closed, to avoid useless exception messages.
|
||||
mock.patch.object(
|
||||
self.mech_driver, '_set_hash_ring_nodes_offline').start()
|
||||
self.mech_driver, '_remove_node_from_hash_ring').start()
|
||||
self.mech_driver.pre_fork_initialize(
|
||||
mock.ANY, mock.ANY, trigger_cls.trigger)
|
||||
|
||||
|
|
|
@ -270,16 +270,18 @@ class TestHashRing(testlib_api.SqlTestCaseLight):
|
|||
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
|
||||
|
||||
def test_set_nodes_from_host_as_offline(self):
|
||||
def test_remove_node_by_uuid(self):
|
||||
self._add_nodes_and_assert_exists(count=3)
|
||||
|
||||
active_nodes = ovn_hash_ring_db.get_active_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
|
||||
self.assertEqual(3, len(active_nodes))
|
||||
|
||||
ovn_hash_ring_db.set_nodes_from_host_as_offline(
|
||||
self.admin_ctx, HASH_RING_TEST_GROUP)
|
||||
node_to_remove = active_nodes[0].node_uuid
|
||||
ovn_hash_ring_db.remove_node_by_uuid(
|
||||
self.admin_ctx, node_to_remove)
|
||||
|
||||
active_nodes = ovn_hash_ring_db.get_active_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP)
|
||||
self.assertEqual(0, len(active_nodes))
|
||||
self.assertEqual(2, len(active_nodes))
|
||||
self.assertNotIn(node_to_remove, [n.node_uuid for n in active_nodes])
|
||||
|
|
Loading…
Reference in New Issue