[OVN] Improve Hash Ring logs

Debugging Hash Ring problems can be difficult challenge given that prior
to this patch the logs were very limited.

This patch improves the logging for this feature as follow:

1. Log when a node is added to the ring
2. Log when nodes are removed from the ring
3. Keep track the number of offline nodes and log it upon loading the
   ring
4. Improve the "Hash Ring is empty" exception with the number of offline
   nodes found (if 0, means the ovn_hash_ring table has no entries)

Closes-Bug: #2023670
Change-Id: Ic90432b5ddea8cf176de159ec7eaafd5fd7bdd6e
Signed-off-by: Lucas Alvares Gomes <lucasagomes@gmail.com>
(cherry picked from commit afa20faec3c37bd06346360cadbad0d69e9925f0)

[OVN] The all() and count() methods should be inside a DB txn

The ``ovn_hash_ring_db`` methods ``get_active_nodes`` and
``count_offline_nodes`` are sending SQL requests that should be issued
from inside a READER context.

Closes-Bug: #2024447
Change-Id: If06c372a9d5cb1dc1ec1af768abb61f52c2c5abd
(cherry picked from commit 0c66dfaed8e1ec00726c3e484e69174779678abd)
This commit is contained in:
Lucas Alvares Gomes 2023-06-13 15:53:54 +01:00
parent f5ed8b91b5
commit b4f7c9dff4
4 changed files with 72 additions and 12 deletions

View File

@ -33,6 +33,7 @@ class StandardAttributeIDNotFound(n_exc.NeutronException):
class HashRingIsEmpty(n_exc.NeutronException):
message = _('Hash Ring returned empty when hashing "%(key)s". '
'This should never happen in a normal situation, please '
'check the status of your cluster')
message = _('Hash Ring returned empty when hashing "%(key)s". All '
'%(node_count)d nodes were found offline. This should never '
'happen in a normal situation, please check the status '
'of your cluster')

View File

@ -38,6 +38,7 @@ class HashRingManager(object):
# Flag to rate limit the caching log
self._prev_num_nodes = -1
self.admin_ctx = context.get_admin_context()
self._offline_node_count = 0
@property
def _wait_startup_before_caching(self):
@ -92,6 +93,11 @@ class HashRingManager(object):
self._hash_ring = hashring.HashRing({node.node_uuid
for node in nodes})
self._last_time_loaded = timeutils.utcnow()
self._offline_node_count = db_hash_ring.count_offline_nodes(
self.admin_ctx, constants.HASH_RING_NODES_TIMEOUT,
self._group)
LOG.debug("Hash Ring loaded. %d active nodes. %d offline nodes",
len(nodes), self._offline_node_count)
def refresh(self):
self._load_hash_ring(refresh=True)
@ -108,4 +114,5 @@ class HashRingManager(object):
# KeyError is raised
return self._hash_ring[key].pop()
except KeyError:
raise exceptions.HashRingIsEmpty(key=key)
raise exceptions.HashRingIsEmpty(
key=key, node_count=self._offline_node_count)

View File

@ -17,12 +17,14 @@ import datetime
from neutron_lib.db import api as db_api
from oslo_config import cfg
from oslo_log import log
from oslo_utils import timeutils
from oslo_utils import uuidutils
from neutron.db.models import ovn as ovn_models
CONF = cfg.CONF
LOG = log.getLogger(__name__)
# NOTE(ralonsoh): this was migrated from networking-ovn to neutron and should
@ -34,6 +36,8 @@ def add_node(context, group_name, node_uuid=None):
with db_api.CONTEXT_WRITER.using(context):
context.session.add(ovn_models.OVNHashRing(
node_uuid=node_uuid, hostname=CONF.host, group_name=group_name))
LOG.info('Node %s from host "%s" and group "%s" added to the Hash Ring',
node_uuid, CONF.host, group_name)
return node_uuid
@ -42,6 +46,8 @@ def remove_nodes_from_host(context, group_name):
context.session.query(ovn_models.OVNHashRing).filter(
ovn_models.OVNHashRing.hostname == CONF.host,
ovn_models.OVNHashRing.group_name == group_name).delete()
LOG.info('Nodes from host "%s" and group "%s" removed from the Hash Ring',
CONF.host, group_name)
def _touch(context, **filter_args):
@ -58,12 +64,31 @@ def touch_node(context, node_uuid):
_touch(context, node_uuid=node_uuid)
def get_active_nodes(context, interval, group_name, from_host=False):
def _get_nodes_query(context, interval, group_name, offline=False,
from_host=False):
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
with db_api.CONTEXT_READER.using(context):
query = context.session.query(ovn_models.OVNHashRing).filter(
ovn_models.OVNHashRing.updated_at >= limit,
ovn_models.OVNHashRing.group_name == group_name)
if from_host:
query = query.filter_by(hostname=CONF.host)
return query.all()
query = context.session.query(ovn_models.OVNHashRing).filter(
ovn_models.OVNHashRing.group_name == group_name)
if offline:
query = query.filter(ovn_models.OVNHashRing.updated_at < limit)
else:
query = query.filter(ovn_models.OVNHashRing.updated_at >= limit)
if from_host:
query = query.filter_by(hostname=CONF.host)
return query
@db_api.CONTEXT_READER
def get_active_nodes(context, interval, group_name, from_host=False):
query = _get_nodes_query(context, interval, group_name,
from_host=from_host)
return query.all()
@db_api.CONTEXT_READER
def count_offline_nodes(context, interval, group_name):
query = _get_nodes_query(context, interval, group_name, offline=True)
return query.count()

View File

@ -242,3 +242,30 @@ class TestHashRing(testlib_api.SqlTestCaseLight):
for node in group2:
node_db = self._get_node_row(node)
self.assertEqual(node_db.created_at, node_db.updated_at)
def test_count_offline_nodes(self):
self._add_nodes_and_assert_exists(count=3)
# Assert no nodes are considered offline
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
# Subtract 60 seconds from utcnow() and touch the nodes to make
# them to appear offline
fake_utcnow = timeutils.utcnow() - datetime.timedelta(seconds=60)
with mock.patch.object(timeutils, 'utcnow') as mock_utcnow:
mock_utcnow.return_value = fake_utcnow
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
HASH_RING_TEST_GROUP)
# Now assert that all nodes from our host are seeing as offline
self.assertEqual(3, ovn_hash_ring_db.count_offline_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
# Touch the nodes again without faking utcnow()
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
HASH_RING_TEST_GROUP)
# Assert no nodes are considered offline
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))