[OVN] Improve Hash Ring logs
Debugging Hash Ring problems can be difficult challenge given that prior to this patch the logs were very limited. This patch improves the logging for this feature as follow: 1. Log when a node is added to the ring 2. Log when nodes are removed from the ring 3. Keep track the number of offline nodes and log it upon loading the ring 4. Improve the "Hash Ring is empty" exception with the number of offline nodes found (if 0, means the ovn_hash_ring table has no entries) Closes-Bug: #2023670 Change-Id: Ic90432b5ddea8cf176de159ec7eaafd5fd7bdd6e Signed-off-by: Lucas Alvares Gomes <lucasagomes@gmail.com> (cherry picked from commit afa20faec3c37bd06346360cadbad0d69e9925f0) [OVN] The all() and count() methods should be inside a DB txn The ``ovn_hash_ring_db`` methods ``get_active_nodes`` and ``count_offline_nodes`` are sending SQL requests that should be issued from inside a READER context. Closes-Bug: #2024447 Change-Id: If06c372a9d5cb1dc1ec1af768abb61f52c2c5abd (cherry picked from commit 0c66dfaed8e1ec00726c3e484e69174779678abd)
This commit is contained in:
parent
f5ed8b91b5
commit
b4f7c9dff4
@ -33,6 +33,7 @@ class StandardAttributeIDNotFound(n_exc.NeutronException):
|
||||
|
||||
|
||||
class HashRingIsEmpty(n_exc.NeutronException):
|
||||
message = _('Hash Ring returned empty when hashing "%(key)s". '
|
||||
'This should never happen in a normal situation, please '
|
||||
'check the status of your cluster')
|
||||
message = _('Hash Ring returned empty when hashing "%(key)s". All '
|
||||
'%(node_count)d nodes were found offline. This should never '
|
||||
'happen in a normal situation, please check the status '
|
||||
'of your cluster')
|
||||
|
@ -38,6 +38,7 @@ class HashRingManager(object):
|
||||
# Flag to rate limit the caching log
|
||||
self._prev_num_nodes = -1
|
||||
self.admin_ctx = context.get_admin_context()
|
||||
self._offline_node_count = 0
|
||||
|
||||
@property
|
||||
def _wait_startup_before_caching(self):
|
||||
@ -92,6 +93,11 @@ class HashRingManager(object):
|
||||
self._hash_ring = hashring.HashRing({node.node_uuid
|
||||
for node in nodes})
|
||||
self._last_time_loaded = timeutils.utcnow()
|
||||
self._offline_node_count = db_hash_ring.count_offline_nodes(
|
||||
self.admin_ctx, constants.HASH_RING_NODES_TIMEOUT,
|
||||
self._group)
|
||||
LOG.debug("Hash Ring loaded. %d active nodes. %d offline nodes",
|
||||
len(nodes), self._offline_node_count)
|
||||
|
||||
def refresh(self):
|
||||
self._load_hash_ring(refresh=True)
|
||||
@ -108,4 +114,5 @@ class HashRingManager(object):
|
||||
# KeyError is raised
|
||||
return self._hash_ring[key].pop()
|
||||
except KeyError:
|
||||
raise exceptions.HashRingIsEmpty(key=key)
|
||||
raise exceptions.HashRingIsEmpty(
|
||||
key=key, node_count=self._offline_node_count)
|
||||
|
@ -17,12 +17,14 @@ import datetime
|
||||
|
||||
from neutron_lib.db import api as db_api
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
|
||||
from neutron.db.models import ovn as ovn_models
|
||||
|
||||
CONF = cfg.CONF
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
# NOTE(ralonsoh): this was migrated from networking-ovn to neutron and should
|
||||
@ -34,6 +36,8 @@ def add_node(context, group_name, node_uuid=None):
|
||||
with db_api.CONTEXT_WRITER.using(context):
|
||||
context.session.add(ovn_models.OVNHashRing(
|
||||
node_uuid=node_uuid, hostname=CONF.host, group_name=group_name))
|
||||
LOG.info('Node %s from host "%s" and group "%s" added to the Hash Ring',
|
||||
node_uuid, CONF.host, group_name)
|
||||
return node_uuid
|
||||
|
||||
|
||||
@ -42,6 +46,8 @@ def remove_nodes_from_host(context, group_name):
|
||||
context.session.query(ovn_models.OVNHashRing).filter(
|
||||
ovn_models.OVNHashRing.hostname == CONF.host,
|
||||
ovn_models.OVNHashRing.group_name == group_name).delete()
|
||||
LOG.info('Nodes from host "%s" and group "%s" removed from the Hash Ring',
|
||||
CONF.host, group_name)
|
||||
|
||||
|
||||
def _touch(context, **filter_args):
|
||||
@ -58,12 +64,31 @@ def touch_node(context, node_uuid):
|
||||
_touch(context, node_uuid=node_uuid)
|
||||
|
||||
|
||||
def get_active_nodes(context, interval, group_name, from_host=False):
|
||||
def _get_nodes_query(context, interval, group_name, offline=False,
|
||||
from_host=False):
|
||||
limit = timeutils.utcnow() - datetime.timedelta(seconds=interval)
|
||||
with db_api.CONTEXT_READER.using(context):
|
||||
query = context.session.query(ovn_models.OVNHashRing).filter(
|
||||
ovn_models.OVNHashRing.updated_at >= limit,
|
||||
ovn_models.OVNHashRing.group_name == group_name)
|
||||
if from_host:
|
||||
query = query.filter_by(hostname=CONF.host)
|
||||
return query.all()
|
||||
query = context.session.query(ovn_models.OVNHashRing).filter(
|
||||
ovn_models.OVNHashRing.group_name == group_name)
|
||||
|
||||
if offline:
|
||||
query = query.filter(ovn_models.OVNHashRing.updated_at < limit)
|
||||
else:
|
||||
query = query.filter(ovn_models.OVNHashRing.updated_at >= limit)
|
||||
|
||||
if from_host:
|
||||
query = query.filter_by(hostname=CONF.host)
|
||||
|
||||
return query
|
||||
|
||||
|
||||
@db_api.CONTEXT_READER
|
||||
def get_active_nodes(context, interval, group_name, from_host=False):
|
||||
query = _get_nodes_query(context, interval, group_name,
|
||||
from_host=from_host)
|
||||
return query.all()
|
||||
|
||||
|
||||
@db_api.CONTEXT_READER
|
||||
def count_offline_nodes(context, interval, group_name):
|
||||
query = _get_nodes_query(context, interval, group_name, offline=True)
|
||||
return query.count()
|
||||
|
@ -242,3 +242,30 @@ class TestHashRing(testlib_api.SqlTestCaseLight):
|
||||
for node in group2:
|
||||
node_db = self._get_node_row(node)
|
||||
self.assertEqual(node_db.created_at, node_db.updated_at)
|
||||
|
||||
def test_count_offline_nodes(self):
|
||||
self._add_nodes_and_assert_exists(count=3)
|
||||
|
||||
# Assert no nodes are considered offline
|
||||
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
|
||||
|
||||
# Subtract 60 seconds from utcnow() and touch the nodes to make
|
||||
# them to appear offline
|
||||
fake_utcnow = timeutils.utcnow() - datetime.timedelta(seconds=60)
|
||||
with mock.patch.object(timeutils, 'utcnow') as mock_utcnow:
|
||||
mock_utcnow.return_value = fake_utcnow
|
||||
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
|
||||
HASH_RING_TEST_GROUP)
|
||||
|
||||
# Now assert that all nodes from our host are seeing as offline
|
||||
self.assertEqual(3, ovn_hash_ring_db.count_offline_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
|
||||
|
||||
# Touch the nodes again without faking utcnow()
|
||||
ovn_hash_ring_db.touch_nodes_from_host(self.admin_ctx,
|
||||
HASH_RING_TEST_GROUP)
|
||||
|
||||
# Assert no nodes are considered offline
|
||||
self.assertEqual(0, ovn_hash_ring_db.count_offline_nodes(
|
||||
self.admin_ctx, interval=60, group_name=HASH_RING_TEST_GROUP))
|
||||
|
Loading…
x
Reference in New Issue
Block a user