Browse Source

Hash Ring: Make workers health check more reliable

When under pressure the maintenance thread may not honor the time interval
that it should run its tasks. For something more sensitive like the task
that health checks the nodes from the Hash Ring this can be problematic
because it may cause it to appear that the workers have died and the
ring will get rebalanced; we do not want to have a long interval for it
because in case of a real failure we do want the ring to rebalance fast.

This patch implements a new approach to enhance the health check of the
Hash Ring workers by allowing the worker to "touch" the ring to say that
its alive.

The patch also includes unittests for the OvnIdlDistributedLock class
that was missing before.

Closes-Bug: #1834498
Change-Id: If06d8580e8e2637b19ff5d76e16f635a5c1d328e
Signed-off-by: Lucas Alvares Gomes <lucasagomes@gmail.com>
changes/53/667953/11
Lucas Alvares Gomes 3 years ago
parent
commit
f93638b042
  1. 1
      networking_ovn/common/constants.py
  2. 5
      networking_ovn/common/maintenance.py
  3. 18
      networking_ovn/db/hash_ring.py
  4. 20
      networking_ovn/ovsdb/ovsdb_monitor.py
  5. 20
      networking_ovn/tests/unit/db/test_hash_ring.py
  6. 82
      networking_ovn/tests/unit/ovsdb/test_ovsdb_monitor.py

1
networking_ovn/common/constants.py

@ -155,4 +155,5 @@ DEFAULT_ADDR_FOR_LSP_WITH_PEER = 'router'
# Hash Ring constants
HASH_RING_NODES_TIMEOUT = 60
HASH_RING_TOUCH_INTERVAL = 30
HASH_RING_CACHE_TIMEOUT = 30

5
networking_ovn/common/maintenance.py

@ -386,10 +386,7 @@ class DBInconsistenciesPeriodics(object):
raise periodics.NeverAgain()
# The static spacing value here is half of the
# HASH_RING_NODES_TIMEOUT, we want to be able to try to touch the nodes
# at least twice before they are considered dead.
@periodics.periodic(spacing=ovn_const.HASH_RING_NODES_TIMEOUT / 2)
@periodics.periodic(spacing=ovn_const.HASH_RING_TOUCH_INTERVAL)
def touch_hash_ring_nodes(self):
# NOTE(lucasagomes): Note that we do not rely on the OVSDB lock
# here because we want the maintenance tasks from each instance to

18
networking_ovn/db/hash_ring.py

@ -43,11 +43,25 @@ def remove_nodes_from_host():
hostname=CONF.host).delete()
def touch_nodes_from_host():
def _touch(hostname=None, node_uuid=None):
filter_args = {}
if hostname:
filter_args['hostname'] = hostname
if node_uuid:
filter_args['node_uuid'] = node_uuid
session = db_api.get_writer_session()
with session.begin():
session.query(models.OVNHashRing).filter_by(
hostname=CONF.host).update({'updated_at': timeutils.utcnow()})
**filter_args).update({'updated_at': timeutils.utcnow()})
def touch_nodes_from_host():
_touch(hostname=CONF.host)
def touch_node(node_uuid):
_touch(node_uuid=node_uuid)
def get_active_nodes(interval, from_host=False):

20
networking_ovn/ovsdb/ovsdb_monitor.py

@ -13,12 +13,14 @@
# under the License.
import abc
import datetime
from neutron_lib.plugins import constants
from neutron_lib.plugins import directory
from neutron_lib.utils import helpers
from oslo_config import cfg
from oslo_log import log
from oslo_utils import timeutils
from ovs.stream import Stream
from ovsdbapp.backend.ovs_idl import connection
from ovsdbapp.backend.ovs_idl import event as row_event
@ -31,6 +33,7 @@ from networking_ovn.common import constants as ovn_const
from networking_ovn.common import exceptions
from networking_ovn.common import hash_ring_manager
from networking_ovn.common import utils
from networking_ovn.db import hash_ring as db_hash_ring
CONF = cfg.CONF
LOG = log.getLogger(__name__)
@ -350,6 +353,7 @@ class OvnIdlDistributedLock(BaseOvnIdl):
self.notify_handler = OvnDbNotifyHandler(driver)
self._hash_ring = hash_ring_manager.HashRingManager()
self._node_uuid = self.driver.node_uuid
self._last_touch = None
def notify(self, event, row, updates=None):
try:
@ -361,6 +365,22 @@ class OvnIdlDistributedLock(BaseOvnIdl):
if target_node != self._node_uuid:
return
# If the worker hasn't been health checked by the maintenance
# thread (see bug #1834498), indicate that it's alive here
time_now = timeutils.utcnow()
touch_timeout = time_now - datetime.timedelta(
seconds=ovn_const.HASH_RING_TOUCH_INTERVAL)
if not self._last_touch or touch_timeout >= self._last_touch:
# NOTE(lucasagomes): Guard the db operation with an exception
# handler. If heartbeating fails for whatever reason, log
# the error and continue with processing the event
try:
db_hash_ring.touch_node(self._node_uuid)
self._last_touch = time_now
except Exception as e:
LOG.exception('Hash Ring node %s failed to heartbeat',
self._node_uuid)
LOG.debug('Hash Ring: Node %(node)s (host: %(hostname)s) '
'handling event "%(event)s" for row %(row)s '
'(table: %(table)s)',

20
networking_ovn/tests/unit/db/test_hash_ring.py

@ -131,3 +131,23 @@ class TestHashRing(db_base.DBTestCase):
from_host=True)
self.assertEqual(3, len(active_nodes))
self.assertNotIn(another_host_id, active_nodes)
def test_touch_node(self):
nodes = self._add_nodes_and_assert_exists(count=3)
# Assert no nodes were updated yet
for node in nodes:
node_db = self._get_node_row(node)
self.assertEqual(node_db.created_at, node_db.updated_at)
# Touch one of the nodes
db_hash_ring.touch_node(nodes[0])
# Assert it has been updated
node_db = self._get_node_row(nodes[0])
self.assertGreater(node_db.updated_at, node_db.created_at)
# Assert the other two nodes hasn't been updated
for node in nodes[1:]:
node_db = self._get_node_row(node)
self.assertEqual(node_db.created_at, node_db.updated_at)

82
networking_ovn/tests/unit/ovsdb/test_ovsdb_monitor.py

@ -13,11 +13,13 @@
# under the License.
import copy
import datetime
import os
import mock
from neutron_lib.plugins import constants
from neutron_lib.plugins import directory
from oslo_utils import timeutils
from oslo_utils import uuidutils
from ovs.db import idl as ovs_idl
from ovs import poller
@ -27,7 +29,9 @@ from ovsdbapp.backend.ovs_idl import idlutils
from networking_ovn.common import config as ovn_config
from networking_ovn.common import constants as ovn_const
from networking_ovn.common import hash_ring_manager
from networking_ovn.common import utils
from networking_ovn.db import hash_ring as db_hash_ring
from networking_ovn.ovsdb import ovsdb_monitor
from networking_ovn.tests import base
from networking_ovn.tests.unit import fakes
@ -424,3 +428,81 @@ class TestChassisMetadataAgentEvent(base.TestCase):
ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: '1'}})
self.assertTrue(self.event.match_fn(ROW_UPDATE, row, old=old))
class TestOvnIdlDistributedLock(base.TestCase):
def setUp(self):
super(TestOvnIdlDistributedLock, self).setUp()
self.node_uuid = uuidutils.generate_uuid()
self.fake_driver = mock.Mock()
self.fake_driver.node_uuid = self.node_uuid
self.fake_event = 'fake-event'
self.fake_row = fakes.FakeOvsdbRow.create_one_ovsdb_row(
attrs={'_table': mock.Mock(name='FakeTable')})
helper = ovs_idl.SchemaHelper(schema_json=OVN_NB_SCHEMA)
helper.register_all()
with mock.patch.object(ovsdb_monitor, 'OvnDbNotifyHandler'):
self.idl = ovsdb_monitor.OvnIdlDistributedLock(
self.fake_driver, 'punix:/tmp/fake', helper)
self.mock_get_node = mock.patch.object(
hash_ring_manager.HashRingManager,
'get_node', return_value=self.node_uuid).start()
@mock.patch.object(db_hash_ring, 'touch_node')
def test_notify(self, mock_touch_node):
self.idl.notify(self.fake_event, self.fake_row)
mock_touch_node.assert_called_once_with(self.node_uuid)
self.idl.notify_handler.notify.assert_called_once_with(
self.fake_event, self.fake_row, None)
@mock.patch.object(db_hash_ring, 'touch_node')
def test_notify_skip_touch_node(self, mock_touch_node):
# Set a time for last touch
self.idl._last_touch = timeutils.utcnow()
self.idl.notify(self.fake_event, self.fake_row)
# Assert that touch_node() wasn't called
self.assertFalse(mock_touch_node.called)
self.idl.notify_handler.notify.assert_called_once_with(
self.fake_event, self.fake_row, None)
@mock.patch.object(db_hash_ring, 'touch_node')
def test_notify_last_touch_expired(self, mock_touch_node):
# Set a time for last touch
self.idl._last_touch = timeutils.utcnow()
# Let's expire the touch node interval for the next utcnow()
with mock.patch.object(timeutils, 'utcnow') as mock_utcnow:
mock_utcnow.return_value = (
self.idl._last_touch + datetime.timedelta(
seconds=ovn_const.HASH_RING_TOUCH_INTERVAL + 1))
self.idl.notify(self.fake_event, self.fake_row)
# Assert that touch_node() was invoked
mock_touch_node.assert_called_once_with(self.node_uuid)
self.idl.notify_handler.notify.assert_called_once_with(
self.fake_event, self.fake_row, None)
@mock.patch.object(ovsdb_monitor.LOG, 'exception')
@mock.patch.object(db_hash_ring, 'touch_node')
def test_notify_touch_node_exception(self, mock_touch_node, mock_log):
mock_touch_node.side_effect = Exception('BoOooOmmMmmMm')
self.idl.notify(self.fake_event, self.fake_row)
# Assert that in an eventual failure on touch_node() the event
# will continue to be processed by notify_handler.notify()
mock_touch_node.assert_called_once_with(self.node_uuid)
# Assert we are logging the exception
self.assertTrue(mock_log.called)
self.idl.notify_handler.notify.assert_called_once_with(
self.fake_event, self.fake_row, None)
def test_notify_different_node(self):
self.mock_get_node.return_value = 'different-node-uuid'
self.idl.notify('fake-event', self.fake_row)
# Assert that notify() wasn't called for a different node uuid
self.assertFalse(self.idl.notify_handler.notify.called)

Loading…
Cancel
Save