Fix a race condition in the hash ring code

We're handling hash rings and updated_at differently: one
is stored on the class level, the other - on instance. Apparently,
there is a race there, resulting in updated_at never updated.
Store hash rings and updated_at in one tuple, so that they're
always loaded and stored together.

Also remove double loading of the hash ring in _get_ring that
could contribute to the problem.

Change-Id: Ib659014e07549ae3d5ec7e69da318301f5994ca8
This commit is contained in:
Dmitry Tantsur 2020-09-25 14:09:43 +02:00
parent ac19e6050d
commit d3872cfcdd
3 changed files with 24 additions and 16 deletions

View File

@ -29,12 +29,11 @@ LOG = log.getLogger(__name__)
class HashRingManager(object): class HashRingManager(object):
_hash_rings = None _hash_rings = (None, 0)
_lock = threading.Lock() _lock = threading.Lock()
def __init__(self, use_groups=True, cache=True): def __init__(self, use_groups=True, cache=True):
self.dbapi = dbapi.get_instance() self.dbapi = dbapi.get_instance()
self.updated_at = time.time()
self.use_groups = use_groups self.use_groups = use_groups
self.cache = cache self.cache = cache
@ -48,19 +47,19 @@ class HashRingManager(object):
# Hot path, no lock. Using a local variable to avoid races with code # Hot path, no lock. Using a local variable to avoid races with code
# changing the class variable. # changing the class variable.
hash_rings = self.__class__._hash_rings hash_rings, updated_at = self.__class__._hash_rings
if hash_rings is not None and self.updated_at >= limit: if hash_rings is not None and updated_at >= limit:
return hash_rings return hash_rings
with self._lock: with self._lock:
if self.__class__._hash_rings is None or self.updated_at < limit: hash_rings, updated_at = self.__class__._hash_rings
if hash_rings is None or updated_at < limit:
LOG.debug('Rebuilding cached hash rings') LOG.debug('Rebuilding cached hash rings')
rings = self._load_hash_rings() hash_rings = self._load_hash_rings()
self.__class__._hash_rings = rings self.__class__._hash_rings = hash_rings, time.time()
self.updated_at = time.time()
LOG.debug('Finished rebuilding hash rings, available drivers ' LOG.debug('Finished rebuilding hash rings, available drivers '
'are %s', ', '.join(rings)) 'are %s', ', '.join(hash_rings))
return self.__class__._hash_rings return hash_rings
def _load_hash_rings(self): def _load_hash_rings(self):
rings = {} rings = {}
@ -78,7 +77,7 @@ class HashRingManager(object):
def reset(cls): def reset(cls):
with cls._lock: with cls._lock:
LOG.debug('Resetting cached hash rings') LOG.debug('Resetting cached hash rings')
cls._hash_rings = None cls._hash_rings = (None, 0)
def get_ring(self, driver_name, conductor_group): def get_ring(self, driver_name, conductor_group):
try: try:
@ -96,13 +95,14 @@ class HashRingManager(object):
def _get_ring(self, driver_name, conductor_group): def _get_ring(self, driver_name, conductor_group):
# There are no conductors, temporary failure - 503 Service Unavailable # There are no conductors, temporary failure - 503 Service Unavailable
if not self.ring: ring = self.ring # a property, don't load twice
if not ring:
raise exception.TemporaryFailure() raise exception.TemporaryFailure()
try: try:
if self.use_groups: if self.use_groups:
return self.ring['%s:%s' % (conductor_group, driver_name)] return ring['%s:%s' % (conductor_group, driver_name)]
return self.ring[driver_name] return ring[driver_name]
except KeyError: except KeyError:
raise exception.DriverNotFound( raise exception.DriverNotFound(
_("The driver '%s' is unknown.") % driver_name) _("The driver '%s' is unknown.") % driver_name)

View File

@ -112,7 +112,10 @@ class HashRingManagerTestCase(db_base.DbTestCase):
# since there is an active conductor for the requested hardware type. # since there is an active conductor for the requested hardware type.
self.assertEqual(1, len(ring)) self.assertEqual(1, len(ring))
self.ring_manager.updated_at = time.time() - 31 self.ring_manager.__class__._hash_rings = (
self.ring_manager.__class__._hash_rings[0],
time.time() - 31
)
ring = self.ring_manager.get_ring('hardware-type', '') ring = self.ring_manager.get_ring('hardware-type', '')
self.assertEqual(2, len(ring)) self.assertEqual(2, len(ring))
@ -121,7 +124,7 @@ class HashRingManagerTestCase(db_base.DbTestCase):
use_groups=self.use_groups) use_groups=self.use_groups)
ring = ring_mgr.ring ring = ring_mgr.ring
self.assertIsNotNone(ring) self.assertIsNotNone(ring)
self.assertIsNone(hash_ring.HashRingManager._hash_rings) self.assertEqual((None, 0), hash_ring.HashRingManager._hash_rings)
class HashRingManagerWithGroupsTestCase(HashRingManagerTestCase): class HashRingManagerWithGroupsTestCase(HashRingManagerTestCase):

View File

@ -0,0 +1,5 @@
---
fixes:
- |
Fixes a potential race in the hash ring code that could result in the
hash rings never updated after their initial load.