diff --git a/ironic/common/hash_ring.py b/ironic/common/hash_ring.py index 61e1b385f0..81050e7a80 100644 --- a/ironic/common/hash_ring.py +++ b/ironic/common/hash_ring.py @@ -16,6 +16,7 @@ import threading import time +from oslo_log import log from tooz import hashring from ironic.common import exception @@ -24,6 +25,9 @@ from ironic.conf import CONF from ironic.db import api as dbapi +LOG = log.getLogger(__name__) + + class HashRingManager(object): _hash_rings = None _lock = threading.Lock() @@ -36,15 +40,20 @@ class HashRingManager(object): def ring(self): interval = CONF.hash_ring_reset_interval limit = time.time() - interval - # Hot path, no lock - if self.__class__._hash_rings is not None and self.updated_at >= limit: - return self.__class__._hash_rings + # Hot path, no lock. Using a local variable to avoid races with code + # changing the class variable. + hash_rings = self.__class__._hash_rings + if hash_rings is not None and self.updated_at >= limit: + return hash_rings with self._lock: if self.__class__._hash_rings is None or self.updated_at < limit: + LOG.debug('Rebuilding cached hash rings') rings = self._load_hash_rings() self.__class__._hash_rings = rings self.updated_at = time.time() + LOG.debug('Finished rebuilding hash rings, available drivers ' + 'are %s', ', '.join(rings)) return self.__class__._hash_rings def _load_hash_rings(self): @@ -60,9 +69,27 @@ class HashRingManager(object): @classmethod def reset(cls): with cls._lock: + LOG.debug('Resetting cached hash rings') cls._hash_rings = None def __getitem__(self, driver_name): + try: + return self._get_ring(driver_name) + except (exception.DriverNotFound, exception.TemporaryFailure): + # NOTE(dtantsur): we assume that this case is more often caused by + # conductors coming and leaving, so we try to rebuild the rings. + LOG.debug('No conductor found for driver %(driver)s, trying ' + 'to rebuild the hash rings', + {'driver': driver_name}) + + self.__class__.reset() + return self._get_ring(driver_name) + + def _get_ring(self, driver_name): + # There are no conductors, temporary failure - 503 Service Unavailable + if not self.ring: + raise exception.TemporaryFailure() + try: return self.ring[driver_name] except KeyError: diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index 1894690307..02afe5be85 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -126,12 +126,6 @@ class ConductorAPI(object): :raises: NoValidHost """ - self.ring_manager.reset() - - # There are no conductors, temporary failure - 503 Service Unavailable - if not self.ring_manager.ring: - raise exception.TemporaryFailure() - try: ring = self.ring_manager[node.driver] dest = ring.get_nodes(node.uuid.encode('utf-8'), @@ -154,9 +148,13 @@ class ConductorAPI(object): :raises: DriverNotFound """ - self.ring_manager.reset() + try: + ring = self.ring_manager[driver_name] + except exception.TemporaryFailure: + # NOTE(dtantsur): even if no conductors are registered, it makes + # sense to report 404 on any driver request. + raise exception.DriverNotFound(_("No conductors registered.")) - ring = self.ring_manager[driver_name] host = random.choice(list(ring.nodes)) return self.topic + "." + host diff --git a/ironic/conf/default.py b/ironic/conf/default.py index 6611b96889..27472c6833 100644 --- a/ironic/conf/default.py +++ b/ironic/conf/default.py @@ -191,8 +191,10 @@ hash_opts = [ 'and potentially allow the Ironic cluster to recover ' 'more quickly if a conductor instance is terminated.')), cfg.IntOpt('hash_ring_reset_interval', - default=180, - help=_('Interval (in seconds) between hash ring resets.')), + default=15, + help=_('Time (in seconds) after which the hash ring is ' + 'considered outdated and is refreshed on the next ' + 'access.')), ] image_opts = [ diff --git a/ironic/tests/unit/common/test_hash_ring.py b/ironic/tests/unit/common/test_hash_ring.py index 94cfbb56a6..88df94af1a 100644 --- a/ironic/tests/unit/common/test_hash_ring.py +++ b/ironic/tests/unit/common/test_hash_ring.py @@ -59,25 +59,34 @@ class HashRingManagerTestCase(db_base.DbTestCase): self.ring_manager.__getitem__, 'driver3') - def test_hash_ring_manager_no_refresh(self): - # If a new conductor is registered after the ring manager is - # initialized, it won't be seen. Long term this is probably - # undesirable, but today is the intended behavior. - self.assertRaises(exception.DriverNotFound, + def test_hash_ring_manager_automatic_retry(self): + self.assertRaises(exception.TemporaryFailure, self.ring_manager.__getitem__, 'driver1') self.register_conductors() - self.assertRaises(exception.DriverNotFound, - self.ring_manager.__getitem__, - 'driver1') + self.assertIsNotNone(self.ring_manager['driver1']) - def test_hash_ring_manager_refresh(self): + def test_hash_ring_manager_reset_interval(self): CONF.set_override('hash_ring_reset_interval', 30) - # Initialize the ring manager to make _hash_rings not None, then - # hash ring will refresh only when time interval exceeded. - self.assertRaises(exception.DriverNotFound, - self.ring_manager.__getitem__, - 'driver1') - self.register_conductors() + # Just to simplify calculations + CONF.set_override('hash_partition_exponent', 0) + self.dbapi.register_conductor({ + 'hostname': 'host1', + 'drivers': ['driver1', 'driver2'], + }) + + ring = self.ring_manager['driver1'] + self.assertEqual(1, len(ring)) + + self.dbapi.register_conductor({ + 'hostname': 'host2', + 'drivers': ['driver1'], + }) + ring = self.ring_manager['driver1'] + # The new conductor is not known yet. Automatic retry does not kick in, + # since there is an active conductor for the requested driver. + self.assertEqual(1, len(ring)) + self.ring_manager.updated_at = time.time() - 31 - self.ring_manager.__getitem__('driver1') + ring = self.ring_manager['driver1'] + self.assertEqual(2, len(ring)) diff --git a/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml b/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml new file mode 100644 index 0000000000..b47b1959e2 --- /dev/null +++ b/releasenotes/notes/hash-ring-race-da0d584de1f46788.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Fixes a race condition in the hash ring implementation that could cause + an internal server error on any request. See `story 2003966 + `_ for details. +upgrade: + - | + The ``hash_ring_reset_interval`` configuration option was changed from 180 + to 15 seconds. Previously, this option was essentially ignored on the API + side, becase the hash ring was reset on each API access. The lower value + minimizes the probability of a request routed to a wrong conductor when the + ring needs rebalancing.