From 173b59abbb22de9fadb9c52c4429301a65095f64 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 6 Jun 2023 16:14:42 -0700 Subject: [PATCH] Correct node cache lock handling A logic error in the TreeCache prevented us from correctly re-establishing the state of node locks after a zk connection loss. In such a case, we need to fetch data for every item in the tree to determine whether it exists, but we were only fetching data for cache objects, not ancillary znodes like lock nodes. Change-Id: I4e9faa4981b90a6b1f46d2f0347866e2166d5b7f --- nodepool/tests/unit/test_zk.py | 27 +++++++++++++++++++++++++++ nodepool/zk/zookeeper.py | 10 ++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/nodepool/tests/unit/test_zk.py b/nodepool/tests/unit/test_zk.py index 47d25cc53..fabc9050f 100644 --- a/nodepool/tests/unit/test_zk.py +++ b/nodepool/tests/unit/test_zk.py @@ -1401,3 +1401,30 @@ class TestTreeCache(tests.DBTestCase): if not checked and test['kind']: raise Exception("Unhandled kind %s" % (test['kind'])) + + def test_node_cache_lock_handling(self): + my_zk = zk.ZooKeeper(self.zk.client, enable_cache=True) + node = zk.Node() + node.state = zk.BUILDING + node.provider = 'rax' + my_zk.storeNode(node) + my_zk.lockNode(node) + + for _ in iterate_timeout(10, Exception, 'cache to sync', interval=0.1): + cached_node = my_zk.getNode(node.id, cached=True, only_cached=True) + if len(cached_node.lock_contenders): + break + self.assertEqual(len(cached_node.lock_contenders), 1) + + my_zk._node_cache._ready.clear() + my_zk._node_cache._start() + my_zk._node_cache._event_queue.join() + my_zk._node_cache._playback_queue.join() + + self.assertEqual(len(cached_node.lock_contenders), 1) + my_zk.unlockNode(node) + + for _ in iterate_timeout(10, Exception, 'cache to sync', interval=0.1): + if not len(cached_node.lock_contenders): + break + self.assertEqual(len(cached_node.lock_contenders), 0) diff --git a/nodepool/zk/zookeeper.py b/nodepool/zk/zookeeper.py index 36a707612..2530d4c4b 100644 --- a/nodepool/zk/zookeeper.py +++ b/nodepool/zk/zookeeper.py @@ -835,6 +835,7 @@ class NodepoolTreeCache(abc.ABC): while not self._stopped: event = self._event_queue.get() if event is None: + self._event_queue.task_done() continue qsize = self._event_queue.qsize() @@ -849,6 +850,7 @@ class NodepoolTreeCache(abc.ABC): self._handleCacheEvent(event) except Exception: self.log.exception("Error handling event %s:", event) + self._event_queue.task_done() def _handleCacheEvent(self, event): # Ignore root node since we don't maintain a cached object for @@ -869,9 +871,11 @@ class NodepoolTreeCache(abc.ABC): fetch = False key = self.parsePath(event.path) - if key is None: + if key is None and event.type != EventType.NONE: # The cache doesn't care about this path, so we don't need - # to fetch. + # to fetch (unless the type is none (re-initialization) in + # which case we always need to fetch in order to determine + # existence). fetch = False if fetch: @@ -884,6 +888,7 @@ class NodepoolTreeCache(abc.ABC): while not self._stopped: item = self._playback_queue.get() if item is None: + self._playback_queue.task_done() continue qsize = self._playback_queue.qsize() @@ -900,6 +905,7 @@ class NodepoolTreeCache(abc.ABC): self._handlePlayback(event, future, key) except Exception: self.log.exception("Error playing back event %s:", event) + self._playback_queue.task_done() def _handlePlayback(self, event, future, key): self.event_log.debug("Cache playback event %s", event)