From e4f1b9b65e916ca726b53db8eb8f1f2bb24d4291 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 13 Oct 2021 10:35:02 -0700 Subject: [PATCH] Fix infinite recursion in getRelatedChanges It's possible to have a loop in change relationships (whether accidentally or intentionally). The current method of getting related changes in order to prune the change cache does not handle that. This reworks that method to be safe for recursion. During development, it was observed that the set of relevant changes was actually a mixture of cache keys (which are objects) and cache references (which are string representations of the same). This change also refactors the collection of relevant cache keys into a method which can be tested on its own to verify we get keys of a consistent type. Change-Id: Ie76f9b19f10b053f5c72bdaf7be3efc445b53639 --- .../change-cache-bugfix-bcf9cb5da3ec2392.yaml | 5 ++++ tests/unit/test_scheduler.py | 10 +++++++ zuul/model.py | 28 +++++++++++-------- zuul/scheduler.py | 12 ++++---- zuul/zk/change_cache.py | 7 +++++ 5 files changed, 45 insertions(+), 17 deletions(-) create mode 100644 releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml diff --git a/releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml b/releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml new file mode 100644 index 0000000000..91be05fb47 --- /dev/null +++ b/releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + A bug which prevented change cache cleanup (and therefore caused + ZooKeeper usage to grow without limits) was fixed. diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index dec7d6f829..fb822f50e2 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -48,6 +48,7 @@ from tests.base import ( TestConnectionRegistry, FIXTURE_DIR, ) +from zuul.zk.change_cache import ChangeKey from zuul.zk.layout import LayoutState EMPTY_LAYOUT_STATE = LayoutState("", "", 0) @@ -1409,6 +1410,15 @@ class TestScheduler(ZuulTestCase): sched.maintainConnectionCache() self.assertEqual(len(_getCachedChanges()), 2) + # Test this method separately to make sure we are getting + # cache keys of the correct type, since we don't do run-time + # validation. + relevant = sched._gatherConnectionCacheKeys() + self.assertEqual(len(relevant), 2) + for k in relevant: + if not isinstance(k, ChangeKey): + raise RuntimeError("Cache key %s is not a ChangeKey" % repr(k)) + self.hold_jobs_in_queue = False self.executor_api.release() self.waitUntilSettled() diff --git a/zuul/model.py b/zuul/model.py index b2449ebe02..700cf66817 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -3672,8 +3672,8 @@ class Ref(object): def isUpdateOf(self, other): return False - def getRelatedChanges(self, sched): - return set() + def getRelatedChanges(self, sched, related): + pass def updatesConfig(self, tenant): tpc = tenant.project_configs.get(self.project.canonical_name) @@ -3879,17 +3879,21 @@ class Change(Branch): return True return False - def getRelatedChanges(self, sched): - related = set() - for reference in self.needs_changes: - related.add(reference) - for reference in self.needed_by_changes: - related.add(reference) + def getRelatedChanges(self, sched, related): + """Recursively update a set of related changes + + :arg Scheduler sched: The scheduler instance + :arg set related: The cache keys of changes which have been inspected + so far. Will be updated with additional changes by this method. + """ + related.add(self.cache_stat.key) + for reference in itertools.chain(self.needs_changes, + self.needed_by_changes): key = ChangeKey.fromReference(reference) - source = sched.connections.getSource(key.connection_name) - change = source.getChangeByKey(key) - related.update(change.getRelatedChanges(sched)) - return related + if key not in related: + source = sched.connections.getSource(key.connection_name) + change = source.getChangeByKey(key) + change.getRelatedChanges(sched, related) def getSafeAttributes(self): return Attributes(project=self.project, diff --git a/zuul/scheduler.py b/zuul/scheduler.py index f0b8a93af5..e0121b3812 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -1616,18 +1616,20 @@ class Scheduler(threading.Thread): else: pipeline.state = pipeline.STATE_NORMAL - def maintainConnectionCache(self): + def _gatherConnectionCacheKeys(self): relevant = set() - self.log.debug("Starting connection cache maintenance") with self.layout_lock: for tenant in self.abide.tenants.values(): for pipeline in tenant.layout.pipelines.values(): self.log.debug("Gather relevant cache items for: %s %s", tenant.name, pipeline.name) for item in pipeline.getAllItems(): - relevant.add(item.change.cache_stat.key) - relevant.update( - item.change.getRelatedChanges(self)) + item.change.getRelatedChanges(self, relevant) + return relevant + + def maintainConnectionCache(self): + self.log.debug("Starting connection cache maintenance") + relevant = self._gatherConnectionCacheKeys() # We'll only remove changes older than `max_age` from the cache, as # it may take a while for an event that was processed by a connection diff --git a/zuul/zk/change_cache.py b/zuul/zk/change_cache.py index 1818ac9ff8..69f806ebce 100644 --- a/zuul/zk/change_cache.py +++ b/zuul/zk/change_cache.py @@ -85,6 +85,13 @@ class ChangeKey: msg = self.reference.encode('utf8') self._hash = hashlib.sha256(msg).hexdigest() + def __hash__(self): + return hash(self.reference) + + def __eq__(self, other): + return (isinstance(other, ChangeKey) and + self.reference == other.reference) + @classmethod def fromReference(cls, data): data = json.loads(data)