Fix infinite recursion in getRelatedChanges

It's possible to have a loop in change relationships (whether accidentally
or intentionally).  The current method of getting related changes in order
to prune the change cache does not handle that.

This reworks that method to be safe for recursion.

During development, it was observed that the set of relevant changes was
actually a mixture of cache keys (which are objects) and cache references
(which are string representations of the same).  This change also refactors
the collection of relevant cache keys into a method which can be tested on
its own to verify we get keys of a consistent type.

Change-Id: Ie76f9b19f10b053f5c72bdaf7be3efc445b53639
This commit is contained in:
James E. Blair 2021-10-13 10:35:02 -07:00
parent 51d4fe5587
commit e4f1b9b65e
5 changed files with 45 additions and 17 deletions

View File

@ -0,0 +1,5 @@
---
fixes:
- |
A bug which prevented change cache cleanup (and therefore caused
ZooKeeper usage to grow without limits) was fixed.

View File

@ -48,6 +48,7 @@ from tests.base import (
TestConnectionRegistry, TestConnectionRegistry,
FIXTURE_DIR, FIXTURE_DIR,
) )
from zuul.zk.change_cache import ChangeKey
from zuul.zk.layout import LayoutState from zuul.zk.layout import LayoutState
EMPTY_LAYOUT_STATE = LayoutState("", "", 0) EMPTY_LAYOUT_STATE = LayoutState("", "", 0)
@ -1409,6 +1410,15 @@ class TestScheduler(ZuulTestCase):
sched.maintainConnectionCache() sched.maintainConnectionCache()
self.assertEqual(len(_getCachedChanges()), 2) self.assertEqual(len(_getCachedChanges()), 2)
# Test this method separately to make sure we are getting
# cache keys of the correct type, since we don't do run-time
# validation.
relevant = sched._gatherConnectionCacheKeys()
self.assertEqual(len(relevant), 2)
for k in relevant:
if not isinstance(k, ChangeKey):
raise RuntimeError("Cache key %s is not a ChangeKey" % repr(k))
self.hold_jobs_in_queue = False self.hold_jobs_in_queue = False
self.executor_api.release() self.executor_api.release()
self.waitUntilSettled() self.waitUntilSettled()

View File

@ -3672,8 +3672,8 @@ class Ref(object):
def isUpdateOf(self, other): def isUpdateOf(self, other):
return False return False
def getRelatedChanges(self, sched): def getRelatedChanges(self, sched, related):
return set() pass
def updatesConfig(self, tenant): def updatesConfig(self, tenant):
tpc = tenant.project_configs.get(self.project.canonical_name) tpc = tenant.project_configs.get(self.project.canonical_name)
@ -3879,17 +3879,21 @@ class Change(Branch):
return True return True
return False return False
def getRelatedChanges(self, sched): def getRelatedChanges(self, sched, related):
related = set() """Recursively update a set of related changes
for reference in self.needs_changes:
related.add(reference) :arg Scheduler sched: The scheduler instance
for reference in self.needed_by_changes: :arg set related: The cache keys of changes which have been inspected
related.add(reference) so far. Will be updated with additional changes by this method.
"""
related.add(self.cache_stat.key)
for reference in itertools.chain(self.needs_changes,
self.needed_by_changes):
key = ChangeKey.fromReference(reference) key = ChangeKey.fromReference(reference)
source = sched.connections.getSource(key.connection_name) if key not in related:
change = source.getChangeByKey(key) source = sched.connections.getSource(key.connection_name)
related.update(change.getRelatedChanges(sched)) change = source.getChangeByKey(key)
return related change.getRelatedChanges(sched, related)
def getSafeAttributes(self): def getSafeAttributes(self):
return Attributes(project=self.project, return Attributes(project=self.project,

View File

@ -1616,18 +1616,20 @@ class Scheduler(threading.Thread):
else: else:
pipeline.state = pipeline.STATE_NORMAL pipeline.state = pipeline.STATE_NORMAL
def maintainConnectionCache(self): def _gatherConnectionCacheKeys(self):
relevant = set() relevant = set()
self.log.debug("Starting connection cache maintenance")
with self.layout_lock: with self.layout_lock:
for tenant in self.abide.tenants.values(): for tenant in self.abide.tenants.values():
for pipeline in tenant.layout.pipelines.values(): for pipeline in tenant.layout.pipelines.values():
self.log.debug("Gather relevant cache items for: %s %s", self.log.debug("Gather relevant cache items for: %s %s",
tenant.name, pipeline.name) tenant.name, pipeline.name)
for item in pipeline.getAllItems(): for item in pipeline.getAllItems():
relevant.add(item.change.cache_stat.key) item.change.getRelatedChanges(self, relevant)
relevant.update( return relevant
item.change.getRelatedChanges(self))
def maintainConnectionCache(self):
self.log.debug("Starting connection cache maintenance")
relevant = self._gatherConnectionCacheKeys()
# We'll only remove changes older than `max_age` from the cache, as # We'll only remove changes older than `max_age` from the cache, as
# it may take a while for an event that was processed by a connection # it may take a while for an event that was processed by a connection

View File

@ -85,6 +85,13 @@ class ChangeKey:
msg = self.reference.encode('utf8') msg = self.reference.encode('utf8')
self._hash = hashlib.sha256(msg).hexdigest() self._hash = hashlib.sha256(msg).hexdigest()
def __hash__(self):
return hash(self.reference)
def __eq__(self, other):
return (isinstance(other, ChangeKey) and
self.reference == other.reference)
@classmethod @classmethod
def fromReference(cls, data): def fromReference(cls, data):
data = json.loads(data) data = json.loads(data)