Browse Source

Fix infinite recursion in getRelatedChanges

It's possible to have a loop in change relationships (whether accidentally
or intentionally).  The current method of getting related changes in order
to prune the change cache does not handle that.

This reworks that method to be safe for recursion.

During development, it was observed that the set of relevant changes was
actually a mixture of cache keys (which are objects) and cache references
(which are string representations of the same).  This change also refactors
the collection of relevant cache keys into a method which can be tested on
its own to verify we get keys of a consistent type.

Change-Id: Ie76f9b19f10b053f5c72bdaf7be3efc445b53639
changes/78/813878/2
James E. Blair 2 weeks ago
parent
commit
e4f1b9b65e
  1. 5
      releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml
  2. 10
      tests/unit/test_scheduler.py
  3. 28
      zuul/model.py
  4. 12
      zuul/scheduler.py
  5. 7
      zuul/zk/change_cache.py

5
releasenotes/notes/change-cache-bugfix-bcf9cb5da3ec2392.yaml

@ -0,0 +1,5 @@
---
fixes:
- |
A bug which prevented change cache cleanup (and therefore caused
ZooKeeper usage to grow without limits) was fixed.

10
tests/unit/test_scheduler.py

@ -48,6 +48,7 @@ from tests.base import (
TestConnectionRegistry,
FIXTURE_DIR,
)
from zuul.zk.change_cache import ChangeKey
from zuul.zk.layout import LayoutState
EMPTY_LAYOUT_STATE = LayoutState("", "", 0)
@ -1409,6 +1410,15 @@ class TestScheduler(ZuulTestCase):
sched.maintainConnectionCache()
self.assertEqual(len(_getCachedChanges()), 2)
# Test this method separately to make sure we are getting
# cache keys of the correct type, since we don't do run-time
# validation.
relevant = sched._gatherConnectionCacheKeys()
self.assertEqual(len(relevant), 2)
for k in relevant:
if not isinstance(k, ChangeKey):
raise RuntimeError("Cache key %s is not a ChangeKey" % repr(k))
self.hold_jobs_in_queue = False
self.executor_api.release()
self.waitUntilSettled()

28
zuul/model.py

@ -3672,8 +3672,8 @@ class Ref(object):
def isUpdateOf(self, other):
return False
def getRelatedChanges(self, sched):
return set()
def getRelatedChanges(self, sched, related):
pass
def updatesConfig(self, tenant):
tpc = tenant.project_configs.get(self.project.canonical_name)
@ -3879,17 +3879,21 @@ class Change(Branch):
return True
return False
def getRelatedChanges(self, sched):
related = set()
for reference in self.needs_changes:
related.add(reference)
for reference in self.needed_by_changes:
related.add(reference)
def getRelatedChanges(self, sched, related):
"""Recursively update a set of related changes
:arg Scheduler sched: The scheduler instance
:arg set related: The cache keys of changes which have been inspected
so far. Will be updated with additional changes by this method.
"""
related.add(self.cache_stat.key)
for reference in itertools.chain(self.needs_changes,
self.needed_by_changes):
key = ChangeKey.fromReference(reference)
source = sched.connections.getSource(key.connection_name)
change = source.getChangeByKey(key)
related.update(change.getRelatedChanges(sched))
return related
if key not in related:
source = sched.connections.getSource(key.connection_name)
change = source.getChangeByKey(key)
change.getRelatedChanges(sched, related)
def getSafeAttributes(self):
return Attributes(project=self.project,

12
zuul/scheduler.py

@ -1616,18 +1616,20 @@ class Scheduler(threading.Thread):
else:
pipeline.state = pipeline.STATE_NORMAL
def maintainConnectionCache(self):
def _gatherConnectionCacheKeys(self):
relevant = set()
self.log.debug("Starting connection cache maintenance")
with self.layout_lock:
for tenant in self.abide.tenants.values():
for pipeline in tenant.layout.pipelines.values():
self.log.debug("Gather relevant cache items for: %s %s",
tenant.name, pipeline.name)
for item in pipeline.getAllItems():
relevant.add(item.change.cache_stat.key)
relevant.update(
item.change.getRelatedChanges(self))
item.change.getRelatedChanges(self, relevant)
return relevant
def maintainConnectionCache(self):
self.log.debug("Starting connection cache maintenance")
relevant = self._gatherConnectionCacheKeys()
# We'll only remove changes older than `max_age` from the cache, as
# it may take a while for an event that was processed by a connection

7
zuul/zk/change_cache.py

@ -85,6 +85,13 @@ class ChangeKey:
msg = self.reference.encode('utf8')
self._hash = hashlib.sha256(msg).hexdigest()
def __hash__(self):
return hash(self.reference)
def __eq__(self, other):
return (isinstance(other, ChangeKey) and
self.reference == other.reference)
@classmethod
def fromReference(cls, data):
data = json.loads(data)

Loading…
Cancel
Save