Handle dependency limit errors more gracefully

When the dependency graph exceeds the configured size we will raise an
exception. Currently we don't handle those exceptions and let them
bubble up to the pipeline processing loop in the scheduler.

When this happens during trigger event processing this is only aborting
the current pipeline handling run and the next scheduler will continue
processing the pipeline as usual.

However, in case where the item is already enqueued this exception can
block the pipeline processor and lead to a hanging pipeline:

ERROR zuul.Scheduler: Exception in pipeline processing:
Traceback (most recent call last):
  File "/opt/zuul/lib/python3.11/site-packages/zuul/scheduler.py", line 2370, in _process_pipeline
    while not self._stopped and pipeline.manager.processQueue():
                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 1800, in processQueue
    item_changed, nnfi = self._processOneItem(
                         ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 1624, in _processOneItem
    self.getDependencyGraph(item.changes[0], dependency_graph, item.event,
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 822, in getDependencyGraph
    self.getDependencyGraph(needed_change, dependency_graph,
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 822, in getDependencyGraph
    self.getDependencyGraph(needed_change, dependency_graph,
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 822, in getDependencyGraph
    self.getDependencyGraph(needed_change, dependency_graph,
  [Previous line repeated 8 more times]
  File "/opt/zuul/lib/python3.11/site-packages/zuul/manager/__init__.py", line 813, in getDependencyGraph
    raise Exception("Dependency graph is too large")
Exception: Dependency graph is too large

To fix this, we'll handle the exception and remove the affected item.
We'll also handle the exception during enqueue and ignore the trigger
event in this case.

Change-Id: I210c5fa4c568f2bf03eedc18b3e9c9a022628dc3
This commit is contained in:
Simon Westphahl 2024-03-19 12:17:07 +01:00
parent 4d06f081bd
commit 305d4dbab9
No known key found for this signature in database
6 changed files with 73 additions and 7 deletions

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,26 @@
- queue:
name: integrated-topic
allow-circular-dependencies: True
dependencies-by-topic: True
- project:
queue: integrated-topic
check:
jobs:
- project-merge
- project-test1:
dependencies: project-merge
- project-test2:
dependencies: project-merge
gate:
jobs:
- project-merge
- project-test1:
dependencies: project-merge
- project-test2:
dependencies: project-merge
- project-testfile:
dependencies: project-merge
post:
jobs:
- project-post

View File

@ -9,3 +9,4 @@
- org/project
- org/project1
- org/project2
- org/project4

View File

@ -9869,3 +9869,29 @@ class TestMaxDeps(ZuulTestCase):
dict(name='project-test1', result='SUCCESS', changes='1,1 2,1'),
dict(name='project-test2', result='SUCCESS', changes='1,1 2,1'),
], ordered=False)
def test_max_deps_extended(self):
self.executor_server.hold_jobs_in_build = True
# max_dependencies for the connection is 1, so this is okay
A = self.fake_gerrit.addFakeChange('org/project4', 'master', 'A')
B = self.fake_gerrit.addFakeChange('org/project4', 'master', 'B',
topic='test-topic')
B.setDependsOn(A, 1)
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
# Increase the number of dependencies for B by adding a
# change with the same topic (dependencies-by-topic is enabled).
# With this C should not be enqueued and A is removed.
C = self.fake_gerrit.addFakeChange('org/project4', 'master', 'C',
topic='test-topic')
self.fake_gerrit.addEvent(C.getPatchsetCreatedEvent(1))
self.executor_server.hold_jobs_in_build = True
self.executor_server.release()
self.waitUntilSettled()
self.assertHistory([
dict(name='project-merge', result='SUCCESS', changes='1,1 2,1'),
], ordered=False)

View File

@ -50,6 +50,10 @@ class StreamingError(Exception):
pass
class DependencyLimitExceededError(Exception):
pass
# Authentication Exceptions
class AuthTokenException(Exception):

View File

@ -596,8 +596,11 @@ class PipelineManager(metaclass=ABCMeta):
change, self.pipeline)
return False
self.getDependencyGraph(change, dependency_graph, event,
update_deps=True)
try:
self.getDependencyGraph(change, dependency_graph, event,
update_deps=True)
except exceptions.DependencyLimitExceededError:
return False
with self.getChangeQueue(change, event, change_queue) as change_queue:
if not change_queue:
@ -808,9 +811,10 @@ class PipelineManager(metaclass=ABCMeta):
if (self.pipeline.tenant.max_dependencies is not None and
(len(dependency_graph) >
self.pipeline.tenant.max_dependencies)):
log.debug("%sDependency graph for change %s is too large",
indent, change)
raise Exception("Dependency graph is too large")
log.info("%sDependency graph for change %s is too large",
indent, change)
raise exceptions.DependencyLimitExceededError(
"Dependency graph is too large")
node = dependency_graph.setdefault(change, [])
if needed_change not in node:
@ -1619,8 +1623,12 @@ class PipelineManager(metaclass=ABCMeta):
meets_reqs = self.areChangesReadyToBeEnqueued(item.changes, item.event)
dependency_graph = collections.OrderedDict()
self.getDependencyGraph(item.changes[0], dependency_graph, item.event,
quiet=True)
try:
self.getDependencyGraph(item.changes[0], dependency_graph,
item.event, quiet=True)
except exceptions.DependencyLimitExceededError:
self.removeItem(item)
return True, nnfi
# Verify that the cycle dependency graph is correct
cycle = self.cycleForChange(