Fix exceptions on reconfig with changing window

When the window size changes, items formerly in the window might be no
longer active. But those items can already have build results. During
reconfiguration the job graph is set to None, but the item is not
prepared if no longer active. This becomes problematic when
build.setResult() tries to access the job graph when resetting builds.

This can be solved by also preparing the item if there was already a job
graph. The other option to fix this, would be to never change the
item.active state back to False after it became active. However, this
would be a change in the current behavior of Zuul, as only as builds
finish the active window would actually decrease.

Exceptions:

2019-12-03 08:11:06,298 zuul.Scheduler ERROR Exception while re-enqueing item <QueueItem 0x7f54285a49b0 for <Change 0x7f5442939da0 org/project 4,1> in gate>
Traceback (most recent call last):
  File "/tmp/zuul/zuul/scheduler.py", line 931, in _reenqueueTenant
    item_ahead_valid=item_ahead_valid)
  File "/tmp/zuul/zuul/manager/__init__.py", line 276, in reEnqueueItem
    item.setResult(build)
  File "/tmp/zuul/zuul/model.py", line 2642, in setResult
    to_skip = self.job_graph.getDependentJobsRecursively(
AttributeError: 'NoneType' object has no attribute 'getDependentJobsRecursively'

2019-12-03 08:11:11,401 zuul.Scheduler ERROR Exception in run handler:
Traceback (most recent call last):
  File "/tmp/zuul/zuul/scheduler.py", line 1133, in run
    self.process_result_queue()
  File "/tmp/zuul/zuul/scheduler.py", line 1268, in process_result_queue
    self._doBuildCompletedEvent(event)
  File "/tmp/zuul/zuul/scheduler.py", line 1473, in _doBuildCompletedEvent
    pipeline.manager.onBuildCompleted(event.build)
  File "/tmp/zuul/zuul/manager/__init__.py", line 995, in onBuildCompleted
    item.setResult(build)
  File "/tmp/zuul/zuul/model.py", line 2642, in setResult
    to_skip = self.job_graph.getDependentJobsRecursively(
AttributeError: 'NoneType' object has no attribute 'getDependentJobsRecursively'

Change-Id: I96d4b28a017a1371029ff7b8d516beb09b6b94ad
This commit is contained in:
Simon Westphahl 2019-12-03 08:18:47 +01:00
parent 8eec9e3bcb
commit a49545c1f8
4 changed files with 223 additions and 9 deletions

View File

@ -0,0 +1,54 @@
- pipeline:
name: gate
manager: dependent
failure-message: Build failed. For information on how to proceed, see http://wiki.example.org/Test_Failures
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
window: 4
window-floor: 1
window-increase-type: linear
window-increase-factor: 1
window-decrease-type: exponential
window-decrease-factor: 3
- job:
name: base
parent: null
run: playbooks/base.yaml
- job:
name: project-merge
run: playbooks/project-merge.yaml
- job:
name: project-test1
run: playbooks/project-test1.yaml
- job:
name: project-test2
run: playbooks/project-test2.yaml
- project:
name: org/project
gate:
jobs:
- project-merge
- project-test1:
dependencies:
- project-merge
- project-test2:
dependencies:
- project-merge

View File

@ -0,0 +1,52 @@
- pipeline:
name: gate
manager: dependent
failure-message: Build failed. For information on how to proceed, see http://wiki.example.org/Test_Failures
trigger:
gerrit:
- event: comment-added
approval:
- Approved: 1
start:
gerrit:
Verified: 0
success:
gerrit:
Verified: 2
submit: true
failure:
gerrit:
Verified: -2
window: 4
window-floor: 1
window-decrease-type: exponential
window-decrease-factor: 2
- job:
name: base
parent: null
run: playbooks/base.yaml
- job:
name: project-merge
run: playbooks/project-merge.yaml
- job:
name: project-test1
run: playbooks/project-test1.yaml
- job:
name: project-test2
run: playbooks/project-test2.yaml
- project:
name: org/project
gate:
jobs:
- project-merge
- project-test1:
dependencies:
- project-merge
- project-test2:
dependencies:
- project-merge

View File

@ -4830,6 +4830,116 @@ class TestScheduler(ZuulTestCase):
self.assertEqual(queue.window_floor, 1)
self.assertEqual(C.data['status'], 'MERGED')
@simple_layout('layouts/rate-limit-reconfigure.yaml')
def test_queue_rate_limiting_reconfigure(self):
"""Test that changes survive a reconfigure when no longer in window.
This is a regression tests for a case that lead to an exception during
re-enqueue. The exception happened when former active items had already
build results but then dropped out of the active window. During
re-enqueue the job graph was not re-initialized because the items were
no longer active.
"""
self.executor_server.hold_jobs_in_build = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
C = self.fake_gerrit.addFakeChange('org/project', 'master', 'C')
D = self.fake_gerrit.addFakeChange('org/project', 'master', 'D')
A.addApproval('Code-Review', 2)
B.addApproval('Code-Review', 2)
C.addApproval('Code-Review', 2)
D.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
self.fake_gerrit.addEvent(C.addApproval('Approved', 1))
self.fake_gerrit.addEvent(D.addApproval('Approved', 1))
self.waitUntilSettled()
self.assertEqual(len(self.builds), 4)
self.assertEqual(self.builds[0].name, 'project-merge')
self.assertEqual(self.builds[1].name, 'project-merge')
self.assertEqual(self.builds[2].name, 'project-merge')
self.assertEqual(self.builds[3].name, 'project-merge')
self.orderedRelease(4)
self.assertEqual(len(self.builds), 8)
self.assertEqual(self.builds[0].name, 'project-test1')
self.assertEqual(self.builds[1].name, 'project-test2')
self.assertEqual(self.builds[2].name, 'project-test1')
self.assertEqual(self.builds[3].name, 'project-test2')
self.assertEqual(self.builds[4].name, 'project-test1')
self.assertEqual(self.builds[5].name, 'project-test2')
self.assertEqual(self.builds[6].name, 'project-test1')
self.assertEqual(self.builds[7].name, 'project-test2')
self.executor_server.failJob('project-test1', B)
self.builds[2].release()
self.builds[3].release()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 4)
# A's jobs
self.assertEqual(self.builds[0].name, 'project-test1')
self.assertEqual(self.builds[1].name, 'project-test2')
# C's and D's merge jobs
self.assertEqual(self.builds[2].name, 'project-merge')
self.assertEqual(self.builds[3].name, 'project-merge')
# Release merge jobs of C, D after speculative gate reset
self.executor_server.release('project-merge')
self.waitUntilSettled()
self.assertEqual(len(self.builds), 6)
# A's jobs
self.assertEqual(self.builds[0].name, 'project-test1')
self.assertEqual(self.builds[1].name, 'project-test2')
# C's + D's jobs
self.assertEqual(self.builds[2].name, 'project-test1')
self.assertEqual(self.builds[3].name, 'project-test2')
self.assertEqual(self.builds[4].name, 'project-test1')
self.assertEqual(self.builds[5].name, 'project-test2')
# Fail D's job so we have a build results for an item that
# is not in the active window after B is reported
# (condition that previously lead to an exception)
self.executor_server.failJob('project-test1', D)
self.builds[4].release()
self.waitUntilSettled()
# Release A's jobs
self.builds[0].release()
self.builds[1].release()
self.waitUntilSettled()
self.assertEqual(len(self.builds), 3)
# C's jobs
self.assertEqual(self.builds[0].name, 'project-test1')
self.assertEqual(self.builds[1].name, 'project-test2')
# D's remaining job
self.assertEqual(self.builds[2].name, 'project-test2')
tenant = self.sched.abide.tenants.get('tenant-one')
queue = tenant.layout.pipelines['gate'].queues[0]
self.assertEqual(queue.window, 1)
# D dropped out of the window
self.assertFalse(queue.queue[-1].active)
self.commitConfigUpdate('org/common-config',
'layouts/rate-limit-reconfigure2.yaml')
self.sched.reconfigure(self.config)
self.waitUntilSettled()
# D's remaining job should still be queued
self.assertEqual(len(self.builds), 3)
self.executor_server.release('project-.*')
self.waitUntilSettled()
@simple_layout('layouts/reconfigure-window.yaml')
def test_reconfigure_window_shrink(self):
# Test the active window shrinking during reconfiguration
@ -4908,8 +5018,7 @@ class TestScheduler(ZuulTestCase):
# be allowed to shrink on reconfiguration.
self.assertEqual(queue.window, 1)
# B is outside the window, but still marked active until the
# next pass through the queue processor, so its builds haven't
# been canceled.
# next pass through the queue processor.
self.assertEqual(len(self.builds), 4)
self.sched.reconfigure(self.config)
@ -4917,20 +5026,16 @@ class TestScheduler(ZuulTestCase):
queue = tenant.layout.pipelines['gate'].queues[0]
self.assertEqual(queue.window, 1)
self.waitUntilSettled()
# B's builds have been canceled now
self.assertEqual(len(self.builds), 2)
# B's builds should not be canceled
self.assertEqual(len(self.builds), 4)
self.executor_server.hold_jobs_in_build = False
self.executor_server.release()
# B's builds will be restarted and will show up in our history
# twice.
self.waitUntilSettled()
self.assertHistory([
dict(name='job1', result='SUCCESS', changes='1,1'),
dict(name='job1', result='ABORTED', changes='1,1 2,1'),
dict(name='job2', result='SUCCESS', changes='1,1'),
dict(name='job2', result='ABORTED', changes='1,1 2,1'),
dict(name='job1', result='SUCCESS', changes='1,1 2,1'),
dict(name='job2', result='SUCCESS', changes='1,1 2,1'),
], ordered=False)

View File

@ -259,9 +259,12 @@ class PipelineManager(object):
# will go ahead and refresh the job graph if needed;
# or it will send a new merge job if necessary, or it
# will do nothing if we're waiting on a merge job.
has_job_graph = bool(item.job_graph)
item.job_graph = None
item.layout = None
if item.active:
# If the item is no longer active, but has a job graph we
# will make sure to update it.
if item.active or has_job_graph:
self.prepareItem(item)
# Re-set build results in case any new jobs have been