From a49545c1f8c9d89befe83e7a02e7195651e5e3e4 Mon Sep 17 00:00:00 2001 From: Simon Westphahl Date: Tue, 3 Dec 2019 08:18:47 +0100 Subject: [PATCH] Fix exceptions on reconfig with changing window When the window size changes, items formerly in the window might be no longer active. But those items can already have build results. During reconfiguration the job graph is set to None, but the item is not prepared if no longer active. This becomes problematic when build.setResult() tries to access the job graph when resetting builds. This can be solved by also preparing the item if there was already a job graph. The other option to fix this, would be to never change the item.active state back to False after it became active. However, this would be a change in the current behavior of Zuul, as only as builds finish the active window would actually decrease. Exceptions: 2019-12-03 08:11:06,298 zuul.Scheduler ERROR Exception while re-enqueing item in gate> Traceback (most recent call last): File "/tmp/zuul/zuul/scheduler.py", line 931, in _reenqueueTenant item_ahead_valid=item_ahead_valid) File "/tmp/zuul/zuul/manager/__init__.py", line 276, in reEnqueueItem item.setResult(build) File "/tmp/zuul/zuul/model.py", line 2642, in setResult to_skip = self.job_graph.getDependentJobsRecursively( AttributeError: 'NoneType' object has no attribute 'getDependentJobsRecursively' 2019-12-03 08:11:11,401 zuul.Scheduler ERROR Exception in run handler: Traceback (most recent call last): File "/tmp/zuul/zuul/scheduler.py", line 1133, in run self.process_result_queue() File "/tmp/zuul/zuul/scheduler.py", line 1268, in process_result_queue self._doBuildCompletedEvent(event) File "/tmp/zuul/zuul/scheduler.py", line 1473, in _doBuildCompletedEvent pipeline.manager.onBuildCompleted(event.build) File "/tmp/zuul/zuul/manager/__init__.py", line 995, in onBuildCompleted item.setResult(build) File "/tmp/zuul/zuul/model.py", line 2642, in setResult to_skip = self.job_graph.getDependentJobsRecursively( AttributeError: 'NoneType' object has no attribute 'getDependentJobsRecursively' Change-Id: I96d4b28a017a1371029ff7b8d516beb09b6b94ad --- .../layouts/rate-limit-reconfigure.yaml | 54 ++++++++ .../layouts/rate-limit-reconfigure2.yaml | 52 ++++++++ tests/unit/test_scheduler.py | 121 ++++++++++++++++-- zuul/manager/__init__.py | 5 +- 4 files changed, 223 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/layouts/rate-limit-reconfigure.yaml create mode 100644 tests/fixtures/layouts/rate-limit-reconfigure2.yaml diff --git a/tests/fixtures/layouts/rate-limit-reconfigure.yaml b/tests/fixtures/layouts/rate-limit-reconfigure.yaml new file mode 100644 index 0000000000..df6e25b0c0 --- /dev/null +++ b/tests/fixtures/layouts/rate-limit-reconfigure.yaml @@ -0,0 +1,54 @@ +- pipeline: + name: gate + manager: dependent + failure-message: Build failed. For information on how to proceed, see http://wiki.example.org/Test_Failures + trigger: + gerrit: + - event: comment-added + approval: + - Approved: 1 + start: + gerrit: + Verified: 0 + success: + gerrit: + Verified: 2 + submit: true + failure: + gerrit: + Verified: -2 + window: 4 + window-floor: 1 + window-increase-type: linear + window-increase-factor: 1 + window-decrease-type: exponential + window-decrease-factor: 3 + +- job: + name: base + parent: null + run: playbooks/base.yaml + +- job: + name: project-merge + run: playbooks/project-merge.yaml + +- job: + name: project-test1 + run: playbooks/project-test1.yaml + +- job: + name: project-test2 + run: playbooks/project-test2.yaml + +- project: + name: org/project + gate: + jobs: + - project-merge + - project-test1: + dependencies: + - project-merge + - project-test2: + dependencies: + - project-merge diff --git a/tests/fixtures/layouts/rate-limit-reconfigure2.yaml b/tests/fixtures/layouts/rate-limit-reconfigure2.yaml new file mode 100644 index 0000000000..cd074995ee --- /dev/null +++ b/tests/fixtures/layouts/rate-limit-reconfigure2.yaml @@ -0,0 +1,52 @@ +- pipeline: + name: gate + manager: dependent + failure-message: Build failed. For information on how to proceed, see http://wiki.example.org/Test_Failures + trigger: + gerrit: + - event: comment-added + approval: + - Approved: 1 + start: + gerrit: + Verified: 0 + success: + gerrit: + Verified: 2 + submit: true + failure: + gerrit: + Verified: -2 + window: 4 + window-floor: 1 + window-decrease-type: exponential + window-decrease-factor: 2 + +- job: + name: base + parent: null + run: playbooks/base.yaml + +- job: + name: project-merge + run: playbooks/project-merge.yaml + +- job: + name: project-test1 + run: playbooks/project-test1.yaml + +- job: + name: project-test2 + run: playbooks/project-test2.yaml + +- project: + name: org/project + gate: + jobs: + - project-merge + - project-test1: + dependencies: + - project-merge + - project-test2: + dependencies: + - project-merge diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 7cffca0450..b41212e8f2 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -4830,6 +4830,116 @@ class TestScheduler(ZuulTestCase): self.assertEqual(queue.window_floor, 1) self.assertEqual(C.data['status'], 'MERGED') + @simple_layout('layouts/rate-limit-reconfigure.yaml') + def test_queue_rate_limiting_reconfigure(self): + """Test that changes survive a reconfigure when no longer in window. + + This is a regression tests for a case that lead to an exception during + re-enqueue. The exception happened when former active items had already + build results but then dropped out of the active window. During + re-enqueue the job graph was not re-initialized because the items were + no longer active. + """ + self.executor_server.hold_jobs_in_build = True + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B') + C = self.fake_gerrit.addFakeChange('org/project', 'master', 'C') + D = self.fake_gerrit.addFakeChange('org/project', 'master', 'D') + + A.addApproval('Code-Review', 2) + B.addApproval('Code-Review', 2) + C.addApproval('Code-Review', 2) + D.addApproval('Code-Review', 2) + + self.fake_gerrit.addEvent(A.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(B.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(C.addApproval('Approved', 1)) + self.fake_gerrit.addEvent(D.addApproval('Approved', 1)) + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 4) + self.assertEqual(self.builds[0].name, 'project-merge') + self.assertEqual(self.builds[1].name, 'project-merge') + self.assertEqual(self.builds[2].name, 'project-merge') + self.assertEqual(self.builds[3].name, 'project-merge') + + self.orderedRelease(4) + + self.assertEqual(len(self.builds), 8) + self.assertEqual(self.builds[0].name, 'project-test1') + self.assertEqual(self.builds[1].name, 'project-test2') + self.assertEqual(self.builds[2].name, 'project-test1') + self.assertEqual(self.builds[3].name, 'project-test2') + self.assertEqual(self.builds[4].name, 'project-test1') + self.assertEqual(self.builds[5].name, 'project-test2') + self.assertEqual(self.builds[6].name, 'project-test1') + self.assertEqual(self.builds[7].name, 'project-test2') + + self.executor_server.failJob('project-test1', B) + self.builds[2].release() + self.builds[3].release() + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 4) + # A's jobs + self.assertEqual(self.builds[0].name, 'project-test1') + self.assertEqual(self.builds[1].name, 'project-test2') + # C's and D's merge jobs + self.assertEqual(self.builds[2].name, 'project-merge') + self.assertEqual(self.builds[3].name, 'project-merge') + + # Release merge jobs of C, D after speculative gate reset + self.executor_server.release('project-merge') + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 6) + + # A's jobs + self.assertEqual(self.builds[0].name, 'project-test1') + self.assertEqual(self.builds[1].name, 'project-test2') + # C's + D's jobs + self.assertEqual(self.builds[2].name, 'project-test1') + self.assertEqual(self.builds[3].name, 'project-test2') + self.assertEqual(self.builds[4].name, 'project-test1') + self.assertEqual(self.builds[5].name, 'project-test2') + + # Fail D's job so we have a build results for an item that + # is not in the active window after B is reported + # (condition that previously lead to an exception) + self.executor_server.failJob('project-test1', D) + self.builds[4].release() + self.waitUntilSettled() + + # Release A's jobs + self.builds[0].release() + self.builds[1].release() + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 3) + + # C's jobs + self.assertEqual(self.builds[0].name, 'project-test1') + self.assertEqual(self.builds[1].name, 'project-test2') + # D's remaining job + self.assertEqual(self.builds[2].name, 'project-test2') + + tenant = self.sched.abide.tenants.get('tenant-one') + queue = tenant.layout.pipelines['gate'].queues[0] + self.assertEqual(queue.window, 1) + + # D dropped out of the window + self.assertFalse(queue.queue[-1].active) + + self.commitConfigUpdate('org/common-config', + 'layouts/rate-limit-reconfigure2.yaml') + self.sched.reconfigure(self.config) + self.waitUntilSettled() + + # D's remaining job should still be queued + self.assertEqual(len(self.builds), 3) + self.executor_server.release('project-.*') + self.waitUntilSettled() + @simple_layout('layouts/reconfigure-window.yaml') def test_reconfigure_window_shrink(self): # Test the active window shrinking during reconfiguration @@ -4908,8 +5018,7 @@ class TestScheduler(ZuulTestCase): # be allowed to shrink on reconfiguration. self.assertEqual(queue.window, 1) # B is outside the window, but still marked active until the - # next pass through the queue processor, so its builds haven't - # been canceled. + # next pass through the queue processor. self.assertEqual(len(self.builds), 4) self.sched.reconfigure(self.config) @@ -4917,20 +5026,16 @@ class TestScheduler(ZuulTestCase): queue = tenant.layout.pipelines['gate'].queues[0] self.assertEqual(queue.window, 1) self.waitUntilSettled() - # B's builds have been canceled now - self.assertEqual(len(self.builds), 2) + # B's builds should not be canceled + self.assertEqual(len(self.builds), 4) self.executor_server.hold_jobs_in_build = False self.executor_server.release() - # B's builds will be restarted and will show up in our history - # twice. self.waitUntilSettled() self.assertHistory([ dict(name='job1', result='SUCCESS', changes='1,1'), - dict(name='job1', result='ABORTED', changes='1,1 2,1'), dict(name='job2', result='SUCCESS', changes='1,1'), - dict(name='job2', result='ABORTED', changes='1,1 2,1'), dict(name='job1', result='SUCCESS', changes='1,1 2,1'), dict(name='job2', result='SUCCESS', changes='1,1 2,1'), ], ordered=False) diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 0e46dcac27..a448194601 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -259,9 +259,12 @@ class PipelineManager(object): # will go ahead and refresh the job graph if needed; # or it will send a new merge job if necessary, or it # will do nothing if we're waiting on a merge job. + has_job_graph = bool(item.job_graph) item.job_graph = None item.layout = None - if item.active: + # If the item is no longer active, but has a job graph we + # will make sure to update it. + if item.active or has_job_graph: self.prepareItem(item) # Re-set build results in case any new jobs have been