Fix node request failures in dependency cycles

Node request failures cause a queue item to fail (naturally). In a normal queue without cycles, that just means that we would cancel jobs behind and wait for the current item to finish the remaining jobs. But with cycles, the items in the bundle detect that items ahead (which are part of the bundle) are failing and so they cancel their own jobs more agressively. If they do this before all the jobs have started (ie, because we are waiting on an unfulfilled node request), they can end up in a situation where they never run builds, but yet they don't report because they are still expecting those builds. This likely points to a larger problem in that we should probably not be canceling those jobs so aggressively. However, the more serious and immediate problem is the race condition that can cause items not to report. To correct this immediate problem, tell the scheduler to create fake build objects with a result of "CANCELED" when the pipeline manager cancels builds and there is no existing build already. This will at least mean that all expected builds are present regardless of whether the node request has been fulfilled. A later change can be made to avoid canceling jobs in the first place without needing to change this behavior. Change-Id: I1e1150ef67c03452b9a98f9366434c53a5ad26fb
2022-04-30 15:58:32 -07:00 · 2022-04-30 15:58:32 -07:00 · 282182f7c2
parent a89ce345c0
commit 282182f7c2
3 changed files with 102 additions and 1 deletions
--- a/tests/fixtures/layouts/circular-deps-node-failure.yaml
+++ b/tests/fixtures/layouts/circular-deps-node-failure.yaml
@ -0,0 +1,62 @@
+- queue:
+    name: integrated
+    allow-circular-dependencies: true
+
+- pipeline:
+    name: gate
+    manager: dependent
+    success-message: Build succeeded (gate).
+    require:
+      gerrit:
+        approval:
+          - Approved: 1
+    trigger:
+      gerrit:
+        - event: comment-added
+          approval:
+            - Approved: 1
+    success:
+      gerrit:
+        Verified: 2
+        submit: true
+    failure:
+      gerrit:
+        Verified: -2
+    start:
+      gerrit:
+        Verified: 0
+    precedence: high
+
+- job:
+    name: base
+    parent: null
+    run: playbooks/run.yaml
+    nodeset:
+      nodes:
+        - label: debian
+          name: controller
+
+- job:
+    name: common-job
+
+- job:
+    name: project1-job
+
+- job:
+    name: project2-job
+
+- project:
+    name: org/project1
+    queue: integrated
+    gate:
+      jobs:
+        - common-job
+        - project1-job
+
+- project:
+    name: org/project2
+    queue: integrated
+    gate:
+      jobs:
+        - common-job
+        - project2-job
--- a/tests/unit/test_circular_dependencies.py
+++ b/tests/unit/test_circular_dependencies.py
@ -464,6 +464,45 @@ class TestGerritCircularDependencies(ZuulTestCase):
        self.assertEqual(A.data["status"], "NEW")
        self.assertEqual(B.data["status"], "NEW")

+    @simple_layout('layouts/circular-deps-node-failure.yaml')
+    def test_cycle_failed_node_request(self):
+        # Test a node request failure as part of a dependency cycle
+
+        # Pause nodepool so we can fail the node request later
+        self.fake_nodepool.pause()
+
+        A = self.fake_gerrit.addFakeChange("org/project1", "master", "A")
+        B = self.fake_gerrit.addFakeChange("org/project2", "master", "B")
+
+        # A <-> B (via commit-depends)
+        A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            A.subject, B.data["url"]
+        )
+        B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
+            B.subject, A.data["url"]
+        )
+
+        A.addApproval("Code-Review", 2)
+        B.addApproval("Code-Review", 2)
+        B.addApproval("Approved", 1)
+
+        self.fake_gerrit.addEvent(A.addApproval("Approved", 1))
+        self.waitUntilSettled()
+
+        # Fail the node request and unpause
+        req = self.fake_nodepool.getNodeRequests()
+        self.fake_nodepool.addFailRequest(req[0])
+
+        self.fake_nodepool.unpause()
+        self.waitUntilSettled()
+
+        self.assertEqual(A.reported, 2)
+        self.assertEqual(B.reported, 2)
+        self.assertIn("bundle", A.messages[-1])
+        self.assertIn("bundle", B.messages[-1])
+        self.assertEqual(A.data["status"], "NEW")
+        self.assertEqual(B.data["status"], "NEW")
+
    def test_failing_cycle_behind_failing_change(self):
        self.executor_server.hold_jobs_in_build = True
        A = self.fake_gerrit.addFakeChange("org/project", "master", "A")
--- a/zuul/manager/init.py
+++ b/zuul/manager/init.py
@ -951,7 +951,7 @@ class PipelineManager(metaclass=ABCMeta):
        jobs_to_cancel = item.getJobs()

        for job in jobs_to_cancel:
-            self.sched.cancelJob(old_build_set, job)
+            self.sched.cancelJob(old_build_set, job, final=True)

        # Don't reset builds for a failing bundle when it has already started
        # reporting, to keep available build results. Those items will be