From 9c2985a7e734479dc80532692e1ef21926e9e9d6 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 10 Mar 2014 10:59:05 -0700 Subject: [PATCH] Cancel a build even if not found Canceling a build is naturally subject to race conditions as the build is started (since canceling a build is different depending on whether or not it has started). This has been handled by checking whether the build has started, and if so canceling the build, if not, removing it from the queue, and if that fails, checking to see if the build just started and if so, canceling that. But even that is still racy because it's possible for the build to have started but for zuul to not have received the gearman packet indicating that it had. To handle that, simply don't check whether the build has started for the third attempt. (The reason we even check at all before the first attempt is that canceling a running build in jenkins is somewhat expensive (it involves iterating over all the builds) so it's better to avoid that if we think it won't work.) Also, add an extra check in the unit test suite when deciding whether the system has settled. This should deal with the case that a trigger_event -> job transition is happening during the haveAllBuildsReported check (which only checks jobs). Change-Id: I60018a5215e7d8230bdf6ef67ec7bc9c719fc286 --- tests/test_scheduler.py | 1 + zuul/launcher/gearman.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 4a4d6de00c..5a265bea93 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -1112,6 +1112,7 @@ class TestScheduler(testtools.TestCase): self.sched.result_event_queue.empty() and self.fake_gerrit.event_queue.empty() and not self.merge_client.build_sets and + self.haveAllBuildsReported() and self.areAllBuildsWaiting()): self.sched.run_handler_lock.release() self.worker.lock.release() diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index 0e248af493..3638add3e8 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -352,11 +352,11 @@ class Gearman(object): self.log.debug("Still unable to find build %s to cancel" % build) if build.number: self.log.debug("Build %s has just started" % build) - self.cancelRunningBuild(build) - self.log.debug("Canceled just running build %s" % build) else: - self.log.error("Build %s has not started but " - "was not found in queue" % build) + self.log.error("Build %s has not started but was not" + "found in queue; canceling anyway" % build) + self.cancelRunningBuild(build) + self.log.debug("Canceled possibly running build %s" % build) def onBuildCompleted(self, job, result=None): if job.unique in self.meta_jobs: