From 571e2e46e628ec1a65faa7b863d1ec7174880f82 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 6 Jun 2018 10:07:43 -0700 Subject: [PATCH] Fix race in slow start test There was a race in the slow start test where we could release a build prior to it entering a waiting state which is a noop. Then the build enters a waiting state and we never release it. Fix this by waiting until the build is in a waiting state prior to releasing the build which ensures we are in a state that release() can transition out of. Change-Id: I04aa0b82e59373f23d86a64dc74e3201d4fd48f1 Co-Authored-By: Clark Boylan --- tests/unit/test_executor.py | 36 +++++++++++++++++++++++++++++++++++- zuul/executor/server.py | 5 ++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py index a0599150f7..c2d7947848 100755 --- a/tests/unit/test_executor.py +++ b/tests/unit/test_executor.py @@ -22,7 +22,7 @@ from unittest import mock import zuul.executor.server import zuul.model -from tests.base import ZuulTestCase, simple_layout +from tests.base import ZuulTestCase, simple_layout, iterate_timeout class TestExecutorRepos(ZuulTestCase): @@ -498,6 +498,19 @@ class TestGovernor(ZuulTestCase): self.log.debug("Build %s complete", build) def test_slow_start(self): + # Note: This test relies on the fact that manageLoad is only + # run at specific points. Several times in the test we check + # that manageLoad has disabled or enabled job acceptance based + # on the number of "starting" jobs. Some of those jobs may + # have actually moved past the "starting" phase and are + # actually "running". But because manageLoad hasn't run + # again, it still uses the old values. Keep this in mind when + # double checking its calculations. + # + # Disable the periodic governor runs to make sure they don't + # interefere (only possible if the test runs longer than 10 + # seconds). + self.executor_server.governor_stop_event.set() self.executor_server.hold_jobs_in_build = True self.executor_server.max_starting_builds = 1 self.executor_server.min_starting_builds = 1 @@ -513,12 +526,33 @@ class TestGovernor(ZuulTestCase): self.assertEqual(len(self.executor_server.job_workers), 1) # Allow enough starting builds for the test to complete. self.executor_server.max_starting_builds = 3 + # We must wait for build1 to enter a waiting state otherwise + # the subsequent release() is a noop and the build is never + # released. We don't use waitUntilSettled as that requires + # the other two builds to start which can't happen while we + # don't accept jobs. + for x in iterate_timeout(30, "build1 is waiting"): + if build1.waiting: + break build1.release() self.waitForWorkerCompletion(build1) self.executor_server.manageLoad() + # This manageLoad call has determined that there are 0 workers + # running, so our full complement of 3 starting builds is + # available. It will re-register for work and pick up the + # next two jobs. self.waitForExecutorBuild('test2') self.waitForExecutorBuild('test3') + # When each of these jobs started, they caused manageLoad to + # be called, the second invocation calculated that there were + # 2 workers running, so our starting build limit was reduced + # to 1. Usually it will calculate that there are 2 starting + # builds, but theoretically it could count only 1 if the first + # build manages to leave the starting phase before the second + # build starts. It should always count the second build as + # starting. As long as at least one build is still in the + # starting phase, this will exceed the limit and unregister. self.assertFalse(self.executor_server.accepting_work) self.executor_server.hold_jobs_in_build = False diff --git a/zuul/executor/server.py b/zuul/executor/server.py index dd8151ee7f..bd6e47815a 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -2133,8 +2133,11 @@ class ExecutorServer(object): base_key = 'zuul.executor.{hostname}' self.statsd.incr(base_key + '.builds') self.job_workers[job.unique] = self._job_class(self, job) - self.job_workers[job.unique].run() + # Run manageLoad before starting the thread mostly for the + # benefit of the unit tests to make the calculation of the + # number of starting jobs more deterministic. self.manageLoad() + self.job_workers[job.unique].run() def run_governor(self): while not self.governor_stop_event.wait(10):