Fix race in slow start test

There was a race in the slow start test where we could release a build prior to it entering a waiting state which is a noop. Then the build enters a waiting state and we never release it. Fix this by waiting until the build is in a waiting state prior to releasing the build which ensures we are in a state that release() can transition out of. Change-Id: I04aa0b82e59373f23d86a64dc74e3201d4fd48f1 Co-Authored-By: Clark Boylan <clark.boylan@gmail.com>
2018-06-06 10:07:43 -07:00 · 2018-06-06 10:07:43 -07:00 · 571e2e46e6
parent ab07e1e9bc
commit 571e2e46e6
2 changed files with 39 additions and 2 deletions
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@ -22,7 +22,7 @@ from unittest import mock
 import zuul.executor.server
 import zuul.model

-from tests.base import ZuulTestCase, simple_layout
+from tests.base import ZuulTestCase, simple_layout, iterate_timeout


 class TestExecutorRepos(ZuulTestCase):
@ -498,6 +498,19 @@ class TestGovernor(ZuulTestCase):
            self.log.debug("Build %s complete", build)

    def test_slow_start(self):
+        # Note: This test relies on the fact that manageLoad is only
+        # run at specific points.  Several times in the test we check
+        # that manageLoad has disabled or enabled job acceptance based
+        # on the number of "starting" jobs.  Some of those jobs may
+        # have actually moved past the "starting" phase and are
+        # actually "running".  But because manageLoad hasn't run
+        # again, it still uses the old values.  Keep this in mind when
+        # double checking its calculations.
+        #
+        # Disable the periodic governor runs to make sure they don't
+        # interefere (only possible if the test runs longer than 10
+        # seconds).
+        self.executor_server.governor_stop_event.set()
        self.executor_server.hold_jobs_in_build = True
        self.executor_server.max_starting_builds = 1
        self.executor_server.min_starting_builds = 1
@ -513,12 +526,33 @@ class TestGovernor(ZuulTestCase):
        self.assertEqual(len(self.executor_server.job_workers), 1)
        # Allow enough starting builds for the test to complete.
        self.executor_server.max_starting_builds = 3
+        # We must wait for build1 to enter a waiting state otherwise
+        # the subsequent release() is a noop and the build is never
+        # released.  We don't use waitUntilSettled as that requires
+        # the other two builds to start which can't happen while we
+        # don't accept jobs.
+        for x in iterate_timeout(30, "build1 is waiting"):
+            if build1.waiting:
+                break
        build1.release()
        self.waitForWorkerCompletion(build1)
        self.executor_server.manageLoad()
+        # This manageLoad call has determined that there are 0 workers
+        # running, so our full complement of 3 starting builds is
+        # available.  It will re-register for work and pick up the
+        # next two jobs.

        self.waitForExecutorBuild('test2')
        self.waitForExecutorBuild('test3')
+        # When each of these jobs started, they caused manageLoad to
+        # be called, the second invocation calculated that there were
+        # 2 workers running, so our starting build limit was reduced
+        # to 1.  Usually it will calculate that there are 2 starting
+        # builds, but theoretically it could count only 1 if the first
+        # build manages to leave the starting phase before the second
+        # build starts.  It should always count the second build as
+        # starting.  As long as at least one build is still in the
+        # starting phase, this will exceed the limit and unregister.
        self.assertFalse(self.executor_server.accepting_work)

        self.executor_server.hold_jobs_in_build = False
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -2133,8 +2133,11 @@ class ExecutorServer(object):
            base_key = 'zuul.executor.{hostname}'
            self.statsd.incr(base_key + '.builds')
        self.job_workers[job.unique] = self._job_class(self, job)
-        self.job_workers[job.unique].run()
+        # Run manageLoad before starting the thread mostly for the
+        # benefit of the unit tests to make the calculation of the
+        # number of starting jobs more deterministic.
        self.manageLoad()
+        self.job_workers[job.unique].run()

    def run_governor(self):
        while not self.governor_stop_event.wait(10):