From 571e2e46e628ec1a65faa7b863d1ec7174880f82 Mon Sep 17 00:00:00 2001
From: "James E. Blair" <jeblair@redhat.com>
Date: Wed, 6 Jun 2018 10:07:43 -0700
Subject: [PATCH] Fix race in slow start test

There was a race in the slow start test where we could release a build
prior to it entering a waiting state which is a noop. Then the build
enters a waiting state and we never release it.

Fix this by waiting until the build is in a waiting state prior to
releasing the build which ensures we are in a state that release() can
transition out of.

Change-Id: I04aa0b82e59373f23d86a64dc74e3201d4fd48f1
Co-Authored-By: Clark Boylan <clark.boylan@gmail.com>
---
 tests/unit/test_executor.py | 36 +++++++++++++++++++++++++++++++++++-
 zuul/executor/server.py     |  5 ++++-
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py
index a0599150f7..c2d7947848 100755
--- a/tests/unit/test_executor.py
+++ b/tests/unit/test_executor.py
@@ -22,7 +22,7 @@ from unittest import mock
 import zuul.executor.server
 import zuul.model
 
-from tests.base import ZuulTestCase, simple_layout
+from tests.base import ZuulTestCase, simple_layout, iterate_timeout
 
 
 class TestExecutorRepos(ZuulTestCase):
@@ -498,6 +498,19 @@ class TestGovernor(ZuulTestCase):
             self.log.debug("Build %s complete", build)
 
     def test_slow_start(self):
+        # Note: This test relies on the fact that manageLoad is only
+        # run at specific points.  Several times in the test we check
+        # that manageLoad has disabled or enabled job acceptance based
+        # on the number of "starting" jobs.  Some of those jobs may
+        # have actually moved past the "starting" phase and are
+        # actually "running".  But because manageLoad hasn't run
+        # again, it still uses the old values.  Keep this in mind when
+        # double checking its calculations.
+        #
+        # Disable the periodic governor runs to make sure they don't
+        # interefere (only possible if the test runs longer than 10
+        # seconds).
+        self.executor_server.governor_stop_event.set()
         self.executor_server.hold_jobs_in_build = True
         self.executor_server.max_starting_builds = 1
         self.executor_server.min_starting_builds = 1
@@ -513,12 +526,33 @@ class TestGovernor(ZuulTestCase):
         self.assertEqual(len(self.executor_server.job_workers), 1)
         # Allow enough starting builds for the test to complete.
         self.executor_server.max_starting_builds = 3
+        # We must wait for build1 to enter a waiting state otherwise
+        # the subsequent release() is a noop and the build is never
+        # released.  We don't use waitUntilSettled as that requires
+        # the other two builds to start which can't happen while we
+        # don't accept jobs.
+        for x in iterate_timeout(30, "build1 is waiting"):
+            if build1.waiting:
+                break
         build1.release()
         self.waitForWorkerCompletion(build1)
         self.executor_server.manageLoad()
+        # This manageLoad call has determined that there are 0 workers
+        # running, so our full complement of 3 starting builds is
+        # available.  It will re-register for work and pick up the
+        # next two jobs.
 
         self.waitForExecutorBuild('test2')
         self.waitForExecutorBuild('test3')
+        # When each of these jobs started, they caused manageLoad to
+        # be called, the second invocation calculated that there were
+        # 2 workers running, so our starting build limit was reduced
+        # to 1.  Usually it will calculate that there are 2 starting
+        # builds, but theoretically it could count only 1 if the first
+        # build manages to leave the starting phase before the second
+        # build starts.  It should always count the second build as
+        # starting.  As long as at least one build is still in the
+        # starting phase, this will exceed the limit and unregister.
         self.assertFalse(self.executor_server.accepting_work)
 
         self.executor_server.hold_jobs_in_build = False
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index dd8151ee7f..bd6e47815a 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -2133,8 +2133,11 @@ class ExecutorServer(object):
             base_key = 'zuul.executor.{hostname}'
             self.statsd.incr(base_key + '.builds')
         self.job_workers[job.unique] = self._job_class(self, job)
-        self.job_workers[job.unique].run()
+        # Run manageLoad before starting the thread mostly for the
+        # benefit of the unit tests to make the calculation of the
+        # number of starting jobs more deterministic.
         self.manageLoad()
+        self.job_workers[job.unique].run()
 
     def run_governor(self):
         while not self.governor_stop_event.wait(10):