Executor: stop jobs in parallel on shutdown

When shutting down, tell all of the jobs to stop, then wait for
all of them to complete.

Also, add a lock around starting jobs so that the list of running
jobs is guaranteed to be consistent.

Also fix a shutdown sequencing bug.  It was possible for us to shut down
the internal repo update thread while jobs were still waiting on it.  This
would cause those jobs never to stop.  Fix this by ensuring that we get
no new jobs, then stop all the jobs, then stop the update thread.

Because we want to wait for the update thread to complete, we join it in
the stop() method.  That seems to make the join() method redundant, so
move all of the joins to stop().

Change-Id: I12422d3b445892be3e50963c9edafe3904698bff
This commit is contained in:
James E. Blair 2017-10-16 11:44:43 -07:00
parent a86aaf1158
commit 0c45d8899f
1 changed files with 86 additions and 39 deletions

View File

@ -47,6 +47,11 @@ COMMANDS = ['stop', 'pause', 'unpause', 'graceful', 'verbose',
DEFAULT_FINGER_PORT = 79
class StopException(Exception):
"""An exception raised when an inner loop is asked to stop."""
pass
class ExecutorError(Exception):
"""A non-transient run-time executor error
@ -558,6 +563,8 @@ class AnsibleJob(object):
self.aborted = True
self.aborted_reason = reason
self.abortRunningProc()
def wait(self):
if self.thread:
self.thread.join()
@ -1530,6 +1537,7 @@ class ExecutorServer(object):
self.hostname = socket.gethostname()
self.log_streaming_port = log_streaming_port
self.merger_lock = threading.Lock()
self.run_lock = threading.Lock()
self.verbose = False
self.command_map = dict(
stop=self.stop,
@ -1665,8 +1673,9 @@ class ExecutorServer(object):
self.merger_worker.registerFunction("merger:refstate")
def register_work(self):
self.accepting_work = True
self.executor_worker.registerFunction("executor:execute")
if self._running:
self.accepting_work = True
self.executor_worker.registerFunction("executor:execute")
def unregister_work(self):
self.accepting_work = False
@ -1675,26 +1684,57 @@ class ExecutorServer(object):
def stop(self):
self.log.debug("Stopping")
self.disk_accountant.stop()
# The governor can change function registration, so make sure
# it has stopped.
self.governor_stop_event.set()
self._running = False
self._command_running = False
self.governor_thread.join()
# Stop accepting new jobs
self.merger_worker.setFunctions([])
self.executor_worker.setFunctions([])
# Tell the executor worker to abort any jobs it just accepted,
# and grab the list of currently running job workers.
with self.run_lock:
self._running = False
self._command_running = False
workers = list(self.job_workers.values())
self.command_socket.stop()
self.update_queue.put(None)
for job_worker in list(self.job_workers.values()):
for job_worker in workers:
try:
job_worker.stop()
except Exception:
self.log.exception("Exception sending stop command "
"to worker:")
for job_worker in workers:
try:
job_worker.wait()
except Exception:
self.log.exception("Exception waiting for worker "
"to stop:")
# Now that we aren't accepting any new jobs, and all of the
# running jobs have stopped, tell the update processor to
# stop.
self.update_queue.put(None)
# All job results should have been sent by now, shutdown the
# gearman workers.
self.merger_worker.shutdown()
self.executor_worker.shutdown()
if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname
self.statsd.gauge(base_key + '.load_average', 0)
self.statsd.gauge(base_key + '.running_builds', 0)
self.log.debug("Stopped")
def join(self):
self.governor_thread.join()
self.update_thread.join()
self.merger_thread.join()
self.executor_thread.join()
def pause(self):
# TODOv3: implement
pass
@ -1719,12 +1759,6 @@ class ExecutorServer(object):
def nokeep(self):
self.keep_jobdir = False
def join(self):
self.update_thread.join()
self.merger_thread.join()
self.executor_thread.join()
self.governor_thread.join()
def runCommand(self):
while self._command_running:
try:
@ -1735,10 +1769,12 @@ class ExecutorServer(object):
self.log.exception("Exception while processing command")
def _updateLoop(self):
while self._running:
while True:
try:
self._innerUpdateLoop()
except:
except StopException:
return
except Exception:
self.log.exception("Exception in update thread:")
def _innerUpdateLoop(self):
@ -1746,7 +1782,7 @@ class ExecutorServer(object):
task = self.update_queue.get()
if task is None:
# We are asked to stop
return
raise StopException()
with self.merger_lock:
self.log.info("Updating repo %s/%s" % (
task.connection_name, task.project_name))
@ -1767,18 +1803,7 @@ class ExecutorServer(object):
try:
job = self.merger_worker.getJob()
try:
if job.name == 'merger:cat':
self.log.debug("Got cat job: %s" % job.unique)
self.cat(job)
elif job.name == 'merger:merge':
self.log.debug("Got merge job: %s" % job.unique)
self.merge(job)
elif job.name == 'merger:refstate':
self.log.debug("Got refstate job: %s" % job.unique)
self.refstate(job)
else:
self.log.error("Unable to handle job %s" % job.name)
job.sendWorkFail()
self.mergerJobDispatch(job)
except Exception:
self.log.exception("Exception while running job")
job.sendWorkException(
@ -1788,21 +1813,28 @@ class ExecutorServer(object):
except Exception:
self.log.exception("Exception while getting job")
def mergerJobDispatch(self, job):
with self.run_lock:
if job.name == 'merger:cat':
self.log.debug("Got cat job: %s" % job.unique)
self.cat(job)
elif job.name == 'merger:merge':
self.log.debug("Got merge job: %s" % job.unique)
self.merge(job)
elif job.name == 'merger:refstate':
self.log.debug("Got refstate job: %s" % job.unique)
self.refstate(job)
else:
self.log.error("Unable to handle job %s" % job.name)
job.sendWorkFail()
def run_executor(self):
self.log.debug("Starting executor listener")
while self._running:
try:
job = self.executor_worker.getJob()
try:
if job.name == 'executor:execute':
self.log.debug("Got execute job: %s" % job.unique)
self.executeJob(job)
elif job.name.startswith('executor:stop'):
self.log.debug("Got stop job: %s" % job.unique)
self.stopJob(job)
else:
self.log.error("Unable to handle job %s" % job.name)
job.sendWorkFail()
self.executorJobDispatch(job)
except Exception:
self.log.exception("Exception while running job")
job.sendWorkException(
@ -1812,9 +1844,20 @@ class ExecutorServer(object):
except Exception:
self.log.exception("Exception while getting job")
def run_governor(self):
while not self.governor_stop_event.wait(30):
self.manageLoad()
def executorJobDispatch(self, job):
with self.run_lock:
if not self._running:
job.sendWorkFail()
return
if job.name == 'executor:execute':
self.log.debug("Got execute job: %s" % job.unique)
self.executeJob(job)
elif job.name.startswith('executor:stop'):
self.log.debug("Got stop job: %s" % job.unique)
self.stopJob(job)
else:
self.log.error("Unable to handle job %s" % job.name)
job.sendWorkFail()
def executeJob(self, job):
if self.statsd:
@ -1823,6 +1866,10 @@ class ExecutorServer(object):
self.job_workers[job.unique] = self._job_class(self, job)
self.job_workers[job.unique].run()
def run_governor(self):
while not self.governor_stop_event.wait(30):
self.manageLoad()
def manageLoad(self):
''' Apply some heuristics to decide whether or not we should
be askign for more jobs '''