Ansible launcher: add pause/unpause support

Change-Id: I55c68153a80477c657d7bc5d22e463c37a494eb6
This commit is contained in:
James E. Blair 2016-06-01 15:33:54 -07:00
parent 8fc762bc5e
commit a6a5004029
2 changed files with 71 additions and 20 deletions

View File

@ -29,8 +29,9 @@ import sys
import signal import signal
import zuul.cmd import zuul.cmd
import zuul.launcher.ansiblelaunchserver
# No zuul imports here because they pull in paramiko which must not be # No zuul imports that pull in paramiko here; it must not be
# imported until after the daemonization. # imported until after the daemonization.
# https://github.com/paramiko/paramiko/issues/59 # https://github.com/paramiko/paramiko/issues/59
# Similar situation with gear and statsd. # Similar situation with gear and statsd.
@ -50,7 +51,8 @@ class Launcher(zuul.cmd.ZuulApp):
parser.add_argument('--keep-jobdir', dest='keep_jobdir', parser.add_argument('--keep-jobdir', dest='keep_jobdir',
action='store_true', action='store_true',
help='keep local jobdirs after run completes') help='keep local jobdirs after run completes')
parser.add_argument('command', choices=['reconfigure', 'stop'], parser.add_argument('command',
choices=zuul.launcher.ansiblelaunchserver.COMMANDS,
nargs='?') nargs='?')
self.args = parser.parse_args() self.args = parser.parse_args()
@ -66,21 +68,12 @@ class Launcher(zuul.cmd.ZuulApp):
s.connect(path) s.connect(path)
s.sendall('%s\n' % cmd) s.sendall('%s\n' % cmd)
def send_reconfigure(self):
self.send_command('reconfigure')
sys.exit(0)
def send_stop(self):
self.send_command('stop')
sys.exit(0)
def exit_handler(self): def exit_handler(self):
self.launcher.stop() self.launcher.stop()
self.launcher.join() self.launcher.join()
def main(self, daemon=True): def main(self, daemon=True):
# See comment at top of file about zuul imports # See comment at top of file about zuul imports
import zuul.launcher.ansiblelaunchserver
self.setup_logging('launcher', 'log_config') self.setup_logging('launcher', 'log_config')
@ -109,11 +102,8 @@ def main():
server.parse_arguments() server.parse_arguments()
server.read_config() server.read_config()
if server.args.command == 'reconfigure': if server.args.command in zuul.launcher.ansiblelaunchserver.COMMANDS:
server.send_reconfigure() server.send_command(server.args.command)
sys.exit(0)
elif server.args.command == 'stop':
server.send_stop()
sys.exit(0) sys.exit(0)
server.configure_connections() server.configure_connections()

View File

@ -37,6 +37,9 @@ import zuul.ansible.plugins.callback_plugins
from zuul.lib import commandsocket from zuul.lib import commandsocket
COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause']
def boolify(x): def boolify(x):
if isinstance(x, str): if isinstance(x, str):
return bool(int(x)) return bool(int(x))
@ -77,6 +80,7 @@ class LaunchServer(object):
self.config = config self.config = config
self.keep_jobdir = keep_jobdir self.keep_jobdir = keep_jobdir
self.hostname = socket.gethostname() self.hostname = socket.gethostname()
self.registered_functions = set()
self.node_workers = {} self.node_workers = {}
self.jobs = {} self.jobs = {}
self.builds = {} self.builds = {}
@ -203,9 +207,16 @@ class LaunchServer(object):
del self.jobs[name] del self.jobs[name]
def register(self): def register(self):
new_functions = set()
if self.accept_nodes: if self.accept_nodes:
self.worker.registerFunction("node-assign:zuul") new_functions.add("node-assign:zuul")
self.worker.registerFunction("stop:%s" % self.hostname) new_functions.add("stop:%s" % self.hostname)
for function in new_functions - self.registered_functions:
self.worker.registerFunction(function)
for function in self.registered_functions - new_functions:
self.worker.unRegisterFunction(function)
self.registered_functions = new_functions
def reconfigure(self): def reconfigure(self):
self.log.debug("Reconfiguring") self.log.debug("Reconfiguring")
@ -219,6 +230,32 @@ class LaunchServer(object):
"to worker:") "to worker:")
self.log.debug("Reconfiguration complete") self.log.debug("Reconfiguration complete")
def pause(self):
self.log.debug("Pausing")
self.accept_nodes = False
self.register()
for node in self.node_workers.values():
try:
if node.isAlive():
node.queue.put(dict(action='pause'))
except Exception:
self.log.exception("Exception sending pause command "
"to worker:")
self.log.debug("Paused")
def unpause(self):
self.log.debug("Unpausing")
self.accept_nodes = True
self.register()
for node in self.node_workers.values():
try:
if node.isAlive():
node.queue.put(dict(action='unpause'))
except Exception:
self.log.exception("Exception sending unpause command "
"to worker:")
self.log.debug("Unpaused")
def stop(self): def stop(self):
self.log.debug("Stopping") self.log.debug("Stopping")
# First, stop accepting new jobs # First, stop accepting new jobs
@ -254,8 +291,12 @@ class LaunchServer(object):
command = self.command_socket.get() command = self.command_socket.get()
if command == 'reconfigure': if command == 'reconfigure':
self.reconfigure() self.reconfigure()
if command == 'stop': elif command == 'stop':
self.stop() self.stop()
elif command == 'pause':
self.pause()
elif command == 'unpause':
self.unpause()
except Exception: except Exception:
self.log.exception("Exception while processing command") self.log.exception("Exception while processing command")
@ -376,6 +417,10 @@ class NodeWorker(object):
self.labels = labels self.labels = labels
self.thread = None self.thread = None
self.registered_functions = set() self.registered_functions = set()
# If the unpaused Event is set, that means we should run jobs.
# If it is clear, then we are paused and should not run jobs.
self.unpaused = threading.Event()
self.unpaused.set()
self._running = True self._running = True
self.queue = Queue.Queue() self.queue = Queue.Queue()
self.manager_name = manager_name self.manager_name = manager_name
@ -434,9 +479,17 @@ class NodeWorker(object):
# will be set by the queue thread. # will be set by the queue thread.
self.log.debug("Submitting stop request") self.log.debug("Submitting stop request")
self._running = False self._running = False
self.unpaused.set()
self.queue.put(dict(action='stop')) self.queue.put(dict(action='stop'))
self.queue.join() self.queue.join()
def pause(self):
self.unpaused.clear()
self.worker.stopWaitingForJobs()
def unpause(self):
self.unpaused.set()
def _runQueue(self): def _runQueue(self):
item = self.queue.get() item = self.queue.get()
try: try:
@ -449,6 +502,12 @@ class NodeWorker(object):
else: else:
self._job_complete_event.wait() self._job_complete_event.wait()
self.worker.shutdown() self.worker.shutdown()
if item['action'] == 'pause':
self.log.debug("Received pause request")
self.pause()
if item['action'] == 'unpause':
self.log.debug("Received unpause request")
self.unpause()
elif item['action'] == 'reconfigure': elif item['action'] == 'reconfigure':
self.log.debug("Received reconfigure request") self.log.debug("Received reconfigure request")
self.register() self.register()
@ -461,7 +520,9 @@ class NodeWorker(object):
def runGearman(self): def runGearman(self):
while self._running: while self._running:
try: try:
self._runGearman() self.unpaused.wait()
if self._running:
self._runGearman()
except Exception: except Exception:
self.log.exception("Exception in gearman manager:") self.log.exception("Exception in gearman manager:")