From b4f9c61e840f25d82ab9387dbaa83f73582ec9e8 Mon Sep 17 00:00:00 2001 From: Jan Hruban Date: Mon, 4 Jan 2016 18:41:04 +0100 Subject: [PATCH 01/29] Lower the log level in tests The subunit output size is already nearing 50MB, which is the maximum allowed by jenkins jobs. Lower the log level to INFO, which should be enough for normal test runs and lower the output size significantly. Co-Authored-By: Joshua Hesketh Change-Id: Ia6adc28a7bda482595df4b5f3b144f150e3a441e --- tests/base.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/base.py b/tests/base.py index 01097c1865..ab608cbe90 100755 --- a/tests/base.py +++ b/tests/base.py @@ -857,8 +857,17 @@ class BaseTestCase(testtools.TestCase): self.useFixture(fixtures.MonkeyPatch('sys.stderr', stderr)) if (os.environ.get('OS_LOG_CAPTURE') == 'True' or os.environ.get('OS_LOG_CAPTURE') == '1'): + log_level = logging.INFO + if os.environ.get('OS_LOG_LEVEL') == 'DEBUG': + log_level = logging.DEBUG + elif os.environ.get('OS_LOG_LEVEL') == 'WARNING': + log_level = logging.WARNING + elif os.environ.get('OS_LOG_LEVEL') == 'ERROR': + log_level = logging.ERROR + elif os.environ.get('OS_LOG_LEVEL') == 'CRITICAL': + log_level = logging.CRITICAL self.useFixture(fixtures.FakeLogger( - level=logging.DEBUG, + level=log_level, format='%(asctime)s %(name)-32s ' '%(levelname)-8s %(message)s')) From dab2fafe8ab6c71393264f3b2a4ccb5f20a1b699 Mon Sep 17 00:00:00 2001 From: "Swapnil Kulkarni (coolsvap)" Date: Fri, 22 Jul 2016 05:06:51 +0000 Subject: [PATCH 02/29] Remove discover from test-requirements It's only needed for python < 2.7 which is not supported Change-Id: I370b3fad3cc926ffb600ca669745d11cb1a5a621 --- test-requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/test-requirements.txt b/test-requirements.txt index 88223b0a8c..aed999865a 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,7 +3,6 @@ hacking>=0.9.2,<0.10 coverage>=3.6 sphinx>=1.1.2,!=1.2.0,!=1.3b1,<1.3 sphinxcontrib-blockdiag>=1.1.0 -discover fixtures>=0.3.14 python-keystoneclient>=0.4.2 python-subunit From 5b9b2bdf026c5eab6382a636a7c33a7b62942bfc Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 29 Sep 2016 10:08:23 -0700 Subject: [PATCH 03/29] Ansible launcher: fix afs publisher This contains several fixes: * Support remove-prefix. This is used by the FTP publisher we are replacing. * Fix sed expressions. They were missing a '/'. * Make the target directory before rsync. Rsync requires the target root directory exist before running. Elsewhere we solved that by encoding the mkdir into the remote rsync command. Since we are running locally here, just run 'mkdir -p' before running rsync. However, it must be done with the keytab, so include it in the k5start command (so that we do not need to run k5start twice). * Include the 'user' in the site definition as the principal for k5start. Change-Id: I69c263a35e732b9a21d411bd30215945783d1023 --- zuul/launcher/ansiblelaunchserver.py | 42 ++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index a8008715b8..288bc47b96 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1109,6 +1109,11 @@ class NodeWorker(object): # that. afsroot = tempfile.mkdtemp(dir=jobdir.staging_root) afscontent = os.path.join(afsroot, 'content') + afssource = afscontent + if afs.get('remove-prefix'): + afssource = os.path.join(afscontent, afs['remove-prefix']) + while afssource[-1] == '/': + afssource = afssource[:-1] src = parameters['WORKSPACE'] if not src.endswith('/'): @@ -1148,7 +1153,7 @@ class NodeWorker(object): # Find the list of root markers in the just-completed build # (usually there will only be one, but some builds produce # content at the root *and* at a tag location). - task = dict(shell=find_pipe.format(path=afscontent, + task = dict(shell=find_pipe.format(path=afssource, file=src_markers_file), when='success', delegate_to='127.0.0.1') @@ -1192,7 +1197,7 @@ class NodeWorker(object): # all, since "/" will be excluded later. command = ("/bin/grep -v '^/$' {src} | " - "/bin/sed -e 's/^+ /' > {filter}".format( + "/bin/sed -e 's/^/+ /' > {filter}".format( src=src_markers_file, filter=filter_file)) task = dict(shell=command, @@ -1208,7 +1213,7 @@ class NodeWorker(object): # underneath the root. command = ("/bin/grep -v '^/$' {exclude} | " - "/bin/sed -e 's/^- /' >> {filter}".format( + "/bin/sed -e 's/^/- /' >> {filter}".format( exclude=exclude_file, filter=filter_file)) task = dict(shell=command, @@ -1224,25 +1229,40 @@ class NodeWorker(object): # then we should omit the '/*' exclusion so that it is # implicitly included. - command = "grep '^/$' {exclude} && echo '- /*' >> {filter}".format( - exclude=exclude_file, - filter=filter_file) + command = ("grep '^/$' {exclude} && " + "echo '- /*' >> {filter} || " + "/bin/true".format( + exclude=exclude_file, + filter=filter_file)) task = dict(shell=command, when='success', delegate_to='127.0.0.1') tasks.append(task) # Perform the rsync with the filter list. - rsync_cmd = [ - '/usr/bin/k5start', '-t', '-k', '{keytab}', '--', + rsync_cmd = ' '.join([ '/usr/bin/rsync', '-rtp', '--safe-links', '--delete-after', "--filter='merge {filter}'", '{src}/', '{dst}/', - ] - shellargs = ' '.join(rsync_cmd).format( - src=afscontent, + ]) + mkdir_cmd = ' '.join(['mkdir', '-p', '{dst}/']) + bash_cmd = ' '.join([ + '/bin/bash', '-c', '"{mkdir_cmd} && {rsync_cmd}"' + ]).format( + mkdir_cmd=mkdir_cmd, + rsync_cmd=rsync_cmd) + + k5start_cmd = ' '.join([ + '/usr/bin/k5start', '-t', '-f', '{keytab}', '{user}', '--', + bash_cmd, + ]) + + shellargs = k5start_cmd.format( + src=afssource, dst=afstarget, filter=filter_file, + user=site['user'], keytab=site['keytab']) + task = dict(shell=shellargs, when='success', delegate_to='127.0.0.1') From 331c3de4a70461b1794127bda8fff158099bd33c Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Tue, 13 Sep 2016 11:32:32 -0500 Subject: [PATCH 04/29] Rename zuul_runner to command In the next patch, we're going to change the body of zuul_runner. But, in order to render that diff well, do the rename in this patch. Change-Id: I3727f506cae5da561948869bd8f8daaf42e4dc0d --- zuul/ansible/library/{zuul_runner.py => command.py} | 0 zuul/launcher/ansiblelaunchserver.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename zuul/ansible/library/{zuul_runner.py => command.py} (100%) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/command.py similarity index 100% rename from zuul/ansible/library/zuul_runner.py rename to zuul/ansible/library/command.py diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 288bc47b96..b956e3b4b0 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1290,8 +1290,8 @@ class NodeWorker(object): runner = dict(command=remote_path, cwd=parameters['WORKSPACE'], parameters=parameters) - task = dict(zuul_runner=runner) - task['name'] = ('zuul_runner with {{ timeout | int - elapsed_time }} ' + task = dict(command=runner) + task['name'] = ('command with {{ timeout | int - elapsed_time }} ' 'second timeout') task['when'] = '{{ elapsed_time < timeout | int }}' task['async'] = '{{ timeout | int - elapsed_time }}' From d1ddd284b85b60a9109ead78d29c93d1fd7c4300 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Tue, 13 Sep 2016 11:14:58 -0500 Subject: [PATCH 05/29] Use command module instead of zuul_runner Having a modified command module with the zuul_runner logic allows us to use normal command and shell entries in the playbooks. (shell is just a wrapper around command) At this moment in time it's an invasive fork of the run_command method on AnsibleModule. That's not optimal for long term, but should get us closer to being able to discuss appropriate hook points with upstream ansible. Use environment task parameter instead of parameters ansible has a structure for passing in environment variables which we can use. We did not use it before due to a behavior in ansible from pre-2.2 that set LANG settings in the environment in a way that caused us to need to clean things in zuul_runner. The module_set_locale variable defaults to False in 2.2, but to True in 2.1 (which was the regression) Set the config value explcitly just to be sure. Change-Id: Iae4769f923ecf74462e1fe43168ea93ff1c61d6e --- zuul/ansible/library/command.py | 450 +++++++++++++++++++++++---- zuul/launcher/ansiblelaunchserver.py | 9 +- 2 files changed, 397 insertions(+), 62 deletions(-) diff --git a/zuul/ansible/library/command.py b/zuul/ansible/library/command.py index 7689fb3c8e..97bdb278b9 100644 --- a/zuul/ansible/library/command.py +++ b/zuul/ansible/library/command.py @@ -1,6 +1,10 @@ #!/usr/bin/python +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Red Hat, Inc. # Copyright (c) 2016 IBM Corp. +# (c) 2012, Michael DeHaan , and others +# (c) 2016, Toshio Kuratomi # # This module is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -15,12 +19,109 @@ # You should have received a copy of the GNU General Public License # along with this software. If not, see . +# flake8: noqa +# This file shares a significant chunk of code with an upstream ansible +# function, run_command. The goal is to not have to fork quite so much +# of that function, and discussing that design with upstream means we +# should keep the changes to substantive ones only. For that reason, this +# file is purposely not enforcing pep8, as making the function pep8 clean +# would remove our ability to easily have a discussion with our friends +# upstream + +DOCUMENTATION = ''' +--- +module: command +short_description: Executes a command on a remote node +version_added: historical +description: + - The M(command) module takes the command name followed by a list of space-delimited arguments. + - The given command will be executed on all selected nodes. It will not be + processed through the shell, so variables like C($HOME) and operations + like C("<"), C(">"), C("|"), C(";") and C("&") will not work (use the M(shell) + module if you need these features). +options: + free_form: + description: + - the command module takes a free form command to run. There is no parameter actually named 'free form'. + See the examples! + required: true + default: null + creates: + description: + - a filename or (since 2.0) glob pattern, when it already exists, this step will B(not) be run. + required: no + default: null + removes: + description: + - a filename or (since 2.0) glob pattern, when it does not exist, this step will B(not) be run. + version_added: "0.8" + required: no + default: null + chdir: + description: + - cd into this directory before running the command + version_added: "0.6" + required: false + default: null + executable: + description: + - change the shell used to execute the command. Should be an absolute path to the executable. + required: false + default: null + version_added: "0.9" + warn: + version_added: "1.8" + default: yes + description: + - if command warnings are on in ansible.cfg, do not warn about this particular line if set to no/false. + required: false +notes: + - If you want to run a command through the shell (say you are using C(<), + C(>), C(|), etc), you actually want the M(shell) module instead. The + M(command) module is much more secure as it's not affected by the user's + environment. + - " C(creates), C(removes), and C(chdir) can be specified after the command. For instance, if you only want to run a command if a certain file does not exist, use this." +author: + - Ansible Core Team + - Michael DeHaan +''' + +EXAMPLES = ''' +# Example from Ansible Playbooks. +- command: /sbin/shutdown -t now + +# Run the command if the specified file does not exist. +- command: /usr/bin/make_database.sh arg1 arg2 creates=/path/to/database + +# You can also use the 'args' form to provide the options. This command +# will change the working directory to somedir/ and will only run when +# /path/to/database doesn't exist. +- command: /usr/bin/make_database.sh arg1 arg2 + args: + chdir: somedir/ + creates: /path/to/database +''' + import datetime -import getpass +import glob +import pipes +import re +import shlex import os + +import getpass +import select import subprocess +import traceback import threading +from ansible.module_utils.basic import AnsibleModule, heuristic_log_sanitize +# ZUUL: Hardcode python2 until we're on ansible 2.2 +from ast import literal_eval + + +PASSWD_ARG_RE = re.compile(r'^[-]{0,2}pass[-]?(word|wd)?') + class Console(object): def __enter__(self): @@ -40,30 +141,6 @@ class Console(object): self.logfile.write(outln) -def get_env(): - env = {} - env['HOME'] = os.path.expanduser('~') - env['USER'] = getpass.getuser() - - # Known locations for PAM mod_env sources - for fn in ['/etc/environment', '/etc/default/locale']: - if os.path.exists(fn): - with open(fn) as f: - for line in f: - if not line: - continue - if line[0] == '#': - continue - if '=' not in line: - continue - k, v = line.strip().split('=') - for q in ["'", '"']: - if v[0] == q: - v = v.strip(q) - env[k] = v - return env - - def follow(fd): newline_warning = False with Console() as console: @@ -79,53 +156,310 @@ def follow(fd): console.addLine('[Zuul] No trailing newline\n') -def run(cwd, cmd, args): - env = get_env() - env.update(args) - proc = subprocess.Popen( - ['/bin/bash', '-l', '-c', cmd], - cwd=cwd, +# Taken from ansible/module_utils/basic.py ... forking the method for now +# so that we can dive in and figure out how to make appropriate hook points +def zuul_run_command(self, args, check_rc=False, close_fds=True, executable=None, data=None, binary_data=False, path_prefix=None, cwd=None, use_unsafe_shell=False, prompt_regex=None, environ_update=None): + ''' + Execute a command, returns rc, stdout, and stderr. + + :arg args: is the command to run + * If args is a list, the command will be run with shell=False. + * If args is a string and use_unsafe_shell=False it will split args to a list and run with shell=False + * If args is a string and use_unsafe_shell=True it runs with shell=True. + :kw check_rc: Whether to call fail_json in case of non zero RC. + Default False + :kw close_fds: See documentation for subprocess.Popen(). Default True + :kw executable: See documentation for subprocess.Popen(). Default None + :kw data: If given, information to write to the stdin of the command + :kw binary_data: If False, append a newline to the data. Default False + :kw path_prefix: If given, additional path to find the command in. + This adds to the PATH environment vairable so helper commands in + the same directory can also be found + :kw cwd: If given, working directory to run the command inside + :kw use_unsafe_shell: See `args` parameter. Default False + :kw prompt_regex: Regex string (not a compiled regex) which can be + used to detect prompts in the stdout which would otherwise cause + the execution to hang (especially if no input data is specified) + :kwarg environ_update: dictionary to *update* os.environ with + ''' + + shell = False + if isinstance(args, list): + if use_unsafe_shell: + args = " ".join([pipes.quote(x) for x in args]) + shell = True + elif isinstance(args, (str, unicode)) and use_unsafe_shell: + shell = True + elif isinstance(args, (str, unicode)): + # On python2.6 and below, shlex has problems with text type + # ZUUL: Hardcode python2 until we're on ansible 2.2 + if isinstance(args, unicode): + args = args.encode('utf-8') + args = shlex.split(args) + else: + msg = "Argument 'args' to run_command must be list or string" + self.fail_json(rc=257, cmd=args, msg=msg) + + prompt_re = None + if prompt_regex: + try: + prompt_re = re.compile(prompt_regex, re.MULTILINE) + except re.error: + self.fail_json(msg="invalid prompt regular expression given to run_command") + + # expand things like $HOME and ~ + if not shell: + args = [ os.path.expanduser(os.path.expandvars(x)) for x in args if x is not None ] + + rc = 0 + msg = None + st_in = None + + # Manipulate the environ we'll send to the new process + old_env_vals = {} + # We can set this from both an attribute and per call + for key, val in self.run_command_environ_update.items(): + old_env_vals[key] = os.environ.get(key, None) + os.environ[key] = val + if environ_update: + for key, val in environ_update.items(): + old_env_vals[key] = os.environ.get(key, None) + os.environ[key] = val + if path_prefix: + old_env_vals['PATH'] = os.environ['PATH'] + os.environ['PATH'] = "%s:%s" % (path_prefix, os.environ['PATH']) + + # If using test-module and explode, the remote lib path will resemble ... + # /tmp/test_module_scratch/debug_dir/ansible/module_utils/basic.py + # If using ansible or ansible-playbook with a remote system ... + # /tmp/ansible_vmweLQ/ansible_modlib.zip/ansible/module_utils/basic.py + + # Clean out python paths set by ansiballz + if 'PYTHONPATH' in os.environ: + pypaths = os.environ['PYTHONPATH'].split(':') + pypaths = [x for x in pypaths \ + if not x.endswith('/ansible_modlib.zip') \ + and not x.endswith('/debug_dir')] + os.environ['PYTHONPATH'] = ':'.join(pypaths) + if not os.environ['PYTHONPATH']: + del os.environ['PYTHONPATH'] + + # create a printable version of the command for use + # in reporting later, which strips out things like + # passwords from the args list + to_clean_args = args + # ZUUL: Hardcode python2 until we're on ansible 2.2 + if isinstance(args, (unicode, str)): + to_clean_args = shlex.split(to_clean_args) + + clean_args = [] + is_passwd = False + for arg in to_clean_args: + if is_passwd: + is_passwd = False + clean_args.append('********') + continue + if PASSWD_ARG_RE.match(arg): + sep_idx = arg.find('=') + if sep_idx > -1: + clean_args.append('%s=********' % arg[:sep_idx]) + continue + else: + is_passwd = True + arg = heuristic_log_sanitize(arg, self.no_log_values) + clean_args.append(arg) + clean_args = ' '.join(pipes.quote(arg) for arg in clean_args) + + if data: + st_in = subprocess.PIPE + + # ZUUL: changed stderr to follow stdout + kwargs = dict( + executable=executable, + shell=shell, + close_fds=close_fds, + stdin=st_in, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - env=env, ) - t = threading.Thread(target=follow, args=(proc.stdout,)) - t.daemon = True - t.start() + if cwd and os.path.isdir(cwd): + kwargs['cwd'] = cwd - ret = proc.wait() - # Give the thread that is writing the console log up to 10 seconds - # to catch up and exit. If it hasn't done so by then, it is very - # likely stuck in readline() because it spawed a child that is - # holding stdout or stderr open. - t.join(10) - with Console() as console: - if t.isAlive(): - console.addLine("[Zuul] standard output/error still open " - "after child exited") - console.addLine("[Zuul] Task exit code: %s\n" % ret) - return ret + # store the pwd + prev_dir = os.getcwd() + + # make sure we're in the right working directory + if cwd and os.path.isdir(cwd): + try: + os.chdir(cwd) + except (OSError, IOError): + e = get_exception() + self.fail_json(rc=e.errno, msg="Could not open %s, %s" % (cwd, str(e))) + + try: + + if self._debug: + if isinstance(args, list): + running = ' '.join(args) + else: + running = args + self.log('Executing: ' + running) + # ZUUL: Replaced the excution loop with the zuul_runner run function + cmd = subprocess.Popen(args, **kwargs) + t = threading.Thread(target=follow, args=(cmd.stdout,)) + t.daemon = True + t.start() + ret = cmd.wait() + # Give the thread that is writing the console log up to 10 seconds + # to catch up and exit. If it hasn't done so by then, it is very + # likely stuck in readline() because it spawed a child that is + # holding stdout or stderr open. + t.join(10) + with Console() as console: + if t.isAlive(): + console.addLine("[Zuul] standard output/error still open " + "after child exited") + console.addLine("[Zuul] Task exit code: %s\n" % ret) + + cmd.stdout.close() + + # ZUUL: stdout and stderr are in the console log file + stdout = '' + stderr = '' + + rc = cmd.returncode + except (OSError, IOError): + e = get_exception() + self.fail_json(rc=e.errno, msg=str(e), cmd=clean_args) + except Exception: + e = get_exception() + self.fail_json(rc=257, msg=str(e), exception=traceback.format_exc(), cmd=clean_args) + + # Restore env settings + for key, val in old_env_vals.items(): + if val is None: + del os.environ[key] + else: + os.environ[key] = val + + if rc != 0 and check_rc: + msg = heuristic_log_sanitize(stderr.rstrip(), self.no_log_values) + self.fail_json(cmd=clean_args, rc=rc, stdout=stdout, stderr=stderr, msg=msg) + + # reset the pwd + os.chdir(prev_dir) + + return (rc, stdout, stderr) + + +def check_command(commandline): + arguments = { 'chown': 'owner', 'chmod': 'mode', 'chgrp': 'group', + 'ln': 'state=link', 'mkdir': 'state=directory', + 'rmdir': 'state=absent', 'rm': 'state=absent', 'touch': 'state=touch' } + commands = { 'hg': 'hg', 'curl': 'get_url or uri', 'wget': 'get_url or uri', + 'svn': 'subversion', 'service': 'service', + 'mount': 'mount', 'rpm': 'yum, dnf or zypper', 'yum': 'yum', 'apt-get': 'apt', + 'tar': 'unarchive', 'unzip': 'unarchive', 'sed': 'template or lineinfile', + 'dnf': 'dnf', 'zypper': 'zypper' } + become = [ 'sudo', 'su', 'pbrun', 'pfexec', 'runas' ] + warnings = list() + command = os.path.basename(commandline.split()[0]) + if command in arguments: + warnings.append("Consider using file module with %s rather than running %s" % (arguments[command], command)) + if command in commands: + warnings.append("Consider using %s module rather than running %s" % (commands[command], command)) + if command in become: + warnings.append("Consider using 'become', 'become_method', and 'become_user' rather than running %s" % (command,)) + return warnings def main(): + + # the command module is the one ansible module that does not take key=value args + # hence don't copy this one if you are looking to build others! module = AnsibleModule( argument_spec=dict( - command=dict(required=True, default=None), - cwd=dict(required=True, default=None), - parameters=dict(default={}, type='dict') + _raw_params = dict(), + _uses_shell = dict(type='bool', default=False), + chdir = dict(type='path'), + executable = dict(), + creates = dict(type='path'), + removes = dict(type='path'), + warn = dict(type='bool', default=True), + environ = dict(type='dict', default=None), ) ) - p = module.params - env = p['parameters'].copy() - ret = run(p['cwd'], p['command'], env) - if ret == 0: - module.exit_json(changed=True, rc=ret) - else: - module.fail_json(msg="Exit code %s" % ret, rc=ret) + shell = module.params['_uses_shell'] + chdir = module.params['chdir'] + executable = module.params['executable'] + args = module.params['_raw_params'] + creates = module.params['creates'] + removes = module.params['removes'] + warn = module.params['warn'] + environ = module.params['environ'] -from ansible.module_utils.basic import * # noqa + if args.strip() == '': + module.fail_json(rc=256, msg="no command given") + + if chdir: + chdir = os.path.abspath(chdir) + os.chdir(chdir) + + if creates: + # do not run the command if the line contains creates=filename + # and the filename already exists. This allows idempotence + # of command executions. + if glob.glob(creates): + module.exit_json( + cmd=args, + stdout="skipped, since %s exists" % creates, + changed=False, + rc=0 + ) + + if removes: + # do not run the command if the line contains removes=filename + # and the filename does not exist. This allows idempotence + # of command executions. + if not glob.glob(removes): + module.exit_json( + cmd=args, + stdout="skipped, since %s does not exist" % removes, + changed=False, + rc=0 + ) + + warnings = list() + if warn: + warnings = check_command(args) + + if not shell: + args = shlex.split(args) + startd = datetime.datetime.now() + + rc, out, err = zuul_run_command(module, args, executable=executable, use_unsafe_shell=shell, environ_update=environ) + + endd = datetime.datetime.now() + delta = endd - startd + + if out is None: + out = '' + if err is None: + err = '' + + module.exit_json( + cmd = args, + stdout = out.rstrip("\r\n"), + stderr = err.rstrip("\r\n"), + rc = rc, + start = str(startd), + end = str(endd), + delta = str(delta), + changed = True, + warnings = warnings + ) if __name__ == '__main__': main() diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index b956e3b4b0..5c6b06cdab 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1287,15 +1287,14 @@ class NodeWorker(object): task = dict(copy=copy) tasks.append(task) - runner = dict(command=remote_path, - cwd=parameters['WORKSPACE'], - parameters=parameters) - task = dict(command=runner) + task = dict(command=remote_path) task['name'] = ('command with {{ timeout | int - elapsed_time }} ' 'second timeout') task['when'] = '{{ elapsed_time < timeout | int }}' task['async'] = '{{ timeout | int - elapsed_time }}' task['poll'] = 5 + task['environment'] = parameters + task['args'] = dict(chdir=parameters['WORKSPACE']) tasks.append(task) filetask = dict(path=remote_path, @@ -1453,6 +1452,8 @@ class NodeWorker(object): config.write('gathering = explicit\n') config.write('callback_plugins = %s\n' % self.callback_dir) config.write('library = %s\n' % self.library_dir) + # TODO(mordred) This can be removed once we're using ansible 2.2 + config.write('module_set_locale = False\n') # bump the timeout because busy nodes may take more than # 10s to respond config.write('timeout = 30\n') From a192814194dd26978f78a38eb52eefc60bf1254d Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Tue, 13 Sep 2016 11:56:05 -0500 Subject: [PATCH 06/29] Put script string in directly instead of in files Now that we're using the command module, just do inline script content to make debugging/reading easier. Change-Id: Ia63f77fd41a03b4662c26f9d0f3b70d1e6a8b5d3 --- zuul/launcher/ansiblelaunchserver.py | 65 +++++++++++++++++++--------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5c6b06cdab..4f02938e68 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +import getopt import json import logging import os @@ -25,7 +26,6 @@ import threading import time import traceback import Queue -import uuid import gear import yaml @@ -52,6 +52,45 @@ def boolify(x): return bool(x) +def deal_with_shebang(data): + # Ansible shell blocks do not honor shebang lines. That's fine - but + # we do have a bunch of scripts that have either nothing, -x, -xe, + # -ex or -eux. Transform those into leading set commands + if not data.startswith('#!'): + return (None, data) + data_lines = data.split('\n') + data_lines.reverse() + shebang = data_lines.pop() + split_line = shebang.split() + # Strip the # and the ! + executable = split_line[0][2:] + if executable == '/bin/sh': + # Ansible default + executable = None + if len(split_line) > 1: + flag_x = False + flag_e = False + flag_u = False + optlist, args = getopt.getopt(split_line[1:], 'uex') + for opt, _ in optlist: + if opt == '-x': + flag_x = True + elif opt == '-e': + flag_e = True + elif opt == '-u': + flag_u = True + + if flag_x: + data_lines.append('set -x') + if flag_e: + data_lines.append('set -e') + if flag_u: + data_lines.append('set -u') + data_lines.reverse() + data = '\n'.join(data_lines) + return (executable, data) + + class LaunchGearWorker(gear.Worker): def __init__(self, *args, **kw): self.__launch_server = kw.pop('launch_server') @@ -116,9 +155,7 @@ class JobDir(object): self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') - self.script_root = os.path.join(self.ansible_root, 'scripts') self.ansible_log = os.path.join(self.ansible_root, 'ansible_log.txt') - os.makedirs(self.script_root) self.staging_root = os.path.join(self.root, 'staging') os.makedirs(self.staging_root) @@ -1272,22 +1309,10 @@ class NodeWorker(object): def _makeBuilderTask(self, jobdir, builder, parameters): tasks = [] - script_fn = '%s.sh' % str(uuid.uuid4().hex) - script_path = os.path.join(jobdir.script_root, script_fn) - with open(script_path, 'w') as script: - data = builder['shell'] - if not data.startswith('#!'): - data = '#!/bin/bash -x\n %s' % (data,) - script.write(data) - remote_path = os.path.join('/tmp', script_fn) - copy = dict(src=script_path, - dest=remote_path, - mode=0o555) - task = dict(copy=copy) - tasks.append(task) + (executable, shell) = deal_with_shebang(builder['shell']) - task = dict(command=remote_path) + task = dict(shell=shell) task['name'] = ('command with {{ timeout | int - elapsed_time }} ' 'second timeout') task['when'] = '{{ elapsed_time < timeout | int }}' @@ -1295,11 +1320,9 @@ class NodeWorker(object): task['poll'] = 5 task['environment'] = parameters task['args'] = dict(chdir=parameters['WORKSPACE']) - tasks.append(task) + if executable: + task['args']['executable'] = executable - filetask = dict(path=remote_path, - state='absent') - task = dict(file=filetask) tasks.append(task) return tasks From f166784a286643c3afda0057e0b787be869b2c14 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Wed, 14 Sep 2016 11:39:12 -0500 Subject: [PATCH 07/29] Stop running commands with async The async module is complex, and we're only using it to handle the running cumulative timeout. However, we still fallback on the watchdog timeout from time to time. Make things simpler by just having that be how we time things out. Change-Id: Ie51de4a135d953c4ad9dcb773d27b3c54ca8829b --- zuul/ansible/plugins/__init__.py | 0 .../plugins/callback_plugins/__init__.py | 0 .../plugins/callback_plugins/timeout.py | 52 ------------------- zuul/launcher/ansiblelaunchserver.py | 29 ++--------- 4 files changed, 3 insertions(+), 78 deletions(-) delete mode 100644 zuul/ansible/plugins/__init__.py delete mode 100644 zuul/ansible/plugins/callback_plugins/__init__.py delete mode 100644 zuul/ansible/plugins/callback_plugins/timeout.py diff --git a/zuul/ansible/plugins/__init__.py b/zuul/ansible/plugins/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/zuul/ansible/plugins/callback_plugins/__init__.py b/zuul/ansible/plugins/callback_plugins/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/zuul/ansible/plugins/callback_plugins/timeout.py b/zuul/ansible/plugins/callback_plugins/timeout.py deleted file mode 100644 index 1cfd10df09..0000000000 --- a/zuul/ansible/plugins/callback_plugins/timeout.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2016 IBM Corp. -# -# This file is part of Zuul -# -# This file is free software: you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This file is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this file. If not, see . - -import time - -from ansible.executor.task_result import TaskResult -from ansible.plugins.callback import CallbackBase - - -class CallbackModule(CallbackBase): - def __init__(self, *args, **kw): - super(CallbackModule, self).__init__(*args, **kw) - self._elapsed_time = 0.0 - self._task_start_time = None - self._play = None - - def v2_playbook_on_play_start(self, play): - self._play = play - - def playbook_on_task_start(self, name, is_conditional): - self._task_start_time = time.time() - - def v2_on_any(self, *args, **kw): - result = None - if args and isinstance(args[0], TaskResult): - result = args[0] - if not result: - return - - if self._task_start_time is not None: - task_time = time.time() - self._task_start_time - self._elapsed_time += task_time - if self._play and result._host: - manager = self._play.get_variable_manager() - facts = dict(elapsed_time=int(self._elapsed_time)) - - manager.set_nonpersistent_facts(result._host, facts) - self._task_start_time = None diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 4f02938e68..c6fe0a4fe2 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -34,7 +34,6 @@ import jenkins_jobs.formatter import zmq import zuul.ansible.library -import zuul.ansible.plugins.callback_plugins from zuul.lib import commandsocket ANSIBLE_WATCHDOG_GRACE = 5 * 60 @@ -213,19 +212,10 @@ class LaunchServer(object): path = os.path.join(state_dir, 'launcher.socket') self.command_socket = commandsocket.CommandSocket(path) ansible_dir = os.path.join(state_dir, 'ansible') - plugins_dir = os.path.join(ansible_dir, 'plugins') - self.callback_dir = os.path.join(plugins_dir, 'callback_plugins') - if not os.path.exists(self.callback_dir): - os.makedirs(self.callback_dir) self.library_dir = os.path.join(ansible_dir, 'library') if not os.path.exists(self.library_dir): os.makedirs(self.library_dir) - callback_path = os.path.dirname(os.path.abspath( - zuul.ansible.plugins.callback_plugins.__file__)) - for fn in os.listdir(callback_path): - shutil.copy(os.path.join(callback_path, fn), self.callback_dir) - library_path = os.path.dirname(os.path.abspath( zuul.ansible.library.__file__)) for fn in os.listdir(library_path): @@ -513,8 +503,7 @@ class LaunchServer(object): args['description'], args['labels'], self.hostname, self.zmq_send_queue, self.termination_queue, self.keep_jobdir, - self.callback_dir, self.library_dir, - self.options) + self.library_dir, self.options) self.node_workers[worker.name] = worker worker.thread = threading.Thread(target=worker.run) @@ -594,8 +583,7 @@ class NodeWorker(object): def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, - termination_queue, keep_jobdir, callback_dir, - library_dir, options): + termination_queue, keep_jobdir, library_dir, options): self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config @@ -641,7 +629,6 @@ class NodeWorker(object): self.username = config.get('launcher', 'username') else: self.username = 'zuul' - self.callback_dir = callback_dir self.library_dir = library_dir self.options = options @@ -1313,11 +1300,7 @@ class NodeWorker(object): (executable, shell) = deal_with_shebang(builder['shell']) task = dict(shell=shell) - task['name'] = ('command with {{ timeout | int - elapsed_time }} ' - 'second timeout') - task['when'] = '{{ elapsed_time < timeout | int }}' - task['async'] = '{{ timeout | int - elapsed_time }}' - task['poll'] = 5 + task['name'] = 'command generated from JJB' task['environment'] = parameters task['args'] = dict(chdir=parameters['WORKSPACE']) if executable: @@ -1370,19 +1353,15 @@ class NodeWorker(object): inventory.write('\n') timeout = None - timeout_var = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): build_timeout = wrapper.get('timeout') if isinstance(build_timeout, dict): - timeout_var = build_timeout.get('timeout-var') timeout = build_timeout.get('timeout') if timeout is not None: timeout = int(timeout) * 60 if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT - if timeout_var: - parameters[timeout_var] = str(timeout * 1000) with open(jobdir.playbook, 'w') as playbook: pre_tasks = [] @@ -1428,7 +1407,6 @@ class NodeWorker(object): error_block.append(task) error_block.append(dict(fail=dict(msg='FAILURE'))) - variables.append(dict(timeout=timeout)) play = dict(hosts='node', name='Job body', vars=variables, pre_tasks=pre_tasks, tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) @@ -1473,7 +1451,6 @@ class NodeWorker(object): config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % jobdir.ansible_log) config.write('gathering = explicit\n') - config.write('callback_plugins = %s\n' % self.callback_dir) config.write('library = %s\n' % self.library_dir) # TODO(mordred) This can be removed once we're using ansible 2.2 config.write('module_set_locale = False\n') From 6767fceba64b5e23f7489856b6b0ba5570c45990 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Fri, 16 Sep 2016 13:15:28 -0400 Subject: [PATCH 08/29] Split playbook into vars, pre-playbook and playbook In order to get to the point where playbooks that people write for tests are playbooks that they could conceivably also use outside of the zuul context, we need to remove the need for zuul-specific things in the main playbook. Add a pre-playbook that runs before the playbook and runs the things that are not tied to current JJB content - namely setting up the logger and prepping directories. Move the SUCCESS/FAILURE message to the post-playbook. Extract the injected variables into a variables file and add a -e@vars.yaml option to the playbook invocation. This provides variables in a known namespace. Obviously there is still an exercise in how a user might write a playbook that wants to consume those variables in some way. Change-Id: Ie5ec6ec65a03ceea9afc3ac59df73cb28f5ca4dd --- zuul/launcher/ansiblelaunchserver.py | 141 +++++++++++++++++++-------- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index c6fe0a4fe2..6a96f0a3e0 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -38,6 +38,7 @@ from zuul.lib import commandsocket ANSIBLE_WATCHDOG_GRACE = 5 * 60 ANSIBLE_DEFAULT_TIMEOUT = 2 * 60 * 60 +ANSIBLE_DEFAULT_PRE_TIMEOUT = 10 * 60 ANSIBLE_DEFAULT_POST_TIMEOUT = 10 * 60 @@ -151,6 +152,8 @@ class JobDir(object): os.makedirs(self.ansible_root) self.known_hosts = os.path.join(self.ansible_root, 'known_hosts') self.inventory = os.path.join(self.ansible_root, 'inventory') + self.vars = os.path.join(self.ansible_root, 'vars.yaml') + self.pre_playbook = os.path.join(self.ansible_root, 'pre_playbook') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') @@ -618,6 +621,7 @@ class NodeWorker(object): self._aborted_job = False self._watchdog_timeout = False self._sent_complete_event = False + self.ansible_pre_proc = None self.ansible_job_proc = None self.ansible_post_proc = None self.workspace_root = config.get('launcher', 'workspace_root') @@ -896,6 +900,7 @@ class NodeWorker(object): 'SUCCESS', {}) def runJob(self, job, args): + self.ansible_pre_proc = None self.ansible_job_proc = None self.ansible_post_proc = None result = None @@ -924,6 +929,12 @@ class NodeWorker(object): job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) + pre_status = self.runAnsiblePrePlaybook(jobdir) + if pre_status is None: + # These should really never fail, so return None and have + # zuul try again + return result + job_status = self.runAnsiblePlaybook(jobdir, timeout) if job_status is None: # The result of the job is indeterminate. Zuul will @@ -1013,7 +1024,7 @@ class NodeWorker(object): syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs) if not scpfile.get('copy-after-failure'): - task['when'] = 'success' + task['when'] = 'success|bool' task.update(self.retry_args) tasks.append(task) @@ -1057,7 +1068,7 @@ class NodeWorker(object): task = dict(shell=shellargs, delegate_to='127.0.0.1') if not scpfile.get('copy-after-failure'): - task['when'] = 'success' + task['when'] = 'success|bool' return task @@ -1086,11 +1097,11 @@ class NodeWorker(object): if rsync_opts: syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs, - when='success') + when='success|bool') task.update(self.retry_args) tasks.append(task) task = dict(shell='lftp -f %s' % ftpscript, - when='success', + when='success|bool', delegate_to='127.0.0.1') ftpsource = ftpcontent if ftp.get('remove-prefix'): @@ -1151,7 +1162,7 @@ class NodeWorker(object): if rsync_opts: syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs, - when='success') + when='success|bool') task.update(self.retry_args) tasks.append(task) @@ -1179,7 +1190,7 @@ class NodeWorker(object): # content at the root *and* at a tag location). task = dict(shell=find_pipe.format(path=afssource, file=src_markers_file), - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1187,7 +1198,7 @@ class NodeWorker(object): # published site. task = dict(shell=find_pipe.format(path=afstarget, file=dst_markers_file), - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1199,7 +1210,7 @@ class NodeWorker(object): dst=dst_markers_file, exclude=exclude_file) task = dict(shell=exclude_command, - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1225,7 +1236,7 @@ class NodeWorker(object): src=src_markers_file, filter=filter_file)) task = dict(shell=command, - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1241,7 +1252,7 @@ class NodeWorker(object): exclude=exclude_file, filter=filter_file)) task = dict(shell=command, - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1259,7 +1270,7 @@ class NodeWorker(object): exclude=exclude_file, filter=filter_file)) task = dict(shell=command, - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1288,7 +1299,7 @@ class NodeWorker(object): keytab=site['keytab']) task = dict(shell=shellargs, - when='success', + when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1301,7 +1312,7 @@ class NodeWorker(object): task = dict(shell=shell) task['name'] = 'command generated from JJB' - task['environment'] = parameters + task['environment'] = "{{ zuul.environment }}" task['args'] = dict(chdir=parameters['WORKSPACE']) if executable: task['args']['executable'] = executable @@ -1363,52 +1374,53 @@ class NodeWorker(object): if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT - with open(jobdir.playbook, 'w') as playbook: - pre_tasks = [] - tasks = [] - main_block = [] - error_block = [] - variables = [] + with open(jobdir.vars, 'w') as vars_yaml: + variables = dict( + timeout=timeout, + environment=parameters, + ) + zuul_vars = dict(zuul=variables) + vars_yaml.write( + yaml.safe_dump(zuul_vars, default_flow_style=False)) + + with open(jobdir.pre_playbook, 'w') as pre_playbook: shellargs = "ssh-keyscan {{ ansible_host }} > %s" % ( jobdir.known_hosts) - pre_tasks.append(dict(shell=shellargs, - delegate_to='127.0.0.1')) - - tasks.append(dict(block=main_block, - rescue=error_block)) + tasks = [] + tasks.append(dict(shell=shellargs, delegate_to='127.0.0.1')) task = dict(file=dict(path='/tmp/console.html', state='absent')) - main_block.append(task) + tasks.append(task) task = dict(zuul_console=dict(path='/tmp/console.html', port=19885)) - main_block.append(task) + tasks.append(task) task = dict(file=dict(path=parameters['WORKSPACE'], state='directory')) - main_block.append(task) + tasks.append(task) msg = [ "Launched by %s" % self.manager_name, "Building remotely on %s in workspace %s" % ( self.name, parameters['WORKSPACE'])] task = dict(zuul_log=dict(msg=msg)) - main_block.append(task) + tasks.append(task) + + play = dict(hosts='node', name='Job setup', tasks=tasks) + pre_playbook.write( + yaml.safe_dump([play], default_flow_style=False)) + + with open(jobdir.playbook, 'w') as playbook: + tasks = [] for builder in jjb_job.get('builders', []): if 'shell' in builder: - main_block.extend( + tasks.extend( self._makeBuilderTask(jobdir, builder, parameters)) - task = dict(zuul_log=dict(msg="Job complete, result: SUCCESS")) - main_block.append(task) - task = dict(zuul_log=dict(msg="Job complete, result: FAILURE")) - error_block.append(task) - error_block.append(dict(fail=dict(msg='FAILURE'))) - - play = dict(hosts='node', name='Job body', vars=variables, - pre_tasks=pre_tasks, tasks=tasks) + play = dict(hosts='node', name='Job body', tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) early_publishers, late_publishers = self._transformPublishers(jjb_job) @@ -1434,6 +1446,14 @@ class NodeWorker(object): # we run the log publisher regardless of whether the rest # of the publishers succeed. tasks = [] + + task = dict(zuul_log=dict(msg="Job complete, result: SUCCESS"), + when='success|bool') + blocks[0].insert(0, task) + task = dict(zuul_log=dict(msg="Job complete, result: FAILURE"), + when='not success|bool') + blocks[0].insert(0, task) + tasks.append(dict(block=blocks[0], always=blocks[1])) @@ -1470,6 +1490,46 @@ class NodeWorker(object): self.log.warning(msg) self.abortRunningProc(proc) + def runAnsiblePrePlaybook(self, jobdir): + # Set LOGNAME env variable so Ansible log_path log reports + # the correct user. + env_copy = os.environ.copy() + env_copy['LOGNAME'] = 'zuul' + + if self.options['verbose']: + verbose = '-vvv' + else: + verbose = '-v' + + cmd = ['ansible-playbook', jobdir.pre_playbook, + '-e@%s' % jobdir.vars, verbose] + self.log.debug("Ansible pre command: %s" % (cmd,)) + + self.ansible_pre_proc = subprocess.Popen( + cmd, + cwd=jobdir.ansible_root, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + preexec_fn=os.setsid, + env=env_copy, + ) + ret = None + watchdog = Watchdog(ANSIBLE_DEFAULT_PRE_TIMEOUT, + self._ansibleTimeout, + (self.ansible_pre_proc, + "Ansible pre timeout exceeded")) + watchdog.start() + try: + for line in iter(self.ansible_pre_proc.stdout.readline, b''): + line = line[:1024].rstrip() + self.log.debug("Ansible pre output: %s" % (line,)) + ret = self.ansible_pre_proc.wait() + finally: + watchdog.stop() + self.log.debug("Ansible pre exit code: %s" % (ret,)) + self.ansible_pre_proc = None + return ret == 0 + def runAnsiblePlaybook(self, jobdir, timeout): # Set LOGNAME env variable so Ansible log_path log reports # the correct user. @@ -1481,7 +1541,8 @@ class NodeWorker(object): else: verbose = '-v' - cmd = ['ansible-playbook', jobdir.playbook, verbose] + cmd = ['ansible-playbook', jobdir.playbook, verbose, + '-e@%s' % jobdir.vars] self.log.debug("Ansible command: %s" % (cmd,)) self.ansible_job_proc = subprocess.Popen( @@ -1530,7 +1591,9 @@ class NodeWorker(object): verbose = '-v' cmd = ['ansible-playbook', jobdir.post_playbook, - '-e', 'success=%s' % success, verbose] + '-e', 'success=%s' % success, + '-e@%s' % jobdir.vars, + verbose] self.log.debug("Ansible post command: %s" % (cmd,)) self.ansible_post_proc = subprocess.Popen( From 843b4df30d1067d55c60045a01476601b307fee4 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sun, 19 Jun 2016 20:29:50 -0400 Subject: [PATCH 09/29] Enable pipelining for ansible-playbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From the manual: Enabling pipelining reduces the number of SSH operations required to execute a module on the remote server, by executing many ansible modules without actual file transfer. This can result in a very significant performance improvement when enabled, however when using “sudo:” operations you must first disable ‘requiretty’ in /etc/sudoers on all managed hosts. Basically on local testing, there is a speed improvement. However, I believe the better reason to enable this is to reduce the number of SSH transactions we preform on our workers. In doing this we reduce our potential chances for SSH connection issue. However, it also appears async operations do not use this setting simply because of async works. Change-Id: Ib224fbf1fed19be3ce7db4da0c466e3d11acc365 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 6a96f0a3e0..ae9e9624cf 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1479,6 +1479,7 @@ class NodeWorker(object): config.write('timeout = 30\n') config.write('[ssh_connection]\n') + config.write('pipelining = True\n') ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ "-o UserKnownHostsFile=%s" % jobdir.known_hosts config.write('ssh_args = %s\n' % ssh_args) From fd97be44d9984f97af1299e8359cf0b2e40d7b24 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 30 Sep 2016 19:58:24 -0400 Subject: [PATCH 10/29] Strip leading / from afs targets It seems that Jenkins does this. At least with FTP. We don't have any leading / on AFS targets, but do the same there for symmetry. Change-Id: Icb7451c0f3f5fa62c8a15fc621fd30f2df166c96 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 288bc47b96..46e5821b94 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1131,7 +1131,7 @@ class NodeWorker(object): task.update(self.retry_args) tasks.append(task) - afstarget = afs['target'] + afstarget = afs['target'].lstrip('/') afstarget = self._substituteVariables(afstarget, parameters) afstarget = os.path.join(site['root'], afstarget) afstarget = os.path.normpath(afstarget) From 226cdd470679937f767ee0fe02f522d7308a037e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 3 Oct 2016 10:50:31 -0700 Subject: [PATCH 11/29] Ansible launcher: Fix afs publisher root detection The find command that collected the marker files is expected to print paths with a leading '/' (see later commands which grep for '^/') but this was omitted. This would cause all jobs which published to the root (whether they had any content in the root directory or were simply only intended to publish to a subdir of the root) to conflict with each other. Also, correct a missing fully-qualified path. Change-Id: I6030c2b101026ff8e72cf4043e1d1b4fbffc5dcb --- zuul/launcher/ansiblelaunchserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 46e5821b94..38a83bd218 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1145,7 +1145,7 @@ class NodeWorker(object): filter_file = os.path.join(afsroot, 'filter') find_pipe = [ - "/usr/bin/find {path} -name .root-marker -printf '%P\n'", + "/usr/bin/find {path} -name .root-marker -printf '/%P\n'", "/usr/bin/xargs -I{{}} dirname {{}}", "/usr/bin/sort > {file}"] find_pipe = ' | '.join(find_pipe) @@ -1229,7 +1229,7 @@ class NodeWorker(object): # then we should omit the '/*' exclusion so that it is # implicitly included. - command = ("grep '^/$' {exclude} && " + command = ("/bin/grep '^/$' {exclude} && " "echo '- /*' >> {filter} || " "/bin/true".format( exclude=exclude_file, From 4c91765f94b494867e1099c437fafb509d10a163 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 12 Oct 2016 14:15:20 -0700 Subject: [PATCH 12/29] Tidy up README.rst Some minor changes to clean up the existing version in preparation for expanding the content. Change-Id: I1cb97286ad91e8491b98c825ff26c37aa756c0b0 --- README.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index ff4d93867e..2c98b688d2 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ Zuul ==== -Zuul is a trunk gating system developed for the OpenStack Project. +Zuul is a project gating system developed for the OpenStack Project. Contributing ------------ @@ -11,10 +11,11 @@ To clone the latest code, use `git clone git://git.openstack.org/openstack-infra Bugs are handled at: https://storyboard.openstack.org/#!/project/679 -Code reviews are, as you might expect, handled by gerrit. The gerrit they -use is http://review.openstack.org +Code reviews are, as you might expect, handled by gerrit at +https://review.openstack.org -Use `git review` to submit patches (after creating a gerrit account that links to your launchpad account). Example:: +Use `git review` to submit patches (after creating a Gerrit account +that links to your launchpad account). Example:: # Do your commits $ git review From 1a426409f570c8c9b46dbfcc719e456ae9efd846 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 12 Oct 2016 15:17:07 -0700 Subject: [PATCH 13/29] Add a note to README.rst about contributing to Zuul v3 Change-Id: Ie81a393900bb8fe057836ba6e13099c0f0e39e57 --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index 2c98b688d2..90e00a56f3 100644 --- a/README.rst +++ b/README.rst @@ -6,6 +6,11 @@ Zuul is a project gating system developed for the OpenStack Project. Contributing ------------ +We are currently engaged in a significant development effort in +preparation for the third major version of Zuul. We call this effort +`Zuul v3`_ and it is described in this file in the `feature/zuulv3` +branch of this repo. + To browse the latest code, see: https://git.openstack.org/cgit/openstack-infra/zuul/tree/ To clone the latest code, use `git clone git://git.openstack.org/openstack-infra/zuul` From 79e94b6bddcc0ca23687d85c68c20e2799ed376a Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 18 Oct 2016 08:20:22 -0700 Subject: [PATCH 14/29] Default test log level to DEBUG except for testr When running under testr, default the test log level to INFO, but otherwise, default it to DEBUG. This way when a developer runs a test in the foreground it logs at DEBUG without any further configuration needed. Change-Id: Ie7388ebf25669807a8c430b6908f9d18115b5dc6 --- .testr.conf | 2 +- tests/base.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.testr.conf b/.testr.conf index 222ce97160..8ef6689074 100644 --- a/.testr.conf +++ b/.testr.conf @@ -1,4 +1,4 @@ [DEFAULT] -test_command=OS_STDOUT_CAPTURE=${OS_STDOUT_CAPTURE:-1} OS_STDERR_CAPTURE=${OS_STDERR_CAPTURE:-1} OS_LOG_CAPTURE=${OS_LOG_CAPTURE:-1} OS_LOG_DEFAULTS=${OS_LOG_DEFAULTS:-""} ${PYTHON:-python} -m subunit.run discover -t ./ tests $LISTOPT $IDOPTION +test_command=OS_LOG_LEVEL=${OS_LOG_LEVEL:-INFO} OS_STDOUT_CAPTURE=${OS_STDOUT_CAPTURE:-1} OS_STDERR_CAPTURE=${OS_STDERR_CAPTURE:-1} OS_LOG_CAPTURE=${OS_LOG_CAPTURE:-1} OS_LOG_DEFAULTS=${OS_LOG_DEFAULTS:-""} ${PYTHON:-python} -m subunit.run discover -t ./ tests $LISTOPT $IDOPTION test_id_option=--load-list $IDFILE test_list_option=--list diff --git a/tests/base.py b/tests/base.py index ab608cbe90..c1851cf979 100755 --- a/tests/base.py +++ b/tests/base.py @@ -857,9 +857,11 @@ class BaseTestCase(testtools.TestCase): self.useFixture(fixtures.MonkeyPatch('sys.stderr', stderr)) if (os.environ.get('OS_LOG_CAPTURE') == 'True' or os.environ.get('OS_LOG_CAPTURE') == '1'): - log_level = logging.INFO + log_level = logging.DEBUG if os.environ.get('OS_LOG_LEVEL') == 'DEBUG': log_level = logging.DEBUG + elif os.environ.get('OS_LOG_LEVEL') == 'INFO': + log_level = logging.INFO elif os.environ.get('OS_LOG_LEVEL') == 'WARNING': log_level = logging.WARNING elif os.environ.get('OS_LOG_LEVEL') == 'ERROR': From 6444d7b9777e5fdbdacb4c0e34da7f9904d60bda Mon Sep 17 00:00:00 2001 From: stephane Date: Tue, 18 Oct 2016 17:57:45 -0700 Subject: [PATCH 15/29] Add pillow dependency libjpeg-dev to bindep This should fix the error we're seeing in gate jobs: ValueError: jpeg is required unless explicitly disabled using --disable-jpeg, aborting Change-Id: I686b21e452aad2dfb9358360137070f58dde3882 --- bindep.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/bindep.txt b/bindep.txt index 1ade6557cd..1f08a58e13 100644 --- a/bindep.txt +++ b/bindep.txt @@ -2,3 +2,4 @@ mysql-client [test] mysql-server [test] postgresql [test] postgresql-client [test] +libjpeg-dev [test] From a4c892d6b43b1e66304757b2de4d93d67c5dc3de Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Wed, 19 Oct 2016 15:32:42 +0000 Subject: [PATCH 16/29] Revert "Put script string in directly instead of in files" This reverts commit a192814194dd26978f78a38eb52eefc60bf1254d. Change-Id: Idd17e474d3ac8842855cb47f74d5ba7c331a074e --- zuul/launcher/ansiblelaunchserver.py | 65 +++++++++------------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index c57a84cd4a..0173426e94 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -12,7 +12,6 @@ # License for the specific language governing permissions and limitations # under the License. -import getopt import json import logging import os @@ -26,6 +25,7 @@ import threading import time import traceback import Queue +import uuid import gear import yaml @@ -52,45 +52,6 @@ def boolify(x): return bool(x) -def deal_with_shebang(data): - # Ansible shell blocks do not honor shebang lines. That's fine - but - # we do have a bunch of scripts that have either nothing, -x, -xe, - # -ex or -eux. Transform those into leading set commands - if not data.startswith('#!'): - return (None, data) - data_lines = data.split('\n') - data_lines.reverse() - shebang = data_lines.pop() - split_line = shebang.split() - # Strip the # and the ! - executable = split_line[0][2:] - if executable == '/bin/sh': - # Ansible default - executable = None - if len(split_line) > 1: - flag_x = False - flag_e = False - flag_u = False - optlist, args = getopt.getopt(split_line[1:], 'uex') - for opt, _ in optlist: - if opt == '-x': - flag_x = True - elif opt == '-e': - flag_e = True - elif opt == '-u': - flag_u = True - - if flag_x: - data_lines.append('set -x') - if flag_e: - data_lines.append('set -e') - if flag_u: - data_lines.append('set -u') - data_lines.reverse() - data = '\n'.join(data_lines) - return (executable, data) - - class LaunchGearWorker(gear.Worker): def __init__(self, *args, **kw): self.__launch_server = kw.pop('launch_server') @@ -157,7 +118,9 @@ class JobDir(object): self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') + self.script_root = os.path.join(self.ansible_root, 'scripts') self.ansible_log = os.path.join(self.ansible_root, 'ansible_log.txt') + os.makedirs(self.script_root) self.staging_root = os.path.join(self.root, 'staging') os.makedirs(self.staging_root) @@ -1307,16 +1270,30 @@ class NodeWorker(object): def _makeBuilderTask(self, jobdir, builder, parameters): tasks = [] + script_fn = '%s.sh' % str(uuid.uuid4().hex) + script_path = os.path.join(jobdir.script_root, script_fn) + with open(script_path, 'w') as script: + data = builder['shell'] + if not data.startswith('#!'): + data = '#!/bin/bash -x\n %s' % (data,) + script.write(data) - (executable, shell) = deal_with_shebang(builder['shell']) + remote_path = os.path.join('/tmp', script_fn) + copy = dict(src=script_path, + dest=remote_path, + mode=0o555) + task = dict(copy=copy) + tasks.append(task) - task = dict(shell=shell) + task = dict(command=remote_path) task['name'] = 'command generated from JJB' task['environment'] = "{{ zuul.environment }}" task['args'] = dict(chdir=parameters['WORKSPACE']) - if executable: - task['args']['executable'] = executable + tasks.append(task) + filetask = dict(path=remote_path, + state='absent') + task = dict(file=filetask) tasks.append(task) return tasks From 3f21d4061d2348adbcad38a90eb4747f82c68e6e Mon Sep 17 00:00:00 2001 From: "John L. Villalovos" Date: Wed, 12 Oct 2016 11:46:44 -0700 Subject: [PATCH 17/29] Generate shell scripts as a sequence For the generated shell scripts in ansiblelaunchserver.py, have them be generated in numerical order. For example 01.sh, 02.sh, etc. This will allow us to tell the ordering of the scripts when looking in '_zuul_ansible/scripts/' Change-Id: Iba6231242a58a23549c92aa32620d498e05886f8 --- zuul/launcher/ansiblelaunchserver.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 0173426e94..be4348ad1a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -25,7 +25,6 @@ import threading import time import traceback import Queue -import uuid import gear import yaml @@ -1268,9 +1267,9 @@ class NodeWorker(object): return tasks - def _makeBuilderTask(self, jobdir, builder, parameters): + def _makeBuilderTask(self, jobdir, builder, parameters, sequence): tasks = [] - script_fn = '%s.sh' % str(uuid.uuid4().hex) + script_fn = '%02d.sh' % sequence script_path = os.path.join(jobdir.script_root, script_fn) with open(script_path, 'w') as script: data = builder['shell'] @@ -1392,10 +1391,13 @@ class NodeWorker(object): with open(jobdir.playbook, 'w') as playbook: tasks = [] + sequence = 0 for builder in jjb_job.get('builders', []): if 'shell' in builder: + sequence += 1 tasks.extend( - self._makeBuilderTask(jobdir, builder, parameters)) + self._makeBuilderTask(jobdir, builder, parameters, + sequence)) play = dict(hosts='node', name='Job body', tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) From 5a84a7647e75348f2d30567f3bbeda824be214d6 Mon Sep 17 00:00:00 2001 From: "John L. Villalovos" Date: Wed, 12 Oct 2016 11:46:44 -0700 Subject: [PATCH 18/29] Ansible launcher: use sequence-uuid in shell scripts For the generated shell scripts which are named using UUID4, prepend a sequence count to them to easily be able to tell the ordering of the scripts when looking in '_zuul_ansible/scripts/'. Keep the uuid to avoid potential collisions in /tmp. Change-Id: Id80bf5139ba1ce12c62945421d49c5e3cd8e2f48 --- zuul/launcher/ansiblelaunchserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index be4348ad1a..11bcf7955e 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -24,6 +24,7 @@ import tempfile import threading import time import traceback +import uuid import Queue import gear @@ -1269,7 +1270,7 @@ class NodeWorker(object): def _makeBuilderTask(self, jobdir, builder, parameters, sequence): tasks = [] - script_fn = '%02d.sh' % sequence + script_fn = '%02d-%s.sh' % (sequence, str(uuid.uuid4().hex)) script_path = os.path.join(jobdir.script_root, script_fn) with open(script_path, 'w') as script: data = builder['shell'] From fa3df0bf1f6772577250f729abf90e53fe30403f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 19 Oct 2016 16:37:11 -0700 Subject: [PATCH 19/29] Ansible launcher: import get_exception in ansible command The ansible command module was missing this import statement. Change-Id: I456512a2a7f2138ac78eb8594c8808f664a9506f --- zuul/ansible/library/command.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/ansible/library/command.py b/zuul/ansible/library/command.py index 97bdb278b9..84c3715359 100644 --- a/zuul/ansible/library/command.py +++ b/zuul/ansible/library/command.py @@ -116,6 +116,7 @@ import traceback import threading from ansible.module_utils.basic import AnsibleModule, heuristic_log_sanitize +from ansible.module_utils.basic import get_exception # ZUUL: Hardcode python2 until we're on ansible 2.2 from ast import literal_eval From 280cd8d0625fcd9820a9289ef1aa4d2076e7b495 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 19 Oct 2016 22:25:21 -0700 Subject: [PATCH 20/29] Ansible launcher: don't close stdout in command module If the job fails to close stdout/stderr, then this will cause an exception. Change-Id: I994c0a0c09bf2ec316ee7c7c51a18fa22372f152 --- zuul/ansible/library/command.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zuul/ansible/library/command.py b/zuul/ansible/library/command.py index 84c3715359..639032257c 100644 --- a/zuul/ansible/library/command.py +++ b/zuul/ansible/library/command.py @@ -323,7 +323,10 @@ def zuul_run_command(self, args, check_rc=False, close_fds=True, executable=None "after child exited") console.addLine("[Zuul] Task exit code: %s\n" % ret) - cmd.stdout.close() + # ZUUL: If the console log follow thread *is* stuck in readline, + # we can't close stdout (attempting to do so raises an + # exception) , so this is disabled. + # cmd.stdout.close() # ZUUL: stdout and stderr are in the console log file stdout = '' From 7aaf5d2f7665d89673c79d3a093169558eb604cd Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Thu, 20 Oct 2016 06:12:33 -0400 Subject: [PATCH 21/29] Add back timeout_var logic We still need to setup our timeout-var environmental variable, otherwise devstack gate will fail to read BUILD_TIMEOUT and default jobs to 120min timeouts. Change-Id: Ieccba55eaab83074a409efdbb928b4a4fdfdecf7 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 11bcf7955e..f735e6bf1f 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1341,15 +1341,19 @@ class NodeWorker(object): inventory.write('\n') timeout = None + timeout_var = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): build_timeout = wrapper.get('timeout') if isinstance(build_timeout, dict): + timeout_var = build_timeout.get('timeout-var') timeout = build_timeout.get('timeout') if timeout is not None: timeout = int(timeout) * 60 if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT + if timeout_var: + parameters[timeout_var] = str(timeout * 1000) with open(jobdir.vars, 'w') as vars_yaml: variables = dict( From 90b13ca09690516cf0a355eeaec7c0e69dfe704d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 20 Oct 2016 10:05:31 -0700 Subject: [PATCH 22/29] Ansible launcher: remove keep_remote_files This option was overriding pipelining=True. Change-Id: Icfb281513e33d2390414a5dffc8c9f433d7e24d7 --- zuul/launcher/ansiblelaunchserver.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f735e6bf1f..4b6b223820 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1448,7 +1448,6 @@ class NodeWorker(object): with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) - config.write('keep_remote_files = True\n') config.write('local_tmp = %s/.ansible/local_tmp\n' % jobdir.root) config.write('remote_tmp = %s/.ansible/remote_tmp\n' % jobdir.root) config.write('private_key_file = %s\n' % self.private_key_file) @@ -1463,6 +1462,14 @@ class NodeWorker(object): config.write('timeout = 30\n') config.write('[ssh_connection]\n') + # NB: when setting pipelining = True, keep_remote_files + # must be False (the default). Otherwise it apparently + # will override the pipelining option and effectively + # disable it. Pipelining has a side effect of running the + # command without a tty (ie, without the -tt argument to + # ssh). We require this behavior so that if a job runs a + # command which expects interactive input on a tty (such + # as sudo) it does not hang. config.write('pipelining = True\n') ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ "-o UserKnownHostsFile=%s" % jobdir.known_hosts From 1b697667d1378d1fee15092859a51059fdd545d5 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Tue, 18 Oct 2016 08:25:48 -0500 Subject: [PATCH 23/29] Stop installing mysql and postgres Zuul doesn't have a database, so there's no need to install one, let alone two, in its bindep.txt file. Remove them. Change-Id: Ib1ac968d06d059d6184a8fec031cf198d0439596 --- bindep.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bindep.txt b/bindep.txt index 1f08a58e13..a2cc02e43f 100644 --- a/bindep.txt +++ b/bindep.txt @@ -1,5 +1 @@ -mysql-client [test] -mysql-server [test] -postgresql [test] -postgresql-client [test] libjpeg-dev [test] From a126e32d302e96a816ef4090ae3cf77f0597c651 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Tue, 1 Nov 2016 07:35:31 -0500 Subject: [PATCH 24/29] Add names to post-playbook tasks for debugging While trying to follow a failed post-playbook in the gate, it became harder than desirable to follow which task was failing. Add names to the tasks so that we can track which thing is going on. Change-Id: I35fd7ad75c82f6a82fc8d12b7fd48860c1ab10f1 --- zuul/launcher/ansiblelaunchserver.py | 39 ++++++++++++++++++---------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 4b6b223820..7487167544 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -963,7 +963,8 @@ class NodeWorker(object): # upload. This uploads the playbook and ansible logs. copyargs = dict(src=jobdir.ansible_root + '/', dest=os.path.join(scproot, '_zuul_ansible')) - task = dict(copy=copyargs, + task = dict(name='copy console log', + copy=copyargs, delegate_to='127.0.0.1') # This is a local copy and should not fail, so does # not need a retry stanza. @@ -985,7 +986,8 @@ class NodeWorker(object): mode='pull') if rsync_opts: syncargs['rsync_opts'] = rsync_opts - task = dict(synchronize=syncargs) + task = dict(name='copy files from node', + synchronize=syncargs) if not scpfile.get('copy-after-failure'): task['when'] = 'success|bool' task.update(self.retry_args) @@ -1028,7 +1030,8 @@ class NodeWorker(object): private_key_file=self.private_key_file, host=site['host'], user=site['user']) - task = dict(shell=shellargs, + task = dict(name='rsync logs to server', + shell=shellargs, delegate_to='127.0.0.1') if not scpfile.get('copy-after-failure'): task['when'] = 'success|bool' @@ -1059,11 +1062,13 @@ class NodeWorker(object): mode='pull') if rsync_opts: syncargs['rsync_opts'] = rsync_opts - task = dict(synchronize=syncargs, + task = dict(name='copy files from node', + synchronize=syncargs, when='success|bool') task.update(self.retry_args) tasks.append(task) - task = dict(shell='lftp -f %s' % ftpscript, + task = dict(name='FTP files to server', + shell='lftp -f %s' % ftpscript, when='success|bool', delegate_to='127.0.0.1') ftpsource = ftpcontent @@ -1124,7 +1129,8 @@ class NodeWorker(object): mode='pull') if rsync_opts: syncargs['rsync_opts'] = rsync_opts - task = dict(synchronize=syncargs, + task = dict(name='copy files from node', + synchronize=syncargs, when='success|bool') task.update(self.retry_args) tasks.append(task) @@ -1151,7 +1157,8 @@ class NodeWorker(object): # Find the list of root markers in the just-completed build # (usually there will only be one, but some builds produce # content at the root *and* at a tag location). - task = dict(shell=find_pipe.format(path=afssource, + task = dict(name='find root markers in build', + shell=find_pipe.format(path=afssource, file=src_markers_file), when='success|bool', delegate_to='127.0.0.1') @@ -1159,7 +1166,8 @@ class NodeWorker(object): # Find the list of root markers that already exist in the # published site. - task = dict(shell=find_pipe.format(path=afstarget, + task = dict(name='find root markers in site', + shell=find_pipe.format(path=afstarget, file=dst_markers_file), when='success|bool', delegate_to='127.0.0.1') @@ -1172,7 +1180,8 @@ class NodeWorker(object): src=src_markers_file, dst=dst_markers_file, exclude=exclude_file) - task = dict(shell=exclude_command, + task = dict(name='produce list of root maker differences', + shell=exclude_command, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1198,7 +1207,8 @@ class NodeWorker(object): "/bin/sed -e 's/^/+ /' > {filter}".format( src=src_markers_file, filter=filter_file)) - task = dict(shell=command, + task = dict(name='produce first filter list', + shell=command, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1214,7 +1224,8 @@ class NodeWorker(object): "/bin/sed -e 's/^/- /' >> {filter}".format( exclude=exclude_file, filter=filter_file)) - task = dict(shell=command, + task = dict(name='produce second filter list', + shell=command, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1232,7 +1243,8 @@ class NodeWorker(object): "/bin/true".format( exclude=exclude_file, filter=filter_file)) - task = dict(shell=command, + task = dict(name='produce third filter list', + shell=command, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) @@ -1261,7 +1273,8 @@ class NodeWorker(object): user=site['user'], keytab=site['keytab']) - task = dict(shell=shellargs, + task = dict(name='k5start write files to AFS', + shell=shellargs, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) From b2d99edf679aaff94be72b07a0f78f0e1e2d82a8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 1 Nov 2016 07:52:42 -0700 Subject: [PATCH 25/29] Add extra debugging for AFS rsync Add the Ansible-standard rsync output format option to rsync, and also output the filter file to the logs to aid in debugging. Change-Id: I68daf93ee7f5d501e51ec90d201830a18c6e5a47 --- zuul/launcher/ansiblelaunchserver.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 7487167544..ba4eb1cddd 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1249,9 +1249,16 @@ class NodeWorker(object): delegate_to='127.0.0.1') tasks.append(task) + task = dict(name='cat filter list', + shell='cat {filter}'.format(filter=filter_file), + when='success|bool', + delegate_to='127.0.0.1') + tasks.append(task) + # Perform the rsync with the filter list. rsync_cmd = ' '.join([ '/usr/bin/rsync', '-rtp', '--safe-links', '--delete-after', + "--out-format='<>%i %n%L'", "--filter='merge {filter}'", '{src}/', '{dst}/', ]) mkdir_cmd = ' '.join(['mkdir', '-p', '{dst}/']) From 38ce39fe5808810a5b3ea6fb4ce0d951757977f1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 1 Nov 2016 08:41:15 -0700 Subject: [PATCH 26/29] Use separate library directories for pre and post The custom command module used in order to collect job output was also being used by the pre and post playbooks. This meant that instead of going to the ansible log file, the rsync output would end up in /tmp/console.html on the zuul launcher. To correct this, create separate library directories for use by the pre and post playbooks which will contain all of the modules except the custom command module. Write separate ansible.cfg files for them, and instruct ansible-playbook to use those config files. Change-Id: I5eb6bcc48bcaa6b056af1af7da93f29408f9db41 --- zuul/launcher/ansiblelaunchserver.py | 42 +++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ba4eb1cddd..8c39ad72cf 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -118,6 +118,8 @@ class JobDir(object): self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') + self.pre_post_config = os.path.join(self.ansible_root, + 'ansible_pre_post.cfg') self.script_root = os.path.join(self.ansible_root, 'scripts') self.ansible_log = os.path.join(self.ansible_root, 'ansible_log.txt') os.makedirs(self.script_root) @@ -181,10 +183,24 @@ class LaunchServer(object): self.library_dir = os.path.join(ansible_dir, 'library') if not os.path.exists(self.library_dir): os.makedirs(self.library_dir) + self.pre_post_library_dir = os.path.join(ansible_dir, + 'pre_post_library') + if not os.path.exists(self.pre_post_library_dir): + os.makedirs(self.pre_post_library_dir) library_path = os.path.dirname(os.path.abspath( zuul.ansible.library.__file__)) - for fn in os.listdir(library_path): + # Ansible library modules that should be available to all + # playbooks: + all_libs = ['zuul_log.py', 'zuul_console.py'] + # Modules that should only be used by job playbooks: + job_libs = ['command.py'] + + for fn in all_libs: + shutil.copy(os.path.join(library_path, fn), self.library_dir) + shutil.copy(os.path.join(library_path, fn), + self.pre_post_library_dir) + for fn in job_libs: shutil.copy(os.path.join(library_path, fn), self.library_dir) def get_config_default(section, option, default): @@ -469,7 +485,8 @@ class LaunchServer(object): args['description'], args['labels'], self.hostname, self.zmq_send_queue, self.termination_queue, self.keep_jobdir, - self.library_dir, self.options) + self.library_dir, self.pre_post_library_dir, + self.options) self.node_workers[worker.name] = worker worker.thread = threading.Thread(target=worker.run) @@ -549,7 +566,8 @@ class NodeWorker(object): def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, - termination_queue, keep_jobdir, library_dir, options): + termination_queue, keep_jobdir, library_dir, + pre_post_library_dir, options): self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config @@ -597,6 +615,7 @@ class NodeWorker(object): else: self.username = 'zuul' self.library_dir = library_dir + self.pre_post_library_dir = pre_post_library_dir self.options = options def isAlive(self): @@ -1465,7 +1484,15 @@ class NodeWorker(object): tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) - with open(jobdir.config, 'w') as config: + self._writeAnsibleConfig(jobdir, jobdir.config, + library=self.library_dir) + self._writeAnsibleConfig(jobdir, jobdir.pre_post_config, + library=self.pre_post_library_dir) + + return timeout + + def _writeAnsibleConfig(self, jobdir, fn, library): + with open(fn, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) config.write('local_tmp = %s/.ansible/local_tmp\n' % jobdir.root) @@ -1474,7 +1501,7 @@ class NodeWorker(object): config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % jobdir.ansible_log) config.write('gathering = explicit\n') - config.write('library = %s\n' % self.library_dir) + config.write('library = %s\n' % library) # TODO(mordred) This can be removed once we're using ansible 2.2 config.write('module_set_locale = False\n') # bump the timeout because busy nodes may take more than @@ -1495,8 +1522,6 @@ class NodeWorker(object): "-o UserKnownHostsFile=%s" % jobdir.known_hosts config.write('ssh_args = %s\n' % ssh_args) - return timeout - def _ansibleTimeout(self, proc, msg): self._watchdog_timeout = True self.log.warning(msg) @@ -1507,6 +1532,7 @@ class NodeWorker(object): # the correct user. env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + env_copy['ANSIBLE_CONFIG'] = jobdir.pre_post_config if self.options['verbose']: verbose = '-vvv' @@ -1547,6 +1573,7 @@ class NodeWorker(object): # the correct user. env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + env_copy['ANSIBLE_CONFIG'] = jobdir.config if self.options['verbose']: verbose = '-vvv' @@ -1596,6 +1623,7 @@ class NodeWorker(object): # the correct user. env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + env_copy['ANSIBLE_CONFIG'] = jobdir.pre_post_config if self.options['verbose']: verbose = '-vvv' From bafbc5b328a3584dc0d8f2c84e0306e84f083475 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 7 Nov 2016 14:21:49 -0800 Subject: [PATCH 27/29] Ansible launcher: move AFS publisher into a module The logic to rsync files into AFS is very complex, requiring an rsync command for each of the pseudo-build-roots that are produced by our docs jobs. Rather than try to do this in ansible YAML, move it into an ansible module where it is much simpler. Change-Id: I4cab8003442734ed48c67e09ea8407ec69303d87 --- zuul/ansible/library/zuul_afs.py | 121 +++++++++++++++++++++ zuul/launcher/ansiblelaunchserver.py | 156 ++------------------------- 2 files changed, 129 insertions(+), 148 deletions(-) create mode 100644 zuul/ansible/library/zuul_afs.py diff --git a/zuul/ansible/library/zuul_afs.py b/zuul/ansible/library/zuul_afs.py new file mode 100644 index 0000000000..3ba426b8fa --- /dev/null +++ b/zuul/ansible/library/zuul_afs.py @@ -0,0 +1,121 @@ +#!/usr/bin/python + +# Copyright (c) 2016 Red Hat +# +# This module is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this software. If not, see . + +import os +import subprocess + + +def afs_sync(afsuser, afskeytab, afsroot, afssource, afstarget): + # Find the list of root markers in the just-completed build + # (usually there will only be one, but some builds produce content + # at the root *and* at a tag location, or possibly at multiple + # translation roots). + src_root_markers = [] + for root, dirnames, filenames in os.walk(afssource): + if '.root-marker' in filenames: + src_root_markers.append(root) + + output_blocks = [] + # Synchronize the content at each root marker. + for root_count, src_root in enumerate(src_root_markers): + # The component of the path between the source root and the + # current source root marker. May be '.' if there is a marker + # at the root. + subpath = os.path.relpath(src_root, afssource) + + # Add to our debugging output + output = dict(subpath=subpath) + output_blocks.append(output) + + # The absolute path to the source (in staging) and destination + # (in afs) of the build root for the current root marker. + subsource = os.path.abspath(os.path.join(afssource, subpath)) + subtarget = os.path.abspath(os.path.join(afstarget, subpath)) + + # Create a filter list for rsync so that we copy exactly the + # directories we want to without deleting any existing + # directories in the published site that were placed there by + # previous builds. + + # Exclude any directories under this subpath which have root + # markers. + excludes = [] + for root, dirnames, filenames in os.walk(subtarget): + if '.root-marker' in filenames: + exclude_subpath = os.path.relpath(root, subtarget) + if exclude_subpath == '.': + continue + excludes.append(os.path.join('/', exclude_subpath)) + output['excludes'] = excludes + + filter_file = os.path.join(afsroot, 'filter_%i' % root_count) + + with open(filter_file, 'w') as f: + for exclude in excludes: + f.write('- %s\n' % exclude) + + # Perform the rsync with the filter list. + rsync_cmd = ' '.join([ + '/usr/bin/rsync', '-rtp', '--safe-links', '--delete-after', + "--out-format='<>%i %n%L'", + "--filter='merge {filter}'", '{src}/', '{dst}/', + ]) + mkdir_cmd = ' '.join(['mkdir', '-p', '{dst}/']) + bash_cmd = ' '.join([ + '/bin/bash', '-c', '"{mkdir_cmd} && {rsync_cmd}"' + ]).format( + mkdir_cmd=mkdir_cmd, + rsync_cmd=rsync_cmd) + + k5start_cmd = ' '.join([ + '/usr/bin/k5start', '-t', '-f', '{keytab}', '{user}', '--', + bash_cmd, + ]) + + shell_cmd = k5start_cmd.format( + src=subsource, + dst=subtarget, + filter=filter_file, + user=afsuser, + keytab=afskeytab), + output['source'] = subsource + output['destination'] = subtarget + output['output'] = subprocess.check_output(shell_cmd, shell=True) + + return output_blocks + + +def main(): + module = AnsibleModule( + argument_spec=dict( + user=dict(required=True, type='raw'), + keytab=dict(required=True, type='raw'), + root=dict(required=True, type='raw'), + source=dict(required=True, type='raw'), + target=dict(required=True, type='raw'), + ) + ) + + p = module.params + output = afs_sync(p['user'], p['keytab'], p['root'], + p['source'], p['target']) + module.exit_json(changed=True, build_roots=output) + +from ansible.module_utils.basic import * # noqa + +if __name__ == '__main__': + main() diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 8c39ad72cf..49d9bcf931 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -192,7 +192,7 @@ class LaunchServer(object): zuul.ansible.library.__file__)) # Ansible library modules that should be available to all # playbooks: - all_libs = ['zuul_log.py', 'zuul_console.py'] + all_libs = ['zuul_log.py', 'zuul_console.py', 'zuul_afs.py'] # Modules that should only be used by job playbooks: job_libs = ['command.py'] @@ -1120,15 +1120,6 @@ class NodeWorker(object): raise Exception("Undefined AFS site: %s" % site) site = self.sites[site] - # It is possible that this could be done in one rsync step, - # however, the current rysnc from the host is complicated (so - # that we can match the behavior of ant), and then rsync to - # afs is complicated and involves a pre-processing step in - # both locations (so that we can exclude directories). Each - # is well understood individually so it is easier to compose - # them in series than combine them together. A better, - # longer-lived solution (with better testing) would do just - # that. afsroot = tempfile.mkdtemp(dir=jobdir.staging_root) afscontent = os.path.join(afsroot, 'content') afssource = afscontent @@ -1162,145 +1153,14 @@ class NodeWorker(object): raise Exception("Target path %s is not below site root" % (afstarget,)) - src_markers_file = os.path.join(afsroot, 'src-markers') - dst_markers_file = os.path.join(afsroot, 'dst-markers') - exclude_file = os.path.join(afsroot, 'exclude') - filter_file = os.path.join(afsroot, 'filter') + afsargs = dict(user=site['user'], + keytab=site['keytab'], + root=afsroot, + source=afssource, + target=afstarget) - find_pipe = [ - "/usr/bin/find {path} -name .root-marker -printf '/%P\n'", - "/usr/bin/xargs -I{{}} dirname {{}}", - "/usr/bin/sort > {file}"] - find_pipe = ' | '.join(find_pipe) - - # Find the list of root markers in the just-completed build - # (usually there will only be one, but some builds produce - # content at the root *and* at a tag location). - task = dict(name='find root markers in build', - shell=find_pipe.format(path=afssource, - file=src_markers_file), - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # Find the list of root markers that already exist in the - # published site. - task = dict(name='find root markers in site', - shell=find_pipe.format(path=afstarget, - file=dst_markers_file), - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # Create a file that contains the set of directories with root - # markers in the published site that do not have root markers - # in the built site. - exclude_command = "/usr/bin/comm -23 {dst} {src} > {exclude}".format( - src=src_markers_file, - dst=dst_markers_file, - exclude=exclude_file) - task = dict(name='produce list of root maker differences', - shell=exclude_command, - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # Create a filter list for rsync so that we copy exactly the - # directories we want to without deleting any existing - # directories in the published site that were placed there by - # previous builds. - - # The first group of items in the filter list are the - # directories in the current build with root markers, except - # for the root of the build. This is so that if, later, the - # build root ends up as an exclude, we still copy the - # directories in this build underneath it (since these - # includes will have matched first). We can't include the - # build root itself here, even if we do want to synchronize - # it, since that would defeat later excludes. In other words, - # if the build produces a root marker in "/subdir" but not in - # "/", this section is needed so that "/subdir" is copied at - # all, since "/" will be excluded later. - - command = ("/bin/grep -v '^/$' {src} | " - "/bin/sed -e 's/^/+ /' > {filter}".format( - src=src_markers_file, - filter=filter_file)) - task = dict(name='produce first filter list', - shell=command, - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # The second group is the set of directories that are in the - # published site but not in the built site. This is so that - # if the built site does contain a marker at root (meaning - # that there is content that should be copied into the root) - # that we don't delete everything else previously built - # underneath the root. - - command = ("/bin/grep -v '^/$' {exclude} | " - "/bin/sed -e 's/^/- /' >> {filter}".format( - exclude=exclude_file, - filter=filter_file)) - task = dict(name='produce second filter list', - shell=command, - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # The last entry in the filter file is for the build root. If - # there is no marker in the build root, then we need to - # exclude it from the rsync, so we add it here. It needs to - # be in the form of '/*' so that it matches all of the files - # in the build root. If there is no marker at the build root, - # then we should omit the '/*' exclusion so that it is - # implicitly included. - - command = ("/bin/grep '^/$' {exclude} && " - "echo '- /*' >> {filter} || " - "/bin/true".format( - exclude=exclude_file, - filter=filter_file)) - task = dict(name='produce third filter list', - shell=command, - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - task = dict(name='cat filter list', - shell='cat {filter}'.format(filter=filter_file), - when='success|bool', - delegate_to='127.0.0.1') - tasks.append(task) - - # Perform the rsync with the filter list. - rsync_cmd = ' '.join([ - '/usr/bin/rsync', '-rtp', '--safe-links', '--delete-after', - "--out-format='<>%i %n%L'", - "--filter='merge {filter}'", '{src}/', '{dst}/', - ]) - mkdir_cmd = ' '.join(['mkdir', '-p', '{dst}/']) - bash_cmd = ' '.join([ - '/bin/bash', '-c', '"{mkdir_cmd} && {rsync_cmd}"' - ]).format( - mkdir_cmd=mkdir_cmd, - rsync_cmd=rsync_cmd) - - k5start_cmd = ' '.join([ - '/usr/bin/k5start', '-t', '-f', '{keytab}', '{user}', '--', - bash_cmd, - ]) - - shellargs = k5start_cmd.format( - src=afssource, - dst=afstarget, - filter=filter_file, - user=site['user'], - keytab=site['keytab']) - - task = dict(name='k5start write files to AFS', - shell=shellargs, + task = dict(name='Synchronize files to AFS', + zuul_afs=afsargs, when='success|bool', delegate_to='127.0.0.1') tasks.append(task) From 71d98174067951bd2dbe24b57747159e24a0a42f Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Tue, 8 Nov 2016 10:56:31 -0500 Subject: [PATCH 28/29] Add attempts logic for jobs Today, if a job is aborted, zuul will launch said job until success / failure. If the job continues to abort, it will loop forever. As a result, we now added the ability to limit this. By default we'll try to relaunch an aborted job a total of 3 times, before RETRY_LIMIT is returned as the result. Change-Id: Ie26fdc29c07430ebfb3df8be8ac1786d63d7e0fe Signed-off-by: Paul Belanger --- doc/source/zuul.rst | 5 ++++ tests/base.py | 3 +++ tests/fixtures/layout-abort-attempts.yaml | 30 +++++++++++++++++++++ tests/test_scheduler.py | 33 +++++++++++++++++++++++ zuul/launcher/gearman.py | 6 +++++ zuul/layoutvalidator.py | 1 + zuul/model.py | 9 +++++++ zuul/scheduler.py | 1 + 8 files changed, 88 insertions(+) create mode 100644 tests/fixtures/layout-abort-attempts.yaml diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index 2285ecbb66..e8279d9b44 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -803,6 +803,11 @@ each job as it builds a list from the project specification. Boolean value (``true`` or ``false``) that indicates whatever a job is voting or not. Default: ``true``. +**attempts (optional)** + Number of attempts zuul will launch a job. Once reached, zuul will report + RETRY_LIMIT as the job result. + Defaults to 3. + **tags (optional)** A list of arbitrary strings which will be associated with the job. Can be used by the parameter-function to alter behavior based on diff --git a/tests/base.py b/tests/base.py index c5b5b780cd..a14b4a9ea0 100755 --- a/tests/base.py +++ b/tests/base.py @@ -540,6 +540,7 @@ class FakeBuild(threading.Thread): self.wait_condition = threading.Condition() self.waiting = False self.aborted = False + self.requeue = False self.created = time.time() self.description = '' self.run_error = False @@ -602,6 +603,8 @@ class FakeBuild(threading.Thread): result = 'FAILURE' if self.aborted: result = 'ABORTED' + if self.requeue: + result = None if self.run_error: work_fail = True diff --git a/tests/fixtures/layout-abort-attempts.yaml b/tests/fixtures/layout-abort-attempts.yaml new file mode 100644 index 0000000000..86d9d783d9 --- /dev/null +++ b/tests/fixtures/layout-abort-attempts.yaml @@ -0,0 +1,30 @@ +pipelines: + - name: check + manager: IndependentPipelineManager + trigger: + gerrit: + - event: patchset-created + success: + gerrit: + verified: 1 + failure: + gerrit: + verified: -1 + + - name: post + manager: IndependentPipelineManager + trigger: + gerrit: + - event: ref-updated + ref: ^(?!refs/).*$ + +jobs: + - name: project-test1 + attempts: 4 + +projects: + - name: org/project + check: + - project-merge: + - project-test1 + - project-test2 diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 335f98741d..b6fa4a35f2 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -4481,3 +4481,36 @@ For CI problems and help debugging, contact ci@example.org""" self.assertIn( '- docs-draft-test2 https://server/job/docs-draft-test2/1/', body[3]) + + def test_rerun_on_abort(self): + "Test that if a worker fails to run a job, it is run again" + + self.config.set('zuul', 'layout_config', + 'tests/fixtures/layout-abort-attempts.yaml') + self.sched.reconfigure(self.config) + self.worker.hold_jobs_in_build = True + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + self.worker.release('.*-merge') + self.waitUntilSettled() + + self.assertEqual(len(self.builds), 2) + self.builds[0].requeue = True + self.worker.release('.*-test*') + self.waitUntilSettled() + + for x in range(3): + self.assertEqual(len(self.builds), 1) + self.builds[0].requeue = True + self.worker.release('.*-test1') + self.waitUntilSettled() + + self.worker.hold_jobs_in_build = False + self.worker.release() + self.waitUntilSettled() + self.assertEqual(len(self.history), 6) + self.assertEqual(self.countJobResults(self.history, 'SUCCESS'), 2) + self.assertEqual(A.reported, 1) + self.assertIn('RETRY_LIMIT', A.messages[0]) diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index 02f78fdbac..2840ba6523 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -367,6 +367,12 @@ class Gearman(object): self.onBuildCompleted(gearman_job, 'NOT_REGISTERED') return build + # NOTE(pabelanger): Rather then looping forever, check to see if job + # has passed attempts limit. + if item.current_build_set.getTries(job.name) > job.attempts: + self.onBuildCompleted(gearman_job, 'RETRY_LIMIT') + return build + if pipeline.precedence == zuul.model.PRECEDENCE_NORMAL: precedence = gear.PRECEDENCE_NORMAL elif pipeline.precedence == zuul.model.PRECEDENCE_HIGH: diff --git a/zuul/layoutvalidator.py b/zuul/layoutvalidator.py index e1e8ac6048..91e15d1fed 100644 --- a/zuul/layoutvalidator.py +++ b/zuul/layoutvalidator.py @@ -103,6 +103,7 @@ class LayoutSchema(object): 'success-pattern': str, 'hold-following-changes': bool, 'voting': bool, + 'attempts': int, 'mutex': str, 'tags': toList(str), 'parameter-function': str, diff --git a/zuul/model.py b/zuul/model.py index 46b0b98c95..b24a06bd25 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -466,6 +466,8 @@ class Job(object): self._files = [] self.skip_if_matcher = None self.swift = {} + # Number of attempts to launch a job before giving up. + self.attempts = 3 def __str__(self): return self.name @@ -646,6 +648,7 @@ class BuildSet(object): self.unable_to_merge = False self.failing_reasons = [] self.merge_state = self.NEW + self.tries = {} def __repr__(self): return '' % ( @@ -671,9 +674,12 @@ class BuildSet(object): def addBuild(self, build): self.builds[build.job.name] = build + if build.job.name not in self.tries: + self.tries[build.job.name] = 1 build.build_set = self def removeBuild(self, build): + self.tries[build.job.name] += 1 del self.builds[build.job.name] def getBuild(self, job_name): @@ -684,6 +690,9 @@ class BuildSet(object): keys.sort() return [self.builds.get(x) for x in keys] + def getTries(self, job_name): + return self.tries.get(job_name) + class QueueItem(object): """A changish inside of a Pipeline queue""" diff --git a/zuul/scheduler.py b/zuul/scheduler.py index b52931ee69..8c265415a4 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -529,6 +529,7 @@ class Scheduler(threading.Thread): m = config_job.get('hold-following-changes', False) if m: job.hold_following_changes = True + job.attempts = config_job.get('attempts', 3) m = config_job.get('voting', None) if m is not None: job.voting = m From 63a595bae3f3db85582ddba2d51ad4124877034f Mon Sep 17 00:00:00 2001 From: Clark Boylan Date: Wed, 16 Nov 2016 11:46:18 -0800 Subject: [PATCH 29/29] Don't retry when using synchronize module There is a bug (https://github.com/ansible/ansible/issues/18281) in the ansible synchronize module that causes any retry attempt at synchronizing to fail because the paths get munged resulting in invalid paths. Unfortunately this also means that the error message we get is not for the first failed sync attempt but for the last making it hard to debug why things failed in the first place. Address this by not attempting to retry until ansible is fixed. This way we get accurate error messages more quickly (as we don't retry over and over and generate a bad error message at the end). Change-Id: I545c44b11f37576edc8768a3ed78962ff870995f --- zuul/launcher/ansiblelaunchserver.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 49d9bcf931..eeea8f8f51 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1009,7 +1009,11 @@ class NodeWorker(object): synchronize=syncargs) if not scpfile.get('copy-after-failure'): task['when'] = 'success|bool' - task.update(self.retry_args) + # We don't use retry_args here because there is a bug in + # the synchronize module that breaks subsequent attempts at + # retrying. Better to try once and get an accurate error + # message if it fails. + # https://github.com/ansible/ansible/issues/18281 tasks.append(task) task = self._makeSCPTaskLocalAction( @@ -1084,7 +1088,10 @@ class NodeWorker(object): task = dict(name='copy files from node', synchronize=syncargs, when='success|bool') - task.update(self.retry_args) + # We don't use retry_args here because there is a bug in the + # synchronize module that breaks subsequent attempts at retrying. + # Better to try once and get an accurate error message if it fails. + # https://github.com/ansible/ansible/issues/18281 tasks.append(task) task = dict(name='FTP files to server', shell='lftp -f %s' % ftpscript, @@ -1142,7 +1149,10 @@ class NodeWorker(object): task = dict(name='copy files from node', synchronize=syncargs, when='success|bool') - task.update(self.retry_args) + # We don't use retry_args here because there is a bug in the + # synchronize module that breaks subsequent attempts at retrying. + # Better to try once and get an accurate error message if it fails. + # https://github.com/ansible/ansible/issues/18281 tasks.append(task) afstarget = afs['target'].lstrip('/')