From d9ac4c18b1da7d74e85de3d2c989c67d5629a434 Mon Sep 17 00:00:00 2001 From: Antoine Musso Date: Thu, 24 Jul 2014 14:44:02 +0200 Subject: [PATCH 001/152] Zuul references cleaner Zuul mergers create a vast number of git references under /refs/zuul which are never garbage collected. With hundred of thousands of references, that makes git fetch operations very slow since git uploads all references to Gerrit to synchronize the Zuul maintained repository. On one of Wikimedia busy repository (mediawiki/core) we had 55000 such references and it can take up to 18 seconds for a fetch to complete. I have seen occurences of a merge taking 2 minutes to complete. As such, this tiny script clears out references for which the commit date of the pointed commit object is older than 360 days (the default). It is not perfect since a recent reference can well point to an old object. That would be the case on repositories that are barely active. In such case the ref will be gone despite it being recently created. A better way would be to vary Zuul references by using month/day which will let one easily garbage collect them. But I am being lazy and that would not let us clear out references using the current scheme. Example usage: zuul-clear-refs.py --verbose --dry-run --until 90 /srv/zuul/git/project Would show a list of references pointing to commit dates older than 90 days and output a message whenever the script would delete them. Hint about the utility in our merger documentation. Reference: https://phabricator.wikimedia.org/T70481 Change-Id: Id4e55f5d571ebd5e8271e516f53f8e05c1f78c1a --- doc/source/merger.rst | 14 ++++++ tools/zuul-clear-refs.py | 94 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100755 tools/zuul-clear-refs.py diff --git a/doc/source/merger.rst b/doc/source/merger.rst index e01bc8c7df..82e204b2cc 100644 --- a/doc/source/merger.rst +++ b/doc/source/merger.rst @@ -58,3 +58,17 @@ instance, a clone will produce a repository in an unpredictable state depending on what the state of Zuul's repository is when the clone happens). They are, however, suitable for automated systems that respond to Zuul triggers. + +Clearing old references +~~~~~~~~~~~~~~~~~~~~~~~ + +The references created under refs/zuul are not garbage collected. Since +git fetch send them all to Gerrit to sync the repositories, the time +spent on merge will slightly grow overtime and start being noticeable. + +To clean them you can use the ``tools/zuul-clear-refs.py`` script on +each repositories. It will delete Zuul references that point to commits +for which the commit date is older than a given amount of days (default +360):: + + ./tools/zuul-clear-refs.py /path/to/zuul/git/repo diff --git a/tools/zuul-clear-refs.py b/tools/zuul-clear-refs.py new file mode 100755 index 0000000000..60ce74422f --- /dev/null +++ b/tools/zuul-clear-refs.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# Copyright 2014-2015 Antoine "hashar" Musso +# Copyright 2014-2015 Wikimedia Foundation Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# pylint: disable=locally-disabled, invalid-name + +""" +Zuul references cleaner. + +Clear up references under /refs/zuul/ by inspecting the age of the commit the +reference points to. If the commit date is older than a number of days +specificed by --until, the reference is deleted from the git repository. + +Use --dry-run --verbose to finely inspect the script behavior. +""" + +import argparse +import git +import logging +import time +import sys + +NOW = int(time.time()) +DEFAULT_DAYS = 360 +ZUUL_REF_PREFIX = 'refs/zuul/' + +parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, +) +parser.add_argument('--until', dest='days_ago', default=DEFAULT_DAYS, type=int, + help='references older than this number of day will ' + 'be deleted. Default: %s' % DEFAULT_DAYS) +parser.add_argument('-n', '--dry-run', dest='dryrun', action='store_true', + help='do not delete references') +parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', + help='set log level from info to debug') +parser.add_argument('gitrepo', help='path to a Zuul git repository') +args = parser.parse_args() + +logging.basicConfig() +log = logging.getLogger('zuul-clear-refs') +if args.verbose: + log.setLevel(logging.DEBUG) +else: + log.setLevel(logging.INFO) + +try: + repo = git.Repo(args.gitrepo) +except git.exc.InvalidGitRepositoryError: + log.error("Invalid git repo: %s" % args.gitrepo) + sys.exit(1) + +for ref in repo.references: + + if not ref.path.startswith(ZUUL_REF_PREFIX): + continue + if type(ref) is not git.refs.reference.Reference: + # Paranoia: ignore heads/tags/remotes .. + continue + + try: + commit_ts = ref.commit.committed_date + except LookupError: + # GitPython does not properly handle PGP signed tags + log.exception("Error in commit: %s, ref: %s. Type: %s", + ref.commit, ref.path, type(ref)) + continue + + commit_age = int((NOW - commit_ts) / 86400) # days + log.debug( + "%s at %s is %3s days old", + ref.commit, + ref.path, + commit_age, + ) + if commit_age > args.days_ago: + if args.dryrun: + log.info("Would delete old ref: %s (%s)", ref.path, ref.commit) + else: + log.info("Deleting old ref: %s (%s)", ref.path, ref.commit) + ref.delete(repo, ref.path) From f9af4cd5106704fb617adbccbbaf681d2e6d0177 Mon Sep 17 00:00:00 2001 From: Bruno Tavares Date: Thu, 15 Oct 2015 14:55:51 -0300 Subject: [PATCH 002/152] Update documentation on statsd metrics generated. Updates the metrics documentation to reflect the statistics generated on the scheduler code. Add the hierarchy of metrics, type of each metric and unit of measure. Co-Authored-By: Danilo Ramalho Change-Id: I9eb98efe85317fba404d9c5c087c33762f7eb4ab --- doc/source/statsd.rst | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/statsd.rst b/doc/source/statsd.rst index f789d612a1..b3bf99f329 100644 --- a/doc/source/statsd.rst +++ b/doc/source/statsd.rst @@ -31,7 +31,7 @@ Metrics The metrics are emitted by the Zuul scheduler (`zuul/scheduler.py`): -**gerrit.events. (counters)** +**gerrit.event. (counters)** Gerrit emits different kind of message over its `stream-events` interface. As a convenience, Zuul emits metrics to statsd which save you from having to use a different daemon to measure Gerrit events. @@ -52,6 +52,18 @@ The metrics are emitted by the Zuul scheduler (`zuul/scheduler.py`): Refer to your Gerrit installation documentation for an exhaustive list of Gerrit event types. +**zuul.node_type.** + Holds metrics specifc to build nodes per label. The hierarchy is: + + #. **** each of the labels associated to a build in + Jenkins. It contains: + + #. **job.** subtree detailing per job statistics: + + #. **wait_time** counter and timer of the wait time, with the + difference of the job start time and the launch time, in + milliseconds. + **zuul.pipeline.** Holds metrics specific to jobs. The hierarchy is: @@ -75,10 +87,13 @@ The metrics are emitted by the Zuul scheduler (`zuul/scheduler.py`): known by Zuul (which includes build time and Zuul overhead). #. **total_changes** counter of the number of change proceeding since Zuul started. + #. **wait_time** counter and timer of the wait time, with the difference + of the job start time and the launch time, in milliseconds. Additionally, the `zuul.pipeline.` hierarchy contains - `current_changes` and `resident_time` metrics for each projects. The slash - separator used in Gerrit name being replaced by dots. + `current_changes` (gauge), `resident_time` (timing) and `total_changes` + (counter) metrics for each projects. The slash separator used in Gerrit name + being replaced by dots. As an example, given a job named `myjob` triggered by the `gate` pipeline which took 40 seconds to build, the Zuul scheduler will emit the following From 371ed667c8588543559f04ab11cc13953a6bb25f Mon Sep 17 00:00:00 2001 From: Jan Hruban Date: Thu, 17 Dec 2015 16:32:30 +0100 Subject: [PATCH 003/152] Clarify zuul-cloner message about missing branch Inform correctly about the missing branch in case that ZUUL_BRANCH is set, but options --branch is empty. Before: INFO:zuul.Cloner:upstream repo is missing branch None After: INFO:zuul.Cloner:upstream repo is missing branch stable Also do not print out anything when ZUUL_BRANCH nor --branch is given. Change-Id: I74b3e72b3066f03bcf3a10af3657bdf494d3cc6b --- zuul/lib/cloner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zuul/lib/cloner.py b/zuul/lib/cloner.py index 0ac7f0fdb7..1ba31b77b0 100644 --- a/zuul/lib/cloner.py +++ b/zuul/lib/cloner.py @@ -148,8 +148,9 @@ class Cloner(object): self.log.info("upstream repo has branch %s", indicated_branch) fallback_branch = indicated_branch else: - self.log.info("upstream repo is missing branch %s", - self.branch) + if indicated_branch: + self.log.info("upstream repo is missing branch %s", + indicated_branch) # FIXME should be origin HEAD branch which might not be 'master' fallback_branch = 'master' From 88ef0ea8ab5356c477dee090d18a69296f36f57e Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Wed, 23 Dec 2015 11:57:02 -0500 Subject: [PATCH 004/152] Expose webapp listen_address and port Move the hardcoded settings into the zuul.conf file to allow a user to better customize them. Change-Id: I9c817efc615ac3e8f8a7f4680dad14ef6cf3cc3b Signed-off-by: Paul Belanger --- doc/source/zuul.rst | 11 +++++++++++ etc/zuul.conf-sample | 4 ++++ tests/base.py | 3 ++- zuul/cmd/server.py | 15 ++++++++++++++- zuul/webapp.py | 9 ++++++--- 5 files changed, 37 insertions(+), 5 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index ad8ec2e103..a806c5845d 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -64,6 +64,17 @@ gearman_server Path to log config file for internal Gearman server. ``log_config=/etc/zuul/gearman-logging.yaml`` +webapp +"""""" + +**listen_address** + IP address or domain name on which to listen (default: 0.0.0.0). + ``listen_address=127.0.0.1`` + +**port** + Port on which the webapp is listening (default: 8001). + ``port=8008`` + zuul """" diff --git a/etc/zuul.conf-sample b/etc/zuul.conf-sample index 21c1317d6d..d7b8eaeb55 100644 --- a/etc/zuul.conf-sample +++ b/etc/zuul.conf-sample @@ -26,6 +26,10 @@ default_container=logs region_name=EXP logserver_prefix=http://logs.example.org/server.app/ +[webapp] +listen_address=0.0.0.0 +port=8001 + [connection gerrit] driver=gerrit server=review.example.com diff --git a/tests/base.py b/tests/base.py index f3bfa4ea8a..c866ddc345 100755 --- a/tests/base.py +++ b/tests/base.py @@ -962,7 +962,8 @@ class ZuulTestCase(BaseTestCase): self.sched.setLauncher(self.launcher) self.sched.setMerger(self.merge_client) - self.webapp = zuul.webapp.WebApp(self.sched, port=0) + self.webapp = zuul.webapp.WebApp( + self.sched, port=0, listen_address='127.0.0.1') self.rpc = zuul.rpclistener.RPCListener(self.config, self.sched) self.sched.start() diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index 2aca4f2c02..550fad9bb2 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -174,7 +174,20 @@ class Server(zuul.cmd.ZuulApp): cache_expiry = self.config.getint('zuul', 'status_expiry') else: cache_expiry = 1 - webapp = zuul.webapp.WebApp(self.sched, cache_expiry=cache_expiry) + + if self.config.has_option('webapp', 'listen_address'): + listen_address = self.config.get('webapp', 'listen_address') + else: + listen_address = '0.0.0.0' + + if self.config.has_option('webapp', 'port'): + port = self.config.getint('webapp', 'port') + else: + port = 8001 + + webapp = zuul.webapp.WebApp( + self.sched, port=port, cache_expiry=cache_expiry, + listen_address=listen_address) rpc = zuul.rpclistener.RPCListener(self.config, self.sched) self.configure_connections() diff --git a/zuul/webapp.py b/zuul/webapp.py index 44c333bf95..c1c848b211 100644 --- a/zuul/webapp.py +++ b/zuul/webapp.py @@ -43,16 +43,19 @@ array of changes, they will not include the queue structure. class WebApp(threading.Thread): log = logging.getLogger("zuul.WebApp") - def __init__(self, scheduler, port=8001, cache_expiry=1): + def __init__(self, scheduler, port=8001, cache_expiry=1, + listen_address='0.0.0.0'): threading.Thread.__init__(self) self.scheduler = scheduler + self.listen_address = listen_address self.port = port self.cache_expiry = cache_expiry self.cache_time = 0 self.cache = None self.daemon = True - self.server = httpserver.serve(dec.wsgify(self.app), host='0.0.0.0', - port=self.port, start_loop=False) + self.server = httpserver.serve( + dec.wsgify(self.app), host=self.listen_address, port=self.port, + start_loop=False) def run(self): self.server.serve_forever() From c7bbf5857ba8733b2445b7364eb159082aae704b Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Wed, 13 Jan 2016 10:51:28 -0500 Subject: [PATCH 005/152] Log 'Received unrecognized event type' as warning Since zuul continues to function when it receives an unknown event, let's reduce the log level to a warning. Change-Id: If0f763f47b3d775410f608876babb5fa8f69ae96 Signed-off-by: Paul Belanger --- zuul/connection/gerrit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/connection/gerrit.py b/zuul/connection/gerrit.py index f8e5add617..4671ff9abb 100644 --- a/zuul/connection/gerrit.py +++ b/zuul/connection/gerrit.py @@ -94,7 +94,7 @@ class GerritEventConnector(threading.Thread): try: event.account = data.get(accountfield_from_type[event.type]) except KeyError: - self.log.error("Received unrecognized event type '%s' from Gerrit.\ + self.log.warning("Received unrecognized event type '%s' from Gerrit.\ Can not get account information." % event.type) event.account = None From 4b4e28f0a346bc653bd07f3560df7db5eea212ef Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 4 Feb 2016 11:05:32 -0800 Subject: [PATCH 006/152] Use high precedence when manually submitting jobs Change-Id: Ida30e4e2cfad78c38681631bde46a907f8d25bbe --- tools/trigger-job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/trigger-job.py b/tools/trigger-job.py index dff4e3fd1d..7123afce86 100755 --- a/tools/trigger-job.py +++ b/tools/trigger-job.py @@ -68,7 +68,7 @@ def main(): job = gear.Job("build:%s" % args.job, json.dumps(data), unique=data['ZUUL_UUID']) - c.submitJob(job) + c.submitJob(job, precedence=gear.PRECEDENCE_HIGH) while not job.complete: time.sleep(1) From fe20ccf98ba21e904e285aeb3179b24abc0876c8 Mon Sep 17 00:00:00 2001 From: Antoine Musso Date: Thu, 16 Oct 2014 15:32:14 +0200 Subject: [PATCH 007/152] Update merge status after merge:merge is submitted When preparing a reference, we set the merge state to PENDING before emitting the merge:merge function. If any exception occurs when submitting the merge:merge job, the buildset is left PENDING and is never retried because prepareRef() early exit in such case. Move the merge_state change after the job has been submitted. An exception would let the state as is (ie NEW) and thus indicate it should be retried. Closes-Bug: #1358517 Change-Id: I4d91a15aaae878ed231d50ab5f4f7a65f0d0e830 --- zuul/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index e1aa0c213b..28b42d3c0a 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -1441,7 +1441,6 @@ class BasePipelineManager(object): return True if build_set.merge_state == build_set.PENDING: return False - build_set.merge_state = build_set.PENDING ref = build_set.ref if hasattr(item.change, 'refspec') and not ref: self.log.debug("Preparing ref for: %s" % item.change) @@ -1459,6 +1458,8 @@ class BasePipelineManager(object): self.sched.merger.updateRepo(item.change.project.name, url, build_set, self.pipeline.precedence) + # merge:merge has been emitted properly: + build_set.merge_state = build_set.PENDING return False def _launchJobs(self, item, jobs): From 2c9498e65a826cceee1423ad44a01b93ecf5138f Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Fri, 4 Mar 2016 08:01:05 +0100 Subject: [PATCH 008/152] Cloner: Don't fall back on infrastructure failure Currently zuul-cloner falls back to the fallback branch when fetching of the zuul ref fails. This is intended when the zuul ref is not found on the remote. But in case the fetch fails due to infrastructure reasons (e.g. zuul-merger is not reachable or certificate verification failed) it should bail out with an error. Otherwise a already merged and tested patch will be verified which could lead to broken patches being merged. Change-Id: Iefc82603de279e36ad5972ce341b102c8d38f69e --- zuul/lib/cloner.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/zuul/lib/cloner.py b/zuul/lib/cloner.py index 257b95ded7..f0235a6965 100644 --- a/zuul/lib/cloner.py +++ b/zuul/lib/cloner.py @@ -103,7 +103,14 @@ class Cloner(object): repo.fetchFrom(zuul_remote, ref) self.log.debug("Fetched ref %s from %s", ref, project) return True - except (ValueError, GitCommandError): + except ValueError: + self.log.debug("Project %s in Zuul does not have ref %s", + project, ref) + return False + except GitCommandError as error: + # Bail out if fetch fails due to infrastructure reasons + if error.stderr.startswith('fatal: unable to access'): + raise self.log.debug("Project %s in Zuul does not have ref %s", project, ref) return False From 057aed1ee1b97e7b7f493fc98818acaa5c69c5ce Mon Sep 17 00:00:00 2001 From: Alexander Evseev Date: Wed, 28 Oct 2015 17:17:04 +0300 Subject: [PATCH 009/152] Split pipeline description by double newlines on status page Change-Id: If359702a60541cdaaeb273df3bea4c85f1b45eb1 --- etc/status/public_html/jquery.zuul.js | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/etc/status/public_html/jquery.zuul.js b/etc/status/public_html/jquery.zuul.js index c63700a5fe..9df44cee7a 100644 --- a/etc/status/public_html/jquery.zuul.js +++ b/etc/status/public_html/jquery.zuul.js @@ -490,10 +490,12 @@ $header_div.append($heading); if (typeof pipeline.description === 'string') { + var descr = $('') + $.each( pipeline.description.split(/\r?\n\r?\n/), function(index, descr_part){ + descr.append($('

').text(descr_part)); + }); $header_div.append( - $('

').append( - $('').text(pipeline.description) - ) + $('

').append(descr) ); } return $header_div; From dc963fc37b502ba1d7ee1abdef7758505a6fa3a7 Mon Sep 17 00:00:00 2001 From: Sachi King Date: Wed, 23 Mar 2016 16:00:33 +1100 Subject: [PATCH 010/152] Fix test for new WebOb In the change from WebOb 1.5.1 to 1.6.0, WebOb no longer adds a charset when the content-type is application/json. http://docs.webob.org/en/stable/whatsnew-1.6.html#bugfixes Change-Id: I78431f295e891ddaf38809d7de55a7195fb9432d --- tests/test_scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 499786ccb8..b585feabef 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -2242,8 +2242,8 @@ class TestScheduler(ZuulTestCase): headers = f.info() self.assertIn('Content-Length', headers) self.assertIn('Content-Type', headers) - self.assertEqual(headers['Content-Type'], - 'application/json; charset=UTF-8') + self.assertIsNotNone(re.match('^application/json(; charset=UTF-8)?$', + headers['Content-Type'])) self.assertIn('Access-Control-Allow-Origin', headers) self.assertIn('Cache-Control', headers) self.assertIn('Last-Modified', headers) From 9a256753705ec40b695931a1dae298d6e6e746f4 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Mon, 4 Apr 2016 13:38:51 +1000 Subject: [PATCH 011/152] Register connections when testing configuration The layout validation requires the connections to be registered with the scheduler to know what connection names are valid in the layout.yaml. However avoid starting the connections so that things like the gerrit connection don't start streaming from gerrit yet. Change-Id: Ie9a03287835c6966f5ac32cac020cf2642ce27d5 --- zuul/cmd/server.py | 1 + zuul/scheduler.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index 2aca4f2c02..850fecbe6f 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -91,6 +91,7 @@ class Server(zuul.cmd.ZuulApp): logging.basicConfig(level=logging.DEBUG) self.sched = zuul.scheduler.Scheduler(self.config) self.configure_connections() + self.sched.registerConnections(self.connections, load=False) layout = self.sched.testConfig(self.config.get('zuul', 'layout_config'), self.connections) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 118cbfc7eb..93016aced1 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -313,11 +313,14 @@ class Scheduler(threading.Thread): # Any skip-if predicate can be matched to trigger a skip return cm.MatchAny(skip_matchers) - def registerConnections(self, connections): + def registerConnections(self, connections, load=True): + # load: whether or not to trigger the onLoad for the connection. This + # is useful for not doing a full load during layout validation. self.connections = connections for connection_name, connection in self.connections.items(): connection.registerScheduler(self) - connection.onLoad() + if load: + connection.onLoad() def stopConnections(self): for connection_name, connection in self.connections.items(): From 92464a2291a032b9253fd02fbdd1f61569a4522c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 5 Apr 2016 10:21:26 -0700 Subject: [PATCH 012/152] Detect dependency cycles introduced with new patchsets When Zuul updates its copy of data about a change, it protects itself from inifinite loops by detecting dependency cycles. However, this only happens when updating a change. If a change depends on another change already in Zuul's cache, it will not necessarily update the cached change, and the dependency cycle detection code will not run. This can later cause problems when Zuul attempts to work with these changes. Correct this by always performing a dependency cycle check, even on cached changes which are not updated. A test is added for this, and it also ensures that the situation can still be corrected by the user by removing the dependency cycle. Many debug log lines in the Gerrit source driver are updated to make it more clear what change is being updated in the updateChange method, since this method is recursive and otherwise logs can get somewhat confusing. Change-Id: I6ab570f734d3abed2f71d547f130d9c392b976d6 --- tests/test_scheduler.py | 39 ++++++++++++++++++++++ zuul/source/gerrit.py | 74 ++++++++++++++++++++++++++++++----------- 2 files changed, 94 insertions(+), 19 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 499786ccb8..71cfd0e2c2 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -4215,6 +4215,45 @@ For CI problems and help debugging, contact ci@example.org""" self.waitUntilSettled() self.assertEqual(self.history[-1].changes, '3,2 2,1 1,2') + def test_crd_cycle_join(self): + "Test an updated change creates a cycle" + A = self.fake_gerrit.addFakeChange('org/project2', 'master', 'A') + + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + # Create B->A + B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B') + B.data['commitMessage'] = '%s\n\nDepends-On: %s\n' % ( + B.subject, A.data['id']) + self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + # Update A to add A->B (a cycle). + A.addPatchset() + A.data['commitMessage'] = '%s\n\nDepends-On: %s\n' % ( + A.subject, B.data['id']) + # Normally we would submit the patchset-created event for + # processing here, however, we have no way of noting whether + # the dependency cycle detection correctly raised an + # exception, so instead, we reach into the source driver and + # call the method that would ultimately be called by the event + # processing. + + source = self.sched.layout.pipelines['gate'].source + with testtools.ExpectedException( + Exception, "Dependency cycle detected"): + source._getChange(u'1', u'2', True) + self.log.debug("Got expected dependency cycle exception") + + # Now if we update B to remove the depends-on, everything + # should be okay. B; A->B + + B.addPatchset() + B.data['commitMessage'] = '%s\n' % (B.subject,) + source._getChange(u'1', u'2', True) + source._getChange(u'2', u'2', True) + def test_disable_at(self): "Test a pipeline will only report to the disabled trigger when failing" diff --git a/zuul/source/gerrit.py b/zuul/source/gerrit.py index eb8705d9a2..73cf726eee 100644 --- a/zuul/source/gerrit.py +++ b/zuul/source/gerrit.py @@ -20,6 +20,20 @@ from zuul.model import Change, Ref, NullChange from zuul.source import BaseSource +# Walk the change dependency tree to find a cycle +def detect_cycle(change, history=None): + if history is None: + history = [] + else: + history = history[:] + history.append(change.number) + for dep in change.needs_changes: + if dep.number in history: + raise Exception("Dependency cycle detected: %s in %s" % ( + dep.number, history)) + detect_cycle(dep, history) + + class GerritSource(BaseSource): name = 'gerrit' log = logging.getLogger("zuul.source.Gerrit") @@ -60,6 +74,10 @@ class GerritSource(BaseSource): data = self.connection.query(change.number) change._data = data change.is_merged = self._isMerged(change) + if change.is_merged: + self.log.debug("Change %s is merged" % (change,)) + else: + self.log.debug("Change %s is not merged" % (change,)) if not head: return change.is_merged if not change.is_merged: @@ -82,7 +100,6 @@ class GerritSource(BaseSource): status = data.get('status') if not status: return False - self.log.debug("Change %s status: %s" % (change, status)) if status == 'MERGED': return True return False @@ -177,7 +194,7 @@ class GerritSource(BaseSource): (record.get('number'),)) return changes - def _getDependsOnFromCommit(self, message): + def _getDependsOnFromCommit(self, message, change): records = [] seen = set() for match in self.depends_on_re.findall(message): @@ -187,17 +204,19 @@ class GerritSource(BaseSource): continue seen.add(match) query = "change:%s" % (match,) - self.log.debug("Running query %s to find needed changes" % - (query,)) + self.log.debug("Updating %s: Running query %s " + "to find needed changes" % + (change, query,)) records.extend(self.connection.simpleQuery(query)) return records - def _getNeededByFromCommit(self, change_id): + def _getNeededByFromCommit(self, change_id, change): records = [] seen = set() query = 'message:%s' % change_id - self.log.debug("Running query %s to find changes needed-by" % - (query,)) + self.log.debug("Updating %s: Running query %s " + "to find changes needed-by" % + (change, query,)) results = self.connection.simpleQuery(query) for result in results: for match in self.depends_on_re.findall( @@ -207,15 +226,15 @@ class GerritSource(BaseSource): key = (result['number'], result['currentPatchSet']['number']) if key in seen: continue - self.log.debug("Found change %s,%s needs %s from commit" % - (key[0], key[1], change_id)) + self.log.debug("Updating %s: Found change %s,%s " + "needs %s from commit" % + (change, key[0], key[1], change_id)) seen.add(key) records.append(result) return records def _updateChange(self, change, history=None): - self.log.info("Updating information for %s,%s" % - (change.number, change.patchset)) + self.log.info("Updating %s" % (change,)) data = self.connection.query(change.number) change._data = data @@ -255,6 +274,7 @@ class GerritSource(BaseSource): if change.is_merged: # This change is merged, so we don't need to look any further # for dependencies. + self.log.debug("Updating %s: change is merged" % (change,)) return change if history is None: @@ -270,21 +290,35 @@ class GerritSource(BaseSource): if dep_num in history: raise Exception("Dependency cycle detected: %s in %s" % ( dep_num, history)) - self.log.debug("Getting git-dependent change %s,%s" % - (dep_num, dep_ps)) + self.log.debug("Updating %s: Getting git-dependent change %s,%s" % + (change, dep_num, dep_ps)) dep = self._getChange(dep_num, dep_ps, history=history) + # Because we are not forcing a refresh in _getChange, it + # may return without executing this code, so if we are + # updating our change to add ourselves to a dependency + # cycle, we won't detect it. By explicitly performing a + # walk of the dependency tree, we will. + detect_cycle(dep, history) if (not dep.is_merged) and dep not in needs_changes: needs_changes.append(dep) - for record in self._getDependsOnFromCommit(data['commitMessage']): + for record in self._getDependsOnFromCommit(data['commitMessage'], + change): dep_num = record['number'] dep_ps = record['currentPatchSet']['number'] if dep_num in history: raise Exception("Dependency cycle detected: %s in %s" % ( dep_num, history)) - self.log.debug("Getting commit-dependent change %s,%s" % - (dep_num, dep_ps)) + self.log.debug("Updating %s: Getting commit-dependent " + "change %s,%s" % + (change, dep_num, dep_ps)) dep = self._getChange(dep_num, dep_ps, history=history) + # Because we are not forcing a refresh in _getChange, it + # may return without executing this code, so if we are + # updating our change to add ourselves to a dependency + # cycle, we won't detect it. By explicitly performing a + # walk of the dependency tree, we will. + detect_cycle(dep, history) if (not dep.is_merged) and dep not in needs_changes: needs_changes.append(dep) change.needs_changes = needs_changes @@ -294,15 +328,17 @@ class GerritSource(BaseSource): for needed in data['neededBy']: parts = needed['ref'].split('/') dep_num, dep_ps = parts[3], parts[4] + self.log.debug("Updating %s: Getting git-needed change %s,%s" % + (change, dep_num, dep_ps)) dep = self._getChange(dep_num, dep_ps) if (not dep.is_merged) and dep.is_current_patchset: needed_by_changes.append(dep) - for record in self._getNeededByFromCommit(data['id']): + for record in self._getNeededByFromCommit(data['id'], change): dep_num = record['number'] dep_ps = record['currentPatchSet']['number'] - self.log.debug("Getting commit-needed change %s,%s" % - (dep_num, dep_ps)) + self.log.debug("Updating %s: Getting commit-needed change %s,%s" % + (change, dep_num, dep_ps)) # Because a commit needed-by may be a cross-repo # dependency, cause that change to refresh so that it will # reference the latest patchset of its Depends-On (this From c4997159bb3f5d3a473854ab9d725d4b8c19105f Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Wed, 17 Feb 2016 21:04:18 +1100 Subject: [PATCH 013/152] Don't reload connections on HUP Previous to the connection work zuul wouldn't stop triggers or therefore the gerrit watcher on reload. Reloading the gerrit connection causes challenges for the cache management which are currently not handled and likely affecting memory usage. Rather than support dynamically reloading all of zuul.conf, just reload the layout and logging and require a full restart for connection changes to take effect. This isn't a feature regression as particular changes to zuul.conf weren't previously picked up (for example, gerrit credentials). Change-Id: Iff77fbca6ab15c9636f6ae8f12bc1bad78f79aa9 --- doc/source/zuul.rst | 5 ++--- zuul/cmd/server.py | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index d8d72e69ce..98e4bb8a2a 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -997,9 +997,8 @@ normal operation, omit ``-d`` and let Zuul run as a daemon. If you send signal 1 (SIGHUP) to the zuul-server process, Zuul will stop executing new jobs, wait until all executing jobs are finished, -reload its configuration, and resume. Any values in any of the -configuration files may be changed, except the location of Zuul's PID -file (a change to that will be ignored until Zuul is restarted). +reload its layout.yaml, and resume. Changes to any connections or +the PID file will be ignored until Zuul is restarted. If you send a SIGUSR1 to the zuul-server process, Zuul will stop executing new jobs, wait until all executing jobs are finished, diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index 2aca4f2c02..b1cd050808 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -61,12 +61,9 @@ class Server(zuul.cmd.ZuulApp): def reconfigure_handler(self, signum, frame): signal.signal(signal.SIGHUP, signal.SIG_IGN) self.log.debug("Reconfiguration triggered") - self.sched.stopConnections() self.read_config() self.setup_logging('zuul', 'log_config') try: - self.configure_connections() - self.sched.registerConnections(self.connections) self.sched.reconfigure(self.config) except Exception: self.log.exception("Reconfiguration failed:") From 266cbb8a2fd7c150e6c3bf710f365c6a59785d6f Mon Sep 17 00:00:00 2001 From: Thanh Ha Date: Sun, 24 Jan 2016 20:57:59 -0500 Subject: [PATCH 014/152] Add additional zuul.conf information Change-Id: I78c7b4adbcc6967f22d7839808e55314275f74db Signed-off-by: Thanh Ha --- doc/source/zuul.rst | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index d8d72e69ce..563aa238fd 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -10,11 +10,11 @@ Zuul has three configuration files: **zuul.conf** Connection information for Gerrit and Gearman, locations of the - other config files. + other config files. (required) **layout.yaml** - Project and pipeline configuration -- what Zuul does. + Project and pipeline configuration -- what Zuul does. (required) **logging.conf** - Python logging config. + Python logging config. (optional) Examples of each of the three files can be found in the etc/ directory of the source distribution. @@ -41,17 +41,23 @@ You can also find an example zuul.conf file in the git gearman """"""" +Client connection information for gearman. If using Zuul's builtin gearmand +server just set **server** to 127.0.0.1. + **server** Hostname or IP address of the Gearman server. - ``server=gearman.example.com`` + ``server=gearman.example.com`` (required) **port** Port on which the Gearman server is listening. - ``port=4730`` + ``port=4730`` (optional) gearman_server """""""""""""" +The builtin gearman server. Zuul can fork a gearman process from itself rather +than connecting to an external one. + **start** Whether to start the internal Gearman server (default: False). ``start=true`` @@ -67,6 +73,11 @@ gearman_server zuul """" +Zuul's main configuration section. At minimum zuul must be able to find +layout.yaml to be useful. + +.. note:: Must be provided when running zuul-server + .. _layout_config: **layout_config** @@ -118,6 +129,13 @@ zuul merger """""" +The zuul-merger process configuration. Detailed documentation on this process +can be found on the :doc:`merger` page. + +.. note:: Must be provided when running zuul-merger. Both services may share the + same configuration (and even host) or otherwise have an individual + zuul.conf. + **git_dir** Directory that Zuul should clone local git repositories to. ``git_dir=/var/lib/zuul/git`` From 33ccffe30235d8ce9e55d2b4745c219cc7c63de6 Mon Sep 17 00:00:00 2001 From: Thanh Ha Date: Sun, 24 Jan 2016 21:32:25 -0500 Subject: [PATCH 015/152] Add quick-start guide Change-Id: I718b2dd6fc6c71f67fbb1774ae4b0b625fdf053d Signed-off-by: Thanh Ha --- doc/source/index.rst | 1 + doc/source/quick-start.rst | 162 +++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 doc/source/quick-start.rst diff --git a/doc/source/index.rst b/doc/source/index.rst index 61f9e4f579..3c793dac04 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -13,6 +13,7 @@ Contents: .. toctree:: :maxdepth: 2 + quick-start gating connections triggers diff --git a/doc/source/quick-start.rst b/doc/source/quick-start.rst new file mode 100644 index 0000000000..82779c6703 --- /dev/null +++ b/doc/source/quick-start.rst @@ -0,0 +1,162 @@ +Quick Start Guide +================= + +System Requirements +------------------- + +For most deployments zuul only needs 1-2GB. OpenStack uses a 30GB setup. + +Install Zuul +------------ + +You can get zuul from pypi via:: + + pip install zuul + +Zuul Components +--------------- + +Zuul provides the following components: + + - **zuul-server**: scheduler daemon which communicates with Gerrit and + Gearman. Handles receiving events, launching jobs, collecting results + and postingreports. + - **zuul-merger**: speculative-merger which communicates with Gearman. + Prepares Git repositories for jobs to test against. This additionally + requires a web server hosting the Git repositories which can be cloned + by the jobs. + - **zuul-cloner**: client side script used to setup job workspace. It is + used to clone the repositories prepared by the zuul-merger described + previously. + - **gearmand**: optional builtin gearman daemon provided by zuul-server + +External components: + + - Jenkins Gearman plugin: Used by Jenkins to connect to Gearman + +Zuul Communication +------------------ + +All the Zuul components communicate with each other using Gearman. As well as +the following communication channels: + +zuul-server: + + - Gerrit + - Gearman Daemon + +zuul-merger: + + - Gerrit + - Gearman Daemon + +zuul-cloner: + + - http hosted zuul-merger git repos + +Jenkins: + + - Gearman Daemon via Jenkins Gearman Plugin + +Zuul Setup +---------- + +At minimum we need to provide **zuul.conf** and **layout.yaml** and placed +in /etc/zuul/ directory. You will also need a zuul user and ssh key for the +zuul user in Gerrit. The following example uses the builtin gearmand service +in zuul. + +**zuul.conf**:: + + [zuul] + layout_config=/etc/zuul/layout.yaml + + [merger] + git_dir=/git + zuul_url=http://zuul.example.com/p + + [gearman_server] + start=true + + [gearman] + server=127.0.0.1 + + [connection gerrit] + driver=gerrit + server=git.example.com + port=29418 + baseurl=https://git.example.com/gerrit/ + user=zuul + sshkey=/home/zuul/.ssh/id_rsa + +See :doc:`zuul` for more details. + +The following sets up a basic timer triggered job using zuul. + +**layout.yaml**:: + + pipelines: + - name: periodic + source: gerrit + manager: IndependentPipelineManager + trigger: + timer: + - time: '0 * * * *' + + projects: + - name: aproject + periodic: + - aproject-periodic-build + +Starting Zuul +------------- + +You can run zuul-server with the **-d** option to make it not daemonize. It's +a good idea at first to confirm there's no issues with your configuration. + +Simply run:: + + zuul-server + +Once run you should have 2 zuul-server processes:: + + zuul 12102 1 0 Jan21 ? 00:15:45 /home/zuul/zuulvenv/bin/python /home/zuul/zuulvenv/bin/zuul-server -d + zuul 12107 12102 0 Jan21 ? 00:00:01 /home/zuul/zuulvenv/bin/python /home/zuul/zuulvenv/bin/zuul-server -d + +Note: In this example zuul was installed in a virtualenv. + +The 2nd zuul-server process is gearmand running if you are using the builtin +gearmand server, otherwise there will only be 1 process. + +Zuul won't actually process your Job queue however unless you also have a +zuul-merger process running. + +Simply run:: + + zuul-merger + +Zuul should now be able to process your periodic job as configured above once +the Jenkins side of things is configured. + +Jenkins Setup +------------- + +Install the Jenkins Gearman Plugin via Jenkins Plugin management interface. +Then naviage to **Manage > Configuration > Gearman** and setup the Jenkins +server hostname/ip and port to connect to gearman. + +At this point gearman should be running your Jenkins jobs. + +Troubleshooting +--------------- + +Checking Gearman function registration (jobs). You can use telnet to connect +to gearman to check that Jenkins is registering your configured jobs in +gearman:: + + telnet 4730 + +Useful commands are **workers** and **status** which you can run by just +typing those commands once connected to gearman. Every job in your Jenkins +master must appear when you run **workers** for Zuul to be able to run jobs +against your Jenkins instance. From b7273ef849e7070e21c1d51a9b4190237f385027 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 19 Apr 2016 08:58:51 -0700 Subject: [PATCH 016/152] Add report URL to status.json So that the final reporting URL (eg, logs.o.o) can be included on the status page for completed jobs. Change-Id: Ida811e2b097a32b8d560aad8a0b18c8048a36279 --- tests/test_scheduler.py | 27 ++++++++++++++++++++----- zuul/model.py | 42 +++++++++++++++++++++++++++++++++------ zuul/reporter/__init__.py | 20 +------------------ zuul/scheduler.py | 7 ++++++- 4 files changed, 65 insertions(+), 31 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index ec1e32a939..fe7c7cc4fa 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -2235,6 +2235,9 @@ class TestScheduler(ZuulTestCase): self.fake_gerrit.addEvent(A.addApproval('APRV', 1)) self.waitUntilSettled() + self.worker.release('project-merge') + self.waitUntilSettled() + port = self.webapp.server.socket.getsockname()[1] req = urllib2.Request("http://localhost:%s/status.json" % port) @@ -2255,7 +2258,7 @@ class TestScheduler(ZuulTestCase): self.waitUntilSettled() data = json.loads(data) - status_jobs = set() + status_jobs = [] for p in data['pipelines']: for q in p['change_queues']: if p['name'] in ['gate', 'conflict']: @@ -2267,10 +2270,24 @@ class TestScheduler(ZuulTestCase): self.assertTrue(change['active']) self.assertEqual(change['id'], '1,1') for job in change['jobs']: - status_jobs.add(job['name']) - self.assertIn('project-merge', status_jobs) - self.assertIn('project-test1', status_jobs) - self.assertIn('project-test2', status_jobs) + status_jobs.append(job) + self.assertEqual('project-merge', status_jobs[0]['name']) + self.assertEqual('https://server/job/project-merge/0/', + status_jobs[0]['url']) + self.assertEqual('http://logs.example.com/1/1/gate/project-merge/0', + status_jobs[0]['report_url']) + + self.assertEqual('project-test1', status_jobs[1]['name']) + self.assertEqual('https://server/job/project-test1/1/', + status_jobs[1]['url']) + self.assertEqual('http://logs.example.com/1/1/gate/project-test1/1', + status_jobs[1]['report_url']) + + self.assertEqual('project-test2', status_jobs[2]['name']) + self.assertEqual('https://server/job/project-test2/2/', + status_jobs[2]['url']) + self.assertEqual('http://logs.example.com/1/1/gate/project-test2/2', + status_jobs[2]['report_url']) def test_merging_queues(self): "Test that transitively-connected change queues are merged" diff --git a/zuul/model.py b/zuul/model.py index d2cf13bc7f..5bea5d03bb 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -266,7 +266,7 @@ class Pipeline(object): items.extend(shared_queue.queue) return items - def formatStatusJSON(self): + def formatStatusJSON(self, url_pattern=None): j_pipeline = dict(name=self.name, description=self.description) j_queues = [] @@ -283,7 +283,7 @@ class Pipeline(object): if j_changes: j_queue['heads'].append(j_changes) j_changes = [] - j_changes.append(e.formatJSON()) + j_changes.append(e.formatJSON(url_pattern)) if (len(j_changes) > 1 and (j_changes[-2]['remaining_time'] is not None) and (j_changes[-1]['remaining_time'] is not None)): @@ -724,7 +724,34 @@ class QueueItem(object): def setReportedResult(self, result): self.current_build_set.result = result - def formatJSON(self): + def formatJobResult(self, job, url_pattern=None): + build = self.current_build_set.getBuild(job.name) + result = build.result + pattern = url_pattern + if result == 'SUCCESS': + if job.success_message: + result = job.success_message + if job.success_pattern: + pattern = job.success_pattern + elif result == 'FAILURE': + if job.failure_message: + result = job.failure_message + if job.failure_pattern: + pattern = job.failure_pattern + url = None + if pattern: + try: + url = pattern.format(change=self.change, + pipeline=self.pipeline, + job=job, + build=build) + except Exception: + pass # FIXME: log this or something? + if not url: + url = build.url or job.name + return (result, url) + + def formatJSON(self, url_pattern=None): changeish = self.change ret = {} ret['active'] = self.active @@ -761,11 +788,13 @@ class QueueItem(object): elapsed = None remaining = None result = None - url = None + build_url = None + report_url = None worker = None if build: result = build.result - url = build.url + build_url = build.url + (unused, report_url) = self.formatJobResult(job, url_pattern) if build.start_time: if build.end_time: elapsed = int((build.end_time - @@ -793,7 +822,8 @@ class QueueItem(object): 'name': job.name, 'elapsed_time': elapsed, 'remaining_time': remaining, - 'url': url, + 'url': build_url, + 'report_url': report_url, 'result': result, 'voting': job.voting, 'uuid': build.uuid if build else None, diff --git a/zuul/reporter/__init__.py b/zuul/reporter/__init__.py index fd7917400e..0569fbe748 100644 --- a/zuul/reporter/__init__.py +++ b/zuul/reporter/__init__.py @@ -113,25 +113,7 @@ class BaseReporter(object): for job in pipeline.getJobs(item): build = item.current_build_set.getBuild(job.name) - result = build.result - pattern = url_pattern - if result == 'SUCCESS': - if job.success_message: - result = job.success_message - if job.success_pattern: - pattern = job.success_pattern - elif result == 'FAILURE': - if job.failure_message: - result = job.failure_message - if job.failure_pattern: - pattern = job.failure_pattern - if pattern: - url = pattern.format(change=item.change, - pipeline=pipeline, - job=job, - build=build) - else: - url = build.url or job.name + (result, url) = item.formatJobResult(job, url_pattern) if not job.voting: voting = ' (non-voting)' else: diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 48bb5e318c..aea9a67e96 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -1097,6 +1097,11 @@ class Scheduler(threading.Thread): pipeline.manager.onMergeCompleted(event) def formatStatusJSON(self): + if self.config.has_option('zuul', 'url_pattern'): + url_pattern = self.config.get('zuul', 'url_pattern') + else: + url_pattern = None + data = {} data['zuul_version'] = self.zuul_version @@ -1122,7 +1127,7 @@ class Scheduler(threading.Thread): pipelines = [] data['pipelines'] = pipelines for pipeline in self.layout.pipelines.values(): - pipelines.append(pipeline.formatStatusJSON()) + pipelines.append(pipeline.formatStatusJSON(url_pattern)) return json.dumps(data) From 9f36519297a69d144b19a67f6b6dfea0aad38faa Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 18 Apr 2016 10:34:48 -0700 Subject: [PATCH 017/152] Add ansible launch server As part of the Zuulv2.5 effort, this service is not designed to be long-lived. Change-Id: Idbac79f60cb694a9a707d204cd274328289bc20d --- setup.cfg | 1 + zuul/cmd/launcher.py | 98 +++++++++++++++ zuul/launcher/ansiblelaunchserver.py | 178 +++++++++++++++++++++++++++ 3 files changed, 277 insertions(+) create mode 100644 zuul/cmd/launcher.py create mode 100644 zuul/launcher/ansiblelaunchserver.py diff --git a/setup.cfg b/setup.cfg index 620e1ac5c8..7ddeb84be2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,6 +25,7 @@ console_scripts = zuul-merger = zuul.cmd.merger:main zuul = zuul.cmd.client:main zuul-cloner = zuul.cmd.cloner:main + zuul-launcher = zuul.cmd.launcher:main [build_sphinx] source-dir = doc/source diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py new file mode 100644 index 0000000000..7f9231bba3 --- /dev/null +++ b/zuul/cmd/launcher.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# Copyright 2012 Hewlett-Packard Development Company, L.P. +# Copyright 2013-2014 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import argparse +import daemon +import extras + +# as of python-daemon 1.6 it doesn't bundle pidlockfile anymore +# instead it depends on lockfile-0.9.1 which uses pidfile. +pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) + +import os +import sys +import signal + +import zuul.cmd + +# No zuul imports here because they pull in paramiko which must not be +# imported until after the daemonization. +# https://github.com/paramiko/paramiko/issues/59 +# Similar situation with gear and statsd. + + +class Launcher(zuul.cmd.ZuulApp): + + def parse_arguments(self): + parser = argparse.ArgumentParser(description='Zuul launch worker.') + parser.add_argument('-c', dest='config', + help='specify the config file') + parser.add_argument('-d', dest='nodaemon', action='store_true', + help='do not run as a daemon') + parser.add_argument('--version', dest='version', action='version', + version=self._get_version(), + help='show zuul version') + self.args = parser.parse_args() + + def exit_handler(self, signum, frame): + signal.signal(signal.SIGUSR1, signal.SIG_IGN) + self.launcher.stop() + self.launcher.join() + + def main(self): + # See comment at top of file about zuul imports + import zuul.launcher.ansiblelaunchserver + + self.setup_logging('launcher', 'log_config') + + LaunchServer = zuul.launcher.ansiblelaunchserver.LaunchServer + self.launcher = LaunchServer(self.config) + self.launcher.start() + + signal.signal(signal.SIGUSR1, self.exit_handler) + signal.signal(signal.SIGUSR2, zuul.cmd.stack_dump_handler) + while True: + try: + signal.pause() + except KeyboardInterrupt: + print "Ctrl + C: asking launcher to exit nicely...\n" + self.exit_handler(signal.SIGINT, None) + sys.exit(0) + + +def main(): + server = Launcher() + server.parse_arguments() + + server.read_config() + server.configure_connections() + + if server.config.has_option('launcher', 'pidfile'): + pid_fn = os.path.expanduser(server.config.get('launcher', 'pidfile')) + else: + pid_fn = '/var/run/zuul-launcher/zuul-launcher.pid' + pid = pid_file_module.TimeoutPIDLockFile(pid_fn, 10) + + if server.args.nodaemon: + server.main() + else: + with daemon.DaemonContext(pidfile=pid): + server.main() + + +if __name__ == "__main__": + sys.path.insert(0, '.') + main() diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py new file mode 100644 index 0000000000..2dd010c0c3 --- /dev/null +++ b/zuul/launcher/ansiblelaunchserver.py @@ -0,0 +1,178 @@ +# Copyright 2014 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import json +import logging +import os +import shutil +import socket +import subprocess +import tempfile +import threading +import traceback + +import gear +import yaml + + +class JobDir(object): + def __init__(self): + self.root = tempfile.mkdtemp() + self.git_root = os.path.join(self.root, 'git') + os.makedirs(self.git_root) + self.ansible_root = os.path.join(self.root, 'ansible') + os.makedirs(self.ansible_root) + self.inventory = os.path.join(self.ansible_root, 'inventory') + self.playbook = os.path.join(self.ansible_root, 'playbook') + self.config = os.path.join(self.ansible_root, 'ansible.cfg') + + def __enter__(self): + return self + + def __exit__(self, etype, value, tb): + shutil.rmtree(self.root) + + +class LaunchServer(object): + log = logging.getLogger("zuul.LaunchServer") + + def __init__(self, config): + self.config = config + self.hostname = socket.gethostname() + + def start(self): + self._running = True + server = self.config.get('gearman', 'server') + if self.config.has_option('gearman', 'port'): + port = self.config.get('gearman', 'port') + else: + port = 4730 + self.worker = gear.Worker('Zuul Launch Server') + self.worker.addServer(server, port) + self.log.debug("Waiting for server") + self.worker.waitForServer() + self.log.debug("Registering") + self.register() + self.log.debug("Starting worker") + self.thread = threading.Thread(target=self.run) + self.thread.daemon = True + self.thread.start() + + def register(self): + self.worker.registerFunction("node-assign:zuul") + + def stop(self): + self.log.debug("Stopping") + self._running = False + self.worker.shutdown() + self.log.debug("Stopped") + + def join(self): + self.thread.join() + + def run(self): + self.log.debug("Starting launch listener") + while self._running: + try: + job = self.worker.getJob() + try: + if job.name.startswith('node-assign:'): + self.log.debug("Got assign-node job: %s" % job.unique) + self.assignNode(job) + else: + self.log.error("Unable to handle job %s" % job.name) + job.sendWorkFail() + except Exception: + self.log.exception("Exception while running job") + job.sendWorkException(traceback.format_exc()) + except Exception: + self.log.exception("Exception while getting job") + + def assignNode(self, job): + data = dict(manager=self.hostname) + job.sendWorkData(json.dumps(data)) + job.sendWorkComplete() + + def launch(self, job): + thread = threading.Thread(target=self._launch, args=(job,)) + thread.start() + + def _launch(self, job): + self.log.debug("Job %s: beginning" % (job.unique,)) + with JobDir() as jobdir: + self.log.debug("Job %s: job root at %s" % + (job.unique, jobdir.root)) + args = json.loads(job.arguments) + tasks = [] + for project in args['projects']: + self.log.debug("Job %s: updating project %s" % + (job.unique, project['name'])) + tasks.append(self.update(project['name'], project['url'])) + for task in tasks: + task.wait() + self.log.debug("Job %s: git updates complete" % (job.unique,)) + merger = self._getMerger(jobdir.git_root) + commit = merger.mergeChanges(args['items']) # noqa + + # TODOv3: Ansible the ansible thing here. + self.prepareAnsibleFiles(jobdir, args) + result = self.runAnsible(jobdir) + + data = { + 'url': 'https://server/job', + 'number': 1 + } + job.sendWorkData(json.dumps(data)) + job.sendWorkStatus(0, 100) + + result = dict(result=result) + job.sendWorkComplete(json.dumps(result)) + + def getHostList(self, args): + # TODOv3: This should get the appropriate nodes from nodepool, + # or in the unit tests, be overriden to return localhost. + return [('localhost', dict(ansible_connection='local'))] + + def prepareAnsibleFiles(self, jobdir, args): + with open(jobdir.inventory, 'w') as inventory: + for host_name, host_vars in self.getHostList(args): + inventory.write(host_name) + inventory.write(' ') + for k, v in host_vars.items(): + inventory.write('%s=%s' % (k, v)) + inventory.write('\n') + with open(jobdir.playbook, 'w') as playbook: + play = dict(hosts='localhost', + tasks=[dict(name='test', + shell='echo Hello world')]) + playbook.write(yaml.dump([play])) + with open(jobdir.config, 'w') as config: + config.write('[defaults]\n') + config.write('hostfile = %s\n' % jobdir.inventory) + + def runAnsible(self, jobdir): + proc = subprocess.Popen( + ['ansible-playbook', jobdir.playbook], + cwd=jobdir.ansible_root, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + (out, err) = proc.communicate() + ret = proc.wait() + print out + print err + if ret == 0: + return 'SUCCESS' + else: + return 'FAILURE' From d6dbd68ba39be37ebad4837a03caa30c7ba40195 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Tue, 22 Dec 2015 10:06:54 +1100 Subject: [PATCH 018/152] Add extra test for bad url patterns Report can have parameters from a job substituted into the message. If a reporter fails for whatever reason it is logged to zuul and otherwise ignored quietly to the user. If bad varaibles are attempted to be substituted into a message log a warning and fallback to the result url for the reporter to continue. This was fixed in b7273ef849e7070e21c1d51a9b4190237f385027 so this change is to just put in an extra test for it. Change-Id: I521cfbb5873973014c43f9780722d2f80a7c12f0 --- tests/fixtures/layout-success-pattern.yaml | 21 +++++++++++++ tests/test_scheduler.py | 35 ++++++++++++++++++++++ zuul/reporter/__init__.py | 3 ++ 3 files changed, 59 insertions(+) create mode 100644 tests/fixtures/layout-success-pattern.yaml diff --git a/tests/fixtures/layout-success-pattern.yaml b/tests/fixtures/layout-success-pattern.yaml new file mode 100644 index 0000000000..cea15f123a --- /dev/null +++ b/tests/fixtures/layout-success-pattern.yaml @@ -0,0 +1,21 @@ +pipelines: + - name: check + manager: IndependentPipelineManager + trigger: + gerrit: + - event: patchset-created + success: + smtp: + to: me@example.org + +jobs: + - name: docs-draft-test + success-pattern: http://docs-draft.example.org/{build.parameters[LOG_PATH]}/publish-docs/ + - name: docs-draft-test2 + success-pattern: http://docs-draft.example.org/{NOPE}/{build.parameters[BAD]}/publish-docs/ + +projects: + - name: org/docs + check: + - docs-draft-test: + - docs-draft-test2 diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index fe7c7cc4fa..8ac53683b9 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -4392,3 +4392,38 @@ For CI problems and help debugging, contact ci@example.org""" self.assertIn('Build failed.', K.messages[0]) # No more messages reported via smtp self.assertEqual(3, len(self.smtp_messages)) + + def test_success_pattern(self): + "Ensure bad build params are ignored" + + # Use SMTP reporter to grab the result message easier + self.init_repo("org/docs") + self.config.set('zuul', 'layout_config', + 'tests/fixtures/layout-success-pattern.yaml') + self.sched.reconfigure(self.config) + self.worker.hold_jobs_in_build = True + self.registerJobs() + + A = self.fake_gerrit.addFakeChange('org/docs', 'master', 'A') + self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) + self.waitUntilSettled() + + # Grab build id + self.assertEqual(len(self.builds), 1) + uuid = self.builds[0].unique[:7] + + self.worker.hold_jobs_in_build = False + self.worker.release() + self.waitUntilSettled() + + self.assertEqual(len(self.smtp_messages), 1) + body = self.smtp_messages[0]['body'].splitlines() + self.assertEqual('Build succeeded.', body[0]) + + self.assertIn( + '- docs-draft-test http://docs-draft.example.org/1/1/1/check/' + 'docs-draft-test/%s/publish-docs/' % uuid, + body[2]) + self.assertIn( + '- docs-draft-test2 https://server/job/docs-draft-test2/1/', + body[3]) diff --git a/zuul/reporter/__init__.py b/zuul/reporter/__init__.py index 0569fbe748..0c9a8d8b6b 100644 --- a/zuul/reporter/__init__.py +++ b/zuul/reporter/__init__.py @@ -13,6 +13,7 @@ # under the License. import abc +import logging import six @@ -24,6 +25,8 @@ class BaseReporter(object): Defines the exact public methods that must be supplied. """ + log = logging.getLogger("zuul.reporter.BaseReporter") + def __init__(self, reporter_config={}, sched=None, connection=None): self.reporter_config = reporter_config self.sched = sched From fc1b58ae05a81400e375c54d20e6edbd67c12db5 Mon Sep 17 00:00:00 2001 From: Tristan Cacqueray Date: Sun, 31 Jan 2016 11:15:36 -0500 Subject: [PATCH 019/152] GerritWatcher: add poll_timeout So that zuul reload process can happen when gerrit stream is silent. Otherwise the main loop doesn't exit out of the poll call and doesn't evaluate the "while not self._stopped" location. Change-Id: If3129a65da6119acf69ad00e2e78c7ec82a49941 Closes-Bug: https://storyboard.openstack.org/#!/story/2000472 --- zuul/connection/gerrit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/connection/gerrit.py b/zuul/connection/gerrit.py index 4671ff9abb..a1854f4ba4 100644 --- a/zuul/connection/gerrit.py +++ b/zuul/connection/gerrit.py @@ -132,6 +132,7 @@ class GerritEventConnector(threading.Thread): class GerritWatcher(threading.Thread): log = logging.getLogger("gerrit.GerritWatcher") + poll_timeout = 500 def __init__(self, gerrit_connection, username, hostname, port=29418, keyfile=None): @@ -154,7 +155,7 @@ class GerritWatcher(threading.Thread): poll = select.poll() poll.register(stdout.channel) while not self._stopped: - ret = poll.poll() + ret = poll.poll(self.poll_timeout) for (fd, event) in ret: if fd == stdout.channel.fileno(): if event == select.POLLIN: From 30a433b202a65dd7ddcc6eebf4d34041f6743740 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sun, 1 May 2016 10:35:29 -0500 Subject: [PATCH 020/152] Pin paramiko < 2.0.0 We need to pin parmakio until we fix an expection in zuul/connection/gerrit.py client.load_system_host_keys() Traceback (most recent call last): File "/opt/ansible-role-zuul/git/openstack-infra/zuul/zuul/connection/gerrit.py", line 168, in _run client.load_system_host_keys() File "/usr/lib/python2.7/site-packages/paramiko/client.py", line 101, in load_system_host_keys self._system_host_keys.load(filename) File "/usr/lib/python2.7/site-packages/paramiko/hostkeys.py", line 101, in load e = HostKeyEntry.from_line(line, lineno) File "/usr/lib/python2.7/site-packages/paramiko/hostkeys.py", line 331, in from_line key = RSAKey(data=decodebytes(key)) File "/usr/lib/python2.7/site-packages/paramiko/rsakey.py", line 58, in __init__ ).public_key(default_backend()) File "/usr/lib64/python2.7/site-packages/cryptography/hazmat/backends/__init__.py", line 35, in default_backend _default_backend = MultiBackend(_available_backends()) File "/usr/lib64/python2.7/site-packages/cryptography/hazmat/backends/__init__.py", line 22, in _available_backends "cryptography.backends" AttributeError: 'EntryPoint' object has no attribute 'resolve' Change-Id: Ifc12a1ec9f26c0b236ed00e128b707c6fba58b1e Signed-off-by: Paul Belanger --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8388f0bd2b..77ac0a58a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pbr>=1.1.0 PyYAML>=3.1.0 Paste WebOb>=1.2.3 -paramiko>=1.8.0 +paramiko>=1.8.0,<2.0.0 GitPython>=0.3.3 ordereddict python-daemon>=2.0.4,<2.1.0 From 9b41a0b502c4b8ffc6c876bfcbed1de9a3c5c15c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 12 May 2016 11:30:05 -0700 Subject: [PATCH 021/152] Properly shutdown apscheduler on reconfigure This addresses a recent increase in test_idle failures. The existing apscheduler was not being shut down during reconfigurations which caused us to end up with two apschedulers running during the test. This could cause a trigger event to fire while the test was expecting the system to be idle. The triggers were not previously shut down during reconfigurations. Instead, the timer trigger relied on having its postConfig method called on each reconfiguration, where it would begin by cleaning up any existing jobs. However, since the connections changes, triggers are stopped, discarded, and recreated during reconfiguration. Because the stop method of the timer trigger was not actually being called, a new trigger was created each time we reconfigured, and old ones were never cleaned up. This likely had a production impact as well, however, it was not likely to be visible unless a configuration change altered the scheduled times for periodic queues (in that case, we would see jobs run at both the old and new times). Change-Id: Ia7c61984a9c47a9b1554a4ccb99309674dffec11 --- zuul/trigger/timer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/trigger/timer.py b/zuul/trigger/timer.py index d42e3db296..f81312e595 100644 --- a/zuul/trigger/timer.py +++ b/zuul/trigger/timer.py @@ -40,8 +40,8 @@ class TimerTrigger(BaseTrigger): self.log.debug("Adding event %s" % event) self.sched.addEvent(event) - def _shutdown(self): - self.apsched.stop() + def stop(self): + self.apsched.shutdown() def getEventFilters(self, trigger_conf): def toList(item): From 119acf362f1c82fa90cfbd10ffac2543c1a631c3 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 18 Apr 2016 15:34:36 -0700 Subject: [PATCH 022/152] Ansible launcher: register jobs from JJB Use jenkins-job-builder to parse the yaml files, then when nodes are added, register jobs with gearman accordingly. Change-Id: I4947671a8e2c0ddbce50a47b8666b4c7ef72a974 --- zuul/cmd/launcher.py | 15 +++ zuul/launcher/ansiblelaunchserver.py | 150 +++++++++++++++++++++++++-- 2 files changed, 154 insertions(+), 11 deletions(-) diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py index 7f9231bba3..86266b3977 100644 --- a/zuul/cmd/launcher.py +++ b/zuul/cmd/launcher.py @@ -22,6 +22,7 @@ import extras # instead it depends on lockfile-0.9.1 which uses pidfile. pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) +import logging import os import sys import signal @@ -47,6 +48,17 @@ class Launcher(zuul.cmd.ZuulApp): help='show zuul version') self.args = parser.parse_args() + def reconfigure_handler(self, signum, frame): + signal.signal(signal.SIGHUP, signal.SIG_IGN) + self.log.debug("Reconfiguration triggered") + self.read_config() + self.setup_logging('launcher', 'log_config') + try: + self.launcher.reconfigure(self.config) + except Exception: + self.log.exception("Reconfiguration failed:") + signal.signal(signal.SIGHUP, self.reconfigure_handler) + def exit_handler(self, signum, frame): signal.signal(signal.SIGUSR1, signal.SIG_IGN) self.launcher.stop() @@ -58,10 +70,13 @@ class Launcher(zuul.cmd.ZuulApp): self.setup_logging('launcher', 'log_config') + self.log = logging.getLogger("zuul.Launcher") + LaunchServer = zuul.launcher.ansiblelaunchserver.LaunchServer self.launcher = LaunchServer(self.config) self.launcher.start() + signal.signal(signal.SIGHUP, self.reconfigure_handler) signal.signal(signal.SIGUSR1, self.exit_handler) signal.signal(signal.SIGUSR2, zuul.cmd.stack_dump_handler) while True: diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 2dd010c0c3..19e8d3ff09 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -14,6 +14,7 @@ import json import logging +import multiprocessing import os import shutil import socket @@ -24,6 +25,7 @@ import traceback import gear import yaml +import jenkins_jobs.builder class JobDir(object): @@ -50,6 +52,9 @@ class LaunchServer(object): def __init__(self, config): self.config = config self.hostname = socket.gethostname() + self.node_workers = {} + self.mpmanager = multiprocessing.Manager() + self.jobs = self.mpmanager.dict() def start(self): self._running = True @@ -64,18 +69,41 @@ class LaunchServer(object): self.worker.waitForServer() self.log.debug("Registering") self.register() + self.loadJobs() self.log.debug("Starting worker") self.thread = threading.Thread(target=self.run) self.thread.daemon = True self.thread.start() + def loadJobs(self): + self.log.debug("Loading jobs") + builder = JJB() + path = self.config.get('launcher', 'jenkins_jobs') + builder.load_files([path]) + builder.parser.expandYaml() + unseen = set(self.jobs.keys()) + for job in builder.parser.jobs: + self.jobs[job['name']] = job + unseen.discard(job['name']) + for name in unseen: + del self.jobs[name] + def register(self): self.worker.registerFunction("node-assign:zuul") + def reconfigure(self, config): + self.log.debug("Reconfiguring") + self.config = config + self.loadJobs() + for node in self.node_workers.values(): + node.queue.put(dict(action='reconfigure')) + def stop(self): self.log.debug("Stopping") self._running = False self.worker.shutdown() + for node in self.node_workers.values(): + node.queue.put(dict(action='stop')) self.log.debug("Stopped") def join(self): @@ -100,32 +128,126 @@ class LaunchServer(object): self.log.exception("Exception while getting job") def assignNode(self, job): + args = json.loads(job.arguments) + worker = NodeWorker(self.config, self.jobs, + args['name'], args['host'], + args['description'], args['labels']) + self.node_workers[worker.name] = worker + + worker.process = multiprocessing.Process(target=worker.run) + worker.process.start() + data = dict(manager=self.hostname) job.sendWorkData(json.dumps(data)) job.sendWorkComplete() + +class NodeWorker(object): + log = logging.getLogger("zuul.NodeWorker") + + def __init__(self, config, jobs, name, host, description, labels): + self.config = config + self.jobs = jobs + self.name = name + self.host = host + self.description = description + if not isinstance(labels, list): + labels = [labels] + self.labels = labels + self.registered_functions = set() + self._running = True + self.queue = multiprocessing.Queue() + + def run(self): + self._running_job = False + server = self.config.get('gearman', 'server') + if self.config.has_option('gearman', 'port'): + port = self.config.get('gearman', 'port') + else: + port = 4730 + self.worker = gear.Worker(self.name) + self.worker.addServer(server, port) + self.log.debug("Waiting for server") + self.worker.waitForServer() + self.register() + + self.gearman_thread = threading.Thread(target=self.run_gearman) + self.gearman_thread.daemon = True + self.gearman_thread.start() + + while self._running: + try: + self._run_queue() + except Exception: + self.log.exception("Exception in queue manager:") + + def _run_queue(self): + item = self.queue.get() + if item['action'] == 'stop': + self._running = False + self.worker.shutdown() + elif item['action'] == 'reconfigure': + self.register() + + def run_gearman(self): + while self._running: + try: + self._run_gearman() + except Exception: + self.log.exception("Exception in gearman manager:") + + def _run_gearman(self): + job = self.worker.getJob() + try: + if job.name not in self.registered_functions: + self.log.error("Unable to handle job %s" % job.name) + job.sendWorkFail() + return + self.launch(job) + except Exception: + self.log.exception("Exception while running job") + job.sendWorkException(traceback.format_exc()) + + def generateFunctionNames(self, job): + # This only supports "node: foo" and "node: foo || bar" + ret = set() + job_labels = job.get('node') + matching_labels = set() + if job_labels: + job_labels = [x.strip() for x in job_labels.split('||')] + matching_labels = set(self.labels) & set(job_labels) + if not matching_labels: + return ret + ret.add('build:%s' % (job['name'],)) + for label in matching_labels: + ret.add('build:%s:%s' % (job['name'], label)) + return ret + + def register(self): + if self._running_job: + return + new_functions = set() + for job in self.jobs.values(): + new_functions |= self.generateFunctionNames(job) + for function in new_functions - self.registered_functions: + self.worker.registerFunction(function) + for function in self.registered_functions - new_functions: + self.worker.unRegisterFunction(function) + self.registered_functions = new_functions + def launch(self, job): + self._running_job = True thread = threading.Thread(target=self._launch, args=(job,)) thread.start() def _launch(self, job): self.log.debug("Job %s: beginning" % (job.unique,)) + return # TODO with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) args = json.loads(job.arguments) - tasks = [] - for project in args['projects']: - self.log.debug("Job %s: updating project %s" % - (job.unique, project['name'])) - tasks.append(self.update(project['name'], project['url'])) - for task in tasks: - task.wait() - self.log.debug("Job %s: git updates complete" % (job.unique,)) - merger = self._getMerger(jobdir.git_root) - commit = merger.mergeChanges(args['items']) # noqa - # TODOv3: Ansible the ansible thing here. self.prepareAnsibleFiles(jobdir, args) result = self.runAnsible(jobdir) @@ -176,3 +298,9 @@ class LaunchServer(object): return 'SUCCESS' else: return 'FAILURE' + + +class JJB(jenkins_jobs.builder.Builder): + def __init__(self): + self.global_config = None + self._plugins_list = [] From 083f545bb8fb6fbee73ce702ab74396c6558d104 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 19 Apr 2016 15:14:29 -0700 Subject: [PATCH 023/152] Ansible launcher: send ZMQ start/complete events We have a lot of infrastructure expecting ZMQ events related to job start/stops. Send them. Note, when the ansible launcher shuts down, it will send fake completion events for each node it has so that nodepool will delete those nodes. Change-Id: Ib38c5c77452743a49f6d1cda69c914b5d491134b --- zuul/launcher/ansiblelaunchserver.py | 182 ++++++++++++++++++++++++--- 1 file changed, 163 insertions(+), 19 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 19e8d3ff09..36986ec019 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -17,6 +17,7 @@ import logging import multiprocessing import os import shutil +import signal import socket import subprocess import tempfile @@ -26,6 +27,7 @@ import traceback import gear import yaml import jenkins_jobs.builder +import zmq class JobDir(object): @@ -55,9 +57,17 @@ class LaunchServer(object): self.node_workers = {} self.mpmanager = multiprocessing.Manager() self.jobs = self.mpmanager.dict() + self.zmq_send_queue = multiprocessing.Queue() def start(self): self._running = True + + # Setup ZMQ + self.zcontext = zmq.Context() + self.zsocket = self.zcontext.socket(zmq.PUB) + self.zsocket.bind("tcp://*:8881") + + # Setup Gearman server = self.config.get('gearman', 'server') if self.config.has_option('gearman', 'port'): port = self.config.get('gearman', 'port') @@ -69,11 +79,21 @@ class LaunchServer(object): self.worker.waitForServer() self.log.debug("Registering") self.register() + + # Load JJB config self.loadJobs() + + # Start ZMQ worker thread + self.log.debug("Starting ZMQ processor") + self.zmq_thread = threading.Thread(target=self.run_zmq) + self.zmq_thread.daemon = True + self.zmq_thread.start() + + # Start Gearman worker thread self.log.debug("Starting worker") - self.thread = threading.Thread(target=self.run) - self.thread.daemon = True - self.thread.start() + self.gearman_thread = threading.Thread(target=self.run) + self.gearman_thread.daemon = True + self.gearman_thread.start() def loadJobs(self): self.log.debug("Loading jobs") @@ -103,14 +123,24 @@ class LaunchServer(object): self._running = False self.worker.shutdown() for node in self.node_workers.values(): - node.queue.put(dict(action='stop')) + node.stop() self.log.debug("Stopped") def join(self): - self.thread.join() + self.gearman_thread.join() + + def run_zmq(self): + while self._running: + try: + item = self.zmq_send_queue.get() + self.log.debug("Got ZMQ event %s" % (item,)) + if item is None: + continue + self.zsocket.send(item) + except Exception: + self.log.exception("Exception while processing ZMQ events") def run(self): - self.log.debug("Starting launch listener") while self._running: try: job = self.worker.getJob() @@ -124,6 +154,8 @@ class LaunchServer(object): except Exception: self.log.exception("Exception while running job") job.sendWorkException(traceback.format_exc()) + except gear.InterruptedError: + return except Exception: self.log.exception("Exception while getting job") @@ -131,7 +163,8 @@ class LaunchServer(object): args = json.loads(job.arguments) worker = NodeWorker(self.config, self.jobs, args['name'], args['host'], - args['description'], args['labels']) + args['description'], args['labels'], + self.hostname, self.zmq_send_queue) self.node_workers[worker.name] = worker worker.process = multiprocessing.Process(target=worker.run) @@ -145,7 +178,9 @@ class LaunchServer(object): class NodeWorker(object): log = logging.getLogger("zuul.NodeWorker") - def __init__(self, config, jobs, name, host, description, labels): + def __init__(self, config, jobs, name, host, description, labels, + manager_name, zmq_send_queue): + self.log.debug("Creating node worker %s" % (name,)) self.config = config self.jobs = jobs self.name = name @@ -157,9 +192,14 @@ class NodeWorker(object): self.registered_functions = set() self._running = True self.queue = multiprocessing.Queue() + self.manager_name = manager_name + self.zmq_send_queue = zmq_send_queue + self.running_job_lock = threading.Lock() + self._running_job = False def run(self): - self._running_job = False + signal.signal(signal.SIGINT, signal.SIG_IGN) + self.log.debug("Node worker %s starting" % (self.name,)) server = self.config.get('gearman', 'server') if self.config.has_option('gearman', 'port'): port = self.config.get('gearman', 'port') @@ -181,11 +221,22 @@ class NodeWorker(object): except Exception: self.log.exception("Exception in queue manager:") + def stop(self): + # If this is called locally, setting _running will be + # effictive, if it's called remotely, it will not be, but it + # will be set by the queue thread. + self.log.debug("Submitting stop request") + self._running = False + self.queue.put(dict(action='stop')) + def _run_queue(self): item = self.queue.get() if item['action'] == 'stop': + self.log.debug("Received stop request") self._running = False self.worker.shutdown() + if not self.abortRunningJob(): + self.sendFakeCompleteEvent() elif item['action'] == 'reconfigure': self.register() @@ -197,7 +248,11 @@ class NodeWorker(object): self.log.exception("Exception in gearman manager:") def _run_gearman(self): - job = self.worker.getJob() + try: + job = self.worker.getJob() + except gear.InterruptedError: + return + self.log.debug("Node worker %s got job %s" % (self.name, job.name)) try: if job.name not in self.registered_functions: self.log.error("Unable to handle job %s" % job.name) @@ -235,14 +290,102 @@ class NodeWorker(object): self.worker.unRegisterFunction(function) self.registered_functions = new_functions - def launch(self, job): - self._running_job = True - thread = threading.Thread(target=self._launch, args=(job,)) - thread.start() + def abortRunningJob(self): + aborted = False + self.log.debug("Abort: acquiring job lock") + with self.running_job_lock: + if self._running_job: + self.log.debug("Abort: a job is running") + proc = self.ansible_proc + if proc: + self.log.debug("Abort: sending kill signal to job process") + try: + proc.kill() + aborted = True + except Exception: + self.log.exception("Exception while killing " + "ansible process:") + else: + self.log.debug("Abort: no job is running") + + return aborted + + def launch(self, job): + self.log.debug("Node worker %s launching job %s" % + (self.name, job.name)) + + # Make sure we can parse what we need from the job first + args = json.loads(job.arguments) + # This may be configurable later, or we may choose to honor + # OFFLINE_NODE_WHEN_COMPLETE + offline = True + job_name = job.name.split(':')[1] + + # Initialize the result so we have something regardless of + # whether the job actually runs + result = None + + try: + self.sendStartEvent(job_name, args) + except Exception: + self.log.exception("Exception while sending job start event") + + try: + result = self.runJob() + except Exception: + self.log.exception("Exception while launching job thread") + + try: + job.sendWorkComplete() + except Exception: + self.log.exception("Exception while sending job completion packet") + + try: + self.sendCompleteEvent(job_name, result, args) + except Exception: + self.log.exception("Exception while sending job completion event") + + if offline: + self.stop() + + def sendStartEvent(self, name, parameters): + build = dict(node_name=self.name, + host_name=self.manager_name, + parameters=parameters) + + event = dict(name=name, + build=build) + + item = "onStarted %s" % json.dumps(event) + self.log.debug("Sending over ZMQ: %s" % (item,)) + self.zmq_send_queue.put(item) + + def sendCompleteEvent(self, name, status, parameters): + build = dict(status=status, + node_name=self.name, + host_name=self.manager_name, + parameters=parameters) + + event = dict(name=name, + build=build) + + item = "onFinalized %s" % json.dumps(event) + self.log.debug("Sending over ZMQ: %s" % (item,)) + self.zmq_send_queue.put(item) + + def sendFakeCompleteEvent(self): + self.sendCompleteEvent('zuul:launcher-shutdown', + 'SUCCESS', {}) + + def runJob(self, job): + self.ansible_proc = None + with self.running_job_lock: + if not self._running: + return + self._running_job = True - def _launch(self, job): self.log.debug("Job %s: beginning" % (job.unique,)) - return # TODO + return 'SUCCESS' # TODO with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) @@ -284,14 +427,15 @@ class NodeWorker(object): config.write('hostfile = %s\n' % jobdir.inventory) def runAnsible(self, jobdir): - proc = subprocess.Popen( + self.ansible_proc = subprocess.Popen( ['ansible-playbook', jobdir.playbook], cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) - (out, err) = proc.communicate() - ret = proc.wait() + (out, err) = self.ansible_proc.communicate() + ret = self.ansible_proc.wait() + self.ansible_proc = None print out print err if ret == 0: From 323b64b1d0940368c4f1eb756ebeb5cffe7937f7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 19 Apr 2016 17:00:04 -0700 Subject: [PATCH 024/152] Ansible launcher: run ansible Change-Id: I2352e07c168de71c616ab37ce2be62dba4b258fb --- zuul/launcher/ansiblelaunchserver.py | 32 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 36986ec019..690659e8b7 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -161,6 +161,7 @@ class LaunchServer(object): def assignNode(self, job): args = json.loads(job.arguments) + self.log.debug("Assigned node with arguments: %s" % (args,)) worker = NodeWorker(self.config, self.jobs, args['name'], args['host'], args['description'], args['labels'], @@ -196,6 +197,7 @@ class NodeWorker(object): self.zmq_send_queue = zmq_send_queue self.running_job_lock = threading.Lock() self._running_job = False + self._sent_complete_event = False def run(self): signal.signal(signal.SIGINT, signal.SIG_IGN) @@ -324,6 +326,7 @@ class NodeWorker(object): # Initialize the result so we have something regardless of # whether the job actually runs result = None + self._sent_complete_event = False try: self.sendStartEvent(job_name, args) @@ -331,12 +334,16 @@ class NodeWorker(object): self.log.exception("Exception while sending job start event") try: - result = self.runJob() + result = self.runJob(job) except Exception: self.log.exception("Exception while launching job thread") + self._running_job = False + if not result: + result = b'' + try: - job.sendWorkComplete() + job.sendWorkComplete(result) except Exception: self.log.exception("Exception while sending job completion packet") @@ -372,27 +379,30 @@ class NodeWorker(object): item = "onFinalized %s" % json.dumps(event) self.log.debug("Sending over ZMQ: %s" % (item,)) self.zmq_send_queue.put(item) + self._sent_complete_event = True def sendFakeCompleteEvent(self): + if self._sent_complete_event: + return self.sendCompleteEvent('zuul:launcher-shutdown', 'SUCCESS', {}) def runJob(self, job): self.ansible_proc = None + result = None with self.running_job_lock: if not self._running: - return + return result self._running_job = True self.log.debug("Job %s: beginning" % (job.unique,)) - return 'SUCCESS' # TODO with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) args = json.loads(job.arguments) self.prepareAnsibleFiles(jobdir, args) - result = self.runAnsible(jobdir) + status = self.runAnsible(jobdir) data = { 'url': 'https://server/job', @@ -401,13 +411,12 @@ class NodeWorker(object): job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) - result = dict(result=result) - job.sendWorkComplete(json.dumps(result)) + result = json.dumps(dict(result=status)) + + return result def getHostList(self, args): - # TODOv3: This should get the appropriate nodes from nodepool, - # or in the unit tests, be overriden to return localhost. - return [('localhost', dict(ansible_connection='local'))] + return [('node', dict(ansible_host=self.host))] def prepareAnsibleFiles(self, jobdir, args): with open(jobdir.inventory, 'w') as inventory: @@ -418,13 +427,14 @@ class NodeWorker(object): inventory.write('%s=%s' % (k, v)) inventory.write('\n') with open(jobdir.playbook, 'w') as playbook: - play = dict(hosts='localhost', + play = dict(hosts='node', tasks=[dict(name='test', shell='echo Hello world')]) playbook.write(yaml.dump([play])) with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) + config.write('host_key_checking = False\n') def runAnsible(self, jobdir): self.ansible_proc = subprocess.Popen( From feade500ffdc614806204cfe2fc87d3fd3ceeeb6 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 19 Apr 2016 17:45:40 -0700 Subject: [PATCH 025/152] Ansible launcher: run shell builders defined in JJB Change-Id: I01bffa94715a83aae35e843131775c34475798a8 --- zuul/launcher/ansiblelaunchserver.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 690659e8b7..6390d18c74 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -23,6 +23,7 @@ import subprocess import tempfile import threading import traceback +import uuid import gear import yaml @@ -40,6 +41,8 @@ class JobDir(object): self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') + self.script_root = os.path.join(self.ansible_root, 'scripts') + os.makedirs(self.script_root) def __enter__(self): return self @@ -399,9 +402,8 @@ class NodeWorker(object): with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) - args = json.loads(job.arguments) - self.prepareAnsibleFiles(jobdir, args) + self.prepareAnsibleFiles(jobdir, job) status = self.runAnsible(jobdir) data = { @@ -415,21 +417,31 @@ class NodeWorker(object): return result - def getHostList(self, args): + def getHostList(self): return [('node', dict(ansible_host=self.host))] - def prepareAnsibleFiles(self, jobdir, args): + def prepareAnsibleFiles(self, jobdir, gearman_job): with open(jobdir.inventory, 'w') as inventory: - for host_name, host_vars in self.getHostList(args): + for host_name, host_vars in self.getHostList(): inventory.write(host_name) inventory.write(' ') for k, v in host_vars.items(): inventory.write('%s=%s' % (k, v)) inventory.write('\n') + job_name = gearman_job.name.split(':')[1] + jjb_job = self.jobs[job_name] with open(jobdir.playbook, 'w') as playbook: - play = dict(hosts='node', - tasks=[dict(name='test', - shell='echo Hello world')]) + tasks = [] + for builder in jjb_job.get('builders', []): + if 'shell' in builder: + script_fn = '%s.sh' % str(uuid.uuid4().hex) + script_fn = os.path.join(jobdir.script_root, script_fn) + with open(script_fn, 'w') as script: + script.write(builder['shell']) + tasks.append(dict(script='%s >> /tmp/console.log 2>&1' % + script_fn)) + play = dict(hosts='node', name='Job body', + tasks=tasks) playbook.write(yaml.dump([play])) with open(jobdir.config, 'w') as config: config.write('[defaults]\n') From 5bc39f435c0f742d530603bdc991eb0cda5f377e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 20 Apr 2016 10:31:45 -0700 Subject: [PATCH 026/152] Ansible launcher: support job timeouts Change-Id: I522ba92dd0dcb43a49b6347f970e6e32afaf3f49 --- zuul/launcher/ansiblelaunchserver.py | 162 +++++++++++++++++++++------ 1 file changed, 127 insertions(+), 35 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 6390d18c74..a69acf6b28 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -16,12 +16,14 @@ import json import logging import multiprocessing import os +import Queue import shutil import signal import socket import subprocess import tempfile import threading +import time import traceback import uuid @@ -60,10 +62,13 @@ class LaunchServer(object): self.node_workers = {} self.mpmanager = multiprocessing.Manager() self.jobs = self.mpmanager.dict() - self.zmq_send_queue = multiprocessing.Queue() + self.zmq_send_queue = multiprocessing.JoinableQueue() + self.termination_queue = multiprocessing.JoinableQueue() def start(self): - self._running = True + self._gearman_running = True + self._zmq_running = True + self._reaper_running = True # Setup ZMQ self.zcontext = zmq.Context() @@ -88,10 +93,16 @@ class LaunchServer(object): # Start ZMQ worker thread self.log.debug("Starting ZMQ processor") - self.zmq_thread = threading.Thread(target=self.run_zmq) + self.zmq_thread = threading.Thread(target=self.runZMQ) self.zmq_thread.daemon = True self.zmq_thread.start() + # Start node worker reaper thread + self.log.debug("Starting reaper") + self.reaper_thread = threading.Thread(target=self.runReaper) + self.reaper_thread.daemon = True + self.reaper_thread.start() + # Start Gearman worker thread self.log.debug("Starting worker") self.gearman_thread = threading.Thread(target=self.run) @@ -119,21 +130,34 @@ class LaunchServer(object): self.config = config self.loadJobs() for node in self.node_workers.values(): - node.queue.put(dict(action='reconfigure')) + try: + if node.isAlive(): + node.queue.put(dict(action='reconfigure')) + except Exception: + self.log.exception("Exception sending reconfigure command " + "to worker:") def stop(self): self.log.debug("Stopping") - self._running = False + self._gearman_running = False + self._reaper_running = False self.worker.shutdown() for node in self.node_workers.values(): - node.stop() + try: + if node.isAlive(): + node.stop() + except Exception: + self.log.exception("Exception sending stop command to worker:") + self._zmq_running = False + self.zmq_send_queue.put(None) + self.zmq_send_queue.join() self.log.debug("Stopped") def join(self): self.gearman_thread.join() - def run_zmq(self): - while self._running: + def runZMQ(self): + while self._zmq_running or not self.zmq_send_queue.empty(): try: item = self.zmq_send_queue.get() self.log.debug("Got ZMQ event %s" % (item,)) @@ -142,9 +166,11 @@ class LaunchServer(object): self.zsocket.send(item) except Exception: self.log.exception("Exception while processing ZMQ events") + finally: + self.zmq_send_queue.task_done() def run(self): - while self._running: + while self._gearman_running: try: job = self.worker.getJob() try: @@ -168,7 +194,8 @@ class LaunchServer(object): worker = NodeWorker(self.config, self.jobs, args['name'], args['host'], args['description'], args['labels'], - self.hostname, self.zmq_send_queue) + self.hostname, self.zmq_send_queue, + self.termination_queue) self.node_workers[worker.name] = worker worker.process = multiprocessing.Process(target=worker.run) @@ -178,12 +205,27 @@ class LaunchServer(object): job.sendWorkData(json.dumps(data)) job.sendWorkComplete() + def runReaper(self): + # We don't actually care if all the events are processed + while self._reaper_running: + try: + item = self.termination_queue.get() + self.log.debug("Got termination event %s" % (item,)) + if item is None: + continue + del self.node_workers[item] + except Exception: + self.log.exception("Exception while processing " + "termination events:") + finally: + self.termination_queue.task_done() + class NodeWorker(object): log = logging.getLogger("zuul.NodeWorker") def __init__(self, config, jobs, name, host, description, labels, - manager_name, zmq_send_queue): + manager_name, zmq_send_queue, termination_queue): self.log.debug("Creating node worker %s" % (name,)) self.config = config self.jobs = jobs @@ -193,14 +235,25 @@ class NodeWorker(object): if not isinstance(labels, list): labels = [labels] self.labels = labels + self.process = None self.registered_functions = set() self._running = True - self.queue = multiprocessing.Queue() + self.queue = multiprocessing.JoinableQueue() self.manager_name = manager_name self.zmq_send_queue = zmq_send_queue + self.termination_queue = termination_queue self.running_job_lock = threading.Lock() + self._job_complete_event = threading.Event() self._running_job = False self._sent_complete_event = False + self._job_timeout = None + self._job_start_time = None + + def isAlive(self): + # Meant to be called from the manager + if self.process and self.process.is_alive(): + return True + return False def run(self): signal.signal(signal.SIGINT, signal.SIG_IGN) @@ -216,13 +269,13 @@ class NodeWorker(object): self.worker.waitForServer() self.register() - self.gearman_thread = threading.Thread(target=self.run_gearman) + self.gearman_thread = threading.Thread(target=self.runGearman) self.gearman_thread.daemon = True self.gearman_thread.start() - while self._running: + while self._running or not self.queue.empty(): try: - self._run_queue() + self._runQueue() except Exception: self.log.exception("Exception in queue manager:") @@ -233,26 +286,49 @@ class NodeWorker(object): self.log.debug("Submitting stop request") self._running = False self.queue.put(dict(action='stop')) + self.queue.join() - def _run_queue(self): - item = self.queue.get() - if item['action'] == 'stop': - self.log.debug("Received stop request") - self._running = False - self.worker.shutdown() - if not self.abortRunningJob(): - self.sendFakeCompleteEvent() - elif item['action'] == 'reconfigure': - self.register() + def _runQueue(self): + # This also runs the timeout function if needed + try: + item = self.queue.get(True, 10) # 10 second resolution on timeout + except Queue.Empty: + # We don't need these in a critical section, but we do + # need them not to change while we evaluate them, so make + # local copies. + running = self._running_job + start = self._job_start_time + timeout = self._job_timeout + now = time.time() + if (running and timeout and start + and now - start >= timeout): + self.log.info("Job timed out after %s seconds" % + (now - start,)) + self.abortRunningJob() + return + try: + if item['action'] == 'stop': + self.log.debug("Received stop request") + self._running = False + self.termination_queue.put(self.name) + if not self.abortRunningJob(): + self.sendFakeCompleteEvent() + else: + self._job_complete_event.wait() + self.worker.shutdown() + elif item['action'] == 'reconfigure': + self.register() + finally: + self.queue.task_done() - def run_gearman(self): + def runGearman(self): while self._running: try: - self._run_gearman() + self._runGearman() except Exception: self.log.exception("Exception in gearman manager:") - def _run_gearman(self): + def _runGearman(self): try: job = self.worker.getJob() except gear.InterruptedError: @@ -303,9 +379,11 @@ class NodeWorker(object): self.log.debug("Abort: a job is running") proc = self.ansible_proc if proc: - self.log.debug("Abort: sending kill signal to job process") + self.log.debug("Abort: sending kill signal to job " + "process group") try: - proc.kill() + pgid = os.getpgid(proc.pid) + os.killpg(pgid, signal.SIGKILL) aborted = True except Exception: self.log.exception("Exception while killing " @@ -316,8 +394,8 @@ class NodeWorker(object): return aborted def launch(self, job): - self.log.debug("Node worker %s launching job %s" % - (self.name, job.name)) + self.log.info("Node worker %s launching job %s" % + (self.name, job.name)) # Make sure we can parse what we need from the job first args = json.loads(job.arguments) @@ -342,6 +420,8 @@ class NodeWorker(object): self.log.exception("Exception while launching job thread") self._running_job = False + self._job_timeout = None + self._job_start_time = None if not result: result = b'' @@ -355,7 +435,8 @@ class NodeWorker(object): except Exception: self.log.exception("Exception while sending job completion event") - if offline: + self._job_complete_event.set() + if offline and self._running: self.stop() def sendStartEvent(self, name, parameters): @@ -397,6 +478,7 @@ class NodeWorker(object): if not self._running: return result self._running_job = True + self._job_complete_event.clear() self.log.debug("Job %s: beginning" % (job.unique,)) with JobDir() as jobdir: @@ -404,6 +486,8 @@ class NodeWorker(object): (job.unique, jobdir.root)) self.prepareAnsibleFiles(jobdir, job) + + self._job_start_time = time.time() status = self.runAnsible(jobdir) data = { @@ -430,6 +514,15 @@ class NodeWorker(object): inventory.write('\n') job_name = gearman_job.name.split(':')[1] jjb_job = self.jobs[job_name] + + for wrapper in jjb_job.get('wrappers', []): + if isinstance(wrapper, dict): + timeout = wrapper.get('build-timeout', {}) + if isinstance(timeout, dict): + timeout = timeout.get('timeout') + if timeout: + self._job_timeout = timeout * 60 + with open(jobdir.playbook, 'w') as playbook: tasks = [] for builder in jjb_job.get('builders', []): @@ -454,12 +547,11 @@ class NodeWorker(object): cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + preexec_fn=os.setsid, ) (out, err) = self.ansible_proc.communicate() ret = self.ansible_proc.wait() self.ansible_proc = None - print out - print err if ret == 0: return 'SUCCESS' else: From 79be4baf4810560384c642fcd059fa554baeef46 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 20 Apr 2016 15:21:58 -0700 Subject: [PATCH 027/152] Ansible launcher: support scp publisher Change-Id: I6d32e37f9bf9d55fe2df4173e69397e9e8da9f60 --- zuul/launcher/ansiblelaunchserver.py | 49 ++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index a69acf6b28..35b5df6059 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -42,6 +42,7 @@ class JobDir(object): os.makedirs(self.ansible_root) self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') + self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') self.script_root = os.path.join(self.ansible_root, 'scripts') os.makedirs(self.script_root) @@ -488,7 +489,6 @@ class NodeWorker(object): self.prepareAnsibleFiles(jobdir, job) self._job_start_time = time.time() - status = self.runAnsible(jobdir) data = { 'url': 'https://server/job', @@ -497,6 +497,13 @@ class NodeWorker(object): job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) + job_status = self.runAnsiblePlaybook(jobdir) + post_status = self.runAnsiblePostPlaybook(jobdir, job_status) + if job_status and post_status: + status = 'SUCCESS' + else: + status = 'FAILURE' + result = json.dumps(dict(result=status)) return result @@ -536,12 +543,33 @@ class NodeWorker(object): play = dict(hosts='node', name='Job body', tasks=tasks) playbook.write(yaml.dump([play])) + + with open(jobdir.post_playbook, 'w') as playbook: + tasks = [] + for publisher in jjb_job.get('publishers', []): + if 'scp' in publisher: + for scpfile in publisher['scp']['files']: + if scpfile.get('copy-console'): + src = '/tmp/console.log' + else: + src = scpfile['source'] + syncargs = dict(src=src, + dest=scpfile['target']) + task = dict(synchronize=syncargs, + delegate_to=publisher['scp']['site']) + if not scpfile.get('copy-after-failure'): + task['when'] = 'success' + tasks.append(task) + play = dict(hosts='node', name='Publishers', + tasks=tasks) + playbook.write(yaml.dump([play])) + with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') - def runAnsible(self, jobdir): + def runAnsiblePlaybook(self, jobdir): self.ansible_proc = subprocess.Popen( ['ansible-playbook', jobdir.playbook], cwd=jobdir.ansible_root, @@ -552,10 +580,19 @@ class NodeWorker(object): (out, err) = self.ansible_proc.communicate() ret = self.ansible_proc.wait() self.ansible_proc = None - if ret == 0: - return 'SUCCESS' - else: - return 'FAILURE' + return ret == 0 + + def runAnsiblePostPlaybook(self, jobdir, success): + proc = subprocess.Popen( + ['ansible-playbook', jobdir.post_playbook, + '-e', 'success=%s' % success], + cwd=jobdir.ansible_root, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setsid, + ) + (out, err) = proc.communicate() + return proc.wait() == 0 class JJB(jenkins_jobs.builder.Builder): From 08d7d4b49b40302ffb3a7f8e754bdfbf52f43a94 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 21 Apr 2016 09:43:10 -0700 Subject: [PATCH 028/152] Ansible launcher: support ftp publisher Change-Id: I53af72a68659e886d3a4bc974b1b638644128aa5 --- zuul/launcher/ansiblelaunchserver.py | 86 +++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 35b5df6059..62d7fc02f3 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -17,6 +17,7 @@ import logging import multiprocessing import os import Queue +import re import shutil import signal import socket @@ -56,6 +57,7 @@ class JobDir(object): class LaunchServer(object): log = logging.getLogger("zuul.LaunchServer") + section_re = re.compile('site "(.*?)"') def __init__(self, config): self.config = config @@ -65,6 +67,17 @@ class LaunchServer(object): self.jobs = self.mpmanager.dict() self.zmq_send_queue = multiprocessing.JoinableQueue() self.termination_queue = multiprocessing.JoinableQueue() + self.sites = {} + + for section in config.sections(): + m = self.section_re.match(section) + if m: + sitename = m.group(1) + d = {} + d['host'] = config.get(section, 'host') + d['user'] = config.get(section, 'user') + d['pass'] = config.get(section, 'pass') + self.sites[sitename] = d def start(self): self._gearman_running = True @@ -192,7 +205,7 @@ class LaunchServer(object): def assignNode(self, job): args = json.loads(job.arguments) self.log.debug("Assigned node with arguments: %s" % (args,)) - worker = NodeWorker(self.config, self.jobs, + worker = NodeWorker(self.config, self.jobs, self.sites, args['name'], args['host'], args['description'], args['labels'], self.hostname, self.zmq_send_queue, @@ -225,11 +238,12 @@ class LaunchServer(object): class NodeWorker(object): log = logging.getLogger("zuul.NodeWorker") - def __init__(self, config, jobs, name, host, description, labels, + def __init__(self, config, jobs, sites, name, host, description, labels, manager_name, zmq_send_queue, termination_queue): self.log.debug("Creating node worker %s" % (name,)) self.config = config self.jobs = jobs + self.sites = sites self.name = name self.host = host self.description = description @@ -511,6 +525,59 @@ class NodeWorker(object): def getHostList(self): return [('node', dict(ansible_host=self.host))] + def _makeSCPTask(self, publisher): + tasks = [] + for scpfile in publisher['scp']['files']: + site = publisher['scp']['site'] + if site not in self.sites: + raise Exception("Undefined SCP site: %s" % (site,)) + site = self.sites[site] + if scpfile.get('copy-console'): + src = '/tmp/console.log' + else: + src = scpfile['source'] + syncargs = dict(src=src, + dest=scpfile['target']) + task = dict(synchronize=syncargs, + delegate_to=site['host']) + if not scpfile.get('copy-after-failure'): + task['when'] = 'success' + tasks.append(task) + return tasks + + def _makeFTPTask(self, jobdir, publisher): + tasks = [] + ftp = publisher['ftp'] + site = ftp['site'] + if site not in self.sites: + raise Exception("Undefined FTP site: %s" % site) + site = self.sites[site] + ftproot = tempfile.mkdtemp(dir=jobdir.ansible_root) + ftpcontent = os.path.join(ftproot, 'content') + os.makedirs(ftpcontent) + ftpscript = os.path.join(ftproot, 'script') + syncargs = dict(src=ftp['source'], + dest=ftpcontent) + task = dict(synchronize=syncargs, + when='success') + tasks.append(task) + task = dict(shell='lftp -f %s' % ftpscript, + when='success') + ftpsource = ftpcontent + if ftp.get('remove-prefix'): + ftpsource = os.path.join(ftpcontent, ftp['remove-prefix']) + while ftpsource[-1] == '/': + ftpsource = ftpsource[:-1] + ftptarget = ftp['target'] + while ftptarget[-1] == '/': + ftptarget = ftptarget[:-1] + with open(ftpscript, 'w') as script: + script.write('open %s\n' % site['host']) + script.write('user %s %s\n' % (site['user'], site['pass'])) + script.write('mirror -R %s %s\n' % (ftpsource, ftptarget)) + tasks.append(task) + return tasks + def prepareAnsibleFiles(self, jobdir, gearman_job): with open(jobdir.inventory, 'w') as inventory: for host_name, host_vars in self.getHostList(): @@ -548,18 +615,9 @@ class NodeWorker(object): tasks = [] for publisher in jjb_job.get('publishers', []): if 'scp' in publisher: - for scpfile in publisher['scp']['files']: - if scpfile.get('copy-console'): - src = '/tmp/console.log' - else: - src = scpfile['source'] - syncargs = dict(src=src, - dest=scpfile['target']) - task = dict(synchronize=syncargs, - delegate_to=publisher['scp']['site']) - if not scpfile.get('copy-after-failure'): - task['when'] = 'success' - tasks.append(task) + tasks.extend(self._makeSCPTask(publisher)) + if 'ftp' in publisher: + tasks.extend(self._makeFTPTask(jobdir, publisher)) play = dict(hosts='node', name='Publishers', tasks=tasks) playbook.write(yaml.dump([play])) From 19233fbff5ee1b252ab014d2ab908fb66d70751a Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 21 Apr 2016 11:26:02 -0700 Subject: [PATCH 029/152] Ansible launcher: support zuul stop commands Change-Id: I02f43162fb9c5d691f0d24841c196f5cb5e2e43e --- zuul/launcher/ansiblelaunchserver.py | 57 ++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 62d7fc02f3..4e652d26f8 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -65,6 +65,7 @@ class LaunchServer(object): self.node_workers = {} self.mpmanager = multiprocessing.Manager() self.jobs = self.mpmanager.dict() + self.builds = self.mpmanager.dict() self.zmq_send_queue = multiprocessing.JoinableQueue() self.termination_queue = multiprocessing.JoinableQueue() self.sites = {} @@ -138,6 +139,7 @@ class LaunchServer(object): def register(self): self.worker.registerFunction("node-assign:zuul") + self.worker.registerFunction("stop:%s" % self.hostname) def reconfigure(self, config): self.log.debug("Reconfiguring") @@ -189,8 +191,11 @@ class LaunchServer(object): job = self.worker.getJob() try: if job.name.startswith('node-assign:'): - self.log.debug("Got assign-node job: %s" % job.unique) + self.log.debug("Got node-assign job: %s" % job.unique) self.assignNode(job) + elif job.name.startswith('stop:'): + self.log.debug("Got stop job: %s" % job.unique) + self.stopJob(job) else: self.log.error("Unable to handle job %s" % job.name) job.sendWorkFail() @@ -205,8 +210,8 @@ class LaunchServer(object): def assignNode(self, job): args = json.loads(job.arguments) self.log.debug("Assigned node with arguments: %s" % (args,)) - worker = NodeWorker(self.config, self.jobs, self.sites, - args['name'], args['host'], + worker = NodeWorker(self.config, self.jobs, self.builds, + self.sites, args['name'], args['host'], args['description'], args['labels'], self.hostname, self.zmq_send_queue, self.termination_queue) @@ -219,6 +224,31 @@ class LaunchServer(object): job.sendWorkData(json.dumps(data)) job.sendWorkComplete() + def stopJob(self, job): + try: + args = json.loads(job.arguments) + self.log.debug("Stop job with arguments: %s" % (args,)) + unique = args['number'] + build_worker_name = self.builds.get(unique) + if not build_worker_name: + self.log.debug("Unable to find build for job %s" % (unique,)) + return + node = self.node_workers.get(build_worker_name) + if not node: + self.log.debug("Unable to find worker for job %s" % (unique,)) + return + try: + if node.isAlive(): + node.queue.put(dict(action='abort')) + else: + self.log.debug("Node %s is not alive while aborting job" % + (node.name,)) + except Exception: + self.log.exception("Exception sending abort command " + "to worker:") + finally: + job.sendWorkComplete() + def runReaper(self): # We don't actually care if all the events are processed while self._reaper_running: @@ -238,11 +268,13 @@ class LaunchServer(object): class NodeWorker(object): log = logging.getLogger("zuul.NodeWorker") - def __init__(self, config, jobs, sites, name, host, description, labels, - manager_name, zmq_send_queue, termination_queue): + def __init__(self, config, jobs, builds, sites, name, host, + description, labels, manager_name, zmq_send_queue, + termination_queue): self.log.debug("Creating node worker %s" % (name,)) self.config = config self.jobs = jobs + self.builds = builds self.sites = sites self.name = name self.host = host @@ -332,7 +364,11 @@ class NodeWorker(object): self._job_complete_event.wait() self.worker.shutdown() elif item['action'] == 'reconfigure': + self.log.debug("Received reconfigure request") self.register() + elif item['action'] == 'abort': + self.log.debug("Received abort request") + self.abortRunningJob() finally: self.queue.task_done() @@ -450,6 +486,11 @@ class NodeWorker(object): except Exception: self.log.exception("Exception while sending job completion event") + try: + del self.builds[job.unique] + except Exception: + self.log.exception("Exception while clearing build record") + self._job_complete_event.set() if offline and self._running: self.stop() @@ -496,6 +537,7 @@ class NodeWorker(object): self._job_complete_event.clear() self.log.debug("Job %s: beginning" % (job.unique,)) + self.builds[job.unique] = self.name with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) @@ -505,8 +547,9 @@ class NodeWorker(object): self._job_start_time = time.time() data = { - 'url': 'https://server/job', - 'number': 1 + 'manager': self.manager_name, + 'number': job.unique, + # 'url': '', } job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) From 47ef69f941d11d7d3978a4d507b8052b8c52bd6d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 21 Apr 2016 17:28:58 -0700 Subject: [PATCH 030/152] Ansible launcher: add zuul_runner module This runs the commands asynchronously (but waits for their completion). This is more robust for long-running commands because it avoids the built-in ssh timeout. This adds an ansible module to actually run the remote command so that we can: * process the console log * use ansible async (the script module does not support it) * control the environment variables of the script being run It also adds a callback plugin to track the elapsed time so that we can use the built-in timeout features of async commands. Note that the module and plugin are GPL licensed. Change-Id: I19b2b6a5c362bb9d843e7802aefe0eb5df9c5ed7 --- zuul/ansible/__init__.py | 0 zuul/ansible/library/__init__.py | 0 zuul/ansible/library/zuul_runner.py | 74 +++++++++++ zuul/ansible/plugins/__init__.py | 0 .../plugins/callback_plugins/__init__.py | 0 .../plugins/callback_plugins/timeout.py | 57 ++++++++ zuul/launcher/ansiblelaunchserver.py | 122 ++++++++++++------ 7 files changed, 211 insertions(+), 42 deletions(-) create mode 100644 zuul/ansible/__init__.py create mode 100644 zuul/ansible/library/__init__.py create mode 100644 zuul/ansible/library/zuul_runner.py create mode 100644 zuul/ansible/plugins/__init__.py create mode 100644 zuul/ansible/plugins/callback_plugins/__init__.py create mode 100644 zuul/ansible/plugins/callback_plugins/timeout.py diff --git a/zuul/ansible/__init__.py b/zuul/ansible/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zuul/ansible/library/__init__.py b/zuul/ansible/library/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py new file mode 100644 index 0000000000..75542445ee --- /dev/null +++ b/zuul/ansible/library/zuul_runner.py @@ -0,0 +1,74 @@ +#!/usr/bin/python + +# Copyright (c) 2016 IBM Corp. +# +# This module is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this software. If not, see . + +import datetime +import subprocess + + +class Console(object): + def __enter__(self): + self.logfile = open('/tmp/console.log', 'w+') + return self + + def __exit__(self, etype, value, tb): + self.logfile.close() + + def addLine(self, ln): + ts = datetime.datetime.now() + outln = '%s %s' % (str(ts), ln) + self.logfile.write(outln) + + +def run(cwd, cmd, args): + proc = subprocess.Popen( + [cmd], + cwd=cwd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + env=args, + ) + + with Console() as console: + while True: + line = proc.stdout.readline() + if not line: + break + console.addLine(line) + + ret = proc.wait() + return ret + + +def main(): + module = AnsibleModule( + argument_spec=dict( + command=dict(required=True, default=None), + cwd=dict(required=True, default=None), + parameters=dict(default={}, type='dict') + ) + ) + + p = module.params + ret = run(p['cwd'], p['command'], p['parameters']) + if ret == 0: + module.exit_json(changed=True, rc=ret) + else: + module.fail_json(msg="Exit code %s" % ret, rc=ret) + +from ansible.module_utils.basic import * # noqa + +main() diff --git a/zuul/ansible/plugins/__init__.py b/zuul/ansible/plugins/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zuul/ansible/plugins/callback_plugins/__init__.py b/zuul/ansible/plugins/callback_plugins/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/zuul/ansible/plugins/callback_plugins/timeout.py b/zuul/ansible/plugins/callback_plugins/timeout.py new file mode 100644 index 0000000000..245e9884ec --- /dev/null +++ b/zuul/ansible/plugins/callback_plugins/timeout.py @@ -0,0 +1,57 @@ +# Copyright 2016 IBM Corp. +# +# This file is part of Zuul +# +# This file is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this file. If not, see . + +import time + +from ansible.executor.task_result import TaskResult +from ansible.plugins.callback import CallbackBase + + +class CallbackModule(CallbackBase): + def __init__(self, *args, **kw): + super(CallbackModule, self).__init__(*args, **kw) + self._elapsed_time = 0.0 + self._task_start_time = None + self._play = None + + def v2_playbook_on_play_start(self, play): + self._play = play + + def playbook_on_task_start(self, name, is_conditional): + self._task_start_time = time.time() + + def v2_on_any(self, *args, **kw): + result = None + if args and isinstance(args[0], TaskResult): + result = args[0] + if not result: + return + + if self._task_start_time is not None: + task_time = time.time() - self._task_start_time + self._elapsed_time += task_time + if self._play and result._host: + manager = self._play.get_variable_manager() + facts = dict(elapsed_time=self._elapsed_time) + + overall_timeout = manager.extra_vars.get('timeout') + if overall_timeout is not None: + timeout = int(overall_timeout) - int(self._elapsed_time) + facts['timeout'] = timeout + + manager.set_nonpersistent_facts(result._host, facts) + self._task_start_time = None diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 4e652d26f8..8eb0374627 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -16,7 +16,6 @@ import json import logging import multiprocessing import os -import Queue import re import shutil import signal @@ -24,7 +23,6 @@ import socket import subprocess import tempfile import threading -import time import traceback import uuid @@ -33,6 +31,9 @@ import yaml import jenkins_jobs.builder import zmq +import zuul.ansible.library +import zuul.ansible.plugins.callback_plugins + class JobDir(object): def __init__(self): @@ -41,6 +42,8 @@ class JobDir(object): os.makedirs(self.git_root) self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) + self.plugins_root = os.path.join(self.ansible_root, 'plugins') + os.makedirs(self.plugins_root) self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') @@ -293,8 +296,7 @@ class NodeWorker(object): self._job_complete_event = threading.Event() self._running_job = False self._sent_complete_event = False - self._job_timeout = None - self._job_start_time = None + self.workspace_root = config.get('launcher', 'workspace_root') def isAlive(self): # Meant to be called from the manager @@ -336,23 +338,7 @@ class NodeWorker(object): self.queue.join() def _runQueue(self): - # This also runs the timeout function if needed - try: - item = self.queue.get(True, 10) # 10 second resolution on timeout - except Queue.Empty: - # We don't need these in a critical section, but we do - # need them not to change while we evaluate them, so make - # local copies. - running = self._running_job - start = self._job_start_time - timeout = self._job_timeout - now = time.time() - if (running and timeout and start - and now - start >= timeout): - self.log.info("Job timed out after %s seconds" % - (now - start,)) - self.abortRunningJob() - return + item = self.queue.get() try: if item['action'] == 'stop': self.log.debug("Received stop request") @@ -466,13 +452,11 @@ class NodeWorker(object): self.log.exception("Exception while sending job start event") try: - result = self.runJob(job) + result = self.runJob(job, args) except Exception: self.log.exception("Exception while launching job thread") self._running_job = False - self._job_timeout = None - self._job_start_time = None if not result: result = b'' @@ -527,7 +511,7 @@ class NodeWorker(object): self.sendCompleteEvent('zuul:launcher-shutdown', 'SUCCESS', {}) - def runJob(self, job): + def runJob(self, job, args): self.ansible_proc = None result = None with self.running_job_lock: @@ -541,10 +525,7 @@ class NodeWorker(object): with JobDir() as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) - - self.prepareAnsibleFiles(jobdir, job) - - self._job_start_time = time.time() + timeout = self.prepareAnsibleFiles(jobdir, job, args) data = { 'manager': self.manager_name, @@ -554,7 +535,7 @@ class NodeWorker(object): job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) - job_status = self.runAnsiblePlaybook(jobdir) + job_status = self.runAnsiblePlaybook(jobdir, timeout) post_status = self.runAnsiblePostPlaybook(jobdir, job_status) if job_status and post_status: status = 'SUCCESS' @@ -621,7 +602,46 @@ class NodeWorker(object): tasks.append(task) return tasks - def prepareAnsibleFiles(self, jobdir, gearman_job): + def _makeBuilderTask(self, jobdir, builder, parameters, timeout): + tasks = [] + script_fn = '%s.sh' % str(uuid.uuid4().hex) + script_path = os.path.join(jobdir.script_root, script_fn) + with open(script_path, 'w') as script: + script.write(builder['shell']) + + remote_path = os.path.join('/tmp', script_fn) + copy = dict(src=script_path, + dest=remote_path, + mode=0555) + task = dict(copy=copy) + tasks.append(task) + + runner = dict(command=remote_path, + cwd=parameters['WORKSPACE'], + parameters=parameters) + task = dict(zuul_runner=runner) + if timeout: + task['when'] = '{{ timeout | int > 0 }}' + task['async'] = '{{ timeout }}' + else: + task['async'] = 2 * 60 * 60 # 2 hour default timeout + task['poll'] = 5 + tasks.append(task) + + filetask = dict(path=remote_path, + state='absent') + task = dict(file=filetask) + tasks.append(task) + + return tasks + + def prepareAnsibleFiles(self, jobdir, gearman_job, args): + job_name = gearman_job.name.split(':')[1] + jjb_job = self.jobs[job_name] + + parameters = args.copy() + parameters['WORKSPACE'] = os.path.join(self.workspace_root, job_name) + with open(jobdir.inventory, 'w') as inventory: for host_name, host_vars in self.getHostList(): inventory.write(host_name) @@ -629,27 +649,30 @@ class NodeWorker(object): for k, v in host_vars.items(): inventory.write('%s=%s' % (k, v)) inventory.write('\n') - job_name = gearman_job.name.split(':')[1] - jjb_job = self.jobs[job_name] + timeout = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): timeout = wrapper.get('build-timeout', {}) if isinstance(timeout, dict): timeout = timeout.get('timeout') if timeout: - self._job_timeout = timeout * 60 + timeout = timeout * 60 with open(jobdir.playbook, 'w') as playbook: tasks = [] + + task = dict(file=dict(path='/tmp/console.log', state='absent')) + tasks.append(task) + + task = dict(file=dict(path=parameters['WORKSPACE'], + state='directory')) + tasks.append(task) + for builder in jjb_job.get('builders', []): if 'shell' in builder: - script_fn = '%s.sh' % str(uuid.uuid4().hex) - script_fn = os.path.join(jobdir.script_root, script_fn) - with open(script_fn, 'w') as script: - script.write(builder['shell']) - tasks.append(dict(script='%s >> /tmp/console.log 2>&1' % - script_fn)) + tasks.extend(self._makeBuilderTask(jobdir, builder, + parameters, timeout)) play = dict(hosts='node', name='Job body', tasks=tasks) playbook.write(yaml.dump([play])) @@ -670,15 +693,30 @@ class NodeWorker(object): config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') - def runAnsiblePlaybook(self, jobdir): + callback_path = zuul.ansible.plugins.callback_plugins.__file__ + callback_path = os.path.abspath(callback_path) + callback_path = os.path.dirname(callback_path) + config.write('callback_plugins = %s\n' % callback_path) + + library_path = zuul.ansible.library.__file__ + library_path = os.path.abspath(library_path) + library_path = os.path.dirname(library_path) + config.write('library = %s\n' % library_path) + + return timeout + + def runAnsiblePlaybook(self, jobdir, timeout): self.ansible_proc = subprocess.Popen( - ['ansible-playbook', jobdir.playbook], + ['ansible-playbook', jobdir.playbook, + '-e', 'timeout=%s' % timeout, '-v'], cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid, ) (out, err) = self.ansible_proc.communicate() + self.log.debug("Ansible stdout:\n%s" % out) + self.log.debug("Ansible stderr:\n%s" % err) ret = self.ansible_proc.wait() self.ansible_proc = None return ret == 0 From c4b1937ed67d884ef5f9d0db28c8e7df287ec8a8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 22 Apr 2016 09:26:37 -0700 Subject: [PATCH 031/152] Ansible launcher: add streaming console support Note the GPL license as it is an ansible module. Change-Id: I5878c626beebe9cc131a431fa46bf67f3eb9c426 --- zuul/ansible/library/zuul_console.py | 194 +++++++++++++++++++++++++++ zuul/launcher/ansiblelaunchserver.py | 5 +- 2 files changed, 198 insertions(+), 1 deletion(-) create mode 100644 zuul/ansible/library/zuul_console.py diff --git a/zuul/ansible/library/zuul_console.py b/zuul/ansible/library/zuul_console.py new file mode 100644 index 0000000000..0e3e0668c7 --- /dev/null +++ b/zuul/ansible/library/zuul_console.py @@ -0,0 +1,194 @@ +#!/usr/bin/python + +# Copyright (c) 2016 IBM Corp. +# +# This module is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this software. If not, see . + +import os +import sys +import socket +import threading + + +def daemonize(): + # A really basic daemonize method that should work well enough for + # now in this circumstance. Based on the public domain code at: + # http://web.archive.org/web/20131017130434/http://www.jejik.com/articles/2007/02/a_simple_unix_linux_daemon_in_python/ + + pid = os.fork() + if pid > 0: + return True + + os.chdir('/') + os.setsid() + os.umask(0) + + pid = os.fork() + if pid > 0: + sys.exit(0) + + sys.stdout.flush() + sys.stderr.flush() + i = open('/dev/null', 'r') + o = open('/dev/null', 'a+') + e = open('/dev/null', 'a+', 0) + os.dup2(i.fileno(), sys.stdin.fileno()) + os.dup2(o.fileno(), sys.stdout.fileno()) + os.dup2(e.fileno(), sys.stderr.fileno()) + return False + + +class Console(object): + def __init__(self, path): + self.path = path + self.file = open(path) + self.stat = os.stat(path) + self.size = self.stat.st_size + + +class Server(object): + def __init__(self, path, port): + self.path = path + s = None + for res in socket.getaddrinfo(None, port, socket.AF_UNSPEC, + socket.SOCK_STREAM, 0, + socket.AI_PASSIVE): + af, socktype, proto, canonname, sa = res + try: + s = socket.socket(af, socktype, proto) + s.setsockopt(socket.SOL_SOCKET, + socket.SO_REUSEADDR, 1) + except socket.error: + s = None + continue + try: + s.bind(sa) + s.listen(1) + except socket.error: + s.close() + s = None + continue + break + if s is None: + sys.exit(1) + self.socket = s + + def accept(self): + conn, addr = self.socket.accept() + return conn + + def run(self): + while True: + conn = self.accept() + t = threading.Thread(target=self.handleOneConnection, args=(conn,)) + t.daemon = True + t.start() + + def chunkConsole(self, conn): + try: + console = Console(self.path) + except Exception: + return + while True: + chunk = console.file.read(4096) + if not chunk: + break + conn.send(chunk) + return console + + def followConsole(self, console, conn): + while True: + r = [console.file, conn] + e = [console.file, conn] + r, w, e = select.select(r, [], e) + + if console.file in e: + return True + if conn in e: + return False + if conn in r: + ret = conn.recv(1024) + # Discard anything read, if input is eof, it has + # disconnected. + if not ret: + return False + + if console.file in r: + line = console.file.readline() + if line: + conn.send(line) + time.sleep(0.5) + try: + st = os.stat(console.path) + if (st.st_ino != console.stat.st_ino or + st.st_size < console.size): + return True + except Exception: + return True + console.size = st.st_size + + def handleOneConnection(self, conn): + # FIXME: this won't notice disconnects until it tries to send + console = None + try: + while True: + if console is not None: + try: + console.file.close() + except: + pass + while True: + console = self.chunkConsole(conn) + if console: + break + time.sleep(0.5) + while True: + if self.followConsole(console, conn): + break + else: + return + finally: + try: + conn.close() + except Exception: + pass + + +def test(): + s = Server('/tmp/console.log', 8088) + s.run() + + +def main(): + module = AnsibleModule( + argument_spec=dict( + path=dict(default='/tmp/console.log'), + port=dict(default=8088, type='int'), + ) + ) + + p = module.params + path = p['path'] + port = p['port'] + + if daemonize(): + module.exit_json() + + s = Server(path, port) + s.run() + +from ansible.module_utils.basic import * # noqa + +main() +# test() diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 8eb0374627..9482f26c52 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -530,7 +530,7 @@ class NodeWorker(object): data = { 'manager': self.manager_name, 'number': job.unique, - # 'url': '', + 'url': 'telnet://%s:8088' % self.host, } job.sendWorkData(json.dumps(data)) job.sendWorkStatus(0, 100) @@ -665,6 +665,9 @@ class NodeWorker(object): task = dict(file=dict(path='/tmp/console.log', state='absent')) tasks.append(task) + task = dict(zuul_console=dict(path='/tmp/console.log', port=8088)) + tasks.append(task) + task = dict(file=dict(path=parameters['WORKSPACE'], state='directory')) tasks.append(task) From e378cab3f2565bb9443bc3dbe5707f1774d10d90 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sun, 1 May 2016 18:13:43 -0500 Subject: [PATCH 032/152] Add private_key_file configuration for launcher Expose the ability for a user to control which SSH private key to use with ansible-playbook. Change-Id: If0bd3941f26d6ed28e8eede539e9a664a34ba8b3 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 9482f26c52..f58a82b956 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -297,6 +297,10 @@ class NodeWorker(object): self._running_job = False self._sent_complete_event = False self.workspace_root = config.get('launcher', 'workspace_root') + if self.config.has_option('launcher', 'private_key_file'): + self.private_key_file = config.get('launcher', 'private_key_file') + else: + self.private_key_file = '~/.ssh/id_rsa' def isAlive(self): # Meant to be called from the manager @@ -695,6 +699,7 @@ class NodeWorker(object): config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') + config.write('private_key_file = %s\n' % self.private_key_file) callback_path = zuul.ansible.plugins.callback_plugins.__file__ callback_path = os.path.abspath(callback_path) From 16ff024e9a3d8b51bf059e961b1143d2792b3b94 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 2 May 2016 07:59:57 -0400 Subject: [PATCH 033/152] Add usernname configuration option for launcher Expose the ability for a user to change the ansible_user setting. This controls the username ansible will use to SSH into the remote node. Change-Id: I7a5788ba2c22d7e7f6e87c43ab1762658451b205 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f58a82b956..52576ff4d6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -301,6 +301,10 @@ class NodeWorker(object): self.private_key_file = config.get('launcher', 'private_key_file') else: self.private_key_file = '~/.ssh/id_rsa' + if self.config.has_option('launcher', 'username'): + self.username = config.get('launcher', 'username') + else: + self.username = 'zuul' def isAlive(self): # Meant to be called from the manager @@ -551,7 +555,8 @@ class NodeWorker(object): return result def getHostList(self): - return [('node', dict(ansible_host=self.host))] + return [('node', dict( + ansible_host=self.host, ansible_user=self.username))] def _makeSCPTask(self, publisher): tasks = [] @@ -649,9 +654,8 @@ class NodeWorker(object): with open(jobdir.inventory, 'w') as inventory: for host_name, host_vars in self.getHostList(): inventory.write(host_name) - inventory.write(' ') for k, v in host_vars.items(): - inventory.write('%s=%s' % (k, v)) + inventory.write(' %s=%s' % (k, v)) inventory.write('\n') timeout = None From 1849e725ed87fc93ca11f86fbd9399199ba2cef7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 12 May 2016 14:47:45 -0700 Subject: [PATCH 034/152] Ansible launcher: Support configurable site roots for ftp/scp Change-Id: Ic239d8bbff7f12406730c6d523127464c1f83f4d --- zuul/launcher/ansiblelaunchserver.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 52576ff4d6..844b390619 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -80,7 +80,8 @@ class LaunchServer(object): d = {} d['host'] = config.get(section, 'host') d['user'] = config.get(section, 'user') - d['pass'] = config.get(section, 'pass') + d['pass'] = config.get(section, 'pass', '') + d['root'] = config.get(section, 'root', '/') self.sites[sitename] = d def start(self): @@ -569,8 +570,13 @@ class NodeWorker(object): src = '/tmp/console.log' else: src = scpfile['source'] + dest = os.path.join(site['root'], scpfile['target']) + dest = os.path.normpath(dest) + if not dest.startswith(site['root']): + raise Exception("Target path %s is not below site root" % + (dest,)) syncargs = dict(src=src, - dest=scpfile['target']) + dest=dest) task = dict(synchronize=syncargs, delegate_to=site['host']) if not scpfile.get('copy-after-failure'): @@ -602,6 +608,11 @@ class NodeWorker(object): while ftpsource[-1] == '/': ftpsource = ftpsource[:-1] ftptarget = ftp['target'] + ftptarget = os.path.join(site['root'], ftp['target']) + ftptarget = os.path.normpath(ftptarget) + if not ftptarget.startswith(site['root']): + raise Exception("Target path %s is not below site root" % + (ftptarget,)) while ftptarget[-1] == '/': ftptarget = ftptarget[:-1] with open(ftpscript, 'w') as script: From de2ca126fb315a8e03229033ed7932430db27b94 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Fri, 13 May 2016 12:49:27 -0400 Subject: [PATCH 035/152] Ansible launcher: Fix module main calls Soon-to-be-released Ansible 2.1 can sometimes hate you for not surrounding your call to main() with the proper conditional check. Let's fix that. Change-Id: If9eca7866d8d2057d200110186e03772cd77a8fb --- zuul/ansible/library/zuul_console.py | 3 ++- zuul/ansible/library/zuul_runner.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/zuul/ansible/library/zuul_console.py b/zuul/ansible/library/zuul_console.py index 0e3e0668c7..bb6ec7bf3e 100644 --- a/zuul/ansible/library/zuul_console.py +++ b/zuul/ansible/library/zuul_console.py @@ -190,5 +190,6 @@ def main(): from ansible.module_utils.basic import * # noqa -main() +if __name__ == '__main__': + main() # test() diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 75542445ee..6fc8f2daf6 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -71,4 +71,5 @@ def main(): from ansible.module_utils.basic import * # noqa -main() +if __name__ == '__main__': + main() From cdada263178fa79aadbe123675a6b91ac230925c Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 16 May 2016 20:26:46 -0400 Subject: [PATCH 036/152] Set default_flow_style=False for readability A cosmetic change to format the playbook as block style. Change-Id: I458cdaccfe903780282dd29577eedd4f6995fcbe Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 844b390619..d1171f938b 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -697,7 +697,7 @@ class NodeWorker(object): parameters, timeout)) play = dict(hosts='node', name='Job body', tasks=tasks) - playbook.write(yaml.dump([play])) + playbook.write(yaml.dump([play], default_flow_style=False)) with open(jobdir.post_playbook, 'w') as playbook: tasks = [] @@ -708,7 +708,7 @@ class NodeWorker(object): tasks.extend(self._makeFTPTask(jobdir, publisher)) play = dict(hosts='node', name='Publishers', tasks=tasks) - playbook.write(yaml.dump([play])) + playbook.write(yaml.dump([play], default_flow_style=False)) with open(jobdir.config, 'w') as config: config.write('[defaults]\n') From b5aa68ffee1292bed5fb80f942150e57373d51f7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 18 May 2016 18:14:52 -0700 Subject: [PATCH 037/152] Ansible launcher: several fixes These were developed together in situ: * Fix tailing the console log * Change console log name from console.log to console.txt (for better auto content typing) * Expand JJB macros for builders and publishers * Use a two-stage SCP copy (worker -> controller; controller -> site); a one-stage copy is possible but will require installing a key on the remote site * Substitute parameters (eg $LOG_SERVER) into scp/ftp site paths * Better worker logging (use the worker name in the logger name) Change-Id: I98e5603f7a3210c1322640a66ecdeadb24ce74fe --- zuul/ansible/library/zuul_console.py | 42 +++++----- zuul/ansible/library/zuul_runner.py | 2 +- zuul/launcher/ansiblelaunchserver.py | 115 +++++++++++++++++++++++---- 3 files changed, 122 insertions(+), 37 deletions(-) diff --git a/zuul/ansible/library/zuul_console.py b/zuul/ansible/library/zuul_console.py index 0e3e0668c7..e0d1d6eea6 100644 --- a/zuul/ansible/library/zuul_console.py +++ b/zuul/ansible/library/zuul_console.py @@ -109,12 +109,20 @@ class Server(object): def followConsole(self, console, conn): while True: - r = [console.file, conn] - e = [console.file, conn] - r, w, e = select.select(r, [], e) + # As long as we have unread data, keep reading/sending + while True: + chunk = console.file.read(4096) + if chunk: + conn.send(chunk) + else: + break - if console.file in e: - return True + # At this point, we are waiting for more data to be written + time.sleep(0.5) + + # Check to see if the remote end has sent any data, if so, + # discard + r, w, e = select.select([conn], [], [conn], 0) if conn in e: return False if conn in r: @@ -124,19 +132,15 @@ class Server(object): if not ret: return False - if console.file in r: - line = console.file.readline() - if line: - conn.send(line) - time.sleep(0.5) - try: - st = os.stat(console.path) - if (st.st_ino != console.stat.st_ino or - st.st_size < console.size): - return True - except Exception: + # See if the file has been truncated + try: + st = os.stat(console.path) + if (st.st_ino != console.stat.st_ino or + st.st_size < console.size): return True - console.size = st.st_size + except Exception: + return True + console.size = st.st_size def handleOneConnection(self, conn): # FIXME: this won't notice disconnects until it tries to send @@ -166,14 +170,14 @@ class Server(object): def test(): - s = Server('/tmp/console.log', 8088) + s = Server('/tmp/console.txt', 8088) s.run() def main(): module = AnsibleModule( argument_spec=dict( - path=dict(default='/tmp/console.log'), + path=dict(default='/tmp/console.txt'), port=dict(default=8088, type='int'), ) ) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 75542445ee..955469fd12 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -21,7 +21,7 @@ import subprocess class Console(object): def __enter__(self): - self.logfile = open('/tmp/console.log', 'w+') + self.logfile = open('/tmp/console.txt', 'w+', 0) return self def __exit__(self, etype, value, tb): diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 844b390619..75b491136e 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -29,6 +29,7 @@ import uuid import gear import yaml import jenkins_jobs.builder +import jenkins_jobs.formatter import zmq import zuul.ansible.library @@ -136,6 +137,7 @@ class LaunchServer(object): builder.parser.expandYaml() unseen = set(self.jobs.keys()) for job in builder.parser.jobs: + builder.expandMacros(job) self.jobs[job['name']] = job unseen.discard(job['name']) for name in unseen: @@ -261,6 +263,10 @@ class LaunchServer(object): self.log.debug("Got termination event %s" % (item,)) if item is None: continue + worker = self.node_workers[item] + self.log.debug("Joining %s" % (item,)) + worker.process.join() + self.log.debug("Joined %s" % (item,)) del self.node_workers[item] except Exception: self.log.exception("Exception while processing " @@ -270,11 +276,10 @@ class LaunchServer(object): class NodeWorker(object): - log = logging.getLogger("zuul.NodeWorker") - def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, termination_queue): + self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config self.jobs = jobs @@ -325,12 +330,15 @@ class NodeWorker(object): self.worker.addServer(server, port) self.log.debug("Waiting for server") self.worker.waitForServer() + self.log.debug("Registering") self.register() self.gearman_thread = threading.Thread(target=self.runGearman) self.gearman_thread.daemon = True self.gearman_thread.start() + self.log.debug("Started") + while self._running or not self.queue.empty(): try: self._runQueue() @@ -559,7 +567,12 @@ class NodeWorker(object): return [('node', dict( ansible_host=self.host, ansible_user=self.username))] - def _makeSCPTask(self, publisher): + def _substituteVariables(self, text, variables): + def lookup(match): + return variables.get(match.group(1), '') + return re.sub('\$([A-Za-z0-9_]+)', lookup, text) + + def _makeSCPTask(self, jobdir, publisher, parameters): tasks = [] for scpfile in publisher['scp']['files']: site = publisher['scp']['site'] @@ -567,35 +580,64 @@ class NodeWorker(object): raise Exception("Undefined SCP site: %s" % (site,)) site = self.sites[site] if scpfile.get('copy-console'): - src = '/tmp/console.log' + src = '/tmp/console.txt' else: src = scpfile['source'] - dest = os.path.join(site['root'], scpfile['target']) + src = self._substituteVariables(src, parameters) + src = os.path.join(parameters['WORKSPACE'], src) + scproot = tempfile.mkdtemp(dir=jobdir.ansible_root) + os.chmod(scproot, 0o755) + syncargs = dict(src=src, + dest=scproot, + mode='pull') + task = dict(synchronize=syncargs) + if not scpfile.get('copy-after-failure'): + task['when'] = 'success' + tasks.append(task) + + dest = scpfile['target'] + dest = self._substituteVariables(dest, parameters) + dest = os.path.join(site['root'], dest) dest = os.path.normpath(dest) if not dest.startswith(site['root']): raise Exception("Target path %s is not below site root" % (dest,)) - syncargs = dict(src=src, - dest=dest) - task = dict(synchronize=syncargs, - delegate_to=site['host']) + local_args = [ + 'command', '/usr/bin/rsync', '--delay-updates', '-F', + '--compress', '-rt', '--safe-links', '--rsh', + '"/usr/bin/ssh -i {private_key_file} -S none ' + '-o StrictHostKeyChecking=no"', + '--out-format="<>%i %n%L"', + '"{source}/"', '"{user}@{host}:{dest}"' + ] + local_action = ' '.join(local_args).format( + source=scproot, + dest=dest, + private_key_file=self.private_key_file, + host=site['host'], + user=site['user']) + task = dict(local_action=local_action) if not scpfile.get('copy-after-failure'): task['when'] = 'success' tasks.append(task) return tasks - def _makeFTPTask(self, jobdir, publisher): + def _makeFTPTask(self, jobdir, publisher, parameters): tasks = [] ftp = publisher['ftp'] site = ftp['site'] if site not in self.sites: raise Exception("Undefined FTP site: %s" % site) site = self.sites[site] + ftproot = tempfile.mkdtemp(dir=jobdir.ansible_root) ftpcontent = os.path.join(ftproot, 'content') os.makedirs(ftpcontent) ftpscript = os.path.join(ftproot, 'script') - syncargs = dict(src=ftp['source'], + src = ftp['source'] + src = self._substituteVariables(src, parameters) + src = os.path.join(parameters['WORKSPACE'], src) + syncargs = dict(src=src, dest=ftpcontent) task = dict(synchronize=syncargs, when='success') @@ -608,6 +650,7 @@ class NodeWorker(object): while ftpsource[-1] == '/': ftpsource = ftpsource[:-1] ftptarget = ftp['target'] + ftptarget = self._substituteVariables(ftptarget, parameters) ftptarget = os.path.join(site['root'], ftp['target']) ftptarget = os.path.normpath(ftptarget) if not ftptarget.startswith(site['root']): @@ -627,7 +670,10 @@ class NodeWorker(object): script_fn = '%s.sh' % str(uuid.uuid4().hex) script_path = os.path.join(jobdir.script_root, script_fn) with open(script_path, 'w') as script: - script.write(builder['shell']) + data = builder['shell'] + if not data.startswith('#!'): + data = '#!/bin/bash -x\n %s' % (data,) + script.write(data) remote_path = os.path.join('/tmp', script_fn) copy = dict(src=script_path, @@ -681,10 +727,10 @@ class NodeWorker(object): with open(jobdir.playbook, 'w') as playbook: tasks = [] - task = dict(file=dict(path='/tmp/console.log', state='absent')) + task = dict(file=dict(path='/tmp/console.txt', state='absent')) tasks.append(task) - task = dict(zuul_console=dict(path='/tmp/console.log', port=8088)) + task = dict(zuul_console=dict(path='/tmp/console.txt', port=8088)) tasks.append(task) task = dict(file=dict(path=parameters['WORKSPACE'], @@ -703,9 +749,11 @@ class NodeWorker(object): tasks = [] for publisher in jjb_job.get('publishers', []): if 'scp' in publisher: - tasks.extend(self._makeSCPTask(publisher)) + tasks.extend(self._makeSCPTask(jobdir, publisher, + parameters)) if 'ftp' in publisher: - tasks.extend(self._makeFTPTask(jobdir, publisher)) + tasks.extend(self._makeFTPTask(jobdir, publisher, + parameters)) play = dict(hosts='node', name='Publishers', tasks=tasks) playbook.write(yaml.dump([play])) @@ -747,13 +795,15 @@ class NodeWorker(object): def runAnsiblePostPlaybook(self, jobdir, success): proc = subprocess.Popen( ['ansible-playbook', jobdir.post_playbook, - '-e', 'success=%s' % success], + '-e', 'success=%s' % success, '-v'], cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid, ) (out, err) = proc.communicate() + self.log.debug("Ansible post stdout:\n%s" % out) + self.log.debug("Ansible post stderr:\n%s" % err) return proc.wait() == 0 @@ -761,3 +811,34 @@ class JJB(jenkins_jobs.builder.Builder): def __init__(self): self.global_config = None self._plugins_list = [] + + def expandComponent(self, component_type, component, template_data): + component_list_type = component_type + 's' + new_components = [] + if isinstance(component, dict): + name, component_data = next(iter(component.items())) + if template_data: + component_data = jenkins_jobs.formatter.deep_format( + component_data, template_data, True) + else: + name = component + component_data = {} + + new_component = self.parser.data[component_type].get(name) + if new_component: + for new_sub_component in new_component[component_list_type]: + new_components.extend( + self.expandComponent(component_type, + new_sub_component, component_data)) + else: + new_components.append({name: component_data}) + return new_components + + def expandMacros(self, job): + for component_type in ['builder', 'publisher']: + component_list_type = component_type + 's' + new_components = [] + for new_component in job.get(component_list_type, []): + new_components.extend(self.expandComponent(component_type, + new_component, {})) + job[component_list_type] = new_components From ce8a213fcde091077be644ce424816bb444d329f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 19 May 2016 15:21:52 -0700 Subject: [PATCH 038/152] Estimate job runtimes internally Rather than relying on the workers to supply estimated job runtimes, record the last 10 successful run times and use those to estimate the run time of each job. This means that workers (which may be highly distributed and lack access to a substantial job history) no longer need to provide these values, and the central scheduler, which is better placed to do so since italready sees all job run times, will. Failure times and a scoreboard of results are kept for each job as well for potential future use in evaluating likelihood of job success. Change-Id: If0955e15a3da9eb842dbee02a4750a177a092d3e --- tests/base.py | 3 ++ tests/test_model.py | 78 +++++++++++++++++++++++++++++ tests/test_scheduler.py | 105 ++++++++++++++++++++++++++------------- zuul/launcher/gearman.py | 3 -- zuul/model.py | 77 ++++++++++++++++++++++++++++ zuul/scheduler.py | 25 ++++++++++ 6 files changed, 253 insertions(+), 38 deletions(-) diff --git a/tests/base.py b/tests/base.py index 405caa0ded..585f2d203b 100755 --- a/tests/base.py +++ b/tests/base.py @@ -876,11 +876,13 @@ class ZuulTestCase(BaseTestCase): self.test_root = os.path.join(tmp_root, "zuul-test") self.upstream_root = os.path.join(self.test_root, "upstream") self.git_root = os.path.join(self.test_root, "git") + self.state_root = os.path.join(self.test_root, "lib") if os.path.exists(self.test_root): shutil.rmtree(self.test_root) os.makedirs(self.test_root) os.makedirs(self.upstream_root) + os.makedirs(self.state_root) # Make per test copy of Configuration. self.setup_config() @@ -888,6 +890,7 @@ class ZuulTestCase(BaseTestCase): os.path.join(FIXTURE_DIR, self.config.get('zuul', 'layout_config'))) self.config.set('merger', 'git_dir', self.git_root) + self.config.set('zuul', 'state_dir', self.state_root) # For each project in config: self.init_repo("org/project") diff --git a/tests/test_model.py b/tests/test_model.py index 271161869f..ac19383ff8 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -12,6 +12,11 @@ # License for the specific language governing permissions and limitations # under the License. +import os +import random + +import fixtures + from zuul import change_matcher as cm from zuul import model @@ -62,3 +67,76 @@ class TestJob(BaseTestCase): metajob = model.Job('^job') job.copy(metajob) self._assert_job_booleans_are_not_none(job) + + +class TestJobTimeData(BaseTestCase): + def setUp(self): + super(TestJobTimeData, self).setUp() + self.tmp_root = self.useFixture(fixtures.TempDir( + rootdir=os.environ.get("ZUUL_TEST_ROOT")) + ).path + + def test_empty_timedata(self): + path = os.path.join(self.tmp_root, 'job-name') + self.assertFalse(os.path.exists(path)) + self.assertFalse(os.path.exists(path + '.tmp')) + td = model.JobTimeData(path) + self.assertEqual(td.success_times, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(td.failure_times, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(td.results, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + + def test_save_reload(self): + path = os.path.join(self.tmp_root, 'job-name') + self.assertFalse(os.path.exists(path)) + self.assertFalse(os.path.exists(path + '.tmp')) + td = model.JobTimeData(path) + self.assertEqual(td.success_times, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(td.failure_times, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(td.results, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + success_times = [] + failure_times = [] + results = [] + for x in range(10): + success_times.append(int(random.random() * 1000)) + failure_times.append(int(random.random() * 1000)) + results.append(0) + results.append(1) + random.shuffle(results) + s = f = 0 + for result in results: + if result: + td.add(failure_times[f], 'FAILURE') + f += 1 + else: + td.add(success_times[s], 'SUCCESS') + s += 1 + self.assertEqual(td.success_times, success_times) + self.assertEqual(td.failure_times, failure_times) + self.assertEqual(td.results, results[10:]) + td.save() + self.assertTrue(os.path.exists(path)) + self.assertFalse(os.path.exists(path + '.tmp')) + td = model.JobTimeData(path) + td.load() + self.assertEqual(td.success_times, success_times) + self.assertEqual(td.failure_times, failure_times) + self.assertEqual(td.results, results[10:]) + + +class TestTimeDataBase(BaseTestCase): + def setUp(self): + super(TestTimeDataBase, self).setUp() + self.tmp_root = self.useFixture(fixtures.TempDir( + rootdir=os.environ.get("ZUUL_TEST_ROOT")) + ).path + self.db = model.TimeDataBase(self.tmp_root) + + def test_timedatabase(self): + self.assertEqual(self.db.getEstimatedTime('job-name'), 0) + self.db.update('job-name', 50, 'SUCCESS') + self.assertEqual(self.db.getEstimatedTime('job-name'), 50) + self.db.update('job-name', 100, 'SUCCESS') + self.assertEqual(self.db.getEstimatedTime('job-name'), 75) + for x in range(10): + self.db.update('job-name', 100, 'SUCCESS') + self.assertEqual(self.db.getEstimatedTime('job-name'), 100) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index fe7c7cc4fa..15d33c8aad 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -34,7 +34,6 @@ import zuul.reporter.gerrit import zuul.reporter.smtp from tests.base import ( - BaseTestCase, ZuulTestCase, repack_repo, ) @@ -44,40 +43,6 @@ logging.basicConfig(level=logging.DEBUG, '%(levelname)-8s %(message)s') -class TestSchedulerConfigParsing(BaseTestCase): - - def test_parse_skip_if(self): - job_yaml = """ -jobs: - - name: job_name - skip-if: - - project: ^project_name$ - branch: ^stable/icehouse$ - all-files-match-any: - - ^filename$ - - project: ^project2_name$ - all-files-match-any: - - ^filename2$ - """.strip() - data = yaml.load(job_yaml) - config_job = data.get('jobs')[0] - sched = zuul.scheduler.Scheduler({}) - cm = zuul.change_matcher - expected = cm.MatchAny([ - cm.MatchAll([ - cm.ProjectMatcher('^project_name$'), - cm.BranchMatcher('^stable/icehouse$'), - cm.MatchAllFiles([cm.FileMatcher('^filename$')]), - ]), - cm.MatchAll([ - cm.ProjectMatcher('^project2_name$'), - cm.MatchAllFiles([cm.FileMatcher('^filename2$')]), - ]), - ]) - matcher = sched._parseSkipIf(config_job) - self.assertEqual(expected, matcher) - - class TestScheduler(ZuulTestCase): def test_jobs_launched(self): @@ -495,6 +460,46 @@ class TestScheduler(ZuulTestCase): self.assertEqual(B.reported, 2) self.assertEqual(C.reported, 2) + def _test_time_database(self, iteration): + self.worker.hold_jobs_in_build = True + A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') + A.addApproval('CRVW', 2) + self.fake_gerrit.addEvent(A.addApproval('APRV', 1)) + self.waitUntilSettled() + time.sleep(2) + + data = json.loads(self.sched.formatStatusJSON()) + found_job = None + for pipeline in data['pipelines']: + if pipeline['name'] != 'gate': + continue + for queue in pipeline['change_queues']: + for head in queue['heads']: + for item in head: + for job in item['jobs']: + if job['name'] == 'project-merge': + found_job = job + break + + self.assertIsNotNone(found_job) + if iteration == 1: + self.assertIsNotNone(found_job['estimated_time']) + self.assertIsNone(found_job['remaining_time']) + else: + self.assertIsNotNone(found_job['estimated_time']) + self.assertTrue(found_job['estimated_time'] >= 2) + self.assertIsNotNone(found_job['remaining_time']) + + self.worker.hold_jobs_in_build = False + self.worker.release() + self.waitUntilSettled() + + def test_time_database(self): + "Test the time database" + + self._test_time_database(1) + self._test_time_database(2) + def test_two_failed_changes_at_head(self): "Test that changes are reparented correctly if 2 fail at head" @@ -600,6 +605,36 @@ class TestScheduler(ZuulTestCase): self.assertEqual(B.reported, 2) self.assertEqual(C.reported, 2) + def test_parse_skip_if(self): + job_yaml = """ +jobs: + - name: job_name + skip-if: + - project: ^project_name$ + branch: ^stable/icehouse$ + all-files-match-any: + - ^filename$ + - project: ^project2_name$ + all-files-match-any: + - ^filename2$ + """.strip() + data = yaml.load(job_yaml) + config_job = data.get('jobs')[0] + cm = zuul.change_matcher + expected = cm.MatchAny([ + cm.MatchAll([ + cm.ProjectMatcher('^project_name$'), + cm.BranchMatcher('^stable/icehouse$'), + cm.MatchAllFiles([cm.FileMatcher('^filename$')]), + ]), + cm.MatchAll([ + cm.ProjectMatcher('^project2_name$'), + cm.MatchAllFiles([cm.FileMatcher('^filename2$')]), + ]), + ]) + matcher = self.sched._parseSkipIf(config_job) + self.assertEqual(expected, matcher) + def test_patch_order(self): "Test that dependent patches are tested in the right order" A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index 69fb71bc18..f3b867ce93 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -456,9 +456,6 @@ class Gearman(object): build.number = data.get('number') build.__gearman_manager = data.get('manager') self.sched.onBuildStarted(build) - - if job.denominator: - build.estimated_time = float(job.denominator) / 1000 else: self.log.error("Unable to find build %s" % job.unique) diff --git a/zuul/model.py b/zuul/model.py index 5bea5d03bb..3fb0577f23 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -13,7 +13,9 @@ # under the License. import copy +import os import re +import struct import time from uuid import uuid4 import extras @@ -1380,3 +1382,78 @@ class Layout(object): job.copy(metajob) self.jobs[name] = job return job + + +class JobTimeData(object): + format = 'B10H10H10B' + version = 0 + + def __init__(self, path): + self.path = path + self.success_times = [0 for x in range(10)] + self.failure_times = [0 for x in range(10)] + self.results = [0 for x in range(10)] + + def load(self): + if not os.path.exists(self.path): + return + with open(self.path) as f: + data = struct.unpack(self.format, f.read()) + version = data[0] + if version != self.version: + raise Exception("Unkown data version") + self.success_times = list(data[1:11]) + self.failure_times = list(data[11:21]) + self.results = list(data[21:32]) + + def save(self): + tmpfile = self.path + '.tmp' + data = [self.version] + data.extend(self.success_times) + data.extend(self.failure_times) + data.extend(self.results) + data = struct.pack(self.format, *data) + with open(tmpfile, 'w') as f: + f.write(data) + os.rename(tmpfile, self.path) + + def add(self, elapsed, result): + elapsed = int(elapsed) + if result == 'SUCCESS': + self.success_times.append(elapsed) + self.success_times.pop(0) + result = 0 + else: + self.failure_times.append(elapsed) + self.failure_times.pop(0) + result = 1 + self.results.append(result) + self.results.pop(0) + + def getEstimatedTime(self): + times = [x for x in self.success_times if x] + if times: + return float(sum(times)) / len(times) + return 0.0 + + +class TimeDataBase(object): + def __init__(self, root): + self.root = root + self.jobs = {} + + def _getTD(self, name): + td = self.jobs.get(name) + if not td: + td = JobTimeData(os.path.join(self.root, name)) + self.jobs[name] = td + td.load() + return td + + def getEstimatedTime(self, name): + return self._getTD(name).getEstimatedTime() + + def update(self, name, elapsed, result): + td = self._getTD(name) + td.add(elapsed, result) + td.save() diff --git a/zuul/scheduler.py b/zuul/scheduler.py index aea9a67e96..ee5cd2bca0 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -262,6 +262,9 @@ class Scheduler(threading.Thread): self.management_event_queue = Queue.Queue() self.layout = model.Layout() + time_dir = self._get_time_database_dir() + self.time_database = model.TimeDataBase(time_dir) + self.zuul_version = zuul_version.version_info.release_string() self.last_reconfigured = None @@ -740,6 +743,17 @@ class Scheduler(threading.Thread): state_dir = '/var/lib/zuul' return os.path.join(state_dir, 'queue.pickle') + def _get_time_database_dir(self): + if self.config.has_option('zuul', 'state_dir'): + state_dir = os.path.expanduser(self.config.get('zuul', + 'state_dir')) + else: + state_dir = '/var/lib/zuul' + d = os.path.join(state_dir, 'times') + if not os.path.exists(d): + os.mkdir(d) + return d + def _save_queue(self): pickle_file = self._get_queue_pickle_file() events = [] @@ -1069,6 +1083,11 @@ class Scheduler(threading.Thread): self.log.warning("Build %s is not associated with a pipeline" % (build,)) return + try: + build.estimated_time = float(self.time_database.getEstimatedTime( + build.job.name)) + except Exception: + self.log.exception("Exception estimating build time:") pipeline.manager.onBuildStarted(event.build) def _doBuildCompletedEvent(self, event): @@ -1082,6 +1101,12 @@ class Scheduler(threading.Thread): self.log.warning("Build %s is not associated with a pipeline" % (build,)) return + if build.end_time and build.start_time and build.result: + duration = build.end_time - build.start_time + try: + self.time_database.update(build.job.name, duration, build.result) + except Exception: + self.log.exception("Exception recording build time:") pipeline.manager.onBuildCompleted(event.build) def _doMergeCompletedEvent(self, event): From dbe6fab14f446b1929b2b30c31a18d489d16b272 Mon Sep 17 00:00:00 2001 From: Alexander Evseev Date: Thu, 19 Nov 2015 12:46:34 +0300 Subject: [PATCH 039/152] Don't take into account commit message for skip-if filter If there is some skip-if condition containing all-files-match-any, then Zuul skips jobs for changes without modified files (merge commits), because it always matches '/COMMIT_MSG'. So test files for regexes only if CR has more than one modified file, because '/COMMIT_MSG' is always included even for empty merge commits. Change-Id: Iad78d9eb8212beea3238728321c1ba74efa991e2 --- doc/source/zuul.rst | 5 ++++- tests/test_change_matcher.py | 8 ++++---- tests/test_model.py | 4 ++-- zuul/change_matcher.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index 98e4bb8a2a..7132407125 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -759,7 +759,10 @@ each job as it builds a list from the project specification. expressions. The pattern for '/COMMIT_MSG' is always matched on and does not - have to be included. + have to be included. Exception is merge commits (without modified + files), in this case '/COMMIT_MSG' is not matched, and job is not + skipped. In case of merge commits it's assumed that list of modified + files isn't predictible and CI should be run. **voting (optional)** Boolean value (``true`` or ``false``) that indicates whatever diff --git a/tests/test_change_matcher.py b/tests/test_change_matcher.py index 1f4ab93d61..05853223a5 100644 --- a/tests/test_change_matcher.py +++ b/tests/test_change_matcher.py @@ -123,13 +123,13 @@ class TestMatchAllFiles(BaseTestMatcher): self._test_matches(False) def test_matches_returns_false_when_not_all_files_match(self): - self._test_matches(False, files=['docs/foo', 'foo/bar']) + self._test_matches(False, files=['/COMMIT_MSG', 'docs/foo', 'foo/bar']) - def test_matches_returns_true_when_commit_message_matches(self): - self._test_matches(True, files=['/COMMIT_MSG']) + def test_matches_returns_false_when_commit_message_matches(self): + self._test_matches(False, files=['/COMMIT_MSG']) def test_matches_returns_true_when_all_files_match(self): - self._test_matches(True, files=['docs/foo']) + self._test_matches(True, files=['/COMMIT_MSG', 'docs/foo']) class TestMatchAll(BaseTestMatcher): diff --git a/tests/test_model.py b/tests/test_model.py index 271161869f..739eef3714 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -31,12 +31,12 @@ class TestJob(BaseTestCase): def test_change_matches_returns_false_for_matched_skip_if(self): change = model.Change('project') - change.files = ['docs/foo'] + change.files = ['/COMMIT_MSG', 'docs/foo'] self.assertFalse(self.job.changeMatches(change)) def test_change_matches_returns_true_for_unmatched_skip_if(self): change = model.Change('project') - change.files = ['foo'] + change.files = ['/COMMIT_MSG', 'foo'] self.assertTrue(self.job.changeMatches(change)) def test_copy_retains_skip_if(self): diff --git a/zuul/change_matcher.py b/zuul/change_matcher.py index ed380f0ae5..ca2d93f375 100644 --- a/zuul/change_matcher.py +++ b/zuul/change_matcher.py @@ -101,7 +101,7 @@ class MatchAllFiles(AbstractMatcherCollection): yield self.commit_regex def matches(self, change): - if not (hasattr(change, 'files') and change.files): + if not (hasattr(change, 'files') and len(change.files) > 1): return False for file_ in change.files: matched_file = False From 53cbfb9158668483fd5c7237e5b4a140a2254493 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 18 May 2016 19:16:46 -0700 Subject: [PATCH 040/152] Ansible launcher: handle Ant paths in publishers Change-Id: I3f821bba0165f16da1aed3db44c3bc1e6de100a8 --- zuul/launcher/ansiblelaunchserver.py | 47 ++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 75b491136e..7b0a10855c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -572,6 +572,30 @@ class NodeWorker(object): return variables.get(match.group(1), '') return re.sub('\$([A-Za-z0-9_]+)', lookup, text) + def _getRsyncOptions(self, source, parameters): + # Treat the publisher source as a filter; ant and rsync behave + # fairly close in this manner, except for leading directories. + source = self._substituteVariables(source, parameters) + # If the source starts with ** then we want to match any + # number of directories, so don't anchor the include filter. + # If it does not start with **, then the intent is likely to + # at least start by matching an immediate file or subdirectory + # (even if later we have a ** in the middle), so in this case, + # anchor it to the root of the transfer (the workspace). + if ((not source.startswith('**')) and + (not source.startswith('/'))): + source = '/' + source + # These options mean: include the thing we want, include any + # directories (so that we continue to search for the thing we + # want no matter how deep it is), exclude anything that + # doesn't match the thing we want or is a directory, then get + # rid of empty directories left over at the end. + rsync_opts = ['--include="%s"' % source, + '--include="*/"', + '--exclude="*"', + '--prune-empty-dirs'] + return rsync_opts + def _makeSCPTask(self, jobdir, publisher, parameters): tasks = [] for scpfile in publisher['scp']['files']: @@ -581,15 +605,21 @@ class NodeWorker(object): site = self.sites[site] if scpfile.get('copy-console'): src = '/tmp/console.txt' + rsync_opts = [] else: - src = scpfile['source'] - src = self._substituteVariables(src, parameters) - src = os.path.join(parameters['WORKSPACE'], src) + src = parameters['WORKSPACE'] + if not src.endswith('/'): + src = src + '/' + rsync_opts = self._getRsyncOptions(scpfile['source'], + parameters) + scproot = tempfile.mkdtemp(dir=jobdir.ansible_root) os.chmod(scproot, 0o755) syncargs = dict(src=src, dest=scproot, mode='pull') + if rsync_opts: + syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs) if not scpfile.get('copy-after-failure'): task['when'] = 'success' @@ -634,11 +664,16 @@ class NodeWorker(object): ftpcontent = os.path.join(ftproot, 'content') os.makedirs(ftpcontent) ftpscript = os.path.join(ftproot, 'script') - src = ftp['source'] - src = self._substituteVariables(src, parameters) - src = os.path.join(parameters['WORKSPACE'], src) + + src = parameters['WORKSPACE'] + if not src.endswith('/'): + src = src + '/' + rsync_opts = self._getRsyncOptions(ftp['source'], + parameters) syncargs = dict(src=src, dest=ftpcontent) + if rsync_opts: + syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs, when='success') tasks.append(task) From faa787729286b035c73ede558320bd6eecd0c34a Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 19 May 2016 16:25:38 -0700 Subject: [PATCH 041/152] Ansible launcher: include PATH Change-Id: Ia5becf5d6f494d759ac182e917556c6398961d81 --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 7b0a10855c..73c208e0ff 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -742,6 +742,7 @@ class NodeWorker(object): parameters = args.copy() parameters['WORKSPACE'] = os.path.join(self.workspace_root, job_name) + parameters['PATH'] = '/usr/local/bin:/usr/bin:/bin' with open(jobdir.inventory, 'w') as inventory: for host_name, host_vars in self.getHostList(): From 3d428d326449442e705d0ee3259acfa149528980 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 19 May 2016 16:41:37 -0700 Subject: [PATCH 042/152] Ansible launcher: create parent directories for SCP Change-Id: I011d29f4d1f68b032134b339eac0c2e6ed25dc95 --- zuul/launcher/ansiblelaunchserver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 73c208e0ff..f66b09631f 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -632,10 +632,12 @@ class NodeWorker(object): if not dest.startswith(site['root']): raise Exception("Target path %s is not below site root" % (dest,)) + local_args = [ 'command', '/usr/bin/rsync', '--delay-updates', '-F', - '--compress', '-rt', '--safe-links', '--rsh', - '"/usr/bin/ssh -i {private_key_file} -S none ' + '--compress', '-rt', '--safe-links', + '--rsync-path="mkdir -p {dest} && rsync"', + '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' '-o StrictHostKeyChecking=no"', '--out-format="<>%i %n%L"', '"{source}/"', '"{user}@{host}:{dest}"' From 3bdf886b0cab26d706b088b8289e5484b4b59857 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 19 May 2016 16:48:34 -0700 Subject: [PATCH 043/152] Ansible launcher: add -q to rsync ssh command Since we're in control of ssh here, we get the warnings, but we can stop them with -q. Change-Id: I24215fc3ce3dcbe2a0ce8574fadd8d16627fbadf --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f66b09631f..d11a4af596 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -638,7 +638,7 @@ class NodeWorker(object): '--compress', '-rt', '--safe-links', '--rsync-path="mkdir -p {dest} && rsync"', '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' - '-o StrictHostKeyChecking=no"', + '-o StrictHostKeyChecking=no -q"', '--out-format="<>%i %n%L"', '"{source}/"', '"{user}@{host}:{dest}"' ] From 61e3657b9df1dc10f90cb44114818c09f84c47e5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 19 May 2016 17:12:33 -0700 Subject: [PATCH 044/152] Ansible launcher: set $HOME Change-Id: I1fd8881a88da81b8a1337c2d2f4d3caad36ffd33 --- zuul/ansible/library/zuul_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 955469fd12..68318e79be 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -16,6 +16,7 @@ # along with this software. If not, see . import datetime +import os import subprocess @@ -63,7 +64,9 @@ def main(): ) p = module.params - ret = run(p['cwd'], p['command'], p['parameters']) + env = p['parameters'].copy() + env['HOME'] = os.path.expanduser('~') + ret = run(p['cwd'], p['command'], env) if ret == 0: module.exit_json(changed=True, rc=ret) else: From 1c5e69a24c922381e89e4946e5748fea7d23e594 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 20 May 2016 16:06:33 -0700 Subject: [PATCH 045/152] Ansible launcher: Append to console log, rather than truncating Jobs end up having lots of builders, each should append to the log. We remove the log before the job starts so we don't need to worry about truncating it here on re-used nodes. Change-Id: I287968b600de0eae48c1bdcf2f91c4383578731c --- zuul/ansible/library/zuul_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 68318e79be..cba640f849 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -22,7 +22,7 @@ import subprocess class Console(object): def __enter__(self): - self.logfile = open('/tmp/console.txt', 'w+', 0) + self.logfile = open('/tmp/console.txt', 'a', 0) return self def __exit__(self, etype, value, tb): From 434a9d9200f72171e69c2495558d34fce687124e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 24 May 2016 08:24:42 -0700 Subject: [PATCH 046/152] Ansible launcher: copy symlinks in publishers This instructs rsync to copy the contents of links, rather than as symlinks. We apparently rely on this behavior in devstack-gate where we symlink the logs/ directory outside of the workspace. Change-Id: I537ac91ba158c01aecbacef54b80071988460a26 --- zuul/launcher/ansiblelaunchserver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index d11a4af596..95ce4ba15c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -617,6 +617,7 @@ class NodeWorker(object): os.chmod(scproot, 0o755) syncargs = dict(src=src, dest=scproot, + copy_links='yes', mode='pull') if rsync_opts: syncargs['rsync_opts'] = rsync_opts @@ -673,7 +674,8 @@ class NodeWorker(object): rsync_opts = self._getRsyncOptions(ftp['source'], parameters) syncargs = dict(src=src, - dest=ftpcontent) + dest=ftpcontent, + copy_links='yes') if rsync_opts: syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs, From e9c93ee2be1221a96394d4a10c9f462bbdd51573 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 24 May 2016 08:32:21 -0700 Subject: [PATCH 047/152] Ansible launcher: run command in login shell Let the login shell set basic env variables (like HOME, PATH, LOCALE, etc). Change-Id: I71384b9baaf091ac26d8e8c1b68779ca7e1d17c9 --- zuul/ansible/library/zuul_runner.py | 23 ++++++++++++++++++++--- zuul/launcher/ansiblelaunchserver.py | 1 - 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index cba640f849..784c7fdb89 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -16,6 +16,7 @@ # along with this software. If not, see . import datetime +import getpass import os import subprocess @@ -34,13 +35,30 @@ class Console(object): self.logfile.write(outln) +def get_env(): + env = {} + env['HOME'] = os.path.expanduser('~') + env['USER'] = getpass.getuser() + + # Known locations for PAM mod_env sources + for fn in ['/etc/environment', '/etc/default/locale']: + if os.path.exists(fn): + with open(fn) as f: + for line in f: + k, v = line.strip().split('=') + env[k] = v + return env + + def run(cwd, cmd, args): + env = get_env() + env.update(args) proc = subprocess.Popen( - [cmd], + ['/bin/bash', '-l', '-c', cmd], cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - env=args, + env=env, ) with Console() as console: @@ -65,7 +83,6 @@ def main(): p = module.params env = p['parameters'].copy() - env['HOME'] = os.path.expanduser('~') ret = run(p['cwd'], p['command'], env) if ret == 0: module.exit_json(changed=True, rc=ret) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 95ce4ba15c..5d1b0006f5 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -746,7 +746,6 @@ class NodeWorker(object): parameters = args.copy() parameters['WORKSPACE'] = os.path.join(self.workspace_root, job_name) - parameters['PATH'] = '/usr/local/bin:/usr/bin:/bin' with open(jobdir.inventory, 'w') as inventory: for host_name, host_vars in self.getHostList(): From fbd1ffb8227874aa942a5819c10152e312e1f06e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 07:46:47 -0700 Subject: [PATCH 048/152] Ansible launcher: log reconfiguration complete So we're not on the edge of our seats. Change-Id: I4117142e63d8edd6ac6f163883bb4c8e2b7e06da --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5d1b0006f5..f5dc158434 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -158,6 +158,7 @@ class LaunchServer(object): except Exception: self.log.exception("Exception sending reconfigure command " "to worker:") + self.log.debug("Reconfiguration complete") def stop(self): self.log.debug("Stopping") From 2a81aa2c2db9a359aaaed0b642e84d10c423ec1d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 08:37:26 -0700 Subject: [PATCH 049/152] Ansible launcher: install libselinux-python if necessary This is temporary until we rebuild images with the zuul-worker dib element. Change-Id: Ic1c2a60ab87ad96a0b2b050c6f568d3946b30466 --- zuul/launcher/ansiblelaunchserver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f5dc158434..3a3c580569 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -777,6 +777,11 @@ class NodeWorker(object): state='directory')) tasks.append(task) + # TODO: remove once zuul-worker DIB element has landed + tasks.append(dict(shell="[ -f /usr/bin/yum ] && " + "sudo /usr/bin/yum install libselinux-python || " + "/bin/true")) + for builder in jjb_job.get('builders', []): if 'shell' in builder: tasks.extend(self._makeBuilderTask(jobdir, builder, From f87c5ce0e064c82af1dc313eea120c5cb5af6be9 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 08:43:37 -0700 Subject: [PATCH 050/152] Ansible launcher: add option to keep jobdir To aid in debugging. Change-Id: Ice4478fd7b6d4121ed9f1c7c9bce1dcc57699a3a --- zuul/cmd/launcher.py | 6 +++++- zuul/launcher/ansiblelaunchserver.py | 16 ++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py index 86266b3977..c9516f8d5c 100644 --- a/zuul/cmd/launcher.py +++ b/zuul/cmd/launcher.py @@ -46,6 +46,9 @@ class Launcher(zuul.cmd.ZuulApp): parser.add_argument('--version', dest='version', action='version', version=self._get_version(), help='show zuul version') + parser.add_argument('--keep-jobdir', dest='keep_jobdir', + action='store_true', + help='keep local jobdirs after run completes') self.args = parser.parse_args() def reconfigure_handler(self, signum, frame): @@ -73,7 +76,8 @@ class Launcher(zuul.cmd.ZuulApp): self.log = logging.getLogger("zuul.Launcher") LaunchServer = zuul.launcher.ansiblelaunchserver.LaunchServer - self.launcher = LaunchServer(self.config) + self.launcher = LaunchServer(self.config, + keep_jobdir=self.args.keep_jobdir) self.launcher.start() signal.signal(signal.SIGHUP, self.reconfigure_handler) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 3a3c580569..b068eab2ae 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -37,7 +37,8 @@ import zuul.ansible.plugins.callback_plugins class JobDir(object): - def __init__(self): + def __init__(self, keep=False): + self.keep = keep self.root = tempfile.mkdtemp() self.git_root = os.path.join(self.root, 'git') os.makedirs(self.git_root) @@ -56,15 +57,17 @@ class JobDir(object): return self def __exit__(self, etype, value, tb): - shutil.rmtree(self.root) + if not self.keep: + shutil.rmtree(self.root) class LaunchServer(object): log = logging.getLogger("zuul.LaunchServer") section_re = re.compile('site "(.*?)"') - def __init__(self, config): + def __init__(self, config, keep_jobdir=False): self.config = config + self.keep_jobdir = keep_jobdir self.hostname = socket.gethostname() self.node_workers = {} self.mpmanager = multiprocessing.Manager() @@ -221,7 +224,7 @@ class LaunchServer(object): self.sites, args['name'], args['host'], args['description'], args['labels'], self.hostname, self.zmq_send_queue, - self.termination_queue) + self.termination_queue, self.keep_jobdir) self.node_workers[worker.name] = worker worker.process = multiprocessing.Process(target=worker.run) @@ -279,7 +282,7 @@ class LaunchServer(object): class NodeWorker(object): def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, - termination_queue): + termination_queue, keep_jobdir): self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config @@ -299,6 +302,7 @@ class NodeWorker(object): self.manager_name = manager_name self.zmq_send_queue = zmq_send_queue self.termination_queue = termination_queue + self.keep_jobdir = keep_jobdir self.running_job_lock = threading.Lock() self._job_complete_event = threading.Event() self._running_job = False @@ -540,7 +544,7 @@ class NodeWorker(object): self.log.debug("Job %s: beginning" % (job.unique,)) self.builds[job.unique] = self.name - with JobDir() as jobdir: + with JobDir(self.keep_jobdir) as jobdir: self.log.debug("Job %s: job root at %s" % (job.unique, jobdir.root)) timeout = self.prepareAnsibleFiles(jobdir, job, args) From 38fc2502922f2f22e7c3694c61145694fcfb71cf Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 09:12:51 -0700 Subject: [PATCH 051/152] Ansible launcher: log exit code from each builder Change-Id: I6871c2b9ea9da025bcb60a5b6e25a04d442308bf --- zuul/ansible/library/zuul_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 784c7fdb89..a230448be6 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -68,7 +68,8 @@ def run(cwd, cmd, args): break console.addLine(line) - ret = proc.wait() + ret = proc.wait() + console.addLine("[Zuul] Exit code: %s\n" % ret) return ret From 7c85204e02042c1d0130231f0b6fd63e538f9ea8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 10:20:57 -0700 Subject: [PATCH 052/152] Ansible launcher: add job result to console log As well as a nice generalized way to append to the log with anisble. Change-Id: If1c5810a9219abaf9b80be0e075aa3056a415004 --- zuul/ansible/library/zuul_log.py | 55 ++++++++++++++++++++++++++++ zuul/ansible/library/zuul_runner.py | 2 +- zuul/launcher/ansiblelaunchserver.py | 28 ++++++++++---- 3 files changed, 76 insertions(+), 9 deletions(-) create mode 100644 zuul/ansible/library/zuul_log.py diff --git a/zuul/ansible/library/zuul_log.py b/zuul/ansible/library/zuul_log.py new file mode 100644 index 0000000000..e3a3458cc9 --- /dev/null +++ b/zuul/ansible/library/zuul_log.py @@ -0,0 +1,55 @@ +#!/usr/bin/python + +# Copyright (c) 2016 IBM Corp. +# Copyright (c) 2016 Red Hat +# +# This module is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this software. If not, see . + +import datetime + + +class Console(object): + def __enter__(self): + self.logfile = open('/tmp/console.txt', 'a', 0) + return self + + def __exit__(self, etype, value, tb): + self.logfile.close() + + def addLine(self, ln): + ts = datetime.datetime.now() + outln = '%s %s' % (str(ts), ln) + self.logfile.write(outln) + + +def log(msg): + with Console() as console: + console.addLine("[Zuul] %s\n" % msg) + + +def main(): + module = AnsibleModule( + argument_spec=dict( + msg=dict(required=True, default=''), + ) + ) + + p = module.params + ret = log(p['msg']) + module.exit_json(changed=True, rc=ret) + +from ansible.module_utils.basic import * # noqa + +if __name__ == '__main__': + main() diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index a230448be6..bf32a5cf6a 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -69,7 +69,7 @@ def run(cwd, cmd, args): console.addLine(line) ret = proc.wait() - console.addLine("[Zuul] Exit code: %s\n" % ret) + console.addLine("[Zuul] Task exit code: %s\n" % ret) return ret diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index b068eab2ae..a6f18c6745 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -770,26 +770,38 @@ class NodeWorker(object): with open(jobdir.playbook, 'w') as playbook: tasks = [] + main_block = [] + error_block = [] + tasks.append(dict(block=main_block, + rescue=error_block)) task = dict(file=dict(path='/tmp/console.txt', state='absent')) - tasks.append(task) + main_block.append(task) task = dict(zuul_console=dict(path='/tmp/console.txt', port=8088)) - tasks.append(task) + main_block.append(task) task = dict(file=dict(path=parameters['WORKSPACE'], state='directory')) - tasks.append(task) + main_block.append(task) # TODO: remove once zuul-worker DIB element has landed - tasks.append(dict(shell="[ -f /usr/bin/yum ] && " - "sudo /usr/bin/yum install libselinux-python || " - "/bin/true")) + main_block.append(dict(shell="[ -f /usr/bin/yum ] && " + "sudo /usr/bin/yum install " + "libselinux-python || " + "/bin/true")) for builder in jjb_job.get('builders', []): if 'shell' in builder: - tasks.extend(self._makeBuilderTask(jobdir, builder, - parameters, timeout)) + main_block.extend( + self._makeBuilderTask(jobdir, builder, + parameters, timeout)) + task = dict(zuul_log=dict(msg="Job complete, result: SUCCESS")) + main_block.append(task) + + task = dict(zuul_log=dict(msg="Job complete, result: FAILURE")) + error_block.append(task) + play = dict(hosts='node', name='Job body', tasks=tasks) playbook.write(yaml.dump([play])) From 6c6ffee12ff90148fadf58dad282e4fc5f7294de Mon Sep 17 00:00:00 2001 From: "Timothy R. Chavez" Date: Wed, 25 May 2016 16:49:45 -0500 Subject: [PATCH 053/152] Simplify logic to ensure absolute path We can simplify the conditional logic to ensure a path is absolute, by using os.path.join instead of string concatenation. Change-Id: I3d2441bbb235fef4775d4bc481574d85866a7fb2 --- zuul/launcher/ansiblelaunchserver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index c3a5dfe8f7..40bad52367 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -582,9 +582,8 @@ class NodeWorker(object): # at least start by matching an immediate file or subdirectory # (even if later we have a ** in the middle), so in this case, # anchor it to the root of the transfer (the workspace). - if ((not source.startswith('**')) and - (not source.startswith('/'))): - source = '/' + source + if not source.startswith('**'): + source = os.path.join('/', source) # These options mean: include the thing we want, include any # directories (so that we continue to search for the thing we # want no matter how deep it is), exclude anything that From e4d229cc6ff05bb4986b1c8b1be602cc87fcf1a9 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 15:25:41 -0700 Subject: [PATCH 054/152] Don't try to create time database dir on config test Change-Id: Ibf9efca828017b6b1f172c074499fb05deefed6a --- zuul/cmd/server.py | 3 ++- zuul/scheduler.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index b1cd050808..6db15a21d6 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -86,7 +86,8 @@ class Server(zuul.cmd.ZuulApp): import zuul.trigger.gerrit logging.basicConfig(level=logging.DEBUG) - self.sched = zuul.scheduler.Scheduler(self.config) + self.sched = zuul.scheduler.Scheduler(self.config, + testonly=True) self.configure_connections() layout = self.sched.testConfig(self.config.get('zuul', 'layout_config'), diff --git a/zuul/scheduler.py b/zuul/scheduler.py index ee5cd2bca0..30a6c81437 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -236,7 +236,7 @@ def toList(item): class Scheduler(threading.Thread): log = logging.getLogger("zuul.Scheduler") - def __init__(self, config): + def __init__(self, config, testonly=False): threading.Thread.__init__(self) self.daemon = True self.wake_event = threading.Event() @@ -262,8 +262,9 @@ class Scheduler(threading.Thread): self.management_event_queue = Queue.Queue() self.layout = model.Layout() - time_dir = self._get_time_database_dir() - self.time_database = model.TimeDataBase(time_dir) + if not testonly: + time_dir = self._get_time_database_dir() + self.time_database = model.TimeDataBase(time_dir) self.zuul_version = zuul_version.version_info.release_string() self.last_reconfigured = None From 67791bd731abd89126ca56334a43bba75ebeb380 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 13:52:49 -0700 Subject: [PATCH 055/152] Ansible launcher: log stdout/stderr one line at a time For better grepping. Change-Id: Idd49bb25c2120d8fa4b8a7df284df4f4c543846d --- zuul/launcher/ansiblelaunchserver.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 8c701dedfe..1cb4f989fb 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -846,8 +846,10 @@ class NodeWorker(object): preexec_fn=os.setsid, ) (out, err) = self.ansible_proc.communicate() - self.log.debug("Ansible stdout:\n%s" % out) - self.log.debug("Ansible stderr:\n%s" % err) + for line in out.split('\n'): + self.log.debug("Ansible stdout:\n%s" % line) + for line in err.split('\n'): + self.log.debug("Ansible stderr:\n%s" % line) ret = self.ansible_proc.wait() self.ansible_proc = None return ret == 0 @@ -862,8 +864,10 @@ class NodeWorker(object): preexec_fn=os.setsid, ) (out, err) = proc.communicate() - self.log.debug("Ansible post stdout:\n%s" % out) - self.log.debug("Ansible post stderr:\n%s" % err) + for line in out.split('\n'): + self.log.debug("Ansible post stdout:\n%s" % line) + for line in err.split('\n'): + self.log.debug("Ansible post stderr:\n%s" % line) return proc.wait() == 0 From 42465ee3ff37c26d8d49b35162cb62bbdabfffa5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 25 May 2016 17:42:38 -0700 Subject: [PATCH 056/152] Ansible launcher: use console.html Some of our tooling expects console.html; the easiest thing to do at this point is just to call it that everywhere. Change-Id: I2ac3d03bf7e263e4f30d6f0d94974f1699ec8f24 --- zuul/ansible/library/zuul_console.py | 4 ++-- zuul/ansible/library/zuul_log.py | 2 +- zuul/ansible/library/zuul_runner.py | 2 +- zuul/launcher/ansiblelaunchserver.py | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/zuul/ansible/library/zuul_console.py b/zuul/ansible/library/zuul_console.py index 42599d80d4..78f3249c88 100644 --- a/zuul/ansible/library/zuul_console.py +++ b/zuul/ansible/library/zuul_console.py @@ -170,14 +170,14 @@ class Server(object): def test(): - s = Server('/tmp/console.txt', 8088) + s = Server('/tmp/console.html', 8088) s.run() def main(): module = AnsibleModule( argument_spec=dict( - path=dict(default='/tmp/console.txt'), + path=dict(default='/tmp/console.html'), port=dict(default=8088, type='int'), ) ) diff --git a/zuul/ansible/library/zuul_log.py b/zuul/ansible/library/zuul_log.py index e3a3458cc9..62094730d3 100644 --- a/zuul/ansible/library/zuul_log.py +++ b/zuul/ansible/library/zuul_log.py @@ -21,7 +21,7 @@ import datetime class Console(object): def __enter__(self): - self.logfile = open('/tmp/console.txt', 'a', 0) + self.logfile = open('/tmp/console.html', 'a', 0) return self def __exit__(self, etype, value, tb): diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index fd5402f5c6..bc38376c21 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -23,7 +23,7 @@ import subprocess class Console(object): def __enter__(self): - self.logfile = open('/tmp/console.txt', 'a', 0) + self.logfile = open('/tmp/console.html', 'a', 0) return self def __exit__(self, etype, value, tb): diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 1cb4f989fb..5adde3f370 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -608,7 +608,7 @@ class NodeWorker(object): raise Exception("Undefined SCP site: %s" % (site,)) site = self.sites[site] if scpfile.get('copy-console'): - src = '/tmp/console.txt' + src = '/tmp/console.html' rsync_opts = [] else: src = parameters['WORKSPACE'] @@ -774,10 +774,10 @@ class NodeWorker(object): tasks.append(dict(block=main_block, rescue=error_block)) - task = dict(file=dict(path='/tmp/console.txt', state='absent')) + task = dict(file=dict(path='/tmp/console.html', state='absent')) main_block.append(task) - task = dict(zuul_console=dict(path='/tmp/console.txt', port=8088)) + task = dict(zuul_console=dict(path='/tmp/console.html', port=8088)) main_block.append(task) task = dict(file=dict(path=parameters['WORKSPACE'], From 46d229c1641ccc587b467145873d8bb324b8fd96 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 26 May 2016 14:42:17 -0700 Subject: [PATCH 057/152] Ansible launcher: remove excess newlines in ansible logs This had the opposite of the intended effect. Change-Id: Ie1c39cbfa06497ff4b618a7da8b1e9c47cf4b19b --- zuul/launcher/ansiblelaunchserver.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5adde3f370..d5dc3557bc 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -847,9 +847,9 @@ class NodeWorker(object): ) (out, err) = self.ansible_proc.communicate() for line in out.split('\n'): - self.log.debug("Ansible stdout:\n%s" % line) + self.log.debug("Ansible stdout: %s" % line) for line in err.split('\n'): - self.log.debug("Ansible stderr:\n%s" % line) + self.log.debug("Ansible stderr: %s" % line) ret = self.ansible_proc.wait() self.ansible_proc = None return ret == 0 @@ -865,9 +865,9 @@ class NodeWorker(object): ) (out, err) = proc.communicate() for line in out.split('\n'): - self.log.debug("Ansible post stdout:\n%s" % line) + self.log.debug("Ansible post stdout: %s" % line) for line in err.split('\n'): - self.log.debug("Ansible post stderr:\n%s" % line) + self.log.debug("Ansible post stderr: %s" % line) return proc.wait() == 0 From 978601149fae16cc4cd1a506c04f121198a81583 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 26 May 2016 14:56:46 -0700 Subject: [PATCH 058/152] Ansible launcher: Do not log long ansible lines 1K should be enough for anybody. Change-Id: I576ebf5ad54aa21d97294723e66e1fc57877b0ea --- zuul/launcher/ansiblelaunchserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index d5dc3557bc..61ef88e17f 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -847,8 +847,10 @@ class NodeWorker(object): ) (out, err) = self.ansible_proc.communicate() for line in out.split('\n'): + line = line[:1024] self.log.debug("Ansible stdout: %s" % line) for line in err.split('\n'): + line = line[:1024] self.log.debug("Ansible stderr: %s" % line) ret = self.ansible_proc.wait() self.ansible_proc = None @@ -865,8 +867,10 @@ class NodeWorker(object): ) (out, err) = proc.communicate() for line in out.split('\n'): + line = line[:1024] self.log.debug("Ansible post stdout: %s" % line) for line in err.split('\n'): + line = line[:1024] self.log.debug("Ansible post stderr: %s" % line) return proc.wait() == 0 From b47f565008213f4073236928d581c29034a304ba Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Mon, 30 May 2016 19:21:17 -0400 Subject: [PATCH 059/152] Move a debug logging line by one If, for some reason, the function name that comes in does not have a :, which should never happen, we won't see the bogus line in the log because we split it before we log it. This seriously doesn't matter - we don't register functions without :'s in them. But my eyes just can't stop staring at it. Change-Id: I089ec1969b28367078ce969bfbfc5272f70318e6 --- zuul/rpclistener.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/rpclistener.py b/zuul/rpclistener.py index d54da9f38e..5329c41932 100644 --- a/zuul/rpclistener.py +++ b/zuul/rpclistener.py @@ -66,8 +66,8 @@ class RPCListener(object): while self._running: try: job = self.worker.getJob() - z, jobname = job.name.split(':') self.log.debug("Received job %s" % job.name) + z, jobname = job.name.split(':') attrname = 'handle_' + jobname if hasattr(self, attrname): f = getattr(self, attrname) From fc47fec61d1f44f0d8f980cb23de4460f8e40698 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 26 May 2016 07:17:43 -0700 Subject: [PATCH 060/152] Ansible launcher: support static workers Change-Id: I7775124f89be94dc184586151e371ab1910ca0f1 --- zuul/launcher/ansiblelaunchserver.py | 71 +++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 61ef88e17f..232bd0cbe1 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -23,6 +23,7 @@ import socket import subprocess import tempfile import threading +import time import traceback import uuid @@ -36,6 +37,12 @@ import zuul.ansible.library import zuul.ansible.plugins.callback_plugins +def boolify(x): + if isinstance(x, str): + return bool(int(x)) + return bool(x) + + class JobDir(object): def __init__(self, keep=False): self.keep = keep @@ -63,7 +70,8 @@ class JobDir(object): class LaunchServer(object): log = logging.getLogger("zuul.LaunchServer") - section_re = re.compile('site "(.*?)"') + site_section_re = re.compile('site "(.*?)"') + node_section_re = re.compile('node "(.*?)"') def __init__(self, config, keep_jobdir=False): self.config = config @@ -76,17 +84,45 @@ class LaunchServer(object): self.zmq_send_queue = multiprocessing.JoinableQueue() self.termination_queue = multiprocessing.JoinableQueue() self.sites = {} + self.static_nodes = {} + if config.has_option('launcher', 'accept-nodes'): + self.accept_nodes = config.get('launcher', 'accept-nodes') + else: + self.accept_nodes = True for section in config.sections(): - m = self.section_re.match(section) + m = self.site_section_re.match(section) if m: sitename = m.group(1) d = {} d['host'] = config.get(section, 'host') d['user'] = config.get(section, 'user') - d['pass'] = config.get(section, 'pass', '') - d['root'] = config.get(section, 'root', '/') + if config.has_option(section, 'pass'): + d['pass'] = config.get(section, 'pass') + else: + d['pass'] = '' + if config.has_option(section, 'root'): + d['root'] = config.get(section, 'root') + else: + d['root'] = '/' self.sites[sitename] = d + continue + m = self.node_section_re.match(section) + if m: + nodename = m.group(1) + d = {} + d['name'] = nodename + d['host'] = config.get(section, 'host') + if config.has_option(section, 'description'): + d['description'] = config.get(section, 'description') + else: + d['description'] = '' + if config.has_option(section, 'labels'): + d['labels'] = config.get(section, 'labels').split(',') + else: + d['labels'] = [] + self.static_nodes[nodename] = d + continue def start(self): self._gearman_running = True @@ -132,6 +168,16 @@ class LaunchServer(object): self.gearman_thread.daemon = True self.gearman_thread.start() + # FIXME: Without this, sometimes the subprocess module does + # not actually launch any subprocesses. I have no + # explanation. -corvus + time.sleep(1) + + # Start static workers + for node in self.static_nodes.values(): + self.log.debug("Creating static node with arguments: %s" % (node,)) + self._launchWorker(node) + def loadJobs(self): self.log.debug("Loading jobs") builder = JJB() @@ -147,7 +193,8 @@ class LaunchServer(object): del self.jobs[name] def register(self): - self.worker.registerFunction("node-assign:zuul") + if self.accept_nodes: + self.worker.registerFunction("node-assign:zuul") self.worker.registerFunction("stop:%s" % self.hostname) def reconfigure(self, config): @@ -220,6 +267,12 @@ class LaunchServer(object): def assignNode(self, job): args = json.loads(job.arguments) self.log.debug("Assigned node with arguments: %s" % (args,)) + self._launchWorker(args) + data = dict(manager=self.hostname) + job.sendWorkData(json.dumps(data)) + job.sendWorkComplete() + + def _launchWorker(self, args): worker = NodeWorker(self.config, self.jobs, self.builds, self.sites, args['name'], args['host'], args['description'], args['labels'], @@ -230,10 +283,6 @@ class LaunchServer(object): worker.process = multiprocessing.Process(target=worker.run) worker.process.start() - data = dict(manager=self.hostname) - job.sendWorkData(json.dumps(data)) - job.sendWorkComplete() - def stopJob(self, job): try: args = json.loads(job.arguments) @@ -458,9 +507,7 @@ class NodeWorker(object): # Make sure we can parse what we need from the job first args = json.loads(job.arguments) - # This may be configurable later, or we may choose to honor - # OFFLINE_NODE_WHEN_COMPLETE - offline = True + offline = boolify(args.get('OFFLINE_NODE_WHEN_COMPLETE', False)) job_name = job.name.split(':')[1] # Initialize the result so we have something regardless of From 7ab2be381bdddb4bef9b2f5c123c52710a237888 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 27 May 2016 14:47:23 -0700 Subject: [PATCH 061/152] Ansible launcher: some ansible fixes These were causing ansible errors: * required and default are mutually exclusize * don't return rc from the zuul_log module * the initial timeout, if not set, will arrive as the string 'None' Change-Id: I0129c01d5fb365b81812553eeaf6b3d91acf8f55 --- zuul/ansible/library/zuul_log.py | 6 +++--- zuul/ansible/plugins/callback_plugins/timeout.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/zuul/ansible/library/zuul_log.py b/zuul/ansible/library/zuul_log.py index 62094730d3..8978275abc 100644 --- a/zuul/ansible/library/zuul_log.py +++ b/zuul/ansible/library/zuul_log.py @@ -41,13 +41,13 @@ def log(msg): def main(): module = AnsibleModule( argument_spec=dict( - msg=dict(required=True, default=''), + msg=dict(required=True), ) ) p = module.params - ret = log(p['msg']) - module.exit_json(changed=True, rc=ret) + log(p['msg']) + module.exit_json(changed=True) from ansible.module_utils.basic import * # noqa diff --git a/zuul/ansible/plugins/callback_plugins/timeout.py b/zuul/ansible/plugins/callback_plugins/timeout.py index 245e9884ec..030ecc87a0 100644 --- a/zuul/ansible/plugins/callback_plugins/timeout.py +++ b/zuul/ansible/plugins/callback_plugins/timeout.py @@ -49,7 +49,7 @@ class CallbackModule(CallbackBase): facts = dict(elapsed_time=self._elapsed_time) overall_timeout = manager.extra_vars.get('timeout') - if overall_timeout is not None: + if str(overall_timeout) != 'None': timeout = int(overall_timeout) - int(self._elapsed_time) facts['timeout'] = timeout From f5922b67cd66759ff99a4b7c38711f93c2e5d23d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 27 May 2016 14:53:26 -0700 Subject: [PATCH 062/152] Ansible launcher: handle JJB with no macros Handle the case where there are no 'builder' macros defined. Change-Id: I5e4db4834df89ce1629000f116245c063ad96a3f --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 232bd0cbe1..8061176dd7 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -939,7 +939,7 @@ class JJB(jenkins_jobs.builder.Builder): name = component component_data = {} - new_component = self.parser.data[component_type].get(name) + new_component = self.parser.data.get(component_type, {}).get(name) if new_component: for new_sub_component in new_component[component_list_type]: new_components.extend( From c4b2041cffdacbadcc8167231f3d27bbf8c7a19b Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 27 May 2016 16:45:26 -0700 Subject: [PATCH 063/152] Ansible launcher: use a socket for reconfiguration Signals and multiprocessing don't mix well. Instead of using signals for things like stop and reconfiguration, use a socket that accepts simple commands, and use the zuul-launcher command to send them. This implements reconfiguration and stopping. Other commands (eg, graceful stop, pause, etc) can be implemented later. Change-Id: I14b1fdc5e3a20f4b1161dbc14705ad424ad13fbd --- zuul/cmd/launcher.py | 69 ++++++++++++++--------- zuul/launcher/ansiblelaunchserver.py | 46 ++++++++++++++- zuul/lib/commandsocket.py | 83 ++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 28 deletions(-) create mode 100644 zuul/lib/commandsocket.py diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py index c9516f8d5c..2ba4b85af1 100644 --- a/zuul/cmd/launcher.py +++ b/zuul/cmd/launcher.py @@ -24,6 +24,7 @@ pid_file_module = extras.try_imports(['daemon.pidlockfile', 'daemon.pidfile']) import logging import os +import socket import sys import signal @@ -49,25 +50,35 @@ class Launcher(zuul.cmd.ZuulApp): parser.add_argument('--keep-jobdir', dest='keep_jobdir', action='store_true', help='keep local jobdirs after run completes') + parser.add_argument('command', choices=['reconfigure', 'stop'], + nargs='?') + self.args = parser.parse_args() - def reconfigure_handler(self, signum, frame): - signal.signal(signal.SIGHUP, signal.SIG_IGN) - self.log.debug("Reconfiguration triggered") - self.read_config() - self.setup_logging('launcher', 'log_config') - try: - self.launcher.reconfigure(self.config) - except Exception: - self.log.exception("Reconfiguration failed:") - signal.signal(signal.SIGHUP, self.reconfigure_handler) + def send_command(self, cmd): + if self.config.has_option('zuul', 'state_dir'): + state_dir = os.path.expanduser( + self.config.get('zuul', 'state_dir')) + else: + state_dir = '/var/lib/zuul' + path = os.path.join(state_dir, 'launcher.socket') + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(path) + s.sendall('%s\n' % cmd) - def exit_handler(self, signum, frame): - signal.signal(signal.SIGUSR1, signal.SIG_IGN) + def send_reconfigure(self): + self.send_command('reconfigure') + sys.exit(0) + + def send_stop(self): + self.send_command('stop') + sys.exit(0) + + def exit_handler(self): self.launcher.stop() self.launcher.join() - def main(self): + def main(self, daemon=True): # See comment at top of file about zuul imports import zuul.launcher.ansiblelaunchserver @@ -80,23 +91,31 @@ class Launcher(zuul.cmd.ZuulApp): keep_jobdir=self.args.keep_jobdir) self.launcher.start() - signal.signal(signal.SIGHUP, self.reconfigure_handler) - signal.signal(signal.SIGUSR1, self.exit_handler) signal.signal(signal.SIGUSR2, zuul.cmd.stack_dump_handler) - while True: - try: - signal.pause() - except KeyboardInterrupt: - print "Ctrl + C: asking launcher to exit nicely...\n" - self.exit_handler(signal.SIGINT, None) - sys.exit(0) + if daemon: + self.launcher.join() + else: + while True: + try: + signal.pause() + except KeyboardInterrupt: + print "Ctrl + C: asking launcher to exit nicely...\n" + self.exit_handler() + sys.exit(0) def main(): server = Launcher() server.parse_arguments() - server.read_config() + + if server.args.command == 'reconfigure': + server.send_reconfigure() + sys.exit(0) + elif server.args.command == 'stop': + server.send_stop() + sys.exit(0) + server.configure_connections() if server.config.has_option('launcher', 'pidfile'): @@ -106,10 +125,10 @@ def main(): pid = pid_file_module.TimeoutPIDLockFile(pid_fn, 10) if server.args.nodaemon: - server.main() + server.main(False) else: with daemon.DaemonContext(pidfile=pid): - server.main() + server.main(True) if __name__ == "__main__": diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 8061176dd7..e6a066c53e 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -35,6 +35,7 @@ import zmq import zuul.ansible.library import zuul.ansible.plugins.callback_plugins +from zuul.lib import commandsocket def boolify(x): @@ -78,6 +79,9 @@ class LaunchServer(object): self.keep_jobdir = keep_jobdir self.hostname = socket.gethostname() self.node_workers = {} + # This has the side effect of creating the logger; our logging + # config will handle the rest. + multiprocessing.get_logger() self.mpmanager = multiprocessing.Manager() self.jobs = self.mpmanager.dict() self.builds = self.mpmanager.dict() @@ -90,6 +94,14 @@ class LaunchServer(object): else: self.accept_nodes = True + if self.config.has_option('zuul', 'state_dir'): + state_dir = os.path.expanduser( + self.config.get('zuul', 'state_dir')) + else: + state_dir = '/var/lib/zuul' + path = os.path.join(state_dir, 'launcher.socket') + self.command_socket = commandsocket.CommandSocket(path) + for section in config.sections(): m = self.site_section_re.match(section) if m: @@ -128,6 +140,7 @@ class LaunchServer(object): self._gearman_running = True self._zmq_running = True self._reaper_running = True + self._command_running = True # Setup ZMQ self.zcontext = zmq.Context() @@ -147,6 +160,13 @@ class LaunchServer(object): self.log.debug("Registering") self.register() + # Start command socket + self.log.debug("Starting command processor") + self.command_socket.start() + self.command_thread = threading.Thread(target=self.runCommand) + self.command_thread.daemon = True + self.command_thread.start() + # Load JJB config self.loadJobs() @@ -197,9 +217,8 @@ class LaunchServer(object): self.worker.registerFunction("node-assign:zuul") self.worker.registerFunction("stop:%s" % self.hostname) - def reconfigure(self, config): + def reconfigure(self): self.log.debug("Reconfiguring") - self.config = config self.loadJobs() for node in self.node_workers.values(): try: @@ -212,22 +231,43 @@ class LaunchServer(object): def stop(self): self.log.debug("Stopping") + # First, stop accepting new jobs self._gearman_running = False self._reaper_running = False self.worker.shutdown() + # Then stop all of the workers for node in self.node_workers.values(): try: if node.isAlive(): node.stop() except Exception: self.log.exception("Exception sending stop command to worker:") + # Stop ZMQ afterwords so that the send queue is flushed self._zmq_running = False self.zmq_send_queue.put(None) self.zmq_send_queue.join() + # Stop command processing + self._command_running = False + self.command_socket.stop() + # Join the gearman thread which was stopped earlier. + self.gearman_thread.join() + # The command thread is joined in the join() method of this + # class, which is called by the command shell. self.log.debug("Stopped") def join(self): - self.gearman_thread.join() + self.command_thread.join() + + def runCommand(self): + while self._command_running: + try: + command = self.command_socket.get() + if command == 'reconfigure': + self.reconfigure() + if command == 'stop': + self.stop() + except Exception: + self.log.exception("Exception while processing command") def runZMQ(self): while self._zmq_running or not self.zmq_send_queue.empty(): diff --git a/zuul/lib/commandsocket.py b/zuul/lib/commandsocket.py new file mode 100644 index 0000000000..1b7fed915b --- /dev/null +++ b/zuul/lib/commandsocket.py @@ -0,0 +1,83 @@ +# Copyright 2014 OpenStack Foundation +# Copyright 2014 Hewlett-Packard Development Company, L.P. +# Copyright 2016 Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging +import os +import socket +import threading +import Queue + + +class CommandSocket(object): + log = logging.getLogger("zuul.CommandSocket") + + def __init__(self, path): + self.running = False + self.path = path + self.queue = Queue.Queue() + + def start(self): + self.running = True + if os.path.exists(self.path): + os.unlink(self.path) + self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + self.socket.bind(self.path) + self.socket.listen(1) + self.socket_thread = threading.Thread(target=self._socketListener) + self.socket_thread.daemon = True + self.socket_thread.start() + + def stop(self): + # First, wake up our listener thread with a connection and + # tell it to stop running. + self.running = False + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + s.connect(self.path) + s.sendall('_stop\n') + # The command '_stop' will be ignored by our listener, so + # directly inject it into the queue so that consumers of this + # class which are waiting in .get() are awakened. They can + # either handle '_stop' or just ignore the unknown command and + # then check to see if they should continue to run before + # re-entering their loop. + self.queue.put('_stop') + self.socket_thread.join() + + def _socketListener(self): + while self.running: + try: + s, addr = self.socket.accept() + self.log.debug("Accepted socket connection %s" % (s,)) + buf = '' + while True: + buf += s.recv(1) + if buf[-1] == '\n': + break + buf = buf.strip() + self.log.debug("Received %s from socket" % (buf,)) + s.close() + # Because we use '_stop' internally to wake up a + # waiting thread, don't allow it to actually be + # injected externally. + if buf != '_stop': + self.queue.put(buf) + except Exception: + self.log.exception("Exception in socket handler") + + def get(self): + if not self.running: + raise Exception("CommandSocket.get called while stopped") + return self.queue.get() From ab461455947f00a98fda071b590179239ddc4d6c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 31 May 2016 15:07:09 -0700 Subject: [PATCH 064/152] Ansible launcher: use correct ZMQ port We use 8888. Where did 8881 even come from? (This should probably be configurable.) Change-Id: I94311fc0ba63b9c29a9e7bc92574ef3a63750c95 --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index e6a066c53e..792d1b9880 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -145,7 +145,7 @@ class LaunchServer(object): # Setup ZMQ self.zcontext = zmq.Context() self.zsocket = self.zcontext.socket(zmq.PUB) - self.zsocket.bind("tcp://*:8881") + self.zsocket.bind("tcp://*:8888") # Setup Gearman server = self.config.get('gearman', 'server') From 8fc762bc5e75931f18188605891bf4180db3eb9c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 1 Jun 2016 10:59:14 -0700 Subject: [PATCH 065/152] Ansible launcher: Use threads for workers The SyncManager from the multiprocessing module seems to be exiting under high load, and not leaving any clues as to why. We can probably handle the scale we anticipate using threads for the moment at least, so switch to that. Change-Id: If235cf802bb50874ecbe8bc234f67bc66a36ee22 --- zuul/launcher/ansiblelaunchserver.py | 33 ++++++++++------------------ 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 792d1b9880..5ecc954a48 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -14,7 +14,6 @@ import json import logging -import multiprocessing import os import re import shutil @@ -23,8 +22,8 @@ import socket import subprocess import tempfile import threading -import time import traceback +import Queue import uuid import gear @@ -79,14 +78,10 @@ class LaunchServer(object): self.keep_jobdir = keep_jobdir self.hostname = socket.gethostname() self.node_workers = {} - # This has the side effect of creating the logger; our logging - # config will handle the rest. - multiprocessing.get_logger() - self.mpmanager = multiprocessing.Manager() - self.jobs = self.mpmanager.dict() - self.builds = self.mpmanager.dict() - self.zmq_send_queue = multiprocessing.JoinableQueue() - self.termination_queue = multiprocessing.JoinableQueue() + self.jobs = {} + self.builds = {} + self.zmq_send_queue = Queue.Queue() + self.termination_queue = Queue.Queue() self.sites = {} self.static_nodes = {} if config.has_option('launcher', 'accept-nodes'): @@ -188,11 +183,6 @@ class LaunchServer(object): self.gearman_thread.daemon = True self.gearman_thread.start() - # FIXME: Without this, sometimes the subprocess module does - # not actually launch any subprocesses. I have no - # explanation. -corvus - time.sleep(1) - # Start static workers for node in self.static_nodes.values(): self.log.debug("Creating static node with arguments: %s" % (node,)) @@ -320,8 +310,8 @@ class LaunchServer(object): self.termination_queue, self.keep_jobdir) self.node_workers[worker.name] = worker - worker.process = multiprocessing.Process(target=worker.run) - worker.process.start() + worker.thread = threading.Thread(target=worker.run) + worker.thread.start() def stopJob(self, job): try: @@ -358,7 +348,7 @@ class LaunchServer(object): continue worker = self.node_workers[item] self.log.debug("Joining %s" % (item,)) - worker.process.join() + worker.thread.join() self.log.debug("Joined %s" % (item,)) del self.node_workers[item] except Exception: @@ -384,10 +374,10 @@ class NodeWorker(object): if not isinstance(labels, list): labels = [labels] self.labels = labels - self.process = None + self.thread = None self.registered_functions = set() self._running = True - self.queue = multiprocessing.JoinableQueue() + self.queue = Queue.Queue() self.manager_name = manager_name self.zmq_send_queue = zmq_send_queue self.termination_queue = termination_queue @@ -408,12 +398,11 @@ class NodeWorker(object): def isAlive(self): # Meant to be called from the manager - if self.process and self.process.is_alive(): + if self.thread and self.thread.is_alive(): return True return False def run(self): - signal.signal(signal.SIGINT, signal.SIG_IGN) self.log.debug("Node worker %s starting" % (self.name,)) server = self.config.get('gearman', 'server') if self.config.has_option('gearman', 'port'): From a6a500402956b2812b764dc06546d64816c0c74a Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 1 Jun 2016 15:33:54 -0700 Subject: [PATCH 066/152] Ansible launcher: add pause/unpause support Change-Id: I55c68153a80477c657d7bc5d22e463c37a494eb6 --- zuul/cmd/launcher.py | 22 +++------ zuul/launcher/ansiblelaunchserver.py | 69 ++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py index 2ba4b85af1..1dbc3ee232 100644 --- a/zuul/cmd/launcher.py +++ b/zuul/cmd/launcher.py @@ -29,8 +29,9 @@ import sys import signal import zuul.cmd +import zuul.launcher.ansiblelaunchserver -# No zuul imports here because they pull in paramiko which must not be +# No zuul imports that pull in paramiko here; it must not be # imported until after the daemonization. # https://github.com/paramiko/paramiko/issues/59 # Similar situation with gear and statsd. @@ -50,7 +51,8 @@ class Launcher(zuul.cmd.ZuulApp): parser.add_argument('--keep-jobdir', dest='keep_jobdir', action='store_true', help='keep local jobdirs after run completes') - parser.add_argument('command', choices=['reconfigure', 'stop'], + parser.add_argument('command', + choices=zuul.launcher.ansiblelaunchserver.COMMANDS, nargs='?') self.args = parser.parse_args() @@ -66,21 +68,12 @@ class Launcher(zuul.cmd.ZuulApp): s.connect(path) s.sendall('%s\n' % cmd) - def send_reconfigure(self): - self.send_command('reconfigure') - sys.exit(0) - - def send_stop(self): - self.send_command('stop') - sys.exit(0) - def exit_handler(self): self.launcher.stop() self.launcher.join() def main(self, daemon=True): # See comment at top of file about zuul imports - import zuul.launcher.ansiblelaunchserver self.setup_logging('launcher', 'log_config') @@ -109,11 +102,8 @@ def main(): server.parse_arguments() server.read_config() - if server.args.command == 'reconfigure': - server.send_reconfigure() - sys.exit(0) - elif server.args.command == 'stop': - server.send_stop() + if server.args.command in zuul.launcher.ansiblelaunchserver.COMMANDS: + server.send_command(server.args.command) sys.exit(0) server.configure_connections() diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5ecc954a48..c4c6ffc4e6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -37,6 +37,9 @@ import zuul.ansible.plugins.callback_plugins from zuul.lib import commandsocket +COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause'] + + def boolify(x): if isinstance(x, str): return bool(int(x)) @@ -77,6 +80,7 @@ class LaunchServer(object): self.config = config self.keep_jobdir = keep_jobdir self.hostname = socket.gethostname() + self.registered_functions = set() self.node_workers = {} self.jobs = {} self.builds = {} @@ -203,9 +207,16 @@ class LaunchServer(object): del self.jobs[name] def register(self): + new_functions = set() if self.accept_nodes: - self.worker.registerFunction("node-assign:zuul") - self.worker.registerFunction("stop:%s" % self.hostname) + new_functions.add("node-assign:zuul") + new_functions.add("stop:%s" % self.hostname) + + for function in new_functions - self.registered_functions: + self.worker.registerFunction(function) + for function in self.registered_functions - new_functions: + self.worker.unRegisterFunction(function) + self.registered_functions = new_functions def reconfigure(self): self.log.debug("Reconfiguring") @@ -219,6 +230,32 @@ class LaunchServer(object): "to worker:") self.log.debug("Reconfiguration complete") + def pause(self): + self.log.debug("Pausing") + self.accept_nodes = False + self.register() + for node in self.node_workers.values(): + try: + if node.isAlive(): + node.queue.put(dict(action='pause')) + except Exception: + self.log.exception("Exception sending pause command " + "to worker:") + self.log.debug("Paused") + + def unpause(self): + self.log.debug("Unpausing") + self.accept_nodes = True + self.register() + for node in self.node_workers.values(): + try: + if node.isAlive(): + node.queue.put(dict(action='unpause')) + except Exception: + self.log.exception("Exception sending unpause command " + "to worker:") + self.log.debug("Unpaused") + def stop(self): self.log.debug("Stopping") # First, stop accepting new jobs @@ -254,8 +291,12 @@ class LaunchServer(object): command = self.command_socket.get() if command == 'reconfigure': self.reconfigure() - if command == 'stop': + elif command == 'stop': self.stop() + elif command == 'pause': + self.pause() + elif command == 'unpause': + self.unpause() except Exception: self.log.exception("Exception while processing command") @@ -376,6 +417,10 @@ class NodeWorker(object): self.labels = labels self.thread = None self.registered_functions = set() + # If the unpaused Event is set, that means we should run jobs. + # If it is clear, then we are paused and should not run jobs. + self.unpaused = threading.Event() + self.unpaused.set() self._running = True self.queue = Queue.Queue() self.manager_name = manager_name @@ -434,9 +479,17 @@ class NodeWorker(object): # will be set by the queue thread. self.log.debug("Submitting stop request") self._running = False + self.unpaused.set() self.queue.put(dict(action='stop')) self.queue.join() + def pause(self): + self.unpaused.clear() + self.worker.stopWaitingForJobs() + + def unpause(self): + self.unpaused.set() + def _runQueue(self): item = self.queue.get() try: @@ -449,6 +502,12 @@ class NodeWorker(object): else: self._job_complete_event.wait() self.worker.shutdown() + if item['action'] == 'pause': + self.log.debug("Received pause request") + self.pause() + if item['action'] == 'unpause': + self.log.debug("Received unpause request") + self.unpause() elif item['action'] == 'reconfigure': self.log.debug("Received reconfigure request") self.register() @@ -461,7 +520,9 @@ class NodeWorker(object): def runGearman(self): while self._running: try: - self._runGearman() + self.unpaused.wait() + if self._running: + self._runGearman() except Exception: self.log.exception("Exception in gearman manager:") From 4c6a774419f41c73dd185ddc0b1c7c4ed900915e Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Fri, 27 May 2016 08:42:17 -0700 Subject: [PATCH 067/152] Python 3 Fixes: Use print() not print For Python 3 fixes, use print() instead of the python 2 "print" bareword syntax. Change-Id: Ib77544d111aab6a1bd52555bcbd1dad9955d0074 --- tests/base.py | 9 ++++----- tests/test_layoutvalidator.py | 6 +++--- tests/test_scheduler.py | 6 +++--- tools/zuul-changes.py | 2 +- zuul/cmd/client.py | 4 ++-- zuul/cmd/launcher.py | 2 +- zuul/cmd/merger.py | 6 ++---- zuul/cmd/server.py | 4 ++-- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/tests/base.py b/tests/base.py index 585f2d203b..de5601bbb4 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1296,9 +1296,8 @@ class ZuulTestCase(BaseTestCase): start = time.time() while True: if time.time() - start > 10: - print 'queue status:', - print ' '.join(self.eventQueuesEmpty()) - print self.areAllBuildsWaiting() + print('queue status:', ''.join(self.eventQueuesEmpty())) + print(self.areAllBuildsWaiting()) raise Exception("Timeout waiting for Zuul to settle") # Make sure no new events show up while we're checking self.worker.lock.acquire() @@ -1336,8 +1335,8 @@ class ZuulTestCase(BaseTestCase): for pipeline in self.sched.layout.pipelines.values(): for queue in pipeline.queues: if len(queue.queue) != 0: - print 'pipeline %s queue %s contents %s' % ( - pipeline.name, queue.name, queue.queue) + print('pipeline %s queue %s contents %s' % ( + pipeline.name, queue.name, queue.queue)) self.assertEqual(len(queue.queue), 0, "Pipelines queues should be empty") diff --git a/tests/test_layoutvalidator.py b/tests/test_layoutvalidator.py index 3dc3234a88..3de4a94d56 100644 --- a/tests/test_layoutvalidator.py +++ b/tests/test_layoutvalidator.py @@ -33,13 +33,13 @@ LAYOUT_RE = re.compile(r'^(good|bad)_.*\.yaml$') class TestLayoutValidator(testtools.TestCase): def test_layouts(self): """Test layout file validation""" - print + print() errors = [] for fn in os.listdir(os.path.join(FIXTURE_DIR, 'layouts')): m = LAYOUT_RE.match(fn) if not m: continue - print fn + print(fn) # Load any .conf file by the same name but .conf extension. config_file = ("%s.conf" % @@ -69,7 +69,7 @@ class TestLayoutValidator(testtools.TestCase): fn) except voluptuous.Invalid as e: error = str(e) - print ' ', error + print(' ', error) if error in errors: raise Exception("Error has already been tested: %s" % error) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 15d33c8aad..053b1ccd33 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -1484,7 +1484,7 @@ jobs: self.worker.build_history = [] path = os.path.join(self.git_root, "org/project") - print repack_repo(path) + print(repack_repo(path)) A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') A.addApproval('CRVW', 2) @@ -1509,9 +1509,9 @@ jobs: A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A') A.addPatchset(large=True) path = os.path.join(self.upstream_root, "org/project1") - print repack_repo(path) + print(repack_repo(path)) path = os.path.join(self.git_root, "org/project1") - print repack_repo(path) + print(repack_repo(path)) A.addApproval('CRVW', 2) self.fake_gerrit.addEvent(A.addApproval('APRV', 1)) diff --git a/tools/zuul-changes.py b/tools/zuul-changes.py index 9dbf504e74..8b854c79de 100755 --- a/tools/zuul-changes.py +++ b/tools/zuul-changes.py @@ -35,7 +35,7 @@ for pipeline in data['pipelines']: if not change['live']: continue cid, cps = change['id'].split(',') - print ( + print( "zuul enqueue --trigger gerrit --pipeline %s " "--project %s --change %s,%s" % ( options.pipeline_name, diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py index 59ac419f15..1ce2828f0b 100644 --- a/zuul/cmd/client.py +++ b/zuul/cmd/client.py @@ -154,7 +154,7 @@ class Client(zuul.cmd.ZuulApp): running_items = client.get_running_jobs() if len(running_items) == 0: - print "No jobs currently running" + print("No jobs currently running") return True all_fields = self._show_running_jobs_columns() @@ -181,7 +181,7 @@ class Client(zuul.cmd.ZuulApp): v += all_fields[f]['append'] values.append(v) table.add_row(values) - print table + print(table) return True def _epoch_to_relative_time(self, epoch): diff --git a/zuul/cmd/launcher.py b/zuul/cmd/launcher.py index 2ba4b85af1..bbda87dad6 100644 --- a/zuul/cmd/launcher.py +++ b/zuul/cmd/launcher.py @@ -99,7 +99,7 @@ class Launcher(zuul.cmd.ZuulApp): try: signal.pause() except KeyboardInterrupt: - print "Ctrl + C: asking launcher to exit nicely...\n" + print("Ctrl + C: asking launcher to exit nicely...\n") self.exit_handler() sys.exit(0) diff --git a/zuul/cmd/merger.py b/zuul/cmd/merger.py index df215fd80b..797a990b01 100644 --- a/zuul/cmd/merger.py +++ b/zuul/cmd/merger.py @@ -68,7 +68,7 @@ class Merger(zuul.cmd.ZuulApp): try: signal.pause() except KeyboardInterrupt: - print "Ctrl + C: asking merger to exit nicely...\n" + print("Ctrl + C: asking merger to exit nicely...\n") self.exit_handler(signal.SIGINT, None) @@ -89,9 +89,7 @@ def main(): f.close() os.unlink(test_fn) except Exception: - print - print "Unable to write to state directory: %s" % state_dir - print + print("\nUnable to write to state directory: %s\n" % state_dir) raise if server.config.has_option('merger', 'pidfile'): diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index 6db15a21d6..f4004362af 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -107,7 +107,7 @@ class Server(zuul.cmd.ZuulApp): jobs.add(v) for job in sorted(layout.jobs): if job not in jobs: - print "Job %s not defined" % job + print("Job %s not defined" % job) failure = True return failure @@ -196,7 +196,7 @@ class Server(zuul.cmd.ZuulApp): try: signal.pause() except KeyboardInterrupt: - print "Ctrl + C: asking scheduler to exit nicely...\n" + print("Ctrl + C: asking scheduler to exit nicely...\n") self.exit_handler(signal.SIGINT, None) From 46326de5ed2c7f28a33a367333385aacf7e3b95c Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Thu, 2 Jun 2016 10:21:12 -0700 Subject: [PATCH 068/152] When a playbook fails, don't succeed in the error_block Make sure that a call into the error_block for "rescue" does not cause the playbook to be considered a success. This uses the Ansible fail module to indicate failure. Change-Id: I07d2f06aa74cfe9dd0f7a1c3a64d4e51290bd570 Partial-Story: #2000619 --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5ecc954a48..43b2c4e18f 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -876,6 +876,7 @@ class NodeWorker(object): task = dict(zuul_log=dict(msg="Job complete, result: FAILURE")) error_block.append(task) + error_block.append(dict(fail=dict(msg='FAILURE'))) play = dict(hosts='node', name='Job body', tasks=tasks) From c01b84dbb36a09413d1a0a3029847e03a7db9647 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 1 Jun 2016 16:40:42 -0700 Subject: [PATCH 069/152] Ansible launcher: add release command This relases idle workers back to nodepool. Change-Id: Id9d759d49a1c47ad920a34d473596340e88f418d --- zuul/launcher/ansiblelaunchserver.py | 48 +++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index c4c6ffc4e6..53da9e9bf8 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -37,7 +37,7 @@ import zuul.ansible.plugins.callback_plugins from zuul.lib import commandsocket -COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause'] +COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause', 'release'] def boolify(x): @@ -256,6 +256,19 @@ class LaunchServer(object): "to worker:") self.log.debug("Unpaused") + def release(self): + self.log.debug("Releasing idle nodes") + for node in self.node_workers.values(): + if node.name in self.static_nodes: + continue + try: + if node.isAlive(): + node.queue.put(dict(action='release')) + except Exception: + self.log.exception("Exception sending release command " + "to worker:") + self.log.debug("Finished releasing idle nodes") + def stop(self): self.log.debug("Stopping") # First, stop accepting new jobs @@ -297,6 +310,8 @@ class LaunchServer(object): self.pause() elif command == 'unpause': self.unpause() + elif command == 'release': + self.release() except Exception: self.log.exception("Exception while processing command") @@ -428,6 +443,8 @@ class NodeWorker(object): self.termination_queue = termination_queue self.keep_jobdir = keep_jobdir self.running_job_lock = threading.Lock() + self._get_job_lock = threading.Lock() + self._got_job = False self._job_complete_event = threading.Event() self._running_job = False self._sent_complete_event = False @@ -490,6 +507,20 @@ class NodeWorker(object): def unpause(self): self.unpaused.set() + def release(self): + # If this node is idle, stop it. + old_unpaused = self.unpaused.is_set() + if old_unpaused: + self.pause() + with self._get_job_lock: + if self._got_job: + self.log.debug("This worker is not idle") + if old_unpaused: + self.unpause() + return + self.log.debug("Stopping due to release command") + self.queue.put(dict(action='stop')) + def _runQueue(self): item = self.queue.get() try: @@ -508,6 +539,9 @@ class NodeWorker(object): if item['action'] == 'unpause': self.log.debug("Received unpause request") self.unpause() + if item['action'] == 'release': + self.log.debug("Received release request") + self.release() elif item['action'] == 'reconfigure': self.log.debug("Received reconfigure request") self.register() @@ -525,12 +559,16 @@ class NodeWorker(object): self._runGearman() except Exception: self.log.exception("Exception in gearman manager:") + with self._get_job_lock: + self._got_job = False def _runGearman(self): - try: - job = self.worker.getJob() - except gear.InterruptedError: - return + with self._get_job_lock: + try: + job = self.worker.getJob() + self._got_job = True + except gear.InterruptedError: + return self.log.debug("Node worker %s got job %s" % (self.name, job.name)) try: if job.name not in self.registered_functions: From 9b8ceb33b6aeb62f4cf0cb0d1a13dcb436b76d47 Mon Sep 17 00:00:00 2001 From: Jeremy Stanley Date: Fri, 3 Jun 2016 02:10:07 +0000 Subject: [PATCH 070/152] Support keep-hierarchy in launcher rsync publisher To maintain feature parity with the SCP publisher in JJB, only maintain the rsync source hierarchy at the destination if keep-hierarchy is true. The default (false) flattens the file tree completely at the destination. Change-Id: I5042f018342d4323c77ba50ce38a2614a065ba9d --- zuul/launcher/ansiblelaunchserver.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5ecc954a48..ce75cb16fd 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -715,16 +715,20 @@ class NodeWorker(object): (dest,)) local_args = [ - 'command', '/usr/bin/rsync', '--delay-updates', '-F', + 'shell', '/usr/bin/rsync', '--delay-updates', '-F', '--compress', '-rt', '--safe-links', '--rsync-path="mkdir -p {dest} && rsync"', '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' '-o StrictHostKeyChecking=no -q"', '--out-format="<>%i %n%L"', - '"{source}/"', '"{user}@{host}:{dest}"' + '{source}', '"{user}@{host}:{dest}"' ] + if scpfile.get('keep-hierarchy'): + source = '"%s/"' % scproot + else: + source = '`/usr/bin/find "%s" -type f`' % scproot local_action = ' '.join(local_args).format( - source=scproot, + source=source, dest=dest, private_key_file=self.private_key_file, host=site['host'], From 4c64983051bd720c3e18236fccf115c8fc0d6ea1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 2 Jun 2016 14:31:33 -0700 Subject: [PATCH 071/152] Ansible launcher: Write console in thread Read the console stream from the pipe and write it to a file from within another thread. This way the main thread can wait on the subprocess, and if it exits for any reason, exit the ansible task. This corrects a problem where our immediate child would exit, but a grandchild may hold stdout open, preventing us from reaching the point at which we call wait() on our child. Change-Id: If983e5b8e0ef27c63490e4b9a135f13a15bf5ae3 Story: #2000619 --- zuul/ansible/library/zuul_runner.py | 36 +++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index bc38376c21..f49016291e 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -19,6 +19,7 @@ import datetime import getpass import os import subprocess +import threading class Console(object): @@ -50,6 +51,21 @@ def get_env(): return env +def follow(fd): + newline_warning = False + with Console() as console: + while True: + line = fd.readline() + if not line: + break + if not line.endswith('\n'): + line += '\n' + newline_warning = True + console.addLine(line) + if newline_warning: + console.addLine('[Zuul] No trailing newline\n') + + def run(cwd, cmd, args): env = get_env() env.update(args) @@ -61,14 +77,20 @@ def run(cwd, cmd, args): env=env, ) - with Console() as console: - while True: - line = proc.stdout.readline() - if not line: - break - console.addLine(line) + t = threading.Thread(target=follow, args=(proc.stdout,)) + t.daemon = True + t.start() - ret = proc.wait() + ret = proc.wait() + # Give the thread that is writing the console log up to 10 seconds + # to catch up and exit. If it hasn't done so by then, it is very + # likely stuck in readline() because it spawed a child that is + # holding stdout or stderr open. + t.join(10) + with Console() as console: + if t.isAlive(): + console.addLine("[Zuul] standard output/error still open " + "after child exited") console.addLine("[Zuul] Task exit code: %s\n" % ret) return ret From e1ba7761ba571526b5d7dfdac070a9002da8c2ce Mon Sep 17 00:00:00 2001 From: Andreas Jaeger Date: Fri, 3 Jun 2016 15:08:25 +0200 Subject: [PATCH 072/152] Fix turbohipster git repo stackforge is dead, use proper location. Also, let's use https in the URL. Change-Id: Ic07d58b146d9bc71da99bc9e8082bd49ec012e0d --- doc/source/launchers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/launchers.rst b/doc/source/launchers.rst index c61cea8724..f368cb9420 100644 --- a/doc/source/launchers.rst +++ b/doc/source/launchers.rst @@ -6,7 +6,7 @@ https://wiki.jenkins-ci.org/display/JENKINS/Gearman+Plugin .. _`Turbo-Hipster`: - http://git.openstack.org/cgit/stackforge/turbo-hipster/ + https://git.openstack.org/cgit/openstack/turbo-hipster/ .. _`Turbo-Hipster Documentation`: http://turbo-hipster.rtfd.org/ From 1506b49e23810e7eb901793d7fe31a8924d35aae Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 3 Jun 2016 11:02:16 -0400 Subject: [PATCH 073/152] Stream subprocess output for ansible-playbook Because we want to see what ansible-playbook is doing it realtime, stream the output instead of waiting until ansible-playbook completes. Change-Id: Ic176ddd44cae144c9daeab46dfbb90c9937c4231 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 598758f66c..bd6f408afe 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -923,16 +923,12 @@ class NodeWorker(object): '-e', 'timeout=%s' % timeout, '-v'], cwd=jobdir.ansible_root, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=subprocess.STDOUT, preexec_fn=os.setsid, ) - (out, err) = self.ansible_proc.communicate() - for line in out.split('\n'): + for line in iter(self.ansible_proc.stdout.readline, b''): line = line[:1024] - self.log.debug("Ansible stdout: %s" % line) - for line in err.split('\n'): - line = line[:1024] - self.log.debug("Ansible stderr: %s" % line) + self.log.debug(line) ret = self.ansible_proc.wait() self.ansible_proc = None return ret == 0 @@ -943,16 +939,12 @@ class NodeWorker(object): '-e', 'success=%s' % success, '-v'], cwd=jobdir.ansible_root, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=subprocess.STDOUT, preexec_fn=os.setsid, ) - (out, err) = proc.communicate() - for line in out.split('\n'): + for line in iter(proc.stdout.readline, b''): line = line[:1024] - self.log.debug("Ansible post stdout: %s" % line) - for line in err.split('\n'): - line = line[:1024] - self.log.debug("Ansible post stderr: %s" % line) + self.log.debug(line) return proc.wait() == 0 From d6cb3aef11bf17cfead8ddc9dadd8123ded40a63 Mon Sep 17 00:00:00 2001 From: Joshua Hesketh Date: Fri, 3 Jun 2016 16:08:20 +1000 Subject: [PATCH 074/152] Ansible launcher: add watchdog for ansible Kill the ansible-playbook if it exceeds its timeout. Change-Id: I9f0a4277dc184fab6ce77ec508b77acbd59ec7ba Co-Authored-By: James E. Blair --- zuul/launcher/ansiblelaunchserver.py | 90 ++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index bd6f408afe..77e9e6c5f5 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -22,6 +22,7 @@ import socket import subprocess import tempfile import threading +import time import traceback import Queue import uuid @@ -36,6 +37,10 @@ import zuul.ansible.library import zuul.ansible.plugins.callback_plugins from zuul.lib import commandsocket +ANSIBLE_WATCHDOG_GRACE = 5 * 60 +ANSIBLE_DEFAULT_TIMEOUT = 2 * 60 * 60 +ANSIBLE_DEFAULT_POST_TIMEOUT = 10 * 60 + def boolify(x): if isinstance(x, str): @@ -43,6 +48,29 @@ def boolify(x): return bool(x) +class Watchdog(object): + def __init__(self, timeout, function, args): + self.timeout = timeout + self.function = function + self.args = args + self.thread = threading.Thread(target=self._run) + self.thread.daemon = True + + def _run(self): + while self._running and time.time() < self.end: + time.sleep(10) + if self._running: + self.function(*self.args) + + def start(self): + self._running = True + self.end = time.time() + self.timeout + self.thread.start() + + def stop(self): + self._running = False + + class JobDir(object): def __init__(self, keep=False): self.keep = keep @@ -509,12 +537,14 @@ class NodeWorker(object): self.registered_functions = new_functions def abortRunningJob(self): + return self.abortRunningProc(self.ansible_job_proc) + + def abortRunningProc(self, proc): aborted = False self.log.debug("Abort: acquiring job lock") with self.running_job_lock: if self._running_job: self.log.debug("Abort: a job is running") - proc = self.ansible_proc if proc: self.log.debug("Abort: sending kill signal to job " "process group") @@ -610,7 +640,8 @@ class NodeWorker(object): 'SUCCESS', {}) def runJob(self, job, args): - self.ansible_proc = None + self.ansible_job_proc = None + self.ansible_post_proc = None result = None with self.running_job_lock: if not self._running: @@ -809,11 +840,8 @@ class NodeWorker(object): cwd=parameters['WORKSPACE'], parameters=parameters) task = dict(zuul_runner=runner) - if timeout: - task['when'] = '{{ timeout | int > 0 }}' - task['async'] = '{{ timeout }}' - else: - task['async'] = 2 * 60 * 60 # 2 hour default timeout + task['when'] = '{{ timeout | int > 0 }}' + task['async'] = '{{ timeout }}' task['poll'] = 5 tasks.append(task) @@ -846,6 +874,8 @@ class NodeWorker(object): timeout = timeout.get('timeout') if timeout: timeout = timeout * 60 + if not timeout: + timeout = ANSIBLE_DEFAULT_TIMEOUT with open(jobdir.playbook, 'w') as playbook: tasks = [] @@ -917,8 +947,12 @@ class NodeWorker(object): return timeout + def _ansibleTimeout(self, proc, msg): + self.log.warning(msg) + self.abortRunningProc(proc) + def runAnsiblePlaybook(self, jobdir, timeout): - self.ansible_proc = subprocess.Popen( + self.ansible_job_proc = subprocess.Popen( ['ansible-playbook', jobdir.playbook, '-e', 'timeout=%s' % timeout, '-v'], cwd=jobdir.ansible_root, @@ -926,15 +960,25 @@ class NodeWorker(object): stderr=subprocess.STDOUT, preexec_fn=os.setsid, ) - for line in iter(self.ansible_proc.stdout.readline, b''): - line = line[:1024] - self.log.debug(line) - ret = self.ansible_proc.wait() + ret = None + watchdog = Watchdog(timeout + ANSIBLE_WATCHDOG_GRACE, + self._ansibleTimeout, + (self.ansible_job_proc, + "Ansible timeout exceeded")) + watchdog.start() + try: + for line in iter(self.ansible_job_proc.stdout.readline, b''): + line = line[:1024].rstrip() + self.log.debug("Ansible output: %s" % (line,)) + ret = self.ansible_job_proc.wait() + finally: + watchdog.stop() + self.log.debug("Ansible exit code: %s" % (ret,)) self.ansible_proc = None return ret == 0 def runAnsiblePostPlaybook(self, jobdir, success): - proc = subprocess.Popen( + self.ansible_post_proc = subprocess.Popen( ['ansible-playbook', jobdir.post_playbook, '-e', 'success=%s' % success, '-v'], cwd=jobdir.ansible_root, @@ -942,10 +986,22 @@ class NodeWorker(object): stderr=subprocess.STDOUT, preexec_fn=os.setsid, ) - for line in iter(proc.stdout.readline, b''): - line = line[:1024] - self.log.debug(line) - return proc.wait() == 0 + ret = None + watchdog = Watchdog(ANSIBLE_DEFAULT_POST_TIMEOUT, + self._ansibleTimeout, + (self.ansible_post_proc, + "Ansible post timeout exceeded")) + watchdog.start() + try: + for line in iter(self.ansible_post_proc.stdout.readline, b''): + line = line[:1024].rstrip() + self.log.debug("Ansible post output: %s" % (line,)) + ret = self.ansible_post_proc.wait() + finally: + watchdog.stop() + self.log.debug("Ansible post exit code: %s" % (ret,)) + self.ansible_post_proc = None + return ret == 0 class JJB(jenkins_jobs.builder.Builder): From e23f599e4d6426f3284ee6fd0b196c78a7655df1 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 3 Jun 2016 23:44:15 -0400 Subject: [PATCH 075/152] Add support for bindep Change-Id: I2d80c547daf403c9467d8924d821c27dc4131606 Signed-off-by: Paul Belanger --- other-requirements.txt | 4 ++++ tox.ini | 8 ++++++++ 2 files changed, 12 insertions(+) create mode 100644 other-requirements.txt diff --git a/other-requirements.txt b/other-requirements.txt new file mode 100644 index 0000000000..1ade6557cd --- /dev/null +++ b/other-requirements.txt @@ -0,0 +1,4 @@ +mysql-client [test] +mysql-server [test] +postgresql [test] +postgresql-client [test] diff --git a/tox.ini b/tox.ini index 79ea939b11..6dc83f7133 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,14 @@ deps = -r{toxinidir}/requirements.txt commands = python setup.py testr --slowest --testr-args='{posargs}' +[testenv:bindep] +# Do not install any requirements. We want this to be fast and work even if +# system dependencies are missing, since it's used to tell you what system +# dependencies are missing! This also means that bindep must be installed +# separately, outside of the requirements files. +deps = bindep +commands = bindep test + [testenv:pep8] commands = flake8 {posargs} From f53aafaf90749933d8d0e14f93e179006a6cc963 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 4 Jun 2016 12:36:56 -0400 Subject: [PATCH 076/152] Remove libselinux-python hack We are now running the zuul-worker element, so this logic can be removed. Change-Id: I7dd224a50817a638e2e65ba27aa8807a99d90702 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 45704c1460..fe2942906c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -993,12 +993,6 @@ class NodeWorker(object): state='directory')) main_block.append(task) - # TODO: remove once zuul-worker DIB element has landed - main_block.append(dict(shell="[ -f /usr/bin/yum ] && " - "sudo /usr/bin/yum install " - "libselinux-python || " - "/bin/true")) - for builder in jjb_job.get('builders', []): if 'shell' in builder: main_block.extend( From 23aa196ac034999fe4e93fadc964898cf97cc75d Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 4 Jun 2016 12:48:18 -0400 Subject: [PATCH 077/152] Remove unused git_root variable Change-Id: Ie82b7c025d223c51f8d70c4bf010b568bd8bc226 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index fe2942906c..0f827c4d1a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -78,8 +78,6 @@ class JobDir(object): def __init__(self, keep=False): self.keep = keep self.root = tempfile.mkdtemp() - self.git_root = os.path.join(self.root, 'git') - os.makedirs(self.git_root) self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) self.plugins_root = os.path.join(self.ansible_root, 'plugins') From 293f7f839b91b0c45276d039f2d377b7203936e6 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:01:22 -0700 Subject: [PATCH 078/152] Python 3 Fix: use six.moves.urillib Use six.moves.urllib instead of urllib2 for python 3 compatability. Change-Id: Ia12a91d6164cd32080bafdeb5f4829e2d35f5e83 --- tests/base.py | 11 +++++------ tests/test_scheduler.py | 10 +++++----- tests/test_webapp.py | 23 ++++++++++++----------- zuul/connection/gerrit.py | 6 +++--- zuul/lib/swift.py | 4 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/base.py b/tests/base.py index de5601bbb4..3c28a726f7 100755 --- a/tests/base.py +++ b/tests/base.py @@ -22,6 +22,7 @@ import logging import os import pprint from six.moves import queue as Queue +from six.moves import urllib import random import re import select @@ -32,12 +33,10 @@ import subprocess import swiftclient import threading import time -import urllib2 import git import gear import fixtures -import six.moves.urllib.parse as urlparse import statsd import testtools from git import GitCommandError @@ -479,7 +478,7 @@ class FakeURLOpener(object): self.url = url def read(self): - res = urlparse.urlparse(self.url) + res = urllib.parse.urlparse(self.url) path = res.path project = '/'.join(path.split('/')[2:-2]) ret = '001e# service=git-upload-pack\n' @@ -947,12 +946,12 @@ class ZuulTestCase(BaseTestCase): self.sched.registerConnections(self.connections) def URLOpenerFactory(*args, **kw): - if isinstance(args[0], urllib2.Request): + if isinstance(args[0], urllib.request.Request): return old_urlopen(*args, **kw) return FakeURLOpener(self.upstream_root, *args, **kw) - old_urlopen = urllib2.urlopen - urllib2.urlopen = URLOpenerFactory + old_urlopen = urllib.request.urlopen + urllib.request.urlopen = URLOpenerFactory self.merge_server = zuul.merger.server.MergeServer(self.config, self.connections) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 053b1ccd33..ea512a2dbc 100755 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -20,11 +20,10 @@ import os import re import shutil import time -import urllib -import urllib2 import yaml import git +from six.moves import urllib import testtools import zuul.change_matcher @@ -2275,8 +2274,8 @@ jobs: port = self.webapp.server.socket.getsockname()[1] - req = urllib2.Request("http://localhost:%s/status.json" % port) - f = urllib2.urlopen(req) + req = urllib.request.Request("http://localhost:%s/status.json" % port) + f = urllib.request.urlopen(req) headers = f.info() self.assertIn('Content-Length', headers) self.assertIn('Content-Type', headers) @@ -2881,7 +2880,8 @@ jobs: port = self.webapp.server.socket.getsockname()[1] - f = urllib.urlopen("http://localhost:%s/status.json" % port) + req = urllib.request.Request("http://localhost:%s/status.json" % port) + f = urllib.request.urlopen(req) data = f.read() self.worker.hold_jobs_in_build = False diff --git a/tests/test_webapp.py b/tests/test_webapp.py index b127c517e9..94f097a772 100644 --- a/tests/test_webapp.py +++ b/tests/test_webapp.py @@ -16,7 +16,8 @@ # under the License. import json -import urllib2 + +from six.moves import urllib from tests.base import ZuulTestCase @@ -44,41 +45,41 @@ class TestWebapp(ZuulTestCase): def test_webapp_status(self): "Test that we can filter to only certain changes in the webapp." - req = urllib2.Request( + req = urllib.request.Request( "http://localhost:%s/status" % self.port) - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) data = json.loads(f.read()) self.assertIn('pipelines', data) def test_webapp_status_compat(self): # testing compat with status.json - req = urllib2.Request( + req = urllib.request.Request( "http://localhost:%s/status.json" % self.port) - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) data = json.loads(f.read()) self.assertIn('pipelines', data) def test_webapp_bad_url(self): # do we 404 correctly - req = urllib2.Request( + req = urllib.request.Request( "http://localhost:%s/status/foo" % self.port) - self.assertRaises(urllib2.HTTPError, urllib2.urlopen, req) + self.assertRaises(urllib.error.HTTPError, urllib.request.urlopen, req) def test_webapp_find_change(self): # can we filter by change id - req = urllib2.Request( + req = urllib.request.Request( "http://localhost:%s/status/change/1,1" % self.port) - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) data = json.loads(f.read()) self.assertEqual(1, len(data), data) self.assertEqual("org/project", data[0]['project']) - req = urllib2.Request( + req = urllib.request.Request( "http://localhost:%s/status/change/2,1" % self.port) - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) data = json.loads(f.read()) self.assertEqual(1, len(data), data) diff --git a/zuul/connection/gerrit.py b/zuul/connection/gerrit.py index a1854f4ba4..ae1e3198b7 100644 --- a/zuul/connection/gerrit.py +++ b/zuul/connection/gerrit.py @@ -18,11 +18,11 @@ import select import json import time from six.moves import queue as Queue +from six.moves import urllib import paramiko import logging import pprint import voluptuous as v -import urllib2 from zuul.connection import BaseConnection from zuul.model import TriggerEvent @@ -388,10 +388,10 @@ class GerritConnection(BaseConnection): url = "%s/p/%s/info/refs?service=git-upload-pack" % ( self.baseurl, project) try: - data = urllib2.urlopen(url).read() + data = urllib.request.urlopen(url).read() except: self.log.error("Cannot get references from %s" % url) - raise # keeps urllib2 error informations + raise # keeps urllib error informations ret = {} read_headers = False read_advertisement = False diff --git a/zuul/lib/swift.py b/zuul/lib/swift.py index 3c411d3ff1..b5d3bc7164 100644 --- a/zuul/lib/swift.py +++ b/zuul/lib/swift.py @@ -19,8 +19,8 @@ from time import time import os import random import six +from six.moves import urllib import string -import urlparse class Swift(object): @@ -156,7 +156,7 @@ class Swift(object): url = os.path.join(self.storage_url, settings['container'], settings['file_path_prefix'], destination_prefix) - u = urlparse.urlparse(url) + u = urllib.parse.urlparse(url) hmac_body = '%s\n%s\n%s\n%s\n%s' % (u.path, redirect, settings['max_file_size'], From 1b9bd78a710bf9e078d773c4adde20781cb887aa Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:03:30 -0700 Subject: [PATCH 079/152] Python 3 Fix: use six.reraise Use six.reraise to transfer exceptions across threads instead of the old python 2 syntax. Change-Id: Ie8d401811e12f9199a6fa88100ed2acd9385343c --- zuul/scheduler.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 30a6c81437..beab8792d7 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -20,6 +20,7 @@ import json import logging import os import pickle +import six from six.moves import queue as Queue import re import sys @@ -125,12 +126,10 @@ class ManagementEvent(object): """An event that should be processed within the main queue run loop""" def __init__(self): self._wait_event = threading.Event() - self._exception = None - self._traceback = None + self._exc_info = None - def exception(self, e, tb): - self._exception = e - self._traceback = tb + def exception(self, exc_info): + self._exc_info = exc_info self._wait_event.set() def done(self): @@ -138,8 +137,8 @@ class ManagementEvent(object): def wait(self, timeout=None): self._wait_event.wait(timeout) - if self._exception: - raise self._exception, None, self._traceback + if self._exc_info: + six.reraise(*self._exc_info) return self._wait_event.is_set() @@ -1053,8 +1052,8 @@ class Scheduler(threading.Thread): else: self.log.error("Unable to handle event %s" % event) event.done() - except Exception as e: - event.exception(e, sys.exc_info()[2]) + except Exception: + event.exception(sys.exc_info()) self.management_event_queue.task_done() def process_result_queue(self): From 822e807154fbb7b143b89c62ba067a6f0142c2a9 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:05:46 -0700 Subject: [PATCH 080/152] Python 3 Fix: use bytesIO instead of cStringIO With the move from cStringIO to the io module, use six..BytesIO as a python 3 compatible version of cStringIO module. Change-Id: Idc62004b28f79c5ef2e3aa0250fce30882f4be65 --- zuul/cmd/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/cmd/__init__.py b/zuul/cmd/__init__.py index 2902c50630..5ffd431720 100644 --- a/zuul/cmd/__init__.py +++ b/zuul/cmd/__init__.py @@ -14,8 +14,8 @@ # License for the specific language governing permissions and limitations # under the License. +import six from six.moves import configparser as ConfigParser -import cStringIO import extras import logging import logging.config @@ -47,7 +47,7 @@ def stack_dump_handler(signum, frame): yappi.start() else: yappi.stop() - yappi_out = cStringIO.StringIO() + yappi_out = six.BytesIO() yappi.get_func_stats().print_all(out=yappi_out) yappi.get_thread_stats().print_all(out=yappi_out) log.debug(yappi_out.getvalue()) From a737cbdcdb70fb7dfcee46b21dda7bdb572c6214 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:16:11 -0700 Subject: [PATCH 081/152] Python 3 Fix: use proper octal notation Python3 requires octal notation such as 0o755. Use this as the compatible syntax instead of the old 0755 syntax for octal. Change-Id: I0ff2f62ba4b865531adbeba66beec9b7efdc22b4 --- zuul/launcher/ansiblelaunchserver.py | 2 +- zuul/merger/merger.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5ecc954a48..6fa78b98b6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -797,7 +797,7 @@ class NodeWorker(object): remote_path = os.path.join('/tmp', script_fn) copy = dict(src=script_path, dest=remote_path, - mode=0555) + mode=0o555) task = dict(copy=copy) tasks.append(task) diff --git a/zuul/merger/merger.py b/zuul/merger/merger.py index c6ae35d79a..3bc29e61ac 100644 --- a/zuul/merger/merger.py +++ b/zuul/merger/merger.py @@ -210,7 +210,7 @@ class Merger(object): fd.write('#!/bin/bash\n') fd.write('ssh -i %s $@\n' % key) fd.close() - os.chmod(name, 0755) + os.chmod(name, 0o755) def addProject(self, project, url): repo = None From 9c4700afa38758cdd554fc879f7deb0288557167 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:25:19 -0700 Subject: [PATCH 082/152] Python 3 Fix: Absolute Imports Use absolute imports instead of relative imports for python 3 compatability. Change-Id: I1203ad26c66ab799955c7cb46ea1a98d7f512a41 --- zuul/merger/server.py | 2 +- zuul/rpclistener.py | 2 +- zuul/scheduler.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/zuul/merger/server.py b/zuul/merger/server.py index 30cd732ecb..d56993c65a 100644 --- a/zuul/merger/server.py +++ b/zuul/merger/server.py @@ -19,7 +19,7 @@ import traceback import gear -import merger +from zuul.merger import merger class MergeServer(object): diff --git a/zuul/rpclistener.py b/zuul/rpclistener.py index d54da9f38e..83d119f02c 100644 --- a/zuul/rpclistener.py +++ b/zuul/rpclistener.py @@ -21,7 +21,7 @@ import traceback import gear import six -import model +from zuul import model class RPCListener(object): diff --git a/zuul/scheduler.py b/zuul/scheduler.py index beab8792d7..dcc5f88e87 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -28,10 +28,10 @@ import threading import time import yaml -import layoutvalidator -import model -from model import Pipeline, Project, ChangeQueue -from model import ChangeishFilter, NullChange +from zuul import layoutvalidator +from zuul import model +from zuul.model import Pipeline, Project, ChangeQueue +from zuul.model import ChangeishFilter, NullChange from zuul import change_matcher, exceptions from zuul import version as zuul_version From e77bf87fd4a00150feec13f783cc4bf0c5987625 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Mon, 30 May 2016 14:31:08 -0700 Subject: [PATCH 083/152] Python 3 Fixes: use six.moves.configparser Use six.moves.configparser instead of ConfigParser for python3 compat. Change-Id: Ib96e3f1f6cb6959529b3982209e7998ec14d76f5 --- tests/test_layoutvalidator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_layoutvalidator.py b/tests/test_layoutvalidator.py index 3de4a94d56..46a8c7ccf3 100644 --- a/tests/test_layoutvalidator.py +++ b/tests/test_layoutvalidator.py @@ -14,7 +14,7 @@ # License for the specific language governing permissions and limitations # under the License. -import ConfigParser +from six.moves import configparser as ConfigParser import os import re From 74fa3865ac65d33f103b83974175cabe43cc75a8 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Thu, 2 Jun 2016 07:39:49 +0300 Subject: [PATCH 084/152] Python 3 Fixes: Replace missing builtins There are a few different missing builtins in python3 that we're using. This shows up when running tox pep8 under python3 which is needed for the streamer work. Change-Id: I1b2ef0b7bdcd1a85895576682455745fe06e880b --- tests/base.py | 5 +++-- tox.ini | 2 ++ zuul/launcher/gearman.py | 3 ++- zuul/model.py | 6 +++++- zuul/scheduler.py | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/base.py b/tests/base.py index 3c28a726f7..e7da178f78 100755 --- a/tests/base.py +++ b/tests/base.py @@ -27,6 +27,7 @@ import random import re import select import shutil +from six.moves import reload_module import socket import string import subprocess @@ -916,8 +917,8 @@ class ZuulTestCase(BaseTestCase): os.environ['STATSD_PORT'] = str(self.statsd.port) self.statsd.start() # the statsd client object is configured in the statsd module import - reload(statsd) - reload(zuul.scheduler) + reload_module(statsd) + reload_module(zuul.scheduler) self.gearman_server = FakeGearmanServer() diff --git a/tox.ini b/tox.ini index 79ea939b11..443cc1ac49 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,8 @@ commands = python setup.py testr --slowest --testr-args='{posargs}' [testenv:pep8] +# streamer is python3 only, so we need to run flake8 in python3 +basepython = python3 commands = flake8 {posargs} [testenv:cover] diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index f3b867ce93..0d9b4fbc3e 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -17,6 +17,7 @@ import inspect import json import logging import os +import six import time import threading from uuid import uuid4 @@ -231,7 +232,7 @@ class Gearman(object): s_config = {} s_config.update((k, v.format(item=item, job=job, change=item.change)) - if isinstance(v, basestring) + if isinstance(v, six.string_types) else (k, v) for k, v in s.items()) diff --git a/zuul/model.py b/zuul/model.py index 3fb0577f23..542d0b6cb9 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -110,7 +110,11 @@ class Pipeline(object): return job_tree def getProjects(self): - return sorted(self.job_trees.keys(), lambda a, b: cmp(a.name, b.name)) + # cmp is not in python3, applied idiom from + # http://python-future.org/compatible_idioms.html#cmp + return sorted( + self.job_trees.keys(), + key=lambda p: p.name) def addQueue(self, queue): self.queues.append(queue) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index dcc5f88e87..f08612d3e5 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -411,7 +411,9 @@ class Scheduler(threading.Thread): base = os.path.dirname(os.path.realpath(config_path)) fn = os.path.join(base, fn) fn = os.path.expanduser(fn) - execfile(fn, config_env) + with open(fn) as _f: + code = compile(_f.read(), fn, 'exec') + six.exec_(code, config_env) for conf_pipeline in data.get('pipelines', []): pipeline = Pipeline(conf_pipeline['name']) From 05d9266c6f7f7c7d70bac02450c7a5926dc76083 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Sat, 4 Jun 2016 17:32:19 -0700 Subject: [PATCH 085/152] Python 3 Fix: dict().iteritems no longer exists Use six.iteritems() instead of dict().iteritems. Change-Id: I53f7d18a06b8f0b8dba906824db9d2a44d4335d1 --- zuul/lib/clonemapper.py | 9 ++++++--- zuul/lib/cloner.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/zuul/lib/clonemapper.py b/zuul/lib/clonemapper.py index ae558cd8d0..57ac177100 100644 --- a/zuul/lib/clonemapper.py +++ b/zuul/lib/clonemapper.py @@ -19,6 +19,9 @@ import logging import os import re +import six + + OrderedDict = extras.try_imports(['collections.OrderedDict', 'ordereddict.OrderedDict']) @@ -59,17 +62,17 @@ class CloneMapper(object): raise Exception("Expansion error. Check error messages above") self.log.info("Mapping projects to workspace...") - for project, dest in ret.iteritems(): + for project, dest in six.iteritems(ret): dest = os.path.normpath(os.path.join(workspace, dest[0])) ret[project] = dest self.log.info(" %s -> %s", project, dest) self.log.debug("Checking overlap in destination directories...") check = defaultdict(list) - for project, dest in ret.iteritems(): + for project, dest in six.iteritems(ret): check[dest].append(project) - dupes = dict((d, p) for (d, p) in check.iteritems() if len(p) > 1) + dupes = dict((d, p) for (d, p) in six.iteritems(check) if len(p) > 1) if dupes: raise Exception("Some projects share the same destination: %s", dupes) diff --git a/zuul/lib/cloner.py b/zuul/lib/cloner.py index f0235a6965..3155df68e5 100644 --- a/zuul/lib/cloner.py +++ b/zuul/lib/cloner.py @@ -19,6 +19,8 @@ import os import re import yaml +import six + from git import GitCommandError from zuul.lib.clonemapper import CloneMapper from zuul.merger.merger import Repo @@ -62,7 +64,7 @@ class Cloner(object): dests = mapper.expand(workspace=self.workspace) self.log.info("Preparing %s repositories", len(dests)) - for project, dest in dests.iteritems(): + for project, dest in six.iteritems(dests): self.prepareRepo(project, dest) self.log.info("Prepared all repositories") From 87e4ab075a1c90ae90bcc9f03d13e113c1a7b04f Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Wed, 8 Jun 2016 14:17:20 -0400 Subject: [PATCH 086/152] Fix referenced before assignment for BuildCompletedEvent It was possible for duration to be referenced before assignment. Traceback (most recent call last): File "/usr/local/lib/python2.7/dist-packages/zuul/scheduler.py", line 1108, in _doBuildCompletedEvent except Exception: UnboundLocalError: local variable 'duration' referenced before assignment Change-Id: I48446f9cdb23bfd37c1a843dbcf90f94492a0678 Signed-off-by: Paul Belanger --- zuul/scheduler.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zuul/scheduler.py b/zuul/scheduler.py index beab8792d7..c3fd3c9b6b 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -1103,10 +1103,11 @@ class Scheduler(threading.Thread): return if build.end_time and build.start_time and build.result: duration = build.end_time - build.start_time - try: - self.time_database.update(build.job.name, duration, build.result) - except Exception: - self.log.exception("Exception recording build time:") + try: + self.time_database.update( + build.job.name, duration, build.result) + except Exception: + self.log.exception("Exception recording build time:") pipeline.manager.onBuildCompleted(event.build) def _doMergeCompletedEvent(self, event): From 3e205c2164c046743870259e946e524229736445 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 8 Jun 2016 11:28:23 -0700 Subject: [PATCH 087/152] Ansible launcher: register a noop set_description job So that Zuul doesn't keep trying to see if it's registered. Change-Id: Ie9b690ae51017a84720e4b4991b53aeba41bb848 --- zuul/launcher/ansiblelaunchserver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 0f827c4d1a..0b3dc46508 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -237,6 +237,7 @@ class LaunchServer(object): if self.accept_nodes: new_functions.add("node-assign:zuul") new_functions.add("stop:%s" % self.hostname) + new_functions.add("set_description:%s" % self.hostname) for function in new_functions - self.registered_functions: self.worker.registerFunction(function) @@ -365,6 +366,10 @@ class LaunchServer(object): elif job.name.startswith('stop:'): self.log.debug("Got stop job: %s" % job.unique) self.stopJob(job) + elif job.name.startswith('set_description:'): + self.log.debug("Got set_description job: %s" % + job.unique) + job.sendWorkComplete() else: self.log.error("Unable to handle job %s" % job.name) job.sendWorkFail() From 9208dc1c0a859642deece4f4be5f43fae065c945 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Wed, 8 Jun 2016 16:43:55 -0400 Subject: [PATCH 088/152] Make isJobRegistered for gearman optional With openstack-infra we actually validate our jobs exist before merging them into our zuul configuration. As a result, the registration check in zuul is redundant. So provide a way for the check to be enable / disabled. Change-Id: I06f2221770e04f958ce7ac4cfe8d5d92d7164cac Signed-off-by: Paul Belanger --- doc/source/zuul.rst | 5 +++++ zuul/launcher/gearman.py | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index 98e4bb8a2a..07b777a2fe 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -49,6 +49,11 @@ gearman Port on which the Gearman server is listening. ``port=4730`` +**check_job_registration** + Check to see if job is registered with Gearman or not. When True + a build result of NOT_REGISTERED will be return if job is not found. + ``check_job_registration=True`` + gearman_server """""""""""""" diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index f3b867ce93..3556b4540c 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -164,6 +164,11 @@ class Gearman(object): port = config.get('gearman', 'port') else: port = 4730 + if config.has_option('gearman', 'check_job_registration'): + self.job_registration = config.getboolean( + 'gearman', 'check_job_registration') + else: + self.job_registration = True self.gearman = ZuulGearmanClient(self) self.gearman.addServer(server, port) @@ -351,7 +356,8 @@ class Gearman(object): build.__gearman_job = gearman_job self.builds[uuid] = build - if not self.isJobRegistered(gearman_job.name): + if self.job_registration and not self.isJobRegistered( + gearman_job.name): self.log.error("Job %s is not registered with Gearman" % gearman_job) self.onBuildCompleted(gearman_job, 'NOT_REGISTERED') @@ -502,7 +508,7 @@ class Gearman(object): # us where the job is running. return False - if not self.isJobRegistered(name): + if self.job_registration and not self.isJobRegistered(name): return False desc_uuid = str(uuid4().hex) From 48bc91a16586dd64cb5ec6da4320d1afbd97a622 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 8 Jun 2016 14:11:30 -0700 Subject: [PATCH 089/152] Ansible launcher: use underscores where appropriate Zuul config file options and zuul gearman functions both use underscores. Switch to those before this gets out of hand. Also use getboolean for getting the boolean config option. Change-Id: Ibecf6b9f0786860e4a0d8653ce5c210dd11a9971 --- zuul/launcher/ansiblelaunchserver.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 0b3dc46508..9609a55a60 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -114,10 +114,16 @@ class LaunchServer(object): self.termination_queue = Queue.Queue() self.sites = {} self.static_nodes = {} - if config.has_option('launcher', 'accept-nodes'): - self.accept_nodes = config.get('launcher', 'accept-nodes') + if config.has_option('launcher', 'accept_nodes'): + self.accept_nodes = config.getboolean('launcher', + 'accept_nodes') else: - self.accept_nodes = True + # TODO(jeblair): remove deprecated form of option + if config.has_option('launcher', 'accept-nodes'): + self.accept_nodes = config.getboolean('launcher', + 'accept-nodes') + else: + self.accept_nodes = True if self.config.has_option('zuul', 'state_dir'): state_dir = os.path.expanduser( @@ -235,6 +241,8 @@ class LaunchServer(object): def register(self): new_functions = set() if self.accept_nodes: + new_functions.add("node_assign:zuul") + # TODO(jeblair): remove deprecated form new_functions.add("node-assign:zuul") new_functions.add("stop:%s" % self.hostname) new_functions.add("set_description:%s" % self.hostname) @@ -360,7 +368,11 @@ class LaunchServer(object): try: job = self.worker.getJob() try: - if job.name.startswith('node-assign:'): + if job.name.startswith('node_assign:'): + self.log.debug("Got node_assign job: %s" % job.unique) + self.assignNode(job) + elif job.name.startswith('node-assign:'): + # TODO(jeblair): remove deprecated form self.log.debug("Got node-assign job: %s" % job.unique) self.assignNode(job) elif job.name.startswith('stop:'): From e6febc79bb140c48b3160d90881b2407a5e8a1e5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 9 Jun 2016 13:22:03 -0700 Subject: [PATCH 090/152] Ansible launcher: make get_env safer Previously, this function would fail on lines with comments. Change-Id: I694281777e6d8a51ceb943f8656189e75544ab05 --- zuul/ansible/library/zuul_runner.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index f49016291e..20b560018a 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -18,6 +18,7 @@ import datetime import getpass import os +import re import subprocess import threading @@ -46,7 +47,12 @@ def get_env(): if os.path.exists(fn): with open(fn) as f: for line in f: - k, v = line.strip().split('=') + line = re.sub('#.*', '', line).strip() + if not line: + continue + if '=' not in line: + continue + k, v = line.split('=') env[k] = v return env From 5d5082dd035bac36fd179afef6fb0d18e3344793 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 9 Jun 2016 13:42:40 -0700 Subject: [PATCH 091/152] Ansible launcher: don't report result on abort When a worker is asked to abort a job, do not send a result to zuul. If Zuul asked for the abort, Zuul will ignore the result anyway. If the operator asked for the abort (i.e., by shutting down the launcher) then the null result will suggest to Zuul it should relaunch the job. Change-Id: I8ad35e70f76ede31b78c67ebfb0104554b28e8c2 --- zuul/launcher/ansiblelaunchserver.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f7bf66f633..1acb1e1fef 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -490,6 +490,7 @@ class NodeWorker(object): self._got_job = False self._job_complete_event = threading.Event() self._running_job = False + self._aborted_job = False self._sent_complete_event = False self.workspace_root = config.get('launcher', 'workspace_root') if self.config.has_option('launcher', 'private_key_file'): @@ -651,6 +652,7 @@ class NodeWorker(object): self.registered_functions = new_functions def abortRunningJob(self): + self._aborted_job = True return self.abortRunningProc(self.ansible_job_proc) def abortRunningProc(self, proc): @@ -687,6 +689,7 @@ class NodeWorker(object): # whether the job actually runs result = None self._sent_complete_event = False + self._aborted_job = False try: self.sendStartEvent(job_name, args) @@ -785,7 +788,10 @@ class NodeWorker(object): else: status = 'FAILURE' - result = json.dumps(dict(result=status)) + if not self._aborted_job: + # A Null result will cause zuul to relaunch the job if + # it needs to. + result = json.dumps(dict(result=status)) return result From 5d58b206baf7332f38a81dd4afbc8b5db837733d Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 9 Jun 2016 14:01:11 -0700 Subject: [PATCH 092/152] Ansible launcher: initialize pid variables These variables were uninitialized, which could cause a problem if we stopped or released a node which had not run a job. We would try to call abortRunningProc before they had been defined. Change-Id: I79c6b14336b039e0f07727be977c3152fbf190af --- zuul/launcher/ansiblelaunchserver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 1acb1e1fef..f0b695d6d2 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -492,6 +492,8 @@ class NodeWorker(object): self._running_job = False self._aborted_job = False self._sent_complete_event = False + self.ansible_job_proc = None + self.ansible_post_proc = None self.workspace_root = config.get('launcher', 'workspace_root') if self.config.has_option('launcher', 'private_key_file'): self.private_key_file = config.get('launcher', 'private_key_file') From 7fac225244ab92e505182fe23446824b07da38c0 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 3 Jun 2016 14:21:48 -0700 Subject: [PATCH 093/152] Ansible launcher: Add graceful stop command Stops accepting new nodes/jobs, releases idle nodes, waits for running jobs to finish, then exits. Change-Id: I8d5dfc101fce0d3d6e1ebd1884ec29cbb1de7841 --- zuul/launcher/ansiblelaunchserver.py | 34 +++++++++++++++++++--------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f0b695d6d2..351067291e 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -42,7 +42,7 @@ ANSIBLE_DEFAULT_TIMEOUT = 2 * 60 * 60 ANSIBLE_DEFAULT_POST_TIMEOUT = 10 * 60 -COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause', 'release'] +COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause', 'release', 'graceful'] def boolify(x): @@ -114,6 +114,15 @@ class LaunchServer(object): self.termination_queue = Queue.Queue() self.sites = {} self.static_nodes = {} + self.command_map = dict( + reconfigure=self.reconfigure, + stop=self.stop, + pause=self.pause, + unpause=self.unpause, + release=self.release, + graceful=self.graceful, + ) + if config.has_option('launcher', 'accept_nodes'): self.accept_nodes = config.getboolean('launcher', 'accept_nodes') @@ -304,6 +313,18 @@ class LaunchServer(object): "to worker:") self.log.debug("Finished releasing idle nodes") + def graceful(self): + # Note: this is run in the command processing thread; no more + # external commands will be processed after this. + self.log.debug("Gracefully stopping") + self.pause() + self.release() + self.log.debug("Waiting for all builds to finish") + while self.builds: + time.sleep(5) + self.log.debug("All builds are finished") + self.stop() + def stop(self): self.log.debug("Stopping") # First, stop accepting new jobs @@ -337,16 +358,7 @@ class LaunchServer(object): while self._command_running: try: command = self.command_socket.get() - if command == 'reconfigure': - self.reconfigure() - elif command == 'stop': - self.stop() - elif command == 'pause': - self.pause() - elif command == 'unpause': - self.unpause() - elif command == 'release': - self.release() + self.command_map[command]() except Exception: self.log.exception("Exception while processing command") From 48293986373d7f40f53024ccb55497786245faa5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 8 Jun 2016 14:28:57 -0700 Subject: [PATCH 094/152] Ansible launcher: remove deprecated hyphens Change-Id: I140980f2a8857bf772f1314fad9cb24e0f79506a --- zuul/launcher/ansiblelaunchserver.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 351067291e..f644c14b62 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -127,12 +127,7 @@ class LaunchServer(object): self.accept_nodes = config.getboolean('launcher', 'accept_nodes') else: - # TODO(jeblair): remove deprecated form of option - if config.has_option('launcher', 'accept-nodes'): - self.accept_nodes = config.getboolean('launcher', - 'accept-nodes') - else: - self.accept_nodes = True + self.accept_nodes = True if self.config.has_option('zuul', 'state_dir'): state_dir = os.path.expanduser( @@ -251,8 +246,6 @@ class LaunchServer(object): new_functions = set() if self.accept_nodes: new_functions.add("node_assign:zuul") - # TODO(jeblair): remove deprecated form - new_functions.add("node-assign:zuul") new_functions.add("stop:%s" % self.hostname) new_functions.add("set_description:%s" % self.hostname) @@ -383,10 +376,6 @@ class LaunchServer(object): if job.name.startswith('node_assign:'): self.log.debug("Got node_assign job: %s" % job.unique) self.assignNode(job) - elif job.name.startswith('node-assign:'): - # TODO(jeblair): remove deprecated form - self.log.debug("Got node-assign job: %s" % job.unique) - self.assignNode(job) elif job.name.startswith('stop:'): self.log.debug("Got stop job: %s" % job.unique) self.stopJob(job) From fdae225f4e86dd142755ed81737e763da58f1bb3 Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Tue, 7 Jun 2016 20:13:20 -0700 Subject: [PATCH 095/152] Python 3 Fix: Divide operator returns float In Python3 division now explicitly returns a float instead of only if/when a float is one of the objects involved with division. Change-Id: I3632fd731b1538ce9fe8a39dce66269425b6076b --- zuul/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/model.py b/zuul/model.py index 542d0b6cb9..ca8f0987ca 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -421,7 +421,7 @@ class ChangeQueue(object): elif self.window_decrease_type == 'exponential': self.window = max( self.window_floor, - self.window / self.window_decrease_factor) + int(self.window / self.window_decrease_factor)) class Project(object): From 622c96873773e1ebef733c0eda733cd6c1521e05 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 9 Jun 2016 08:14:53 -0700 Subject: [PATCH 096/152] Fix timeout debug print in tests Commit 4c6a7744 introduced an error in formatting the queue status. This corrects that and also changes the print statements to debug logs so they are easier to follow. Change-Id: I412ad6c2e460c5ee15cc0e5a3956a513b7cd7138 --- tests/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/base.py b/tests/base.py index e7da178f78..66bf16a3df 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1296,8 +1296,11 @@ class ZuulTestCase(BaseTestCase): start = time.time() while True: if time.time() - start > 10: - print('queue status:', ''.join(self.eventQueuesEmpty())) - print(self.areAllBuildsWaiting()) + self.log.debug("Queue status:") + for queue in self.event_queues: + self.log.debug(" %s: %s" % (queue, queue.empty())) + self.log.debug("All builds waiting: %s" % + (self.areAllBuildsWaiting(),)) raise Exception("Timeout waiting for Zuul to settle") # Make sure no new events show up while we're checking self.worker.lock.acquire() From 03fcee46f92c3ccde2856f180608fa1b25b8ae64 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 10 Jun 2016 09:30:35 -0400 Subject: [PATCH 097/152] Implement BUILD_TIMEOUT env variable Looking at the code, I seen no way for this to be setup. Specifically, this is needed by devstack-gate and our build-timeout macro. Change-Id: I4087cc30a5bc888fa56b6c09190d36140b5b9174 Closes-Bug: #1591102 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f644c14b62..9647eb8f7a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -990,15 +990,19 @@ class NodeWorker(object): inventory.write('\n') timeout = None + timeout_var = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): timeout = wrapper.get('build-timeout', {}) if isinstance(timeout, dict): timeout = timeout.get('timeout') + timeout_var = timeout.get('timeout-var', None) if timeout: timeout = timeout * 60 if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT + if timeout_var: + parameters[timeout_var] = timeout with open(jobdir.playbook, 'w') as playbook: tasks = [] From 555ffe037d12fe7a6da1f36dab1ce9562bfd4a10 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 10 Jun 2016 11:14:34 -0400 Subject: [PATCH 098/152] Requeue jobs if AnsibleHostUnreachable return If ansible-playbook return AnsibleHostUnreachable (3), we can assume there was a network issue runing our playbook. Rather then continuing and potentially posting no logs, we can have zuul requeue the job to try again. Change-Id: I8c4d92fa7156c7d8591ca5b54be8cb30c249eacf Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f644c14b62..82e17171fc 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -785,6 +785,12 @@ class NodeWorker(object): job.sendWorkStatus(0, 100) job_status = self.runAnsiblePlaybook(jobdir, timeout) + if job_status == 3: + # AnsibleHostUnreachable: We had a network issue connecting to + # our zuul-worker. Rather then contiune, have zuul requeue the + # job. + return result + post_status = self.runAnsiblePostPlaybook(jobdir, job_status) if job_status and post_status: status = 'SUCCESS' From 5e17058546eeabb4ed91ff25d3442f8158ddb24c Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 10 Jun 2016 12:41:01 -0400 Subject: [PATCH 099/152] Use pull mode for synchronize in _makeFTPTask This brings the task inline with how our SCPTask works. And also fixes failures with our releasenotes jobs. Change-Id: I81200582f76a49398cbfbad9082a0181a524f5ff Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f644c14b62..3b9584a03a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -913,7 +913,8 @@ class NodeWorker(object): parameters) syncargs = dict(src=src, dest=ftpcontent, - copy_links='yes') + copy_links='yes', + mode='pull') if rsync_opts: syncargs['rsync_opts'] = rsync_opts task = dict(synchronize=syncargs, From d34e0b4dc7f1717ce33fd65f2bffa292d053a0dd Mon Sep 17 00:00:00 2001 From: Morgan Fainberg Date: Thu, 9 Jun 2016 19:10:38 -0700 Subject: [PATCH 100/152] Reduce Log Size To reduce the testrepository.subunit output, eliminate debugging logs from gear.Server and gear.Client. This is handled via an ENV defined in the tox.ini called `OS_LOG_DEFAULTS`. Any module can be specified in the typicall python logging format (e.g. "gear.Server=INFO"). Each entry should be comma separated. For each valid entry, a fake logger is created with the log level set to that level. An invalid format will be skipped (expected: `=`). An invalid logging level will default to logging.DEBUG. Specifying OS_LOG_DEFAULT as an ENV var prior to running tox will override the default values defined in tox.ini. Change-Id: I893418435c538bfcedb803d12b57832c8111f06f --- .testr.conf | 2 +- tests/base.py | 22 ++++++++++++++++++++++ tox.ini | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.testr.conf b/.testr.conf index 5433c070e3..222ce97160 100644 --- a/.testr.conf +++ b/.testr.conf @@ -1,4 +1,4 @@ [DEFAULT] -test_command=OS_STDOUT_CAPTURE=${OS_STDOUT_CAPTURE:-1} OS_STDERR_CAPTURE=${OS_STDERR_CAPTURE:-1} OS_LOG_CAPTURE=${OS_LOG_CAPTURE:-1} ${PYTHON:-python} -m subunit.run discover -t ./ tests $LISTOPT $IDOPTION +test_command=OS_STDOUT_CAPTURE=${OS_STDOUT_CAPTURE:-1} OS_STDERR_CAPTURE=${OS_STDERR_CAPTURE:-1} OS_LOG_CAPTURE=${OS_LOG_CAPTURE:-1} OS_LOG_DEFAULTS=${OS_LOG_DEFAULTS:-""} ${PYTHON:-python} -m subunit.run discover -t ./ tests $LISTOPT $IDOPTION test_id_option=--load-list $IDFILE test_list_option=--list diff --git a/tests/base.py b/tests/base.py index e7da178f78..5b31eea62f 100755 --- a/tests/base.py +++ b/tests/base.py @@ -862,6 +862,28 @@ class BaseTestCase(testtools.TestCase): format='%(asctime)s %(name)-32s ' '%(levelname)-8s %(message)s')) + # NOTE(notmorgan): Extract logging overrides for specific libraries + # from the OS_LOG_DEFAULTS env and create FakeLogger fixtures for + # each. This is used to limit the output during test runs from + # libraries that zuul depends on such as gear. + log_defaults_from_env = os.environ.get('OS_LOG_DEFAULTS') + + if log_defaults_from_env: + for default in log_defaults_from_env.split(','): + try: + name, level_str = default.split('=', 1) + level = getattr(logging, level_str, logging.DEBUG) + self.useFixture(fixtures.FakeLogger( + name=name, + level=level, + format='%(asctime)s %(name)-32s ' + '%(levelname)-8s %(message)s')) + except ValueError: + # NOTE(notmorgan): Invalid format of the log default, + # skip and don't try and apply a logger for the + # specified module + pass + class ZuulTestCase(BaseTestCase): diff --git a/tox.ini b/tox.ini index 443cc1ac49..a8767c273e 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ setenv = STATSD_HOST=127.0.0.1 STATSD_PORT=8125 VIRTUAL_ENV={envdir} OS_TEST_TIMEOUT=30 + OS_LOG_DEFAULTS={env:OS_LOG_DEFAULTS:gear.Server=INFO,gear.Client=INFO} passenv = ZUUL_TEST_ROOT usedevelop = True install_command = pip install {opts} {packages} From 28178bf6fc437b92001858b331faf6aa5db8fb2f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Jun 2016 13:39:31 -0700 Subject: [PATCH 101/152] Ansible launcher: fix timeout var We reassigned the timeout value over top of the dictionary. This leaves the dictionary alone so it can be reused. Change-Id: I4bb2fb24eecea5ba28ac2bb78fb477de6236331a --- zuul/launcher/ansiblelaunchserver.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index af6eac10d5..87898b6eb9 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -999,10 +999,10 @@ class NodeWorker(object): timeout_var = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): - timeout = wrapper.get('build-timeout', {}) - if isinstance(timeout, dict): - timeout = timeout.get('timeout') - timeout_var = timeout.get('timeout-var', None) + build_timeout = wrapper.get('build-timeout', {}) + if isinstance(build_timeout, dict): + timeout_var = build_timeout.get('timeout-var', None) + timeout = build_timeout.get('timeout') if timeout: timeout = timeout * 60 if not timeout: From a6bbd4001f5acfd129fe7f8e3042d6b80999b8ce Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Jun 2016 13:50:18 -0700 Subject: [PATCH 102/152] Ansible launcher: be smarter about env quotes When reading the initial env from files, handled quoted values. Also, only ignore lines that only start with '#' (which matches pam_env behavior). Change-Id: Ie84db4fdb5d70b3da062b48920eb4ca603ebeba7 --- zuul/ansible/library/zuul_runner.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 20b560018a..5a388073f7 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -18,7 +18,6 @@ import datetime import getpass import os -import re import subprocess import threading @@ -47,12 +46,16 @@ def get_env(): if os.path.exists(fn): with open(fn) as f: for line in f: - line = re.sub('#.*', '', line).strip() if not line: continue + if line[0] == '#': + continue if '=' not in line: continue - k, v = line.split('=') + k, v = line.strip().split('=') + for q in ["'", '"']: + if v[0] == q: + v = v.strip(q) env[k] = v return env From 490f4aa7d81ce497b7b6ed8854f63ad886f242cb Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Jun 2016 14:39:34 -0700 Subject: [PATCH 103/152] Increase the Gerrit trailing delay We're starting to see inconsistent data from gerrit on uploads of (somewhat) long patch series to nova. Increase the trailing delay to give Gerrit time to make itself consistent. Hopefully we can reduce this in the future after we work out the GC issues on the nova repo. Change-Id: I644fcdc196b737909c2115071027dc2826c21c0b --- zuul/connection/gerrit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/connection/gerrit.py b/zuul/connection/gerrit.py index ae1e3198b7..62891cd9a2 100644 --- a/zuul/connection/gerrit.py +++ b/zuul/connection/gerrit.py @@ -32,7 +32,7 @@ class GerritEventConnector(threading.Thread): """Move events from Gerrit to the scheduler.""" log = logging.getLogger("zuul.GerritEventConnector") - delay = 5.0 + delay = 10.0 def __init__(self, connection): super(GerritEventConnector, self).__init__() From 7edacc6f4e35fa0d4da41998e13f38eb41515160 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 10 Jun 2016 16:43:23 -0700 Subject: [PATCH 104/152] Register functions in the RPC listener before running A gear worker should not grab jobs before it has registered its functions. A race condition could prevent it from picking up jobs assigned to it. Change-Id: I34a4e94c030d54800d592edf89ae06bb0b2627fc --- zuul/rpclistener.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/rpclistener.py b/zuul/rpclistener.py index 83d119f02c..551dd03491 100644 --- a/zuul/rpclistener.py +++ b/zuul/rpclistener.py @@ -40,11 +40,11 @@ class RPCListener(object): port = 4730 self.worker = gear.Worker('Zuul RPC Listener') self.worker.addServer(server, port) + self.worker.waitForServer() + self.register() self.thread = threading.Thread(target=self.run) self.thread.daemon = True self.thread.start() - self.worker.waitForServer() - self.register() def register(self): self.worker.registerFunction("zuul:enqueue") From b55a4d15dd59ef90a1e0215ab581f065cd421c39 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Jun 2016 12:01:56 -0400 Subject: [PATCH 105/152] Delegate FTP publisher to 127.0.0.1 We want ansible to run lftp on our zuul-launcher servers, not our remote nodes. Change-Id: I0ec9182ce431c9421df8633717f78fc0d3b683f3 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5bef6d39e4..dbce8bf93a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -927,7 +927,8 @@ class NodeWorker(object): when='success') tasks.append(task) task = dict(shell='lftp -f %s' % ftpscript, - when='success') + when='success', + delegate_to='127.0.0.1') ftpsource = ftpcontent if ftp.get('remove-prefix'): ftpsource = os.path.join(ftpcontent, ftp['remove-prefix']) From 955b226e4541ebc2990d9e3f8665dd0b85cef1eb Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Jun 2016 12:29:32 -0400 Subject: [PATCH 106/152] Update SCPTask to also use delegate_to This brings both our FTP / SCP task inline using the same logic for delegate_to. Change-Id: I234069838e3feb2047b32dbbdb92e6ce7acc5077 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index dbce8bf93a..16493d35f8 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -874,8 +874,8 @@ class NodeWorker(object): raise Exception("Target path %s is not below site root" % (dest,)) - local_args = [ - 'shell', '/usr/bin/rsync', '--delay-updates', '-F', + rsync_cmd = [ + '/usr/bin/rsync', '--delay-updates', '-F', '--compress', '-rt', '--safe-links', '--rsync-path="mkdir -p {dest} && rsync"', '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' @@ -887,13 +887,14 @@ class NodeWorker(object): source = '"%s/"' % scproot else: source = '`/usr/bin/find "%s" -type f`' % scproot - local_action = ' '.join(local_args).format( + shellargs = ' '.join(rsync_cmd).format( source=source, dest=dest, private_key_file=self.private_key_file, host=site['host'], user=site['user']) - task = dict(local_action=local_action) + task = dict(shell=shellargs, + delegate_to='127.0.0.1') if not scpfile.get('copy-after-failure'): task['when'] = 'success' tasks.append(task) From 01af7cd7d27f692c6615db3029cdb58612395ae4 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Jun 2016 13:08:05 -0400 Subject: [PATCH 107/152] Don't create .retry files when our playbooks fail We don't use these, so disable them by default. Change-Id: Id4d9776160f181b6c3bee55153cb79dc0a5b54bc Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 16493d35f8..8063e5e8c1 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1064,6 +1064,7 @@ class NodeWorker(object): config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') config.write('private_key_file = %s\n' % self.private_key_file) + config.write('retry_files_enabled = False\n') callback_path = zuul.ansible.plugins.callback_plugins.__file__ callback_path = os.path.abspath(callback_path) From edcdcd3457b1771328dd464276659607cd2a88a0 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sun, 12 Jun 2016 13:26:01 -0400 Subject: [PATCH 108/152] Fix typo with ansible-playbook process Because we skipped setting our ansible_job_process to None, it was possible for our abortRunningProc to error: Traceback (most recent call last): File "/usr/local/lib/python2.7/dist-packages/zuul/launcher/ansiblelaunchserver.py", line 671, in abortRunningProc pgid = os.getpgid(proc.pid) OSError: [Errno 3] No such process Change-Id: Ie6da0614a75281782fb0f29f6c8046a02e1681b9 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5bef6d39e4..5f5adcb7f9 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1102,7 +1102,7 @@ class NodeWorker(object): finally: watchdog.stop() self.log.debug("Ansible exit code: %s" % (ret,)) - self.ansible_proc = None + self.ansible_job_proc = None return ret == 0 def runAnsiblePostPlaybook(self, jobdir, success): From 4c5b10381f01b36507c862c72a052341b2fc0cc2 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Thu, 9 Jun 2016 17:56:48 -0400 Subject: [PATCH 109/152] Add POST_FAILURE status First check if we fail to run our post_playbook, if a failure set POST_FAILURE. This usually means as misconfigured job or infra problem. Then check if our job is successful or a failure. Change-Id: Icc6b8254ac7812c8df733e780a43a44a528f7b78 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f0b695d6d2..33e00b8f9c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -785,7 +785,9 @@ class NodeWorker(object): job_status = self.runAnsiblePlaybook(jobdir, timeout) post_status = self.runAnsiblePostPlaybook(jobdir, job_status) - if job_status and post_status: + if not post_status: + status = 'POST_FAILURE' + elif job_status: status = 'SUCCESS' else: status = 'FAILURE' From 76978de4929164cb5b2068cf95995ea086bef9e6 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 11 Jun 2016 19:58:16 -0400 Subject: [PATCH 110/152] Bootstrap worker logs with zuul information In an effort to help us default failures, add some information about zuul-launcher and worker nodes into the logs. We also update zuul_log to optionally support lists, so we can pass more then 1 line into it. We default to raw now to make ansible 2.1 happier. Change-Id: Icb4d20e3ef0166eb43b1b45aec049040813b5d37 Signed-off-by: Paul Belanger --- zuul/ansible/library/zuul_log.py | 7 +++++-- zuul/launcher/ansiblelaunchserver.py | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/zuul/ansible/library/zuul_log.py b/zuul/ansible/library/zuul_log.py index 8978275abc..2072bc9634 100644 --- a/zuul/ansible/library/zuul_log.py +++ b/zuul/ansible/library/zuul_log.py @@ -34,14 +34,17 @@ class Console(object): def log(msg): + if not isinstance(msg, list): + msg = [msg] with Console() as console: - console.addLine("[Zuul] %s\n" % msg) + for line in msg: + console.addLine("[Zuul] %s\n" % line) def main(): module = AnsibleModule( argument_spec=dict( - msg=dict(required=True), + msg=dict(required=True, type='raw'), ) ) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 5bef6d39e4..cdda3aea43 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1028,6 +1028,13 @@ class NodeWorker(object): state='directory')) main_block.append(task) + msg = [ + "Launched by %s" % self.manager_name, + "Building remotely on %s in workspace %s" % ( + self.name, parameters['WORKSPACE'])] + task = dict(zuul_log=dict(msg=msg)) + main_block.append(task) + for builder in jjb_job.get('builders', []): if 'shell' in builder: main_block.extend( From d43715988766fa95b88af5fc2c9d2c2aa723b4f9 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 8 Jun 2016 16:55:04 -0700 Subject: [PATCH 111/152] Ansible launcher: add private gearman function OpenStack's Zuul's gear server is spending most of its time dealing with function registration (800 nodes, 20k functions each). Alleviate the pain by adding a private gearman function to register functions en masse. In preliminary testing, this causes function registration to take 27% of the current time. Change-Id: I0d2342cadca5e3d6d6c1964a119e0d50b0bcc548 --- zuul/cmd/server.py | 12 +++++----- zuul/launcher/ansiblelaunchserver.py | 20 ++++++++++++---- zuul/lib/gearserver.py | 35 ++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 zuul/lib/gearserver.py diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index f4004362af..1fb4a3292c 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -117,18 +117,18 @@ class Server(zuul.cmd.ZuulApp): if child_pid == 0: os.close(pipe_write) self.setup_logging('gearman_server', 'log_config') - import gear + import zuul.lib.gearserver statsd_host = os.environ.get('STATSD_HOST') statsd_port = int(os.environ.get('STATSD_PORT', 8125)) if self.config.has_option('gearman_server', 'listen_address'): host = self.config.get('gearman_server', 'listen_address') else: host = None - gear.Server(4730, - host=host, - statsd_host=statsd_host, - statsd_port=statsd_port, - statsd_prefix='zuul.geard') + zuul.lib.gearserver.GearServer(4730, + host=host, + statsd_host=statsd_host, + statsd_port=statsd_port, + statsd_prefix='zuul.geard') # Keep running until the parent dies: pipe_read = os.fdopen(pipe_read) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f644c14b62..f002fedc77 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -51,6 +51,19 @@ def boolify(x): return bool(x) +class GearWorker(gear.Worker): + MASS_DO = 101 + + def sendMassDo(self, functions): + data = b'\x00'.join([gear.convert_to_bytes(x) for x in functions]) + self.broadcast_lock.acquire() + try: + p = gear.Packet(gear.constants.REQ, self.MASS_DO, data) + self.broadcast(p) + finally: + self.broadcast_lock.release() + + class Watchdog(object): def __init__(self, timeout, function, args): self.timeout = timeout @@ -518,7 +531,7 @@ class NodeWorker(object): port = self.config.get('gearman', 'port') else: port = 4730 - self.worker = gear.Worker(self.name) + self.worker = GearWorker(self.name) self.worker.addServer(server, port) self.log.debug("Waiting for server") self.worker.waitForServer() @@ -648,10 +661,7 @@ class NodeWorker(object): new_functions = set() for job in self.jobs.values(): new_functions |= self.generateFunctionNames(job) - for function in new_functions - self.registered_functions: - self.worker.registerFunction(function) - for function in self.registered_functions - new_functions: - self.worker.unRegisterFunction(function) + self.worker.sendMassDo(new_functions) self.registered_functions = new_functions def abortRunningJob(self): diff --git a/zuul/lib/gearserver.py b/zuul/lib/gearserver.py new file mode 100644 index 0000000000..9cddca346b --- /dev/null +++ b/zuul/lib/gearserver.py @@ -0,0 +1,35 @@ +# Copyright 2016 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import gear + +MASS_DO = 101 + + +class GearServer(gear.Server): + def handlePacket(self, packet): + if packet.ptype == MASS_DO: + self.log.info("Received packet from %s: %s" % (packet.connection, + packet)) + self.handleMassDo(packet) + else: + return super(GearServer, self).handlePacket(packet) + + def handleMassDo(self, packet): + packet.connection.functions = set() + for name in packet.data.split(b'\x00'): + self.log.debug("Adding function %s to %s" % ( + name, packet.connection)) + packet.connection.functions.add(name) + self.functions.add(name) From 71776d111fa7df09042191570ac6199090df540f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 13 Jun 2016 13:11:50 -0700 Subject: [PATCH 112/152] Ansible launcher: fix check of ansible exit code runAnsible* only returns a boolean -- success or failure, but we were assuming it returned an exit code. Make runAnsible* tristate so that it returns success, failure, or indeterminate and move the exit code comparison closer to where ansible actually exits. Change-Id: I824681787f86c46b52817d473c37640ad19ee155 --- zuul/launcher/ansiblelaunchserver.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index aef54d35e8..b53f314d32 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -795,10 +795,9 @@ class NodeWorker(object): job.sendWorkStatus(0, 100) job_status = self.runAnsiblePlaybook(jobdir, timeout) - if job_status == 3: - # AnsibleHostUnreachable: We had a network issue connecting to - # our zuul-worker. Rather then contiune, have zuul requeue the - # job. + if job_status is None: + # The result of the job is indeterminate. Zuul will + # run it again. return result post_status = self.runAnsiblePostPlaybook(jobdir, job_status) @@ -1125,6 +1124,10 @@ class NodeWorker(object): watchdog.stop() self.log.debug("Ansible exit code: %s" % (ret,)) self.ansible_job_proc = None + if ret == 3: + # AnsibleHostUnreachable: We had a network issue connecting to + # our zuul-worker. + return None return ret == 0 def runAnsiblePostPlaybook(self, jobdir, success): From 0670a2764b3b9582c4b68273386c741cc4ec2a31 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 4 Jun 2016 13:05:16 -0400 Subject: [PATCH 113/152] Refactor local_action SCP task into function This allows use to reused the logic, for the following patchset. Change-Id: I147a7519a61537024ec6aef6ae4600bb443475e0 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 74 +++++++++++++++------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index b53f314d32..e5a8ae5ee1 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -851,9 +851,6 @@ class NodeWorker(object): tasks = [] for scpfile in publisher['scp']['files']: site = publisher['scp']['site'] - if site not in self.sites: - raise Exception("Undefined SCP site: %s" % (site,)) - site = self.sites[site] if scpfile.get('copy-console'): src = '/tmp/console.html' rsync_opts = [] @@ -877,40 +874,49 @@ class NodeWorker(object): task['when'] = 'success' tasks.append(task) - dest = scpfile['target'] - dest = self._substituteVariables(dest, parameters) - dest = os.path.join(site['root'], dest) - dest = os.path.normpath(dest) - if not dest.startswith(site['root']): - raise Exception("Target path %s is not below site root" % - (dest,)) - - rsync_cmd = [ - '/usr/bin/rsync', '--delay-updates', '-F', - '--compress', '-rt', '--safe-links', - '--rsync-path="mkdir -p {dest} && rsync"', - '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' - '-o StrictHostKeyChecking=no -q"', - '--out-format="<>%i %n%L"', - '{source}', '"{user}@{host}:{dest}"' - ] - if scpfile.get('keep-hierarchy'): - source = '"%s/"' % scproot - else: - source = '`/usr/bin/find "%s" -type f`' % scproot - shellargs = ' '.join(rsync_cmd).format( - source=source, - dest=dest, - private_key_file=self.private_key_file, - host=site['host'], - user=site['user']) - task = dict(shell=shellargs, - delegate_to='127.0.0.1') - if not scpfile.get('copy-after-failure'): - task['when'] = 'success' + task = self._makeSCPTaskLocalAction( + site, scpfile, scproot, parameters) tasks.append(task) return tasks + def _makeSCPTaskLocalAction(self, site, scpfile, scproot, parameters): + if site not in self.sites: + raise Exception("Undefined SCP site: %s" % (site,)) + site = self.sites[site] + dest = scpfile['target'] + dest = self._substituteVariables(dest, parameters) + dest = os.path.join(site['root'], dest) + dest = os.path.normpath(dest) + if not dest.startswith(site['root']): + raise Exception("Target path %s is not below site root" % + (dest,)) + + rsync_cmd = [ + '/usr/bin/rsync', '--delay-updates', '-F', + '--compress', '-rt', '--safe-links', + '--rsync-path="mkdir -p {dest} && rsync"', + '--rsh="/usr/bin/ssh -i {private_key_file} -S none ' + '-o StrictHostKeyChecking=no -q"', + '--out-format="<>%i %n%L"', + '{source}', '"{user}@{host}:{dest}"' + ] + if scpfile.get('keep-hierarchy'): + source = '"%s/"' % scproot + else: + source = '`/usr/bin/find "%s" -type f`' % scproot + shellargs = ' '.join(rsync_cmd).format( + source=source, + dest=dest, + private_key_file=self.private_key_file, + host=site['host'], + user=site['user']) + task = dict(shell=shellargs, + delegate_to='127.0.0.1') + if not scpfile.get('copy-after-failure'): + task['when'] = 'success' + + return task + def _makeFTPTask(self, jobdir, publisher, parameters): tasks = [] ftp = publisher['ftp'] From 665f0aa2dd86f58519765e9c681549609cee3cd1 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 14 Jun 2016 10:12:29 -0700 Subject: [PATCH 114/152] Ansible launcher: Remove unused plugins dir This was being set up in the jobdir, but not used. Change-Id: I44017b0f2bbcabe4d5fde98f68c299f5f0d7247f --- zuul/launcher/ansiblelaunchserver.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index e5a8ae5ee1..fb10553668 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -93,8 +93,6 @@ class JobDir(object): self.root = tempfile.mkdtemp() self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) - self.plugins_root = os.path.join(self.ansible_root, 'plugins') - os.makedirs(self.plugins_root) self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') From c0968ef378e78e9dd586b8ed7ea63df0d3a4d5f5 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Tue, 14 Jun 2016 11:27:01 -0400 Subject: [PATCH 115/152] Add log_path to ansible.cfg Enable logging to ansible.log for ansible-playbook. We still need to upload the file to our logging server for users to view. Change-Id: I1f20bf93d9824a02c3521136ebedf36a901420c6 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index fb10553668..0a92327060 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -99,6 +99,8 @@ class JobDir(object): self.config = os.path.join(self.ansible_root, 'ansible.cfg') self.script_root = os.path.join(self.ansible_root, 'scripts') os.makedirs(self.script_root) + self.logs = os.path.join(self.ansible_root, 'logs') + os.makedirs(self.logs) def __enter__(self): return self @@ -1087,6 +1089,8 @@ class NodeWorker(object): config.write('host_key_checking = False\n') config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') + config.write('log_path = %s\n' % os.path.join( + jobdir.logs, 'ansible.log')) callback_path = zuul.ansible.plugins.callback_plugins.__file__ callback_path = os.path.abspath(callback_path) From b13425ac0f72079f6cf05d23022eff5a91f6efbb Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 14 Jun 2016 10:32:49 -0700 Subject: [PATCH 116/152] Ansible launcher: move ftp/scp staging out of ansible dir So that we can upload the entire ansible directory without extra copies of the locally staged scp and ftp content (as well as the ftp script which contains a password), stage ftp and scp local content outside of the jobdir's ansible directory. Change-Id: Ia03e60491055563ed31230a35532488dd296163a --- zuul/launcher/ansiblelaunchserver.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 0a92327060..bed099ca53 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -101,6 +101,8 @@ class JobDir(object): os.makedirs(self.script_root) self.logs = os.path.join(self.ansible_root, 'logs') os.makedirs(self.logs) + self.staging_root = os.path.join(self.root, 'staging') + os.makedirs(self.staging_root) def __enter__(self): return self @@ -861,7 +863,7 @@ class NodeWorker(object): rsync_opts = self._getRsyncOptions(scpfile['source'], parameters) - scproot = tempfile.mkdtemp(dir=jobdir.ansible_root) + scproot = tempfile.mkdtemp(dir=jobdir.staging_root) os.chmod(scproot, 0o755) syncargs = dict(src=src, dest=scproot, @@ -925,7 +927,7 @@ class NodeWorker(object): raise Exception("Undefined FTP site: %s" % site) site = self.sites[site] - ftproot = tempfile.mkdtemp(dir=jobdir.ansible_root) + ftproot = tempfile.mkdtemp(dir=jobdir.staging_root) ftpcontent = os.path.join(ftproot, 'content') os.makedirs(ftpcontent) ftpscript = os.path.join(ftproot, 'script') From 21e4968f0ddb528e2ffccfbe0cc4b61845827706 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Tue, 14 Jun 2016 14:52:43 -0400 Subject: [PATCH 117/152] Don't run post_playbook if ansible-playbook is aborted If ansible-playbook returns -9, the job has been aborted. No need to continue and upload logs. Change-Id: I9326f77ae51bb80cce30a825dc8360bf7be533c4 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index b53f314d32..83236f839b 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1128,6 +1128,9 @@ class NodeWorker(object): # AnsibleHostUnreachable: We had a network issue connecting to # our zuul-worker. return None + elif ret == -9: + # Received abort request. + return None return ret == 0 def runAnsiblePostPlaybook(self, jobdir, success): From 94bdf1cacd74462e7b7578a80f817eeb5dfb58d3 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Tue, 14 Jun 2016 13:17:18 -0400 Subject: [PATCH 118/152] Upload ansible playbook directory to logs server If an SCP console log publisher is configured, copy our ansible playbooks and logs into the directory we plan to upload. Change-Id: I0544bfa02bf22a7f94a6944dead5528a7f811164 Signed-off-by: Paul Belanger Co-Authored-by: James E. Blair --- zuul/launcher/ansiblelaunchserver.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index bed099ca53..f2115db8c8 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -852,8 +852,20 @@ class NodeWorker(object): def _makeSCPTask(self, jobdir, publisher, parameters): tasks = [] for scpfile in publisher['scp']['files']: + scproot = tempfile.mkdtemp(dir=jobdir.staging_root) + os.chmod(scproot, 0o755) + site = publisher['scp']['site'] if scpfile.get('copy-console'): + # Include the local ansible directory in the console + # upload. This uploads the playbook and ansible logs. + copyargs = dict(src=jobdir.ansible_root + '/', + dest=os.path.join(scproot, '_zuul_ansible')) + task = dict(copy=copyargs, + delegate_to='127.0.0.1') + tasks.append(task) + + # Fetch the console log from the remote host. src = '/tmp/console.html' rsync_opts = [] else: @@ -863,8 +875,6 @@ class NodeWorker(object): rsync_opts = self._getRsyncOptions(scpfile['source'], parameters) - scproot = tempfile.mkdtemp(dir=jobdir.staging_root) - os.chmod(scproot, 0o755) syncargs = dict(src=src, dest=scproot, copy_links='yes', From 216e01485496f6c0c34245aade8e3eebc4dc5bc5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 14 Jun 2016 11:00:22 -0700 Subject: [PATCH 119/152] Ansible launcher: ensure that console/ansible publishing is last Transform the jjb defined job to ensure that the console log publisher and its files section are both last in order to maximize the uploaded logs. Change-Id: I0abc98fd468b9ce0276b7f736ed94fb1de57691a --- zuul/launcher/ansiblelaunchserver.py | 31 +++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f2115db8c8..03fe61679b 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1013,6 +1013,33 @@ class NodeWorker(object): return tasks + def _transformPublishers(self, jjb_job): + early_publishers = [] + late_publishers = [] + old_publishers = jjb_job.get('publishers', []) + for publisher in old_publishers: + early_scpfiles = [] + late_scpfiles = [] + if 'scp' not in publisher: + early_publishers.append(publisher) + continue + copy_console = False + for scpfile in publisher['scp']['files']: + if scpfile.get('copy-console'): + late_scpfiles.append(scpfile) + copy_console = True + else: + early_scpfiles.append(scpfile) + publisher['scp']['files'] = early_scpfiles + late_scpfiles + if copy_console: + late_publishers.append(publisher) + else: + early_publishers.append(publisher) + publishers = early_publishers + late_publishers + if old_publishers != publishers: + self.log.debug("Transformed job publishers") + return publishers + def prepareAnsibleFiles(self, jobdir, gearman_job, args): job_name = gearman_job.name.split(':')[1] jjb_job = self.jobs[job_name] @@ -1082,9 +1109,11 @@ class NodeWorker(object): tasks=tasks) playbook.write(yaml.dump([play], default_flow_style=False)) + publishers = self._transformPublishers(jjb_job) + with open(jobdir.post_playbook, 'w') as playbook: tasks = [] - for publisher in jjb_job.get('publishers', []): + for publisher in publishers: if 'scp' in publisher: tasks.extend(self._makeSCPTask(jobdir, publisher, parameters)) From dce895135bca5637965881685856d4eaff84cd39 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 14 Jun 2016 11:14:26 -0700 Subject: [PATCH 120/152] Ansible launcher: ensure log publishing always runs But still report POST_FAILURE if any publisher fails. Change-Id: Ie95ae75a9d605eaa44d1112be88d52e06e251879 --- zuul/launcher/ansiblelaunchserver.py | 30 +++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 03fe61679b..3f2f77c8d1 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1038,7 +1038,7 @@ class NodeWorker(object): publishers = early_publishers + late_publishers if old_publishers != publishers: self.log.debug("Transformed job publishers") - return publishers + return early_publishers, late_publishers def prepareAnsibleFiles(self, jobdir, gearman_job, args): job_name = gearman_job.name.split(':')[1] @@ -1109,17 +1109,29 @@ class NodeWorker(object): tasks=tasks) playbook.write(yaml.dump([play], default_flow_style=False)) - publishers = self._transformPublishers(jjb_job) + early_publishers, late_publishers = self._transformPublishers(jjb_job) with open(jobdir.post_playbook, 'w') as playbook: + blocks = [] + for publishers in [early_publishers, late_publishers]: + block = [] + for publisher in publishers: + if 'scp' in publisher: + block.extend(self._makeSCPTask(jobdir, publisher, + parameters)) + if 'ftp' in publisher: + block.extend(self._makeFTPTask(jobdir, publisher, + parameters)) + blocks.append(block) + + # The 'always' section contains the log publishing tasks, + # the 'block' contains all the other publishers. This way + # we run the log publisher regardless of whether the rest + # of the publishers succeed. tasks = [] - for publisher in publishers: - if 'scp' in publisher: - tasks.extend(self._makeSCPTask(jobdir, publisher, - parameters)) - if 'ftp' in publisher: - tasks.extend(self._makeFTPTask(jobdir, publisher, - parameters)) + tasks.append(dict(block=blocks[0], + always=blocks[1])) + play = dict(hosts='node', name='Publishers', tasks=tasks) playbook.write(yaml.dump([play], default_flow_style=False)) From 89c41fd6402580f366410b73b5d9a419845ef934 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 14 Jun 2016 16:02:44 -0700 Subject: [PATCH 121/152] Ansible launcher: set keep-hierarchy on log copying We don't set keep-hierarchy in the log publisher because it only publishes one file. But we're tacking on the full ansible directory to that publisher. We want keep-hierarchy set so those files don't just end up in the root. Modify that in our job publisher transform. Change-Id: I188493168ff62f3711fe59fd93e8bd617864e391 --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 3f2f77c8d1..f0b85241f9 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1026,6 +1026,7 @@ class NodeWorker(object): copy_console = False for scpfile in publisher['scp']['files']: if scpfile.get('copy-console'): + scpfile['keep-hierarchy'] = True late_scpfiles.append(scpfile) copy_console = True else: From 95e0f72a42ca17c8480737b6730008283a8ceb84 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Tue, 14 Jun 2016 17:40:24 -0400 Subject: [PATCH 122/152] Turn off Ansible host fact gathering We don't appear to use host facts, so turning off fact gathering can help avoid unnecessary work: http://docs.ansible.com/ansible/intro_configuration.html#gathering http://docs.ansible.com/ansible/playbooks_variables.html#turning-off-facts Change-Id: I8ddc143b89c406730443ad914a5ccd5253a2a96f --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f0b85241f9..2bb7636344 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1145,6 +1145,7 @@ class NodeWorker(object): config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % os.path.join( jobdir.logs, 'ansible.log')) + config.write('gathering = explicit\n') callback_path = zuul.ansible.plugins.callback_plugins.__file__ callback_path = os.path.abspath(callback_path) From 1487ba2db621913297d31f1b030b2d081b85c943 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 15 Jun 2016 10:07:15 -0700 Subject: [PATCH 123/152] Ansible launcher: strip leading / from scp/ftp targets It seems that Jenkins does this. At least with FTP. We don't have any leading / on SCP targets, but do the same there for symmetry. Change-Id: Ie69593f6f3336cc943cff17a97530df8461fdbaa --- zuul/launcher/ansiblelaunchserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 2bb7636344..ce228e5f7d 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -895,7 +895,7 @@ class NodeWorker(object): if site not in self.sites: raise Exception("Undefined SCP site: %s" % (site,)) site = self.sites[site] - dest = scpfile['target'] + dest = scpfile['target'].lstrip('/') dest = self._substituteVariables(dest, parameters) dest = os.path.join(site['root'], dest) dest = os.path.normpath(dest) @@ -964,7 +964,7 @@ class NodeWorker(object): ftpsource = os.path.join(ftpcontent, ftp['remove-prefix']) while ftpsource[-1] == '/': ftpsource = ftpsource[:-1] - ftptarget = ftp['target'] + ftptarget = ftp['target'].lstrip('/') ftptarget = self._substituteVariables(ftptarget, parameters) ftptarget = os.path.join(site['root'], ftp['target']) ftptarget = os.path.normpath(ftptarget) From a48fa02abf65a5ae3e0c82108227ea4eec686938 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 15 Jun 2016 14:04:37 -0700 Subject: [PATCH 124/152] Ansible launcher: use .txt as logfile extension To make it unecessary to configure apache on our log server to treat this file as text/plain. Change-Id: Ib26a8a2781fda4061be48098d38a0036b4ea19fa --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ce228e5f7d..b792a2c2cd 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1144,7 +1144,7 @@ class NodeWorker(object): config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % os.path.join( - jobdir.logs, 'ansible.log')) + jobdir.logs, 'ansible.txt')) config.write('gathering = explicit\n') callback_path = zuul.ansible.plugins.callback_plugins.__file__ From 06770a85fcff810fc3e1673120710100fc7b0601 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 16 Jun 2016 07:49:34 -0700 Subject: [PATCH 125/152] Ansible launcher: update registration between jobs On a long-running worker, reconfiguration update events received during job runs would be ignored. On busy workers, that could mean we go quite some time without updating our job registration. Instead, set a flag if we ignore a reconfiguration event, and then, before we go to collect our next job, check it and re-register if need be. Change-Id: Ib4af4944c0f0f73d07e54c729c9d9f010552cfdd --- zuul/launcher/ansiblelaunchserver.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ce228e5f7d..63f3856cce 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -502,6 +502,8 @@ class NodeWorker(object): self.termination_queue = termination_queue self.keep_jobdir = keep_jobdir self.running_job_lock = threading.Lock() + self.pending_registration = False + self.registration_lock = threading.Lock() self._get_job_lock = threading.Lock() self._got_job = False self._job_complete_event = threading.Event() @@ -625,6 +627,8 @@ class NodeWorker(object): self._got_job = False def _runGearman(self): + if self.pending_registration: + self.register() with self._get_job_lock: try: job = self.worker.getJob() @@ -658,13 +662,23 @@ class NodeWorker(object): return ret def register(self): - if self._running_job: + if not self.registration_lock.acquire(False): + self.log.debug("Registration already in progress") return - new_functions = set() - for job in self.jobs.values(): - new_functions |= self.generateFunctionNames(job) - self.worker.sendMassDo(new_functions) - self.registered_functions = new_functions + try: + if self._running_job: + self.pending_registration = True + self.log.debug("Ignoring registration due to running job") + return + self.log.debug("Updating registration") + self.pending_registration = False + new_functions = set() + for job in self.jobs.values(): + new_functions |= self.generateFunctionNames(job) + self.worker.sendMassDo(new_functions) + self.registered_functions = new_functions + finally: + self.registration_lock.release() def abortRunningJob(self): self._aborted_job = True From 6870b1293261942c99a15f5e150c7b99bf32b45b Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Jun 2016 12:25:16 -0400 Subject: [PATCH 126/152] Use safe_dump() for Ansible playbooks Change-Id: I7df6f8c1c3dbad5adcf741aee11f27073d7b50ea --- zuul/launcher/ansiblelaunchserver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ce228e5f7d..00a01d550c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1108,7 +1108,7 @@ class NodeWorker(object): play = dict(hosts='node', name='Job body', tasks=tasks) - playbook.write(yaml.dump([play], default_flow_style=False)) + playbook.write(yaml.safe_dump([play], default_flow_style=False)) early_publishers, late_publishers = self._transformPublishers(jjb_job) @@ -1135,7 +1135,7 @@ class NodeWorker(object): play = dict(hosts='node', name='Publishers', tasks=tasks) - playbook.write(yaml.dump([play], default_flow_style=False)) + playbook.write(yaml.safe_dump([play], default_flow_style=False)) with open(jobdir.config, 'w') as config: config.write('[defaults]\n') From b01ec5403ac78b78999b114d263380c587a2939f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 16 Jun 2016 09:46:49 -0700 Subject: [PATCH 127/152] Make pipeline approval username requirement a regular expression This now matches the trigger username requirement which is already a regex. Change-Id: Ib88998de9057f6605b8bbb6c52ab9bd2486b2e83 --- doc/source/zuul.rst | 5 +++-- tests/fixtures/layout-requirement-username.yaml | 2 +- zuul/model.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/zuul.rst b/doc/source/zuul.rst index 07b777a2fe..be9570c351 100644 --- a/doc/source/zuul.rst +++ b/doc/source/zuul.rst @@ -399,11 +399,12 @@ explanation of each of the parameters:: approval matching all specified requirements. *username* - If present, an approval from this username is required. + If present, an approval from this username is required. It is + treated as a regular expression. *email* If present, an approval with this email address is required. It - is treated as a regular expression as above. + is treated as a regular expression. *email-filter* (deprecated) A deprecated alternate spelling of *email*. Only one of *email* or diff --git a/tests/fixtures/layout-requirement-username.yaml b/tests/fixtures/layout-requirement-username.yaml index 7a549f04b6..f9e647752d 100644 --- a/tests/fixtures/layout-requirement-username.yaml +++ b/tests/fixtures/layout-requirement-username.yaml @@ -3,7 +3,7 @@ pipelines: manager: IndependentPipelineManager require: approval: - - username: jenkins + - username: ^(jenkins|zuul)$ trigger: gerrit: - event: comment-added diff --git a/zuul/model.py b/zuul/model.py index ca8f0987ca..46b0b98c95 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -1079,7 +1079,7 @@ class BaseFilter(object): for a in approvals: for k, v in a.items(): if k == 'username': - pass + a['username'] = re.compile(v) elif k in ['email', 'email-filter']: a['email'] = re.compile(v) elif k == 'newer-than': @@ -1098,7 +1098,7 @@ class BaseFilter(object): by = approval.get('by', {}) for k, v in rapproval.items(): if k == 'username': - if (by.get('username', '') != v): + if (not v.search(by.get('username', ''))): return False elif k == 'email': if (not v.search(by.get('email', ''))): From ad45d8a74e1dcabee03ed4c21f459e30bd0a38d6 Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Thu, 16 Jun 2016 16:28:32 -0400 Subject: [PATCH 128/152] Set LOGNAME env var to fix Ansible logging The getpass.getuser() method, used by the Ansible logging file, can incorrectly report the user as it checks a list of environment variables in order to get the user name. Under some circumstances, e.g., daemons, you may not get the correct username because LOGNAME will not get set correctly. Change-Id: I8a4670ba9b3df20de73d878013eebf4a0ea568c0 --- zuul/launcher/ansiblelaunchserver.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 693d5a4db2..ee4ea06ac0 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1178,6 +1178,11 @@ class NodeWorker(object): self.abortRunningProc(proc) def runAnsiblePlaybook(self, jobdir, timeout): + # Set LOGNAME env variable so Ansible log_path log reports + # the correct user. + env_copy = os.environ.copy() + env_copy['LOGNAME'] = 'zuul' + self.ansible_job_proc = subprocess.Popen( ['ansible-playbook', jobdir.playbook, '-e', 'timeout=%s' % timeout, '-v'], @@ -1185,6 +1190,7 @@ class NodeWorker(object): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid, + env=env_copy, ) ret = None watchdog = Watchdog(timeout + ANSIBLE_WATCHDOG_GRACE, @@ -1208,6 +1214,11 @@ class NodeWorker(object): return ret == 0 def runAnsiblePostPlaybook(self, jobdir, success): + # Set LOGNAME env variable so Ansible log_path log reports + # the correct user. + env_copy = os.environ.copy() + env_copy['LOGNAME'] = 'zuul' + self.ansible_post_proc = subprocess.Popen( ['ansible-playbook', jobdir.post_playbook, '-e', 'success=%s' % success, '-v'], @@ -1215,6 +1226,7 @@ class NodeWorker(object): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid, + env=env_copy, ) ret = None watchdog = Watchdog(ANSIBLE_DEFAULT_POST_TIMEOUT, From f0291c244af7f27c6d2f4d1dbf6438b993cf972c Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 16 Jun 2016 14:02:33 -0700 Subject: [PATCH 129/152] Ansible launcher: delay node assignment under load Gearman wakes all available workers at the same time when a job is available. The first one to respond gets the job. To attempt to more evenly distribute nodes (which are assigned via a gearman job) across multiple zuul launchers, delay for a period related exponentially to the number of nodes this launcher currently has between the wake up and grab job packets. Change-Id: I307938f97b730b229c1622cd2f929fc5b65ccdad --- zuul/launcher/ansiblelaunchserver.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 00a01d550c..1457665237 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -51,7 +51,19 @@ def boolify(x): return bool(x) -class GearWorker(gear.Worker): +class LaunchGearWorker(gear.Worker): + def __init__(self, *args, **kw): + self.__launch_server = kw.pop('launch_server') + super(LaunchGearWorker, self).__init__(*args, **kw) + + def handleNoop(self, packet): + workers = len(self.__launch_server.node_workers) + delay = (workers ** 2) / 1000.0 + time.sleep(delay) + return super(LaunchGearWorker, self).handleNoop(packet) + + +class NodeGearWorker(gear.Worker): MASS_DO = 101 def sendMassDo(self, functions): @@ -203,7 +215,8 @@ class LaunchServer(object): port = self.config.get('gearman', 'port') else: port = 4730 - self.worker = gear.Worker('Zuul Launch Server') + self.worker = LaunchGearWorker('Zuul Launch Server', + launch_server=self) self.worker.addServer(server, port) self.log.debug("Waiting for server") self.worker.waitForServer() @@ -533,7 +546,7 @@ class NodeWorker(object): port = self.config.get('gearman', 'port') else: port = 4730 - self.worker = GearWorker(self.name) + self.worker = NodeGearWorker(self.name) self.worker.addServer(server, port) self.log.debug("Waiting for server") self.worker.waitForServer() From f997bbeab92982c9a34b2b67e827d8dec9210fd6 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 16 Jun 2016 14:21:55 -0700 Subject: [PATCH 130/152] Ansible launcher: Log ansible command line Change-Id: I070cf39e051487687dd8d077c2a56ec841f8dfc3 --- zuul/launcher/ansiblelaunchserver.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ee4ea06ac0..ac524a8226 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1183,9 +1183,12 @@ class NodeWorker(object): env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + cmd = ['ansible-playbook', jobdir.playbook, + '-e', 'timeout=%s' % timeout, '-v'] + self.log.debug("Ansible command: %s" % (cmd,)) + self.ansible_job_proc = subprocess.Popen( - ['ansible-playbook', jobdir.playbook, - '-e', 'timeout=%s' % timeout, '-v'], + cmd, cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, @@ -1219,9 +1222,12 @@ class NodeWorker(object): env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + cmd = ['ansible-playbook', jobdir.post_playbook, + '-e', 'success=%s' % success, '-v'] + self.log.debug("Ansible post command: %s" % (cmd,)) + self.ansible_post_proc = subprocess.Popen( - ['ansible-playbook', jobdir.post_playbook, - '-e', 'success=%s' % success, '-v'], + cmd, cwd=jobdir.ansible_root, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, From 8c2bd786d98ab25b8321e4620e9f38dad4afae0b Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Thu, 16 Jun 2016 20:08:09 -0400 Subject: [PATCH 131/152] Enabled host_key_checking Here we are using a pre_task to properly setup our known_hosts files for our zuul worker. Specifically, we run ssk-keyscan on the worker node and add the results into our temp JobDir. Change-Id: Ibc2b0c2bdaaf2724edaeece136c2f9b3cace454b Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 2b76e0d7ba..f4cd5b6bf6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -105,6 +105,7 @@ class JobDir(object): self.root = tempfile.mkdtemp() self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) + self.known_hosts = os.path.join(self.ansible_root, 'known_hosts') self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') @@ -1098,9 +1099,16 @@ class NodeWorker(object): parameters[timeout_var] = timeout with open(jobdir.playbook, 'w') as playbook: + pre_tasks = [] tasks = [] main_block = [] error_block = [] + + shellargs = "ssh-keyscan %s > %s" % ( + self.host, jobdir.known_hosts) + pre_tasks.append(dict(shell=shellargs, + delegate_to='127.0.0.1')) + tasks.append(dict(block=main_block, rescue=error_block)) @@ -1134,7 +1142,7 @@ class NodeWorker(object): error_block.append(dict(fail=dict(msg='FAILURE'))) play = dict(hosts='node', name='Job body', - tasks=tasks) + pre_tasks=pre_tasks, tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) early_publishers, late_publishers = self._transformPublishers(jjb_job) @@ -1167,7 +1175,6 @@ class NodeWorker(object): with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) - config.write('host_key_checking = False\n') config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % os.path.join( @@ -1182,7 +1189,12 @@ class NodeWorker(object): library_path = zuul.ansible.library.__file__ library_path = os.path.abspath(library_path) library_path = os.path.dirname(library_path) - config.write('library = %s\n' % library_path) + config.write('library = %s\n\n' % library_path) + + config.write('[ssh_connection]\n') + ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ + "-o UserKnownHostsFile=%s" % jobdir.known_hosts + config.write('ssh_args = %s\n' % ssh_args) return timeout From b133b326114e3049f031b00802d9de6221f121fa Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 17 Jun 2016 07:46:25 -0700 Subject: [PATCH 132/152] Ansible launcher: set timeout env var in ms Jenkins, being Java derived, uses milliseconds as a time unit. Change-Id: Ie1d058e258e7630c150b065fd22a55f0cffcd214 --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 2b76e0d7ba..6ec80c8d14 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1095,7 +1095,7 @@ class NodeWorker(object): if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT if timeout_var: - parameters[timeout_var] = timeout + parameters[timeout_var] = timeout * 1000 with open(jobdir.playbook, 'w') as playbook: tasks = [] From 19dd834814593e5a76c077b6b72f90696e37eca5 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 17 Jun 2016 07:55:47 -0700 Subject: [PATCH 133/152] Ansible launcher: Fix timeout processing * Ensure variables are the correct type * Expand wrapper macros * Don't let later wrappers override the timeout wrapper * Correct the name of the timeout wrapper Co-Authored-By: Paul Belanger Change-Id: Iab26c018cac9b0d4f28ecf4af7c7e3590bdf61f8 --- zuul/launcher/ansiblelaunchserver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 6ec80c8d14..c1cf16def6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1086,16 +1086,16 @@ class NodeWorker(object): timeout_var = None for wrapper in jjb_job.get('wrappers', []): if isinstance(wrapper, dict): - build_timeout = wrapper.get('build-timeout', {}) + build_timeout = wrapper.get('timeout') if isinstance(build_timeout, dict): - timeout_var = build_timeout.get('timeout-var', None) + timeout_var = build_timeout.get('timeout-var') timeout = build_timeout.get('timeout') - if timeout: - timeout = timeout * 60 + if timeout is not None: + timeout = int(timeout) * 60 if not timeout: timeout = ANSIBLE_DEFAULT_TIMEOUT if timeout_var: - parameters[timeout_var] = timeout * 1000 + parameters[timeout_var] = str(timeout * 1000) with open(jobdir.playbook, 'w') as playbook: tasks = [] @@ -1293,7 +1293,7 @@ class JJB(jenkins_jobs.builder.Builder): return new_components def expandMacros(self, job): - for component_type in ['builder', 'publisher']: + for component_type in ['builder', 'publisher', 'wrapper']: component_list_type = component_type + 's' new_components = [] for new_component in job.get(component_list_type, []): From 3bc31c889b86ce9b1acefde94db5e3e8b53939be Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 17 Jun 2016 15:00:40 -0700 Subject: [PATCH 134/152] Revert "Enabled host_key_checking" This reverts commit 8c2bd786d98ab25b8321e4620e9f38dad4afae0b. This doesn't appear to work in local testing. Change-Id: I9a3890c2eea6b9a8c741723a200986c02b87229c --- zuul/launcher/ansiblelaunchserver.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 32fc832118..c1cf16def6 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -105,7 +105,6 @@ class JobDir(object): self.root = tempfile.mkdtemp() self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) - self.known_hosts = os.path.join(self.ansible_root, 'known_hosts') self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') @@ -1099,16 +1098,9 @@ class NodeWorker(object): parameters[timeout_var] = str(timeout * 1000) with open(jobdir.playbook, 'w') as playbook: - pre_tasks = [] tasks = [] main_block = [] error_block = [] - - shellargs = "ssh-keyscan %s > %s" % ( - self.host, jobdir.known_hosts) - pre_tasks.append(dict(shell=shellargs, - delegate_to='127.0.0.1')) - tasks.append(dict(block=main_block, rescue=error_block)) @@ -1142,7 +1134,7 @@ class NodeWorker(object): error_block.append(dict(fail=dict(msg='FAILURE'))) play = dict(hosts='node', name='Job body', - pre_tasks=pre_tasks, tasks=tasks) + tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) early_publishers, late_publishers = self._transformPublishers(jjb_job) @@ -1175,6 +1167,7 @@ class NodeWorker(object): with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) + config.write('host_key_checking = False\n') config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % os.path.join( @@ -1189,12 +1182,7 @@ class NodeWorker(object): library_path = zuul.ansible.library.__file__ library_path = os.path.abspath(library_path) library_path = os.path.dirname(library_path) - config.write('library = %s\n\n' % library_path) - - config.write('[ssh_connection]\n') - ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ - "-o UserKnownHostsFile=%s" % jobdir.known_hosts - config.write('ssh_args = %s\n' % ssh_args) + config.write('library = %s\n' % library_path) return timeout From 0ccec4b0480f897fea48c4b85a9a414dd92dd1e6 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 18 Jun 2016 12:16:38 -0400 Subject: [PATCH 135/152] Use ftptarget since leading / is stripped Change-Id: I67865da99fa89cd6bab0654ea9eb403c1c45ac4e Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 32fc832118..0ab1137188 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -994,7 +994,7 @@ class NodeWorker(object): ftpsource = ftpsource[:-1] ftptarget = ftp['target'].lstrip('/') ftptarget = self._substituteVariables(ftptarget, parameters) - ftptarget = os.path.join(site['root'], ftp['target']) + ftptarget = os.path.join(site['root'], ftptarget) ftptarget = os.path.normpath(ftptarget) if not ftptarget.startswith(site['root']): raise Exception("Target path %s is not below site root" % From 0a0f4a022e23e994f780bcd183d9435d8eb607b3 Mon Sep 17 00:00:00 2001 From: Monty Taylor Date: Fri, 17 Jun 2016 17:52:40 -0500 Subject: [PATCH 136/152] Do timeout calculation in jinja timeout set as a non-persistent fact does not override the timeout extra_var. Additionally, the setting of the value does not occur until the task has a result, which means that the callback setting the value happens after the async will have started. However, setting the elapsed_time for the previous task does carry over to the beginning of the subsequent task, so the elapsed_time value is available for calculations at the async invocation. If we do the timeout - elapsed_time calculation in jinja, all of the correct values should be in scope. Change-Id: If08f8c79622f40ba92c3455d393416d880dd1bef --- zuul/ansible/plugins/callback_plugins/timeout.py | 7 +------ zuul/launcher/ansiblelaunchserver.py | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/zuul/ansible/plugins/callback_plugins/timeout.py b/zuul/ansible/plugins/callback_plugins/timeout.py index 030ecc87a0..1cfd10df09 100644 --- a/zuul/ansible/plugins/callback_plugins/timeout.py +++ b/zuul/ansible/plugins/callback_plugins/timeout.py @@ -46,12 +46,7 @@ class CallbackModule(CallbackBase): self._elapsed_time += task_time if self._play and result._host: manager = self._play.get_variable_manager() - facts = dict(elapsed_time=self._elapsed_time) - - overall_timeout = manager.extra_vars.get('timeout') - if str(overall_timeout) != 'None': - timeout = int(overall_timeout) - int(self._elapsed_time) - facts['timeout'] = timeout + facts = dict(elapsed_time=int(self._elapsed_time)) manager.set_nonpersistent_facts(result._host, facts) self._task_start_time = None diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index e0d1cbf11b..f07e6095a7 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1028,8 +1028,8 @@ class NodeWorker(object): cwd=parameters['WORKSPACE'], parameters=parameters) task = dict(zuul_runner=runner) - task['when'] = '{{ timeout | int > 0 }}' - task['async'] = '{{ timeout }}' + task['when'] = '{{ elapsed_time < timeout | int }}' + task['async'] = '{{ timeout | int - elapsed_time }}' task['poll'] = 5 tasks.append(task) From 749ca63eb49781e6daa3cbf51634cd01ee4307af Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Fri, 17 Jun 2016 17:03:03 -0400 Subject: [PATCH 137/152] Include timeout in name of zuul_runner tasks This outputs our current timeout value for async. Change-Id: I9025a867f66c7158048b070ac72f4700255f3d5d Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index f07e6095a7..1252e3b159 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1028,6 +1028,8 @@ class NodeWorker(object): cwd=parameters['WORKSPACE'], parameters=parameters) task = dict(zuul_runner=runner) + task['name'] = ('zuul_runner with {{ timeout | int - elapsed_time }} ' + 'second timeout') task['when'] = '{{ elapsed_time < timeout | int }}' task['async'] = '{{ timeout | int - elapsed_time }}' task['poll'] = 5 From 8122328e37f77ba3ba7307583a57dd9e63ca46a9 Mon Sep 17 00:00:00 2001 From: Doug Hellmann Date: Mon, 25 Jan 2016 15:41:35 -0500 Subject: [PATCH 138/152] highlight the error condition when reporting a config issue Add the word "ERROR" to make it easier to find the error when searching the loooong output produced by zuul-server when checking its config file, to make debugging failures of the validation tests in patches to project-config simpler. Change-Id: I38cf31619eb309ebb7fc49eca4437a9c36bcb1ce Signed-off-by: Doug Hellmann --- zuul/cmd/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/cmd/server.py b/zuul/cmd/server.py index 1fb4a3292c..5e675fd56b 100755 --- a/zuul/cmd/server.py +++ b/zuul/cmd/server.py @@ -107,7 +107,7 @@ class Server(zuul.cmd.ZuulApp): jobs.add(v) for job in sorted(layout.jobs): if job not in jobs: - print("Job %s not defined" % job) + print("FAILURE: Job %s not defined" % job) failure = True return failure From b7509206cd5810a29c2fc753fa63baecce08a1db Mon Sep 17 00:00:00 2001 From: Ian Wienand Date: Tue, 21 Jun 2016 10:13:41 +1000 Subject: [PATCH 139/152] Make console log output use deliminator Make the console log look more like the old Jenkins output so that log parsing scripts, etc, don't have to be modified. We have, however, retained the higher precsion timestamp. I922283891c557e489e4d75f17e78e8c31a5ca2ca tests that os-loganalyze can handle the higher precision timestamp; logstash's built-in TIMESTAMP_ISO8601 regex will also parse it (see I16d3215c94f0cd5698fc23b1f1bf6252d9157246). Change-Id: Ia5356f867d52a2758aa179f5454cbe196bbf98a2 --- zuul/ansible/library/zuul_runner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_runner.py b/zuul/ansible/library/zuul_runner.py index 5a388073f7..7689fb3c8e 100644 --- a/zuul/ansible/library/zuul_runner.py +++ b/zuul/ansible/library/zuul_runner.py @@ -31,8 +31,12 @@ class Console(object): self.logfile.close() def addLine(self, ln): + # Note this format with deliminator is "inspired" by the old + # Jenkins format but with microsecond resolution instead of + # millisecond. It is kept so log parsing/formatting remains + # consistent. ts = datetime.datetime.now() - outln = '%s %s' % (str(ts), ln) + outln = '%s | %s' % (ts, ln) self.logfile.write(outln) From b547ae95420e25bb8c35372d82fcf2c20dbfbcbe Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 21 Jun 2016 08:09:16 -0700 Subject: [PATCH 140/152] Ansible launcher: copy lib/plugins at startup Because we run Zuul continuously deployed from source and installed via pip, whenever a Zuul commit lands, pip re-installs Zuul. When that happens, there may be several minutes where part of the Zuul source code is no longer on disk. Because the ansible launcher points directly into the installed Zuul source for the zuul library and plugins, jobs launched during that window will fail. To correct this, make a copy of the necessary ansible files on startup and point the jobs to that location. Change-Id: Ib0b6f07967ba314d34fad4835c4e2e84584e446c --- zuul/launcher/ansiblelaunchserver.py | 38 +++++++++++++++++++--------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 1252e3b159..4898218cc2 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -163,6 +163,24 @@ class LaunchServer(object): state_dir = '/var/lib/zuul' path = os.path.join(state_dir, 'launcher.socket') self.command_socket = commandsocket.CommandSocket(path) + ansible_dir = os.path.join(state_dir, 'ansible') + plugins_dir = os.path.join(ansible_dir, 'plugins') + self.callback_dir = os.path.join(plugins_dir, 'callback_plugins') + if not os.path.exists(self.callback_dir): + os.makedirs(self.callback_dir) + self.library_dir = os.path.join(ansible_dir, 'library') + if not os.path.exists(self.library_dir): + os.makedirs(self.library_dir) + + callback_path = os.path.dirname(os.path.abspath( + zuul.ansible.plugins.callback_plugins.__file__)) + for fn in os.listdir(callback_path): + shutil.copy(os.path.join(callback_path, fn), self.callback_dir) + + library_path = os.path.dirname(os.path.abspath( + zuul.ansible.library.__file__)) + for fn in os.listdir(library_path): + shutil.copy(os.path.join(library_path, fn), self.library_dir) for section in config.sections(): m = self.site_section_re.match(section) @@ -435,7 +453,8 @@ class LaunchServer(object): self.sites, args['name'], args['host'], args['description'], args['labels'], self.hostname, self.zmq_send_queue, - self.termination_queue, self.keep_jobdir) + self.termination_queue, self.keep_jobdir, + self.callback_dir, self.library_dir) self.node_workers[worker.name] = worker worker.thread = threading.Thread(target=worker.run) @@ -489,7 +508,8 @@ class LaunchServer(object): class NodeWorker(object): def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, - termination_queue, keep_jobdir): + termination_queue, keep_jobdir, callback_dir, + library_dir): self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config @@ -534,6 +554,8 @@ class NodeWorker(object): self.username = config.get('launcher', 'username') else: self.username = 'zuul' + self.callback_dir = callback_dir + self.library_dir = library_dir def isAlive(self): # Meant to be called from the manager @@ -1175,16 +1197,8 @@ class NodeWorker(object): config.write('log_path = %s\n' % os.path.join( jobdir.logs, 'ansible.txt')) config.write('gathering = explicit\n') - - callback_path = zuul.ansible.plugins.callback_plugins.__file__ - callback_path = os.path.abspath(callback_path) - callback_path = os.path.dirname(callback_path) - config.write('callback_plugins = %s\n' % callback_path) - - library_path = zuul.ansible.library.__file__ - library_path = os.path.abspath(library_path) - library_path = os.path.dirname(library_path) - config.write('library = %s\n' % library_path) + config.write('callback_plugins = %s\n' % self.callback_dir) + config.write('library = %s\n' % self.library_dir) return timeout From 4cbf88565bd61f2a1c9ec6f78ffd5b47e96e2248 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 21 Jun 2016 08:13:53 -0700 Subject: [PATCH 141/152] Ansible launcher: add '|' to zuul_log To match the format in zuul_runner. Change-Id: I863e4735ffebfc890782242bd3593a421bfb141b --- zuul/ansible/library/zuul_log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zuul/ansible/library/zuul_log.py b/zuul/ansible/library/zuul_log.py index 2072bc9634..4b377d9079 100644 --- a/zuul/ansible/library/zuul_log.py +++ b/zuul/ansible/library/zuul_log.py @@ -29,7 +29,7 @@ class Console(object): def addLine(self, ln): ts = datetime.datetime.now() - outln = '%s %s' % (str(ts), ln) + outln = '%s | %s' % (str(ts), ln) self.logfile.write(outln) From 21ba08b40b00419afb0cbbbaf7fb8906cade5374 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 20 Jun 2016 14:32:38 -0400 Subject: [PATCH 142/152] Add local_tmp to ansible.cfg Since we are running more then 1 ansible-playbook process on our zuul-launcher, lets error on the side of caution. Include local_tmp into our job directory as not to have other ansible-playbook processes working in the same tmp directory. Change-Id: Ide01d42679f676ed969363d627f94812dcb160e3 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 4898218cc2..6f48a05bc7 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1192,6 +1192,7 @@ class NodeWorker(object): config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') + config.write('local_tmp = %s/.ansible/tmp\n' % jobdir.root) config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') config.write('log_path = %s\n' % os.path.join( From a31a74e1698d1327bd202a4f2bd9cbaab5a2671e Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Mon, 20 Jun 2016 15:38:27 -0400 Subject: [PATCH 143/152] Enable keep_remote_files in ansible.cfg When ansible is not using pipelining, files are copied to the remote worker then run locally on said worker. Ansible being the good program that it is, attempts to clean up these files once finished with them. As a results, it may be possible for ansible async to delete these files before actually executing them[1]. Consideration should be taken for long lived workers, as we may want to update our remote_tmp setting to use the /tmp folder. We can discuss that in a follow up patch. [1] https://github.com/ansible/ansible/issues/10855 Change-Id: Idb1efcdde9d6fe6d74fc2131a4fecddcd1e46904 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 6f48a05bc7..feca816961 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1192,6 +1192,7 @@ class NodeWorker(object): config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) config.write('host_key_checking = False\n') + config.write('keep_remote_files = True\n') config.write('local_tmp = %s/.ansible/tmp\n' % jobdir.root) config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') From 46877906b91778e1f3cec5cee289768823094c39 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 17 Jun 2016 10:27:53 -0700 Subject: [PATCH 144/152] Ansible launcher: add support for node revocation So that if we issue a nodepool delete on an assigned node, nodepool can notify the launcher that it no longer exists. Change-Id: Ib1dc0879e1cbbbdb6c0b16b6bb52ff9f8b98a25d --- zuul/launcher/ansiblelaunchserver.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index feca816961..0e66dd1ab5 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -294,6 +294,7 @@ class LaunchServer(object): new_functions.add("node_assign:zuul") new_functions.add("stop:%s" % self.hostname) new_functions.add("set_description:%s" % self.hostname) + new_functions.add("node_revoke:%s" % self.hostname) for function in new_functions - self.registered_functions: self.worker.registerFunction(function) @@ -429,6 +430,9 @@ class LaunchServer(object): self.log.debug("Got set_description job: %s" % job.unique) job.sendWorkComplete() + elif job.name.startswith('node_revoke:'): + self.log.debug("Got node_revoke job: %s" % job.unique) + self.revokeNode(job) else: self.log.error("Unable to handle job %s" % job.name) job.sendWorkFail() @@ -460,6 +464,27 @@ class LaunchServer(object): worker.thread = threading.Thread(target=worker.run) worker.thread.start() + def revokeNode(self, job): + try: + args = json.loads(job.arguments) + self.log.debug("Revoke job with arguments: %s" % (args,)) + name = args['name'] + node = self.node_workers.get(name) + if not node: + self.log.debug("Unable to find worker %s" % (name,)) + return + try: + if node.isAlive(): + node.queue.put(dict(action='stop')) + else: + self.log.debug("Node %s is not alive while revoking node" % + (node.name,)) + except Exception: + self.log.exception("Exception sending stop command " + "to worker:") + finally: + job.sendWorkComplete() + def stopJob(self, job): try: args = json.loads(job.arguments) From b964f4a47412c561259cb77493ddfaca61dedaa0 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 18 Jun 2016 11:25:09 -0400 Subject: [PATCH 145/152] Move job timeout into playbook as vars This is more a cosmetic change for the purpose to easier debug our playbooks. Currently timeout was passed into our ansible playbook via the CLI, however since we build up our playbooks from scratch we can simply change to a vars inside the playbook. Our functionality should not change but providers users an easier way to determine the initial timeout value. Change-Id: Id5d68ce1ca3025757349fb5765993889c92fb309 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 0e66dd1ab5..3d58ed391c 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1150,6 +1150,8 @@ class NodeWorker(object): tasks = [] main_block = [] error_block = [] + variables = [] + tasks.append(dict(block=main_block, rescue=error_block)) @@ -1182,7 +1184,8 @@ class NodeWorker(object): error_block.append(task) error_block.append(dict(fail=dict(msg='FAILURE'))) - play = dict(hosts='node', name='Job body', + variables.append(dict(timeout=timeout)) + play = dict(hosts='node', name='Job body', vars=variables, tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) @@ -1239,8 +1242,7 @@ class NodeWorker(object): env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' - cmd = ['ansible-playbook', jobdir.playbook, - '-e', 'timeout=%s' % timeout, '-v'] + cmd = ['ansible-playbook', jobdir.playbook, '-v'] self.log.debug("Ansible command: %s" % (cmd,)) self.ansible_job_proc = subprocess.Popen( From cf30031ccd55149d55918b8e189087c9629ceb91 Mon Sep 17 00:00:00 2001 From: Paul Belanger Date: Sat, 18 Jun 2016 12:47:58 -0400 Subject: [PATCH 146/152] Remove unused timeout args for _makeBuilderTask Change-Id: If8c6aca0735ca2716fe8402d5665c10973593344 Signed-off-by: Paul Belanger --- zuul/launcher/ansiblelaunchserver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 3d58ed391c..a9b9733ece 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -1054,7 +1054,7 @@ class NodeWorker(object): tasks.append(task) return tasks - def _makeBuilderTask(self, jobdir, builder, parameters, timeout): + def _makeBuilderTask(self, jobdir, builder, parameters): tasks = [] script_fn = '%s.sh' % str(uuid.uuid4().hex) script_path = os.path.join(jobdir.script_root, script_fn) @@ -1175,8 +1175,7 @@ class NodeWorker(object): for builder in jjb_job.get('builders', []): if 'shell' in builder: main_block.extend( - self._makeBuilderTask(jobdir, builder, - parameters, timeout)) + self._makeBuilderTask(jobdir, builder, parameters)) task = dict(zuul_log=dict(msg="Job complete, result: SUCCESS")) main_block.append(task) From cb993deabed700cd1aff6a650ca6c717117ea066 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 21 Jun 2016 08:33:18 -0700 Subject: [PATCH 147/152] Revert "Revert "Enabled host_key_checking"" This reverts commit 3bc31c889b86ce9b1acefde94db5e3e8b53939be. Change-Id: Id8d5ae3d8bd764865ca89efb8926e80f1fcbb6a4 --- zuul/launcher/ansiblelaunchserver.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index a9b9733ece..7061e71bfa 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -105,6 +105,7 @@ class JobDir(object): self.root = tempfile.mkdtemp() self.ansible_root = os.path.join(self.root, 'ansible') os.makedirs(self.ansible_root) + self.known_hosts = os.path.join(self.ansible_root, 'known_hosts') self.inventory = os.path.join(self.ansible_root, 'inventory') self.playbook = os.path.join(self.ansible_root, 'playbook') self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') @@ -1147,11 +1148,17 @@ class NodeWorker(object): parameters[timeout_var] = str(timeout * 1000) with open(jobdir.playbook, 'w') as playbook: + pre_tasks = [] tasks = [] main_block = [] error_block = [] variables = [] + shellargs = "ssh-keyscan %s > %s" % ( + self.host, jobdir.known_hosts) + pre_tasks.append(dict(shell=shellargs, + delegate_to='127.0.0.1')) + tasks.append(dict(block=main_block, rescue=error_block)) @@ -1185,7 +1192,7 @@ class NodeWorker(object): variables.append(dict(timeout=timeout)) play = dict(hosts='node', name='Job body', vars=variables, - tasks=tasks) + pre_tasks=pre_tasks, tasks=tasks) playbook.write(yaml.safe_dump([play], default_flow_style=False)) early_publishers, late_publishers = self._transformPublishers(jjb_job) @@ -1218,7 +1225,6 @@ class NodeWorker(object): with open(jobdir.config, 'w') as config: config.write('[defaults]\n') config.write('hostfile = %s\n' % jobdir.inventory) - config.write('host_key_checking = False\n') config.write('keep_remote_files = True\n') config.write('local_tmp = %s/.ansible/tmp\n' % jobdir.root) config.write('private_key_file = %s\n' % self.private_key_file) @@ -1229,6 +1235,11 @@ class NodeWorker(object): config.write('callback_plugins = %s\n' % self.callback_dir) config.write('library = %s\n' % self.library_dir) + config.write('[ssh_connection]\n') + ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ + "-o UserKnownHostsFile=%s" % jobdir.known_hosts + config.write('ssh_args = %s\n' % ssh_args) + return timeout def _ansibleTimeout(self, proc, msg): From 9ff15063e57f10b4c85122fc389586b12a0d8bc9 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 21 Jun 2016 10:45:03 -0700 Subject: [PATCH 148/152] Ansible launcher: move ansible log to ansible root There are no other logs, so instead of writing the ansible log dir in the logs/ directory, just put it in the ansible root. This is friendlier on the log server. Change-Id: I11c51e8dab1f061a3bb509965017ee6d352ed57b --- zuul/launcher/ansiblelaunchserver.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 7061e71bfa..e0458ea7e9 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -111,9 +111,8 @@ class JobDir(object): self.post_playbook = os.path.join(self.ansible_root, 'post_playbook') self.config = os.path.join(self.ansible_root, 'ansible.cfg') self.script_root = os.path.join(self.ansible_root, 'scripts') + self.ansible_log = os.path.join(self.ansible_root, 'ansible_log.txt') os.makedirs(self.script_root) - self.logs = os.path.join(self.ansible_root, 'logs') - os.makedirs(self.logs) self.staging_root = os.path.join(self.root, 'staging') os.makedirs(self.staging_root) @@ -1229,8 +1228,7 @@ class NodeWorker(object): config.write('local_tmp = %s/.ansible/tmp\n' % jobdir.root) config.write('private_key_file = %s\n' % self.private_key_file) config.write('retry_files_enabled = False\n') - config.write('log_path = %s\n' % os.path.join( - jobdir.logs, 'ansible.txt')) + config.write('log_path = %s\n' % jobdir.ansible_log) config.write('gathering = explicit\n') config.write('callback_plugins = %s\n' % self.callback_dir) config.write('library = %s\n' % self.library_dir) From 491e836afb64e315126e8cc9539fdc2b63de42e7 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 22 Jun 2016 16:12:38 -0700 Subject: [PATCH 149/152] Ansible launcher: add verbose command Add a command to enable verbose ansible output, as well as a second command to disable it. This way it is easy to enable/disable verbose output when debugging a problem without needing to restart. Change-Id: I9b2ba07c83868662542a8632620af11361b0f868 --- zuul/launcher/ansiblelaunchserver.py | 36 ++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index e0458ea7e9..569c96698e 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -42,7 +42,8 @@ ANSIBLE_DEFAULT_TIMEOUT = 2 * 60 * 60 ANSIBLE_DEFAULT_POST_TIMEOUT = 10 * 60 -COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause', 'release', 'graceful'] +COMMANDS = ['reconfigure', 'stop', 'pause', 'unpause', 'release', 'graceful', + 'verbose', 'unverbose'] def boolify(x): @@ -131,6 +132,9 @@ class LaunchServer(object): def __init__(self, config, keep_jobdir=False): self.config = config + self.options = dict( + verbose=False + ) self.keep_jobdir = keep_jobdir self.hostname = socket.gethostname() self.registered_functions = set() @@ -148,6 +152,8 @@ class LaunchServer(object): unpause=self.unpause, release=self.release, graceful=self.graceful, + verbose=self.verboseOn, + unverbose=self.verboseOff, ) if config.has_option('launcher', 'accept_nodes'): @@ -391,6 +397,14 @@ class LaunchServer(object): # class, which is called by the command shell. self.log.debug("Stopped") + def verboseOn(self): + self.log.debug("Enabling verbose mode") + self.options['verbose'] = True + + def verboseOff(self): + self.log.debug("Disabling verbose mode") + self.options['verbose'] = False + def join(self): self.command_thread.join() @@ -458,7 +472,8 @@ class LaunchServer(object): args['description'], args['labels'], self.hostname, self.zmq_send_queue, self.termination_queue, self.keep_jobdir, - self.callback_dir, self.library_dir) + self.callback_dir, self.library_dir, + self.options) self.node_workers[worker.name] = worker worker.thread = threading.Thread(target=worker.run) @@ -534,7 +549,7 @@ class NodeWorker(object): def __init__(self, config, jobs, builds, sites, name, host, description, labels, manager_name, zmq_send_queue, termination_queue, keep_jobdir, callback_dir, - library_dir): + library_dir, options): self.log = logging.getLogger("zuul.NodeWorker.%s" % (name,)) self.log.debug("Creating node worker %s" % (name,)) self.config = config @@ -581,6 +596,7 @@ class NodeWorker(object): self.username = 'zuul' self.callback_dir = callback_dir self.library_dir = library_dir + self.options = options def isAlive(self): # Meant to be called from the manager @@ -1250,7 +1266,12 @@ class NodeWorker(object): env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' - cmd = ['ansible-playbook', jobdir.playbook, '-v'] + if self.options['verbose']: + verbose = '-vvv' + else: + verbose = '-v' + + cmd = ['ansible-playbook', jobdir.playbook, verbose] self.log.debug("Ansible command: %s" % (cmd,)) self.ansible_job_proc = subprocess.Popen( @@ -1291,8 +1312,13 @@ class NodeWorker(object): env_copy = os.environ.copy() env_copy['LOGNAME'] = 'zuul' + if self.options['verbose']: + verbose = '-vvv' + else: + verbose = '-v' + cmd = ['ansible-playbook', jobdir.post_playbook, - '-e', 'success=%s' % success, '-v'] + '-e', 'success=%s' % success, verbose] self.log.debug("Ansible post command: %s" % (cmd,)) self.ansible_post_proc = subprocess.Popen( From 7d27b985a81f5b0da8db8b897b9c31275f6dee02 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Fri, 24 Jun 2016 15:36:16 -0700 Subject: [PATCH 150/152] Ansible launcher: clean up result handling Have runJob always return a string or None for simplicity. Construct the result dictionary only when sending the work_complete packet, and send a dictionary with a result element every time, even if the value is None. Zuul will react the same whether there is no dictionary, or it has no result or the result is null, and retry the job. Finally, send a simple string as status rather than the result dictionary to ZMQ. If the job is aborted, status will be null. Change-Id: Ie093c070c2fb4baf67f538b79826a7d8e42b18d9 --- zuul/launcher/ansiblelaunchserver.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index 569c96698e..ed9c213908 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -807,11 +807,10 @@ class NodeWorker(object): self.log.exception("Exception while launching job thread") self._running_job = False - if not result: - result = b'' try: - job.sendWorkComplete(result) + data = json.dumps(dict(result=result)) + job.sendWorkComplete(data) except Exception: self.log.exception("Exception while sending job completion packet") @@ -894,16 +893,16 @@ class NodeWorker(object): post_status = self.runAnsiblePostPlaybook(jobdir, job_status) if not post_status: - status = 'POST_FAILURE' + result = 'POST_FAILURE' elif job_status: - status = 'SUCCESS' + result = 'SUCCESS' else: - status = 'FAILURE' + result = 'FAILURE' - if not self._aborted_job: + if self._aborted_job: # A Null result will cause zuul to relaunch the job if # it needs to. - result = json.dumps(dict(result=status)) + result = None return result From 4c51d9c6da82cfc64281450fd710f4d41df0c162 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 11 Jul 2016 15:54:39 -0700 Subject: [PATCH 151/152] Ansible launcher: fix pause/unpause accept nodes When unpausing the launcher, it would begin to accept nodes regardless of the original configuration setting. Keep a record of that setting so that we can return to it when we unpause. Change-Id: Ib8be2bda0cc53c7a6a37ed7107520a944330adc0 --- zuul/launcher/ansiblelaunchserver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zuul/launcher/ansiblelaunchserver.py b/zuul/launcher/ansiblelaunchserver.py index ed9c213908..95fc2fa46a 100644 --- a/zuul/launcher/ansiblelaunchserver.py +++ b/zuul/launcher/ansiblelaunchserver.py @@ -161,6 +161,7 @@ class LaunchServer(object): 'accept_nodes') else: self.accept_nodes = True + self.config_accept_nodes = self.accept_nodes if self.config.has_option('zuul', 'state_dir'): state_dir = os.path.expanduser( @@ -335,7 +336,7 @@ class LaunchServer(object): def unpause(self): self.log.debug("Unpausing") - self.accept_nodes = True + self.accept_nodes = self.config_accept_nodes self.register() for node in self.node_workers.values(): try: From 9f16d522a976fc32d841e2d2bd92d6a4216c92c5 Mon Sep 17 00:00:00 2001 From: Sachi King Date: Wed, 16 Mar 2016 12:20:45 +1100 Subject: [PATCH 152/152] Support post jobs by supporting rev checkout Currently zuul-cloner does not support post jobs, as it does not know what to checkout. This adds the ability on a per project basis to specify a revision to be checked out. When specified zuul-cloner will successfully check out the same repo as gerrit-git-prep.sh does in post jobs. Sample usage: clonemap: - name: openstack/neutron dest: ./neu - name: openstack/requirements dest: ./reqs export ZUUL_PROJECT="openstack/neutron" export ZUUL_NEWREV="a2Fhc2Rma2FzZHNkZjhkYXM4OWZhc25pb2FzODkK" export ZUUL_BRANCH="stable/liberty" zuul-cloner -m map.yaml git://git.openstack.org $ZUUL_PROJECT \ openstack/requirements This results with openstack/neutron checked out at rev a2Fhc2 and openstack/requirements at 'heads/stable/liberty' Change-Id: Ie9b03508a44f04adfbe2696cde136439ebffb9a6 --- tests/base.py | 11 +++++++++ tests/test_cloner.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ zuul/cmd/cloner.py | 8 +++++++ zuul/exceptions.py | 9 ++++++++ zuul/lib/cloner.py | 48 +++++++++++++++++++++++++++++++-------- 5 files changed, 120 insertions(+), 10 deletions(-) diff --git a/tests/base.py b/tests/base.py index 405caa0ded..7945a0b838 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1132,6 +1132,17 @@ class ZuulTestCase(BaseTestCase): zuul.merger.merger.reset_repo_to_head(repo) repo.git.clean('-x', '-f', '-d') + def create_commit(self, project): + path = os.path.join(self.upstream_root, project) + repo = git.Repo(path) + repo.head.reference = repo.heads['master'] + file_name = os.path.join(path, 'README') + with open(file_name, 'a') as f: + f.write('creating fake commit\n') + repo.index.add([file_name]) + commit = repo.index.commit('Creating a fake commit') + return commit.hexsha + def ref_has_change(self, ref, change): path = os.path.join(self.git_root, change.project) repo = git.Repo(path) diff --git a/tests/test_cloner.py b/tests/test_cloner.py index 137c1570ee..e3576bdad8 100644 --- a/tests/test_cloner.py +++ b/tests/test_cloner.py @@ -566,3 +566,57 @@ class TestCloner(ZuulTestCase): self.worker.hold_jobs_in_build = False self.worker.release() self.waitUntilSettled() + + def test_post_checkout(self): + project = "org/project" + path = os.path.join(self.upstream_root, project) + repo = git.Repo(path) + repo.head.reference = repo.heads['master'] + commits = [] + for i in range(0, 3): + commits.append(self.create_commit(project)) + newRev = commits[1] + + cloner = zuul.lib.cloner.Cloner( + git_base_url=self.upstream_root, + projects=[project], + workspace=self.workspace_root, + zuul_branch=None, + zuul_ref='master', + zuul_url=self.git_root, + zuul_project=project, + zuul_newrev=newRev, + ) + cloner.execute() + repos = self.getWorkspaceRepos([project]) + cloned_sha = repos[project].rev_parse('HEAD').hexsha + self.assertEqual(newRev, cloned_sha) + + def test_post_and_master_checkout(self): + project = "org/project1" + master_project = "org/project2" + path = os.path.join(self.upstream_root, project) + repo = git.Repo(path) + repo.head.reference = repo.heads['master'] + commits = [] + for i in range(0, 3): + commits.append(self.create_commit(project)) + newRev = commits[1] + + cloner = zuul.lib.cloner.Cloner( + git_base_url=self.upstream_root, + projects=[project, master_project], + workspace=self.workspace_root, + zuul_branch=None, + zuul_ref='master', + zuul_url=self.git_root, + zuul_project=project, + zuul_newrev=newRev + ) + cloner.execute() + repos = self.getWorkspaceRepos([project, master_project]) + cloned_sha = repos[project].rev_parse('HEAD').hexsha + self.assertEqual(newRev, cloned_sha) + self.assertEqual( + repos[master_project].rev_parse('HEAD').hexsha, + repos[master_project].rev_parse('master').hexsha) diff --git a/zuul/cmd/cloner.py b/zuul/cmd/cloner.py index c616aa145f..4f8b9f474a 100755 --- a/zuul/cmd/cloner.py +++ b/zuul/cmd/cloner.py @@ -27,6 +27,8 @@ ZUUL_ENV_SUFFIXES = ( 'branch', 'ref', 'url', + 'project', + 'newrev', ) @@ -98,6 +100,10 @@ class Cloner(zuul.cmd.ZuulApp): parser.error("Specifying a Zuul ref requires a Zuul url. " "Define Zuul arguments either via environment " "variables or using options above.") + if 'zuul_newrev' in zuul_args and 'zuul_project' not in zuul_args: + parser.error("ZUUL_NEWREV has been specified without " + "ZUUL_PROJECT. Please define a ZUUL_PROJECT or do " + "not set ZUUL_NEWREV.") self.args = args @@ -145,6 +151,8 @@ class Cloner(zuul.cmd.ZuulApp): clone_map_file=self.args.clone_map_file, project_branches=project_branches, cache_dir=self.args.cache_dir, + zuul_newrev=self.args.zuul_newrev, + zuul_project=self.args.zuul_project, ) cloner.execute() diff --git a/zuul/exceptions.py b/zuul/exceptions.py index 2bd2c6b4b6..40a1e40f52 100644 --- a/zuul/exceptions.py +++ b/zuul/exceptions.py @@ -22,5 +22,14 @@ class ChangeNotFound(Exception): super(ChangeNotFound, self).__init__(message) +class RevNotFound(Exception): + def __init__(self, project, rev): + self.project = project + self.revision = rev + message = ("Failed to checkout project '%s' at revision '%s'" + % (self.project, self.revision)) + super(RevNotFound, self).__init__(message) + + class MergeFailure(Exception): pass diff --git a/zuul/lib/cloner.py b/zuul/lib/cloner.py index f0235a6965..62ab9388ea 100644 --- a/zuul/lib/cloner.py +++ b/zuul/lib/cloner.py @@ -20,6 +20,7 @@ import re import yaml from git import GitCommandError +from zuul import exceptions from zuul.lib.clonemapper import CloneMapper from zuul.merger.merger import Repo @@ -29,7 +30,8 @@ class Cloner(object): def __init__(self, git_base_url, projects, workspace, zuul_branch, zuul_ref, zuul_url, branch=None, clone_map_file=None, - project_branches=None, cache_dir=None): + project_branches=None, cache_dir=None, zuul_newrev=None, + zuul_project=None): self.clone_map = [] self.dests = None @@ -43,6 +45,10 @@ class Cloner(object): self.zuul_ref = zuul_ref or '' self.zuul_url = zuul_url self.project_branches = project_branches or {} + self.project_revisions = {} + + if zuul_newrev and zuul_project: + self.project_revisions[zuul_project] = zuul_newrev if clone_map_file: self.readCloneMap(clone_map_file) @@ -119,10 +125,15 @@ class Cloner(object): """Clone a repository for project at dest and apply a reference suitable for testing. The reference lookup is attempted in this order: - 1) Zuul reference for the indicated branch - 2) Zuul reference for the master branch - 3) The tip of the indicated branch - 4) The tip of the master branch + 1) The indicated revision for specific project + 2) Zuul reference for the indicated branch + 3) Zuul reference for the master branch + 4) The tip of the indicated branch + 5) The tip of the master branch + + If an "indicated revision" is specified for this project, and we are + unable to meet this requirement, we stop attempting to check this + repo out and raise a zuul.exceptions.RevNotFound exception. The "indicated branch" is one of the following: @@ -142,6 +153,10 @@ class Cloner(object): # `git branch` is happy with. repo.reset() + indicated_revision = None + if project in self.project_revisions: + indicated_revision = self.project_revisions[project] + indicated_branch = self.branch or self.zuul_branch if project in self.project_branches: indicated_branch = self.project_branches[project] @@ -167,13 +182,26 @@ class Cloner(object): else: fallback_zuul_ref = None + # If the user has requested an explicit revision to be checked out, + # we use it above all else, and if we cannot satisfy this requirement + # we raise an error and do not attempt to continue. + if indicated_revision: + self.log.info("Attempting to check out revision %s for " + "project %s", indicated_revision, project) + try: + self.fetchFromZuul(repo, project, self.zuul_ref) + commit = repo.checkout(indicated_revision) + except (ValueError, GitCommandError): + raise exceptions.RevNotFound(project, indicated_revision) + self.log.info("Prepared '%s' repo at revision '%s'", project, + indicated_revision) # If we have a non empty zuul_ref to use, use it. Otherwise we fall # back to checking out the branch. - if ((override_zuul_ref and - self.fetchFromZuul(repo, project, override_zuul_ref)) or - (fallback_zuul_ref and - fallback_zuul_ref != override_zuul_ref and - self.fetchFromZuul(repo, project, fallback_zuul_ref))): + elif ((override_zuul_ref and + self.fetchFromZuul(repo, project, override_zuul_ref)) or + (fallback_zuul_ref and + fallback_zuul_ref != override_zuul_ref and + self.fetchFromZuul(repo, project, fallback_zuul_ref))): # Work around a bug in GitPython which can not parse FETCH_HEAD gitcmd = git.Git(dest) fetch_head = gitcmd.rev_parse('FETCH_HEAD')