From b2332081d8e0fda780bbc1f4dfd917f2546d1724 Mon Sep 17 00:00:00 2001 From: Timothy Chavez Date: Fri, 7 Aug 2015 20:08:04 -0500 Subject: [PATCH] Report the per-job build wait time to graphite Knowing how long a job waits to build in aggregate can give useful insights into the performance and capacity of the build system. This change also uses the node labels sent back from the gearman worker to submit metrics within that context. Change-Id: I173251c8b1569755124b7cb1a48b6274bf38c94b Depends-On: Ibca938fcf8a65facd7e39dab4eb994dfc637722a --- tests/base.py | 2 ++ zuul/cmd/client.py | 8 +++++++- zuul/launcher/gearman.py | 4 +++- zuul/model.py | 6 +++++- zuul/scheduler.py | 24 ++++++++++++++++++++++-- 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/tests/base.py b/tests/base.py index 535bb7fac4..fd27a89f87 100755 --- a/tests/base.py +++ b/tests/base.py @@ -598,6 +598,8 @@ class FakeBuild(threading.Thread): result = 'RUN_ERROR' else: data['result'] = result + data['node_labels'] = ['bare-necessities'] + data['node_name'] = 'foo' work_fail = False changes = None diff --git a/zuul/cmd/client.py b/zuul/cmd/client.py index bc2c152d80..10f28540ec 100644 --- a/zuul/cmd/client.py +++ b/zuul/cmd/client.py @@ -232,6 +232,12 @@ class Client(zuul.cmd.ZuulApp): 'number': { 'title': 'Number' }, + 'node_labels': { + 'title': 'Node Labels' + }, + 'node_name': { + 'title': 'Node Name' + }, 'worker.name': { 'title': 'Worker' }, @@ -245,7 +251,7 @@ class Client(zuul.cmd.ZuulApp): 'worker.fqdn': { 'title': 'Worker Domain' }, - 'worker.progam': { + 'worker.program': { 'title': 'Worker Program' }, 'worker.version': { diff --git a/zuul/launcher/gearman.py b/zuul/launcher/gearman.py index 57ac5cacd6..6c8b689060 100644 --- a/zuul/launcher/gearman.py +++ b/zuul/launcher/gearman.py @@ -423,9 +423,11 @@ class Gearman(object): build = self.builds.get(job.unique) if build: + data = getJobData(job) + build.node_labels = data.get('node_labels') + build.node_name = data.get('node_name') if not build.canceled: if result is None: - data = getJobData(job) result = data.get('result') if result is None: build.retry = True diff --git a/zuul/model.py b/zuul/model.py index 4b907c3e45..e852033372 100644 --- a/zuul/model.py +++ b/zuul/model.py @@ -578,6 +578,8 @@ class Build(object): self.retry = False self.parameters = {} self.worker = Worker() + self.node_labels = [] + self.node_name = None def __repr__(self): return ('' % @@ -799,7 +801,9 @@ class QueueItem(object): 'canceled': build.canceled if build else None, 'retry': build.retry if build else None, 'number': build.number if build else None, - 'worker': worker + 'node_labels': build.node_labels if build else [], + 'node_name': build.node_name if build else None, + 'worker': worker, }) if self.pipeline.haveAllJobsStarted(self): diff --git a/zuul/scheduler.py b/zuul/scheduler.py index 9620036a10..f3ecc033fc 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -533,6 +533,16 @@ class Scheduler(threading.Thread): def onBuildStarted(self, build): self.log.debug("Adding start event for build: %s" % build) build.start_time = time.time() + try: + if statsd and build.pipeline: + jobname = build.job.name.replace('.', '_') + key = 'zuul.pipeline.%s.job.%s.wait_time' % ( + build.pipeline.name, jobname) + dt = int((build.start_time - build.launch_time) * 1000) + statsd.timing(key, dt) + statsd.incr(key) + except: + self.log.exception("Exception reporting runtime stats") event = BuildStartedEvent(build) self.result_event_queue.put(event) self.wake_event.set() @@ -544,14 +554,24 @@ class Scheduler(threading.Thread): try: if statsd and build.pipeline: jobname = build.job.name.replace('.', '_') + key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name + for label in build.node_labels: + # Jenkins includes the node name in its list of labels, so + # we filter it out here, since that is not statistically + # interesting. + if label == build.node_name: + continue + dt = int((build.start_time - build.launch_time) * 1000) + key = 'zuul.node_type.%s.job.%s.wait_time' % ( + label, jobname) + statsd.timing(key, dt) + statsd.incr(key) key = 'zuul.pipeline.%s.job.%s.%s' % (build.pipeline.name, jobname, build.result) if build.result in ['SUCCESS', 'FAILURE'] and build.start_time: dt = int((build.end_time - build.start_time) * 1000) statsd.timing(key, dt) statsd.incr(key) - key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name - statsd.incr(key) except: self.log.exception("Exception reporting runtime stats") event = BuildCompletedEvent(build)