Report the per-job build wait time to graphite

Knowing how long a job waits to build in aggregate can give useful insights into the performance and capacity of the build system. This change also uses the node labels sent back from the gearman worker to submit metrics within that context. Change-Id: I173251c8b1569755124b7cb1a48b6274bf38c94b Depends-On: Ibca938fcf8a65facd7e39dab4eb994dfc637722a
2015-08-07 20:08:04 -05:00 · 2015-08-07 20:08:04 -05:00 · b2332081d8
parent f8e892567c
commit b2332081d8
5 changed files with 39 additions and 5 deletions
--- a/tests/base.py
+++ b/tests/base.py
@ -598,6 +598,8 @@ class FakeBuild(threading.Thread):
            result = 'RUN_ERROR'
        else:
            data['result'] = result
+            data['node_labels'] = ['bare-necessities']
+            data['node_name'] = 'foo'
            work_fail = False

        changes = None
--- a/zuul/cmd/client.py
+++ b/zuul/cmd/client.py
@ -232,6 +232,12 @@ class Client(zuul.cmd.ZuulApp):
            'number': {
                'title': 'Number'
            },
+            'node_labels': {
+                'title': 'Node Labels'
+            },
+            'node_name': {
+                'title': 'Node Name'
+            },
            'worker.name': {
                'title': 'Worker'
            },
@ -245,7 +251,7 @@ class Client(zuul.cmd.ZuulApp):
            'worker.fqdn': {
                'title': 'Worker Domain'
            },
-            'worker.progam': {
+            'worker.program': {
                'title': 'Worker Program'
            },
            'worker.version': {
--- a/zuul/launcher/gearman.py
+++ b/zuul/launcher/gearman.py
@ -423,9 +423,11 @@ class Gearman(object):

        build = self.builds.get(job.unique)
        if build:
+            data = getJobData(job)
+            build.node_labels = data.get('node_labels')
+            build.node_name = data.get('node_name')
            if not build.canceled:
                if result is None:
-                    data = getJobData(job)
                    result = data.get('result')
                if result is None:
                    build.retry = True
--- a/zuul/model.py
+++ b/zuul/model.py
@ -578,6 +578,8 @@ class Build(object):
        self.retry = False
        self.parameters = {}
        self.worker = Worker()
+        self.node_labels = []
+        self.node_name = None

    def __repr__(self):
        return ('<Build %s of %s on %s>' %
@ -799,7 +801,9 @@ class QueueItem(object):
                'canceled': build.canceled if build else None,
                'retry': build.retry if build else None,
                'number': build.number if build else None,
-                'worker': worker
+                'node_labels': build.node_labels if build else [],
+                'node_name': build.node_name if build else None,
+                'worker': worker,
            })

        if self.pipeline.haveAllJobsStarted(self):
--- a/zuul/scheduler.py
+++ b/zuul/scheduler.py
@ -533,6 +533,16 @@ class Scheduler(threading.Thread):
    def onBuildStarted(self, build):
        self.log.debug("Adding start event for build: %s" % build)
        build.start_time = time.time()
+        try:
+            if statsd and build.pipeline:
+                jobname = build.job.name.replace('.', '_')
+                key = 'zuul.pipeline.%s.job.%s.wait_time' % (
+                    build.pipeline.name, jobname)
+                dt = int((build.start_time - build.launch_time) * 1000)
+                statsd.timing(key, dt)
+                statsd.incr(key)
+        except:
+            self.log.exception("Exception reporting runtime stats")
        event = BuildStartedEvent(build)
        self.result_event_queue.put(event)
        self.wake_event.set()
@ -544,14 +554,24 @@ class Scheduler(threading.Thread):
        try:
            if statsd and build.pipeline:
                jobname = build.job.name.replace('.', '_')
+                key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
+                for label in build.node_labels:
+                    # Jenkins includes the node name in its list of labels, so
+                    # we filter it out here, since that is not statistically
+                    # interesting.
+                    if label == build.node_name:
+                        continue
+                    dt = int((build.start_time - build.launch_time) * 1000)
+                    key = 'zuul.node_type.%s.job.%s.wait_time' % (
+                        label, jobname)
+                    statsd.timing(key, dt)
+                    statsd.incr(key)
                key = 'zuul.pipeline.%s.job.%s.%s' % (build.pipeline.name,
                                                      jobname, build.result)
                if build.result in ['SUCCESS', 'FAILURE'] and build.start_time:
                    dt = int((build.end_time - build.start_time) * 1000)
                    statsd.timing(key, dt)
                statsd.incr(key)
-                key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
-                statsd.incr(key)
        except:
            self.log.exception("Exception reporting runtime stats")
        event = BuildCompletedEvent(build)