Report the per-job build wait time to graphite

Knowing how long a job waits to build in aggregate can give useful
insights into the performance and capacity of the build system. This
change also uses the node labels sent back from the gearman worker to
submit metrics within that context.

Change-Id: I173251c8b1569755124b7cb1a48b6274bf38c94b
Depends-On: Ibca938fcf8a65facd7e39dab4eb994dfc637722a
This commit is contained in:
Timothy Chavez 2015-08-07 20:08:04 -05:00
parent f8e892567c
commit b2332081d8
5 changed files with 39 additions and 5 deletions

View File

@ -598,6 +598,8 @@ class FakeBuild(threading.Thread):
result = 'RUN_ERROR'
else:
data['result'] = result
data['node_labels'] = ['bare-necessities']
data['node_name'] = 'foo'
work_fail = False
changes = None

View File

@ -232,6 +232,12 @@ class Client(zuul.cmd.ZuulApp):
'number': {
'title': 'Number'
},
'node_labels': {
'title': 'Node Labels'
},
'node_name': {
'title': 'Node Name'
},
'worker.name': {
'title': 'Worker'
},
@ -245,7 +251,7 @@ class Client(zuul.cmd.ZuulApp):
'worker.fqdn': {
'title': 'Worker Domain'
},
'worker.progam': {
'worker.program': {
'title': 'Worker Program'
},
'worker.version': {

View File

@ -423,9 +423,11 @@ class Gearman(object):
build = self.builds.get(job.unique)
if build:
data = getJobData(job)
build.node_labels = data.get('node_labels')
build.node_name = data.get('node_name')
if not build.canceled:
if result is None:
data = getJobData(job)
result = data.get('result')
if result is None:
build.retry = True

View File

@ -578,6 +578,8 @@ class Build(object):
self.retry = False
self.parameters = {}
self.worker = Worker()
self.node_labels = []
self.node_name = None
def __repr__(self):
return ('<Build %s of %s on %s>' %
@ -799,7 +801,9 @@ class QueueItem(object):
'canceled': build.canceled if build else None,
'retry': build.retry if build else None,
'number': build.number if build else None,
'worker': worker
'node_labels': build.node_labels if build else [],
'node_name': build.node_name if build else None,
'worker': worker,
})
if self.pipeline.haveAllJobsStarted(self):

View File

@ -533,6 +533,16 @@ class Scheduler(threading.Thread):
def onBuildStarted(self, build):
self.log.debug("Adding start event for build: %s" % build)
build.start_time = time.time()
try:
if statsd and build.pipeline:
jobname = build.job.name.replace('.', '_')
key = 'zuul.pipeline.%s.job.%s.wait_time' % (
build.pipeline.name, jobname)
dt = int((build.start_time - build.launch_time) * 1000)
statsd.timing(key, dt)
statsd.incr(key)
except:
self.log.exception("Exception reporting runtime stats")
event = BuildStartedEvent(build)
self.result_event_queue.put(event)
self.wake_event.set()
@ -544,14 +554,24 @@ class Scheduler(threading.Thread):
try:
if statsd and build.pipeline:
jobname = build.job.name.replace('.', '_')
key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
for label in build.node_labels:
# Jenkins includes the node name in its list of labels, so
# we filter it out here, since that is not statistically
# interesting.
if label == build.node_name:
continue
dt = int((build.start_time - build.launch_time) * 1000)
key = 'zuul.node_type.%s.job.%s.wait_time' % (
label, jobname)
statsd.timing(key, dt)
statsd.incr(key)
key = 'zuul.pipeline.%s.job.%s.%s' % (build.pipeline.name,
jobname, build.result)
if build.result in ['SUCCESS', 'FAILURE'] and build.start_time:
dt = int((build.end_time - build.start_time) * 1000)
statsd.timing(key, dt)
statsd.incr(key)
key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name
statsd.incr(key)
except:
self.log.exception("Exception reporting runtime stats")
event = BuildCompletedEvent(build)