Merge "Add timer for starting_builds"

This commit is contained in:
Zuul 2019-01-07 22:57:07 +00:00 committed by Gerrit Code Review
commit fd53894f73
4 changed files with 30 additions and 3 deletions

View File

@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Incremented each time the executor starts a build. Incremented each time the executor starts a build.
.. stat:: starting_builds .. stat:: starting_builds
:type: gauge :type: gauge, timer
The number of builds starting on this executor. These are The number of builds starting on this executor and a timer containing
builds which have not yet begun their first pre-playbook. how long jobs were in this state. These are builds which have not yet
begun their first pre-playbook.
The timer needs special thoughts when interpreting it because it
aggregates all jobs. It can be useful when aggregating it over a longer
period of time (maybe a day) where fast rising graphs could indicate e.g.
IO problems of the machines the executors are running on. But it has to
be noted that a rising graph also can indicate a higher usage of complex
jobs using more required projects. Also comparing several executors might
give insight if the graphs differ a lot from each other. Typically the
jobs are equally distributed over all executors (in the same zone when
using the zone feature) and as such the starting jobs timers (aggregated
over a large enough interval) should not differ much.
.. stat:: running_builds .. stat:: running_builds
:type: gauge :type: gauge

View File

@ -0,0 +1,6 @@
---
features:
- |
The executors emit a new timer
:stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
during starting.

View File

@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace( exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
'.', '_') '.', '_')
self.assertReportedStat(exec_key + '.builds', value='1', kind='c') self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
self.assertReportedStat(exec_key + '.starting_builds', kind='g')
self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
self.assertReportedStat( self.assertReportedStat(
'zuul.nodepool.requests.requested.total', value='1', kind='c') 'zuul.nodepool.requests.requested.total', value='1', kind='c')
self.assertReportedStat( self.assertReportedStat(

View File

@ -648,6 +648,7 @@ class AnsibleJob(object):
self.proc_lock = threading.Lock() self.proc_lock = threading.Lock()
self.running = False self.running = False
self.started = False # Whether playbooks have started running self.started = False # Whether playbooks have started running
self.time_starting_build = None
self.paused = False self.paused = False
self.aborted = False self.aborted = False
self.aborted_reason = None self.aborted_reason = None
@ -736,6 +737,7 @@ class AnsibleJob(object):
def execute(self): def execute(self):
try: try:
self.time_starting_build = time.monotonic()
self.ssh_agent.start() self.ssh_agent.start()
self.ssh_agent.add(self.private_key_file) self.ssh_agent.add(self.private_key_file)
for key in self.arguments.get('ssh_keys', []): for key in self.arguments.get('ssh_keys', []):
@ -1093,6 +1095,11 @@ class AnsibleJob(object):
pre_failed = False pre_failed = False
success = False success = False
if self.executor_server.statsd:
key = "zuul.executor.{hostname}.starting_builds"
self.executor_server.statsd.timing(
key, (time.monotonic() - self.time_starting_build) * 1000)
self.started = True self.started = True
time_started = time.time() time_started = time.time()
# timeout value is "total" job timeout which accounts for # timeout value is "total" job timeout which accounts for