Merge "Add timer for starting_builds"
This commit is contained in:
commit
fd53894f73
|
@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
||||||
Incremented each time the executor starts a build.
|
Incremented each time the executor starts a build.
|
||||||
|
|
||||||
.. stat:: starting_builds
|
.. stat:: starting_builds
|
||||||
:type: gauge
|
:type: gauge, timer
|
||||||
|
|
||||||
The number of builds starting on this executor. These are
|
The number of builds starting on this executor and a timer containing
|
||||||
builds which have not yet begun their first pre-playbook.
|
how long jobs were in this state. These are builds which have not yet
|
||||||
|
begun their first pre-playbook.
|
||||||
|
|
||||||
|
The timer needs special thoughts when interpreting it because it
|
||||||
|
aggregates all jobs. It can be useful when aggregating it over a longer
|
||||||
|
period of time (maybe a day) where fast rising graphs could indicate e.g.
|
||||||
|
IO problems of the machines the executors are running on. But it has to
|
||||||
|
be noted that a rising graph also can indicate a higher usage of complex
|
||||||
|
jobs using more required projects. Also comparing several executors might
|
||||||
|
give insight if the graphs differ a lot from each other. Typically the
|
||||||
|
jobs are equally distributed over all executors (in the same zone when
|
||||||
|
using the zone feature) and as such the starting jobs timers (aggregated
|
||||||
|
over a large enough interval) should not differ much.
|
||||||
|
|
||||||
.. stat:: running_builds
|
.. stat:: running_builds
|
||||||
:type: gauge
|
:type: gauge
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
The executors emit a new timer
|
||||||
|
:stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
|
||||||
|
during starting.
|
|
@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
|
||||||
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
|
exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
|
||||||
'.', '_')
|
'.', '_')
|
||||||
self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
|
self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
|
||||||
|
self.assertReportedStat(exec_key + '.starting_builds', kind='g')
|
||||||
|
self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
|
||||||
self.assertReportedStat(
|
self.assertReportedStat(
|
||||||
'zuul.nodepool.requests.requested.total', value='1', kind='c')
|
'zuul.nodepool.requests.requested.total', value='1', kind='c')
|
||||||
self.assertReportedStat(
|
self.assertReportedStat(
|
||||||
|
|
|
@ -648,6 +648,7 @@ class AnsibleJob(object):
|
||||||
self.proc_lock = threading.Lock()
|
self.proc_lock = threading.Lock()
|
||||||
self.running = False
|
self.running = False
|
||||||
self.started = False # Whether playbooks have started running
|
self.started = False # Whether playbooks have started running
|
||||||
|
self.time_starting_build = None
|
||||||
self.paused = False
|
self.paused = False
|
||||||
self.aborted = False
|
self.aborted = False
|
||||||
self.aborted_reason = None
|
self.aborted_reason = None
|
||||||
|
@ -736,6 +737,7 @@ class AnsibleJob(object):
|
||||||
|
|
||||||
def execute(self):
|
def execute(self):
|
||||||
try:
|
try:
|
||||||
|
self.time_starting_build = time.monotonic()
|
||||||
self.ssh_agent.start()
|
self.ssh_agent.start()
|
||||||
self.ssh_agent.add(self.private_key_file)
|
self.ssh_agent.add(self.private_key_file)
|
||||||
for key in self.arguments.get('ssh_keys', []):
|
for key in self.arguments.get('ssh_keys', []):
|
||||||
|
@ -1093,6 +1095,11 @@ class AnsibleJob(object):
|
||||||
|
|
||||||
pre_failed = False
|
pre_failed = False
|
||||||
success = False
|
success = False
|
||||||
|
if self.executor_server.statsd:
|
||||||
|
key = "zuul.executor.{hostname}.starting_builds"
|
||||||
|
self.executor_server.statsd.timing(
|
||||||
|
key, (time.monotonic() - self.time_starting_build) * 1000)
|
||||||
|
|
||||||
self.started = True
|
self.started = True
|
||||||
time_started = time.time()
|
time_started = time.time()
|
||||||
# timeout value is "total" job timeout which accounts for
|
# timeout value is "total" job timeout which accounts for
|
||||||
|
|
Loading…
Reference in New Issue