Browse Source

Add timer for starting_builds

We currently have a gauge for starting_builds but actually have no
knowledge about how long jobs are in the starting state. This adds a
metric for this so we can see changes in the job startup time after
changes in the system.

Change-Id: I261f8bdc8de336967b9c8ecd6eafc68f0bfe6b78
tags/3.5.0
Tobias Henkel 6 months ago
parent
commit
d4f75ffac8
No account linked to committer's email address

+ 16
- 4
doc/source/admin/monitoring.rst View File

@@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
145 145
       Incremented each time the executor starts a build.
146 146
 
147 147
    .. stat:: starting_builds
148
-      :type: gauge
149
-
150
-      The number of builds starting on this executor.  These are
151
-      builds which have not yet begun their first pre-playbook.
148
+      :type: gauge, timer
149
+
150
+      The number of builds starting on this executor and a timer containing
151
+      how long jobs were in this state. These are builds which have not yet
152
+      begun their first pre-playbook.
153
+
154
+      The timer needs special thoughts when interpreting it because it
155
+      aggregates all jobs. It can be useful when aggregating it over a longer
156
+      period of time (maybe a day) where fast rising graphs could indicate e.g.
157
+      IO problems of the machines the executors are running on. But it has to
158
+      be noted that a rising graph also can indicate a higher usage of complex
159
+      jobs using more required projects. Also comparing several executors might
160
+      give insight if the graphs differ a lot from each other. Typically the
161
+      jobs are equally distributed over all executors (in the same zone when
162
+      using the zone feature) and as such the starting jobs timers (aggregated
163
+      over a large enough interval) should not differ much.
152 164
 
153 165
    .. stat:: running_builds
154 166
       :type: gauge

+ 6
- 0
releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+features:
3
+  - |
4
+    The executors emit a new timer
5
+    :stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
6
+    during starting.

+ 2
- 0
tests/unit/test_scheduler.py View File

@@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
161 161
         exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
162 162
             '.', '_')
163 163
         self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
164
+        self.assertReportedStat(exec_key + '.starting_builds', kind='g')
165
+        self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
164 166
         self.assertReportedStat(
165 167
             'zuul.nodepool.requests.requested.total', value='1', kind='c')
166 168
         self.assertReportedStat(

+ 7
- 0
zuul/executor/server.py View File

@@ -648,6 +648,7 @@ class AnsibleJob(object):
648 648
         self.proc_lock = threading.Lock()
649 649
         self.running = False
650 650
         self.started = False  # Whether playbooks have started running
651
+        self.time_starting_build = None
651 652
         self.paused = False
652 653
         self.aborted = False
653 654
         self.aborted_reason = None
@@ -736,6 +737,7 @@ class AnsibleJob(object):
736 737
 
737 738
     def execute(self):
738 739
         try:
740
+            self.time_starting_build = time.monotonic()
739 741
             self.ssh_agent.start()
740 742
             self.ssh_agent.add(self.private_key_file)
741 743
             for key in self.arguments.get('ssh_keys', []):
@@ -1093,6 +1095,11 @@ class AnsibleJob(object):
1093 1095
 
1094 1096
         pre_failed = False
1095 1097
         success = False
1098
+        if self.executor_server.statsd:
1099
+            key = "zuul.executor.{hostname}.starting_builds"
1100
+            self.executor_server.statsd.timing(
1101
+                key, (time.monotonic() - self.time_starting_build) * 1000)
1102
+
1096 1103
         self.started = True
1097 1104
         time_started = time.time()
1098 1105
         # timeout value is "total" job timeout which accounts for

Loading…
Cancel
Save