Add timer for starting_builds

We currently have a gauge for starting_builds but actually have no knowledge about how long jobs are in the starting state. This adds a metric for this so we can see changes in the job startup time after changes in the system. Change-Id: I261f8bdc8de336967b9c8ecd6eafc68f0bfe6b78
2018-12-07 13:22:20 +01:00 · 2018-12-07 13:22:20 +01:00 · d4f75ffac8
commit d4f75ffac8
parent ea3e777469
4 changed files with 30 additions and 3 deletions
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
      Incremented each time the executor starts a build.

   .. stat:: starting_builds
-      :type: gauge
+      :type: gauge, timer

-      The number of builds starting on this executor.  These are
-      builds which have not yet begun their first pre-playbook.
+      The number of builds starting on this executor and a timer containing
+      how long jobs were in this state. These are builds which have not yet
+      begun their first pre-playbook.
+
+      The timer needs special thoughts when interpreting it because it
+      aggregates all jobs. It can be useful when aggregating it over a longer
+      period of time (maybe a day) where fast rising graphs could indicate e.g.
+      IO problems of the machines the executors are running on. But it has to
+      be noted that a rising graph also can indicate a higher usage of complex
+      jobs using more required projects. Also comparing several executors might
+      give insight if the graphs differ a lot from each other. Typically the
+      jobs are equally distributed over all executors (in the same zone when
+      using the zone feature) and as such the starting jobs timers (aggregated
+      over a large enough interval) should not differ much.

   .. stat:: running_builds
      :type: gauge
--- a/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
+++ b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
@ -0,0 +1,6 @@
+---
+features:
+  - |
+    The executors emit a new timer
+    :stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
+    during starting.
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
        exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
            '.', '_')
        self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
+        self.assertReportedStat(exec_key + '.starting_builds', kind='g')
+        self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
        self.assertReportedStat(
            'zuul.nodepool.requests.requested.total', value='1', kind='c')
        self.assertReportedStat(
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -648,6 +648,7 @@ class AnsibleJob(object):
        self.proc_lock = threading.Lock()
        self.running = False
        self.started = False  # Whether playbooks have started running
+        self.time_starting_build = None
        self.paused = False
        self.aborted = False
        self.aborted_reason = None
@ -736,6 +737,7 @@ class AnsibleJob(object):

    def execute(self):
        try:
+            self.time_starting_build = time.monotonic()
            self.ssh_agent.start()
            self.ssh_agent.add(self.private_key_file)
            for key in self.arguments.get('ssh_keys', []):
@ -1093,6 +1095,11 @@ class AnsibleJob(object):

        pre_failed = False
        success = False
+        if self.executor_server.statsd:
+            key = "zuul.executor.{hostname}.starting_builds"
+            self.executor_server.statsd.timing(
+                key, (time.monotonic() - self.time_starting_build) * 1000)
+
        self.started = True
        time_started = time.time()
        # timeout value is "total" job timeout which accounts for