From d4f75ffac8facbb14d0fe55d754db6f18b138f0a Mon Sep 17 00:00:00 2001
From: Tobias Henkel <tobias.henkel@bmw.de>
Date: Fri, 7 Dec 2018 13:22:20 +0100
Subject: [PATCH] Add timer for starting_builds

We currently have a gauge for starting_builds but actually have no
knowledge about how long jobs are in the starting state. This adds a
metric for this so we can see changes in the job startup time after
changes in the system.

Change-Id: I261f8bdc8de336967b9c8ecd6eafc68f0bfe6b78
---
 doc/source/admin/monitoring.rst                | 18 +++++++++++++++---
 ...starting-builds-timer-7f05fd11d5da3358.yaml |  6 ++++++
 tests/unit/test_scheduler.py                   |  2 ++
 zuul/executor/server.py                        |  7 +++++++
 4 files changed, 30 insertions(+), 3 deletions(-)
 create mode 100644 releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst
index bd47f7be7e..6680b0f413 100644
--- a/doc/source/admin/monitoring.rst
+++ b/doc/source/admin/monitoring.rst
@@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
       Incremented each time the executor starts a build.
 
    .. stat:: starting_builds
-      :type: gauge
+      :type: gauge, timer
 
-      The number of builds starting on this executor.  These are
-      builds which have not yet begun their first pre-playbook.
+      The number of builds starting on this executor and a timer containing
+      how long jobs were in this state. These are builds which have not yet
+      begun their first pre-playbook.
+
+      The timer needs special thoughts when interpreting it because it
+      aggregates all jobs. It can be useful when aggregating it over a longer
+      period of time (maybe a day) where fast rising graphs could indicate e.g.
+      IO problems of the machines the executors are running on. But it has to
+      be noted that a rising graph also can indicate a higher usage of complex
+      jobs using more required projects. Also comparing several executors might
+      give insight if the graphs differ a lot from each other. Typically the
+      jobs are equally distributed over all executors (in the same zone when
+      using the zone feature) and as such the starting jobs timers (aggregated
+      over a large enough interval) should not differ much.
 
    .. stat:: running_builds
       :type: gauge
diff --git a/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
new file mode 100644
index 0000000000..05bdddd49c
--- /dev/null
+++ b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml
@@ -0,0 +1,6 @@
+---
+features:
+  - |
+    The executors emit a new timer
+    :stat:`zuul.executor.<executor>.starting_builds` with the time jobs spent
+    during starting.
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 06d7b8df62..24cffb2cd6 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase):
         exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace(
             '.', '_')
         self.assertReportedStat(exec_key + '.builds', value='1', kind='c')
+        self.assertReportedStat(exec_key + '.starting_builds', kind='g')
+        self.assertReportedStat(exec_key + '.starting_builds', kind='ms')
         self.assertReportedStat(
             'zuul.nodepool.requests.requested.total', value='1', kind='c')
         self.assertReportedStat(
diff --git a/zuul/executor/server.py b/zuul/executor/server.py
index 79566282ce..0917f54ab1 100644
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -648,6 +648,7 @@ class AnsibleJob(object):
         self.proc_lock = threading.Lock()
         self.running = False
         self.started = False  # Whether playbooks have started running
+        self.time_starting_build = None
         self.paused = False
         self.aborted = False
         self.aborted_reason = None
@@ -736,6 +737,7 @@ class AnsibleJob(object):
 
     def execute(self):
         try:
+            self.time_starting_build = time.monotonic()
             self.ssh_agent.start()
             self.ssh_agent.add(self.private_key_file)
             for key in self.arguments.get('ssh_keys', []):
@@ -1093,6 +1095,11 @@ class AnsibleJob(object):
 
         pre_failed = False
         success = False
+        if self.executor_server.statsd:
+            key = "zuul.executor.{hostname}.starting_builds"
+            self.executor_server.statsd.timing(
+                key, (time.monotonic() - self.time_starting_build) * 1000)
+
         self.started = True
         time_started = time.time()
         # timeout value is "total" job timeout which accounts for