From d4f75ffac8facbb14d0fe55d754db6f18b138f0a Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Fri, 7 Dec 2018 13:22:20 +0100 Subject: [PATCH] Add timer for starting_builds We currently have a gauge for starting_builds but actually have no knowledge about how long jobs are in the starting state. This adds a metric for this so we can see changes in the job startup time after changes in the system. Change-Id: I261f8bdc8de336967b9c8ecd6eafc68f0bfe6b78 --- doc/source/admin/monitoring.rst | 18 +++++++++++++++--- ...starting-builds-timer-7f05fd11d5da3358.yaml | 6 ++++++ tests/unit/test_scheduler.py | 2 ++ zuul/executor/server.py | 7 +++++++ 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst index bd47f7be7e..6680b0f413 100644 --- a/doc/source/admin/monitoring.rst +++ b/doc/source/admin/monitoring.rst @@ -145,10 +145,22 @@ These metrics are emitted by the Zuul :ref:`scheduler`: Incremented each time the executor starts a build. .. stat:: starting_builds - :type: gauge + :type: gauge, timer - The number of builds starting on this executor. These are - builds which have not yet begun their first pre-playbook. + The number of builds starting on this executor and a timer containing + how long jobs were in this state. These are builds which have not yet + begun their first pre-playbook. + + The timer needs special thoughts when interpreting it because it + aggregates all jobs. It can be useful when aggregating it over a longer + period of time (maybe a day) where fast rising graphs could indicate e.g. + IO problems of the machines the executors are running on. But it has to + be noted that a rising graph also can indicate a higher usage of complex + jobs using more required projects. Also comparing several executors might + give insight if the graphs differ a lot from each other. Typically the + jobs are equally distributed over all executors (in the same zone when + using the zone feature) and as such the starting jobs timers (aggregated + over a large enough interval) should not differ much. .. stat:: running_builds :type: gauge diff --git a/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml new file mode 100644 index 0000000000..05bdddd49c --- /dev/null +++ b/releasenotes/notes/starting-builds-timer-7f05fd11d5da3358.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The executors emit a new timer + :stat:`zuul.executor..starting_builds` with the time jobs spent + during starting. diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 06d7b8df62..24cffb2cd6 100644 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -161,6 +161,8 @@ class TestScheduler(ZuulTestCase): exec_key = 'zuul.executor.%s' % self.executor_server.hostname.replace( '.', '_') self.assertReportedStat(exec_key + '.builds', value='1', kind='c') + self.assertReportedStat(exec_key + '.starting_builds', kind='g') + self.assertReportedStat(exec_key + '.starting_builds', kind='ms') self.assertReportedStat( 'zuul.nodepool.requests.requested.total', value='1', kind='c') self.assertReportedStat( diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 79566282ce..0917f54ab1 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -648,6 +648,7 @@ class AnsibleJob(object): self.proc_lock = threading.Lock() self.running = False self.started = False # Whether playbooks have started running + self.time_starting_build = None self.paused = False self.aborted = False self.aborted_reason = None @@ -736,6 +737,7 @@ class AnsibleJob(object): def execute(self): try: + self.time_starting_build = time.monotonic() self.ssh_agent.start() self.ssh_agent.add(self.private_key_file) for key in self.arguments.get('ssh_keys', []): @@ -1093,6 +1095,11 @@ class AnsibleJob(object): pre_failed = False success = False + if self.executor_server.statsd: + key = "zuul.executor.{hostname}.starting_builds" + self.executor_server.statsd.timing( + key, (time.monotonic() - self.time_starting_build) * 1000) + self.started = True time_started = time.time() # timeout value is "total" job timeout which accounts for