From faf8198f2a360e9a780c3006c660d1178ad86d8a Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 10 Oct 2017 15:42:26 -0700 Subject: [PATCH] Emit some stats from executor Emit the load average, a counter for builds, and a guage for running builds. Change-Id: I8541724f1322b8257b623b3b2cfd8f3e6b95574d --- doc/source/admin/monitoring.rst | 23 ++++++++++++++++++++++- tests/base.py | 6 ++++-- tests/unit/test_scheduler.py | 2 ++ zuul/executor/server.py | 18 +++++++++++++++++- 4 files changed, 45 insertions(+), 4 deletions(-) diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst index 7c2ac80063..dc6be0de3b 100644 --- a/doc/source/admin/monitoring.rst +++ b/doc/source/admin/monitoring.rst @@ -32,7 +32,7 @@ These metrics are emitted by the Zuul :ref:`scheduler`: Zuul will report counters for each type of event it receives from each of its configured drivers. -.. stat:: zuul..pipeline +.. stat:: zuul.tenant..pipeline Holds metrics specific to jobs. This hierarchy includes: @@ -125,6 +125,27 @@ These metrics are emitted by the Zuul :ref:`scheduler`: How long each item spent in the pipeline before its first job started. +.. stat:: zuul.executor. + + Holds metrics emitted by individual executors. The ```` + component of the key will be replaced with the hostname of the + executor. + + .. stat:: builds + :type: counter + + Incremented each time the executor starts a build. + + .. stat:: running_builds + :type: gauge + + The number of builds currently running on this executor. + + .. stat:: load_average + :type: gauge + + The one-minute load average of this executor, multiplied by 100. + As an example, given a job named `myjob` in `mytenant` triggered by a change to `myproject` on the `master` branch in the `gate` pipeline diff --git a/tests/base.py b/tests/base.py index 028a194e88..035ff0ca52 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1429,6 +1429,9 @@ class RecordingExecutorServer(zuul.executor.server.ExecutorServer): be explicitly released. """ + + _job_class = RecordingAnsibleJob + def __init__(self, *args, **kw): self._run_ansible = kw.pop('_run_ansible', False) self._test_root = kw.pop('_test_root', False) @@ -1483,8 +1486,7 @@ class RecordingExecutorServer(zuul.executor.server.ExecutorServer): args = json.loads(job.arguments) args['zuul']['_test'] = dict(test_root=self._test_root) job.arguments = json.dumps(args) - self.job_workers[job.unique] = RecordingAnsibleJob(self, job) - self.job_workers[job.unique].run() + super(RecordingExecutorServer, self).executeJob(job) def stopJob(self, job): self.log.debug("handle stop") diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index c1f0a0ee84..ab0d0fd3e2 100755 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -117,6 +117,8 @@ class TestScheduler(ZuulTestCase): self.assertReportedStat( 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' 'org_project.master.total_changes', value='1|c') + exec_key = 'zuul.executor.%s' % self.executor_server.hostname + self.assertReportedStat(exec_key + '.builds', value='1|c') for build in self.history: self.assertTrue(build.parameters['zuul']['voting']) diff --git a/zuul/executor/server.py b/zuul/executor/server.py index 670a42021a..d9edc249cc 100644 --- a/zuul/executor/server.py +++ b/zuul/executor/server.py @@ -29,6 +29,7 @@ import time import traceback from zuul.lib.yamlutil import yaml from zuul.lib.config import get_default +from zuul.lib.statsd import get_statsd try: import ara.plugins.callbacks as ara_callbacks @@ -1483,6 +1484,7 @@ class ExecutorExecuteWorker(gear.TextWorker): class ExecutorServer(object): log = logging.getLogger("zuul.ExecutorServer") + _job_class = AnsibleJob def __init__(self, config, connections={}, jobdir_root=None, keep_jobdir=False, log_streaming_port=DEFAULT_FINGER_PORT): @@ -1506,6 +1508,7 @@ class ExecutorServer(object): nokeep=self.nokeep, ) + self.statsd = get_statsd(config) self.merge_root = get_default(self.config, 'executor', 'git_dir', '/var/lib/zuul/executor-git') self.default_username = get_default(self.config, 'executor', @@ -1652,6 +1655,10 @@ class ExecutorServer(object): "to worker:") self.merger_worker.shutdown() self.executor_worker.shutdown() + if self.statsd: + base_key = 'zuul.executor.%s' % self.hostname + self.statsd.gauge(base_key + '.load_average', 0) + self.statsd.gauge(base_key + '.running_builds', 0) self.log.debug("Stopped") def pause(self): @@ -1776,7 +1783,10 @@ class ExecutorServer(object): self.manageLoad() def executeJob(self, job): - self.job_workers[job.unique] = AnsibleJob(self, job) + if self.statsd: + base_key = 'zuul.executor.%s' % self.hostname + self.statsd.incr(base_key + '.builds') + self.job_workers[job.unique] = self._job_class(self, job) self.job_workers[job.unique].run() def manageLoad(self): @@ -1795,6 +1805,12 @@ class ExecutorServer(object): "Re-registering as load is within limits {} <= {}".format( load_avg, self.max_load_avg)) self.register_work() + if self.statsd: + base_key = 'zuul.executor.%s' % self.hostname + self.statsd.gauge(base_key + '.load_average', + int(load_avg * 100)) + self.statsd.gauge(base_key + '.running_builds', + len(self.job_workers)) def finishJob(self, unique): del(self.job_workers[unique])