Browse Source

Emit some stats from executor

Emit the load average, a counter for builds, and a guage for
running builds.

Change-Id: I8541724f1322b8257b623b3b2cfd8f3e6b95574d
changes/73/511073/9
James E. Blair 5 years ago
parent
commit
faf8198f2a
  1. 23
      doc/source/admin/monitoring.rst
  2. 6
      tests/base.py
  3. 2
      tests/unit/test_scheduler.py
  4. 18
      zuul/executor/server.py

23
doc/source/admin/monitoring.rst

@ -32,7 +32,7 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
Zuul will report counters for each type of event it receives from
each of its configured drivers.
.. stat:: zuul.<tenant>.pipeline
.. stat:: zuul.tenant.<tenant>.pipeline
Holds metrics specific to jobs. This hierarchy includes:
@ -125,6 +125,27 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
How long each item spent in the pipeline before its first job
started.
.. stat:: zuul.executor.<executor>
Holds metrics emitted by individual executors. The ``<executor>``
component of the key will be replaced with the hostname of the
executor.
.. stat:: builds
:type: counter
Incremented each time the executor starts a build.
.. stat:: running_builds
:type: gauge
The number of builds currently running on this executor.
.. stat:: load_average
:type: gauge
The one-minute load average of this executor, multiplied by 100.
As an example, given a job named `myjob` in `mytenant` triggered by a
change to `myproject` on the `master` branch in the `gate` pipeline

6
tests/base.py

@ -1429,6 +1429,9 @@ class RecordingExecutorServer(zuul.executor.server.ExecutorServer):
be explicitly released.
"""
_job_class = RecordingAnsibleJob
def __init__(self, *args, **kw):
self._run_ansible = kw.pop('_run_ansible', False)
self._test_root = kw.pop('_test_root', False)
@ -1483,8 +1486,7 @@ class RecordingExecutorServer(zuul.executor.server.ExecutorServer):
args = json.loads(job.arguments)
args['zuul']['_test'] = dict(test_root=self._test_root)
job.arguments = json.dumps(args)
self.job_workers[job.unique] = RecordingAnsibleJob(self, job)
self.job_workers[job.unique].run()
super(RecordingExecutorServer, self).executeJob(job)
def stopJob(self, job):
self.log.debug("handle stop")

2
tests/unit/test_scheduler.py

@ -117,6 +117,8 @@ class TestScheduler(ZuulTestCase):
self.assertReportedStat(
'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.'
'org_project.master.total_changes', value='1|c')
exec_key = 'zuul.executor.%s' % self.executor_server.hostname
self.assertReportedStat(exec_key + '.builds', value='1|c')
for build in self.history:
self.assertTrue(build.parameters['zuul']['voting'])

18
zuul/executor/server.py

@ -29,6 +29,7 @@ import time
import traceback
from zuul.lib.yamlutil import yaml
from zuul.lib.config import get_default
from zuul.lib.statsd import get_statsd
try:
import ara.plugins.callbacks as ara_callbacks
@ -1483,6 +1484,7 @@ class ExecutorExecuteWorker(gear.TextWorker):
class ExecutorServer(object):
log = logging.getLogger("zuul.ExecutorServer")
_job_class = AnsibleJob
def __init__(self, config, connections={}, jobdir_root=None,
keep_jobdir=False, log_streaming_port=DEFAULT_FINGER_PORT):
@ -1506,6 +1508,7 @@ class ExecutorServer(object):
nokeep=self.nokeep,
)
self.statsd = get_statsd(config)
self.merge_root = get_default(self.config, 'executor', 'git_dir',
'/var/lib/zuul/executor-git')
self.default_username = get_default(self.config, 'executor',
@ -1652,6 +1655,10 @@ class ExecutorServer(object):
"to worker:")
self.merger_worker.shutdown()
self.executor_worker.shutdown()
if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname
self.statsd.gauge(base_key + '.load_average', 0)
self.statsd.gauge(base_key + '.running_builds', 0)
self.log.debug("Stopped")
def pause(self):
@ -1776,7 +1783,10 @@ class ExecutorServer(object):
self.manageLoad()
def executeJob(self, job):
self.job_workers[job.unique] = AnsibleJob(self, job)
if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname
self.statsd.incr(base_key + '.builds')
self.job_workers[job.unique] = self._job_class(self, job)
self.job_workers[job.unique].run()
def manageLoad(self):
@ -1795,6 +1805,12 @@ class ExecutorServer(object):
"Re-registering as load is within limits {} <= {}".format(
load_avg, self.max_load_avg))
self.register_work()
if self.statsd:
base_key = 'zuul.executor.%s' % self.hostname
self.statsd.gauge(base_key + '.load_average',
int(load_avg * 100))
self.statsd.gauge(base_key + '.running_builds',
len(self.job_workers))
def finishJob(self, unique):
del(self.job_workers[unique])

Loading…
Cancel
Save