From 80ac158acd607b5404e36a58db3097801d8a679e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 9 Oct 2017 07:02:40 -0700 Subject: [PATCH] Update statsd output for tenants Update the statsd output to account for tenants and other v3 changes. Change-Id: I984e1930ab63d9a551cf33be922bac447ad0df9d --- doc/source/admin/monitoring.rst | 108 ++++++++++++++++++-------------- tests/unit/test_scheduler.py | 31 +++++---- zuul/manager/__init__.py | 25 +++++--- zuul/scheduler.py | 39 ++++++------ 4 files changed, 118 insertions(+), 85 deletions(-) diff --git a/doc/source/admin/monitoring.rst b/doc/source/admin/monitoring.rst index 4fed1f9a23..a8b2324f0d 100644 --- a/doc/source/admin/monitoring.rst +++ b/doc/source/admin/monitoring.rst @@ -33,17 +33,13 @@ Metrics These metrics are emitted by the Zuul :ref:`scheduler`: -.. stat:: gerrit.event. +.. stat:: zuul.event..event. :type: counter - Gerrit emits different kinds of messages over its `stream-events` - interface. Zuul will report counters for each type of event it - receives from Gerrit. + Zuul will report counters for each type of event it receives from + each of its configured drivers. - Refer to your Gerrit installation documentation for a complete - list of Gerrit event types. - -.. stat:: zuul.pipeline +.. stat:: zuul..pipeline Holds metrics specific to jobs. This hierarchy includes: @@ -63,22 +59,60 @@ These metrics are emitted by the Zuul :ref:`scheduler`: The number of items currently being processed by this pipeline. - .. stat:: job + .. stat:: project - Subtree detailing per jobs statistics: + This hierarchy holds more specific metrics for each project + participating in the pipeline. - .. stat:: + .. stat:: - The triggered job name. + The canonical hostname for the triggering project. + Embedded ``.`` characters will be translated to ``_``. - .. stat:: - :type: counter, timer + .. stat:: - A counter for each type of result (e.g., ``SUCCESS`` or - ``FAILURE``, ``ERROR``, etc.) for the job. If the - result is ``SUCCESS`` or ``FAILURE``, Zuul will - additionally report the duration of the build as a - timer. + The name of the triggering project. Embedded ``/`` or + ``.`` characters will be translated to ``_``. + + .. stat:: + + The name of the triggering branch. Embedded ``/`` or + ``.`` characters will be translated to ``_``. + + .. stat:: job + + Subtree detailing per-project job statistics: + + .. stat:: + + The triggered job name. + + .. stat:: + :type: counter, timer + + A counter for each type of result (e.g., ``SUCCESS`` or + ``FAILURE``, ``ERROR``, etc.) for the job. If the + result is ``SUCCESS`` or ``FAILURE``, Zuul will + additionally report the duration of the build as a + timer. + + .. stat:: current_changes + :type: gauge + + The number of items of this project currently being + processed by this pipeline. + + .. stat:: resident_time + :type: timer + + A timer metric reporting how long each item for this + project has been in the pipeline. + + .. stat:: total_changes + :type: counter + + The number of changes for this project processed by the + pipeline since Zuul started. .. stat:: resident_time :type: timer @@ -98,34 +132,12 @@ These metrics are emitted by the Zuul :ref:`scheduler`: How long each item spent in the pipeline before its first job started. - .. stat:: - This hierarchy holds more specific metrics for each project - participating in the pipeline. If the project name contains - a ``/`` character, it will be replaced with a ``.``. +As an example, given a job named `myjob` in `mytenant` triggered by a +change to `myproject` on the `master` branch in the `gate` pipeline +which took 40 seconds to build, the Zuul scheduler will emit the +following statsd events: - .. stat:: current_changes - :type: gauge - - The number of items of this project currently being - processed by this pipeline. - - .. stat:: resident_time - :type: timer - - A timer metric reporting how long each item for this - project has been in the pipeline. - - .. stat:: total_changes - :type: counter - - The number of changes for this project processed by the - pipeline since Zuul started. - -As an example, given a job named `myjob` triggered by the `gate` pipeline -which took 40 seconds to build, the Zuul scheduler will emit the following -statsd events: - - * ``zuul.pipeline.gate.job.myjob.SUCCESS`` +1 - * ``zuul.pipeline.gate.job.myjob`` 40 seconds - * ``zuul.pipeline.gate.all_jobs`` +1 + * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` +1 + * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` 40 seconds + * ``zuul.tenant.mytenant.pipeline.gate.all_jobs`` +1 diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py index 6ab1bccf13..32039605e8 100755 --- a/tests/unit/test_scheduler.py +++ b/tests/unit/test_scheduler.py @@ -89,25 +89,34 @@ class TestScheduler(ZuulTestCase): self.assertEqual(self.getJobFromHistory('project-test2').node, 'label1') + for stat in self.statsd.stats: + k, v = stat.decode('utf-8').split(':') + self.log.debug('stat %s:%s', k, v) # TODOv3(jeblair): we may want to report stats by tenant (also?). # Per-driver self.assertReportedStat('zuul.event.gerrit.comment-added', value='1|c') # Per-driver per-connection self.assertReportedStat('zuul.event.gerrit.gerrit.comment-added', value='1|c') - self.assertReportedStat('zuul.pipeline.gate.current_changes', - value='1|g') - self.assertReportedStat('zuul.pipeline.gate.job.project-merge.SUCCESS', - kind='ms') - self.assertReportedStat('zuul.pipeline.gate.job.project-merge.SUCCESS', - value='1|c') - self.assertReportedStat('zuul.pipeline.gate.resident_time', kind='ms') - self.assertReportedStat('zuul.pipeline.gate.total_changes', - value='1|c') self.assertReportedStat( - 'zuul.pipeline.gate.org.project.resident_time', kind='ms') + 'zuul.tenant.tenant-one.pipeline.gate.current_changes', + value='1|g') self.assertReportedStat( - 'zuul.pipeline.gate.org.project.total_changes', value='1|c') + 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' + 'org_project.master.job.project-merge.SUCCESS', kind='ms') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' + 'org_project.master.job.project-merge.SUCCESS', value='1|c') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.resident_time', kind='ms') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.total_changes', value='1|c') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' + 'org_project.master.resident_time', kind='ms') + self.assertReportedStat( + 'zuul.tenant.tenant-one.pipeline.gate.project.review_example_com.' + 'org_project.master.total_changes', value='1|c') for build in self.history: self.assertTrue(build.parameters['zuul']['voting']) diff --git a/zuul/manager/__init__.py b/zuul/manager/__init__.py index 0c3d123daa..51851e139c 100644 --- a/zuul/manager/__init__.py +++ b/zuul/manager/__init__.py @@ -820,19 +820,28 @@ class PipelineManager(object): dt = None items = len(self.pipeline.getAllItems()) - # stats.timers.zuul.pipeline.NAME.resident_time - # stats_counts.zuul.pipeline.NAME.total_changes - # stats.gauges.zuul.pipeline.NAME.current_changes - key = 'zuul.pipeline.%s' % self.pipeline.name + tenant = self.pipeline.layout.tenant + basekey = 'zuul.tenant.%s' % tenant.name + key = '%s.pipeline.%s' % (basekey, self.pipeline.name) + # stats.timers.zuul.tenant..pipeline..resident_time + # stats_counts.zuul.tenant..pipeline..total_changes + # stats.gauges.zuul.tenant..pipeline..current_changes self.sched.statsd.gauge(key + '.current_changes', items) if dt: self.sched.statsd.timing(key + '.resident_time', dt) self.sched.statsd.incr(key + '.total_changes') - # stats.timers.zuul.pipeline.NAME.ORG.PROJECT.resident_time - # stats_counts.zuul.pipeline.NAME.ORG.PROJECT.total_changes - project_name = item.change.project.name.replace('/', '.') - key += '.%s' % project_name + hostname = (item.change.project.canonical_hostname. + replace('.', '_')) + projectname = (item.change.project.name. + replace('.', '_').replace('/', '.')) + projectname = projectname.replace('.', '_').replace('/', '.') + branchname = item.change.branch.replace('.', '_').replace('/', '.') + # stats.timers.zuul.tenant..pipeline.. + # project....resident_time + # stats_counts.zuul.tenant..pipeline.. + # project....total_changes + key += '.project.%s.%s.%s' % (hostname, projectname, branchname) if dt: self.sched.statsd.timing(key + '.resident_time', dt) self.sched.statsd.incr(key + '.total_changes') diff --git a/zuul/scheduler.py b/zuul/scheduler.py index ab147bae0b..cfcd865c47 100644 --- a/zuul/scheduler.py +++ b/zuul/scheduler.py @@ -282,31 +282,34 @@ class Scheduler(threading.Thread): build.result = result try: if self.statsd and build.pipeline: - jobname = build.job.name.replace('.', '_') - key = 'zuul.pipeline.%s.all_jobs' % build.pipeline.name + tenant = build.pipeline.layout.tenant + jobname = build.job.name.replace('.', '_').replace('/', '_') + hostname = (build.build_set.item.change.project. + canonical_hostname.replace('.', '_')) + projectname = (build.build_set.item.change.project.name. + replace('.', '_').replace('/', '_')) + branchname = (build.build_set.item.change.branch. + replace('.', '_').replace('/', '_')) + basekey = 'zuul.tenant.%s' % tenant.name + pipekey = '%s.pipeline.%s' % (basekey, build.pipeline.name) + # zuul.tenant..pipeline..all_jobs + key = '%s.all_jobs' % pipekey self.statsd.incr(key) - for label in build.node_labels: - # Jenkins includes the node name in its list of labels, so - # we filter it out here, since that is not statistically - # interesting. - if label == build.node_name: - continue - dt = int((build.start_time - build.execute_time) * 1000) - key = 'zuul.pipeline.%s.label.%s.wait_time' % ( - build.pipeline.name, label) - self.statsd.timing(key, dt) - key = 'zuul.pipeline.%s.job.%s.%s' % (build.pipeline.name, - jobname, build.result) + jobkey = '%s.project.%s.%s.%s.job.%s' % ( + pipekey, hostname, projectname, branchname, jobname) + # zuul.tenant..pipeline..project. + # ...job.. + key = '%s.%s' % (jobkey, build.result) if build.result in ['SUCCESS', 'FAILURE'] and build.start_time: dt = int((build.end_time - build.start_time) * 1000) self.statsd.timing(key, dt) self.statsd.incr(key) - - key = 'zuul.pipeline.%s.job.%s.wait_time' % ( - build.pipeline.name, jobname) + # zuul.tenant..pipeline..project. + # ...job..wait_time + key = '%s.wait_time' % jobkey dt = int((build.start_time - build.execute_time) * 1000) self.statsd.timing(key, dt) - except: + except Exception: self.log.exception("Exception reporting runtime stats") event = BuildCompletedEvent(build) self.result_event_queue.put(event)