From 0dbd8c07847635830c6d5bec38780d37fb14c979 Mon Sep 17 00:00:00 2001 From: Tristan Cacqueray Date: Sun, 2 Sep 2018 00:53:37 +0000 Subject: [PATCH] prometheus: add options to start the server and process collector This change adds a new prometheus_port option to start a metric server to be scrapped by a prometheus service. By default, the server exposes process informations. Change-Id: Ie329df6adc69768dfdb158d00283161f8b70f07a --- doc/source/discussion/components.rst | 11 +++++ doc/source/reference/monitoring.rst | 43 +++++++++++++++++++ etc/zuul.conf-sample | 5 +++ ...rometheus-monitoring-ffa6de30a483e4b5.yaml | 5 +++ requirements.txt | 1 + tests/base.py | 27 ++++++++++++ tests/unit/test_prometheus.py | 42 ++++++++++++++++++ zuul/cmd/__init__.py | 10 +++++ zuul/cmd/executor.py | 1 + zuul/cmd/merger.py | 1 + zuul/cmd/scheduler.py | 1 + zuul/cmd/web.py | 1 + 12 files changed, 148 insertions(+) create mode 100644 releasenotes/notes/prometheus-monitoring-ffa6de30a483e4b5.yaml create mode 100644 tests/unit/test_prometheus.py diff --git a/doc/source/discussion/components.rst b/doc/source/discussion/components.rst index d726968548..7b106a84b2 100644 --- a/doc/source/discussion/components.rst +++ b/doc/source/discussion/components.rst @@ -392,6 +392,17 @@ The following sections of ``zuul.conf`` are used by the scheduler: If a value higher than ``max_hold_expiration`` is supplied during hold request creation, it will be lowered to this value. + .. attr:: prometheus_port + + Set a TCP port to start the prometheus metrics client. + + .. attr:: prometheus_addr + :default: 0.0.0.0 + + The IPv4 addr to listen for prometheus metrics poll. + To use IPv6, python>3.8 is required `issue24209 `_. + + Operation ~~~~~~~~~ diff --git a/doc/source/reference/monitoring.rst b/doc/source/reference/monitoring.rst index 2a91252f05..8680abba81 100644 --- a/doc/source/reference/monitoring.rst +++ b/doc/source/reference/monitoring.rst @@ -503,3 +503,46 @@ following statsd events: * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` +1 * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` 40 seconds * ``zuul.tenant.mytenant.pipeline.gate.all_jobs`` +1 + + +Prometheus monitoring +--------------------- + +Zuul comes with support to start a prometheus_ metric server to be added as +prometheus's target. + +.. _prometheus: https://prometheus.io/docs/introduction/overview/ + + +Configuration +~~~~~~~~~~~~~ + +Prometheus support uses the ``prometheus_client`` python module. +Note that support is optional and Zuul will start without +the prometheus python module present. + +To enable the service, set the ``prometheus_port`` in a service section of +``zuul.conf``. For example setting :attr:`scheduler.prometheus_port` to 9091 +starts a HTTP server to expose metrics to a prometheus services at: +http://scheduler:9091/metrics + + +Metrics +~~~~~~~ + +These metrics are exposed by default: + +.. stat:: process_virtual_memory_bytes + :type: gauge + +.. stat:: process_resident_memory_bytes + :type: gauge + +.. stat:: process_open_fds + :type: gauge + +.. stat:: process_start_time_seconds + :type: gauge + +.. stat:: process_cpu_seconds_total + :type: counter diff --git a/etc/zuul.conf-sample b/etc/zuul.conf-sample index 4a83e04ed8..040eef8267 100644 --- a/etc/zuul.conf-sample +++ b/etc/zuul.conf-sample @@ -26,22 +26,27 @@ tenant_config=/etc/zuul/main.yaml log_config=/etc/zuul/logging.conf pidfile=/var/run/zuul/zuul.pid state_dir=/var/lib/zuul +prometheus_port=9091 +;prometheus_addr=0.0.0.0 [merger] git_dir=/var/lib/zuul/git ;git_user_email=zuul@example.com ;git_user_name=zuul +prometheus_port=9092 [executor] default_username=zuul trusted_ro_paths=/opt/zuul-scripts:/var/cache trusted_rw_paths=/opt/zuul-logs +prometheus_port=9093 [web] listen_address=127.0.0.1 port=9000 static_cache_expiry=0 status_url=https://zuul.example.com/status +prometheus_port=9094 [webclient] url=https://zuul.example.com diff --git a/releasenotes/notes/prometheus-monitoring-ffa6de30a483e4b5.yaml b/releasenotes/notes/prometheus-monitoring-ffa6de30a483e4b5.yaml new file mode 100644 index 0000000000..58f09d5bed --- /dev/null +++ b/releasenotes/notes/prometheus-monitoring-ffa6de30a483e4b5.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + A new prometheus_port option for the services can be used to start the + prometheus python client and exposes metrics. diff --git a/requirements.txt b/requirements.txt index 2f7ac5d4d4..3ca11a6ad0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ GitPython>=2.1.8 python-daemon>=2.0.4 extras statsd>=3.0 +prometheus-client voluptuous>=0.10.2 gear>=0.13.0,<1.0.0,!=0.15.0 apscheduler>=3.0 diff --git a/tests/base.py b/tests/base.py index 09597f0659..f8fc14f599 100644 --- a/tests/base.py +++ b/tests/base.py @@ -65,6 +65,7 @@ from git.exc import NoSuchPathError from git.util import IterableList import yaml import paramiko +import prometheus_client.exposition from zuul.driver.sql.sqlconnection import DatabaseSession from zuul.model import Change @@ -868,6 +869,25 @@ class FakeGerritChange(object): self.reported += 1 +class PrometheusServer(object): + def start(self): + app = prometheus_client.make_wsgi_app(prometheus_client.REGISTRY) + self.httpd = prometheus_client.exposition.make_server( + "0.0.0.0", + 0, + app, + prometheus_client.exposition.ThreadingWSGIServer, + handler_class=prometheus_client.exposition._SilentHandler) + self.port = self.httpd.socket.getsockname()[1] + self.thread = threading.Thread(target=self.httpd.serve_forever) + self.thread.daemon = True + self.thread.start() + + def stop(self): + self.httpd.shutdown() + self.thread.join() + + class GerritWebServer(object): def __init__(self, fake_gerrit): @@ -4156,6 +4176,10 @@ class ZuulTestCase(BaseTestCase): server that all of the Zuul components in this test use to communicate with each other. + :ivar PrometheusServer prometheus_server: An instance of + :py:class: ~test.base.PrometheusServer` which is the Prometheus + metrics endpoint. + :ivar RecordingExecutorServer executor_server: An instance of :py:class:`~tests.base.RecordingExecutorServer` which is the Ansible execute server used to run jobs for this test. @@ -4279,6 +4303,8 @@ class ZuulTestCase(BaseTestCase): self.statsd.start() self.gearman_server = FakeGearmanServer(self.use_ssl) + self.prometheus_server = PrometheusServer() + self.prometheus_server.start() self.config.set('gearman', 'port', str(self.gearman_server.port)) self.log.info("Gearman server on port %s" % @@ -4673,6 +4699,7 @@ class ZuulTestCase(BaseTestCase): self.statsd.join() self.rpcclient.shutdown() self.gearman_server.shutdown() + self.prometheus_server.stop() self.fake_nodepool.stop() self.zk_client.disconnect() self.printHistory() diff --git a/tests/unit/test_prometheus.py b/tests/unit/test_prometheus.py new file mode 100644 index 0000000000..ac185361d2 --- /dev/null +++ b/tests/unit/test_prometheus.py @@ -0,0 +1,42 @@ +# Copyright 2019 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import requests + +from tests.base import ZuulTestCase + + +class BaseTestPrometheus(ZuulTestCase): + tenant_config_file = 'config/single-tenant/main.yaml' + + def get_metrics(self): + r = requests.get( + "http://localhost:%d" % self.prometheus_server.port) + metrics = {} + for line in r.text.split('\n'): + if not line or line.startswith("#"): + continue + try: + key, value = line.split() + except ValueError: + continue + metrics[key] = value + return metrics + + +class TestPrometheus(BaseTestPrometheus): + def test_prometheus_process_metrics(self): + metrics = self.get_metrics() + self.assertIn("process_resident_memory_bytes", metrics) + self.assertIn("process_open_fds", metrics) diff --git a/zuul/cmd/__init__.py b/zuul/cmd/__init__.py index e4f6803d93..46d5902ac7 100755 --- a/zuul/cmd/__init__.py +++ b/zuul/cmd/__init__.py @@ -28,6 +28,7 @@ import sys import traceback import threading +prometheus_client = extras.try_import('prometheus_client') yappi = extras.try_import('yappi') objgraph = extras.try_import('objgraph') @@ -199,6 +200,15 @@ class ZuulDaemonApp(ZuulApp, metaclass=abc.ABCMeta): "Configured logging: {version}".format( version=zuul_version_info.release_string())) + def setup_prometheus(self, section): + if self.config.has_option(section, 'prometheus_port'): + if not prometheus_client: + raise RuntimeError("prometheus_client library is missing.") + port = int(self.config.get(section, 'prometheus_port')) + addr = get_default( + self.config, section, 'prometheus_addr', '0.0.0.0') + prometheus_client.start_http_server(port, addr) + def main(self): self.parseArguments() self.readConfig() diff --git a/zuul/cmd/executor.py b/zuul/cmd/executor.py index 96d484afbc..15ff0272b6 100755 --- a/zuul/cmd/executor.py +++ b/zuul/cmd/executor.py @@ -87,6 +87,7 @@ class Executor(zuul.cmd.ZuulDaemonApp): os.mkdir(self.job_dir) self.setup_logging('executor', 'log_config') + self.setup_prometheus('executor') self.log = logging.getLogger("zuul.Executor") self.finger_port = int( diff --git a/zuul/cmd/merger.py b/zuul/cmd/merger.py index e5f4128269..5e318fb921 100755 --- a/zuul/cmd/merger.py +++ b/zuul/cmd/merger.py @@ -50,6 +50,7 @@ class Merger(zuul.cmd.ZuulDaemonApp): self.configure_connections(source_only=True) self.setup_logging('merger', 'log_config') + self.setup_prometheus('merger') self.merger = MergeServer(self.config, self.connections) self.merger.start() diff --git a/zuul/cmd/scheduler.py b/zuul/cmd/scheduler.py index 04ce746b7b..1a5c59438a 100755 --- a/zuul/cmd/scheduler.py +++ b/zuul/cmd/scheduler.py @@ -132,6 +132,7 @@ class Scheduler(zuul.cmd.ZuulDaemonApp): self.start_gear_server() self.setup_logging('scheduler', 'log_config') + self.setup_prometheus('scheduler') self.log = logging.getLogger("zuul.Scheduler") self.configure_connections(require_sql=True) diff --git a/zuul/cmd/web.py b/zuul/cmd/web.py index db63b77a94..8a74b7889a 100755 --- a/zuul/cmd/web.py +++ b/zuul/cmd/web.py @@ -84,6 +84,7 @@ class WebServer(zuul.cmd.ZuulDaemonApp): sys.exit(0) self.setup_logging('web', 'log_config') + self.setup_prometheus('web') self.log = logging.getLogger("zuul.WebServer") try: