prometheus: add options to start the server and process collector

This change adds a new prometheus_port option to start a metric server
to be scrapped by a prometheus service. By default, the server exposes
process informations.

Change-Id: Ie329df6adc69768dfdb158d00283161f8b70f07a
This commit is contained in:
Tristan Cacqueray 2018-09-02 00:53:37 +00:00
parent 3ca33f0686
commit 0dbd8c0784
12 changed files with 148 additions and 0 deletions

View File

@ -392,6 +392,17 @@ The following sections of ``zuul.conf`` are used by the scheduler:
If a value higher than ``max_hold_expiration`` is supplied during If a value higher than ``max_hold_expiration`` is supplied during
hold request creation, it will be lowered to this value. hold request creation, it will be lowered to this value.
.. attr:: prometheus_port
Set a TCP port to start the prometheus metrics client.
.. attr:: prometheus_addr
:default: 0.0.0.0
The IPv4 addr to listen for prometheus metrics poll.
To use IPv6, python>3.8 is required `issue24209 <https://bugs.python.org/issue24209>`_.
Operation Operation
~~~~~~~~~ ~~~~~~~~~

View File

@ -503,3 +503,46 @@ following statsd events:
* ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` +1 * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` +1
* ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` 40 seconds * ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` 40 seconds
* ``zuul.tenant.mytenant.pipeline.gate.all_jobs`` +1 * ``zuul.tenant.mytenant.pipeline.gate.all_jobs`` +1
Prometheus monitoring
---------------------
Zuul comes with support to start a prometheus_ metric server to be added as
prometheus's target.
.. _prometheus: https://prometheus.io/docs/introduction/overview/
Configuration
~~~~~~~~~~~~~
Prometheus support uses the ``prometheus_client`` python module.
Note that support is optional and Zuul will start without
the prometheus python module present.
To enable the service, set the ``prometheus_port`` in a service section of
``zuul.conf``. For example setting :attr:`scheduler.prometheus_port` to 9091
starts a HTTP server to expose metrics to a prometheus services at:
http://scheduler:9091/metrics
Metrics
~~~~~~~
These metrics are exposed by default:
.. stat:: process_virtual_memory_bytes
:type: gauge
.. stat:: process_resident_memory_bytes
:type: gauge
.. stat:: process_open_fds
:type: gauge
.. stat:: process_start_time_seconds
:type: gauge
.. stat:: process_cpu_seconds_total
:type: counter

View File

@ -26,22 +26,27 @@ tenant_config=/etc/zuul/main.yaml
log_config=/etc/zuul/logging.conf log_config=/etc/zuul/logging.conf
pidfile=/var/run/zuul/zuul.pid pidfile=/var/run/zuul/zuul.pid
state_dir=/var/lib/zuul state_dir=/var/lib/zuul
prometheus_port=9091
;prometheus_addr=0.0.0.0
[merger] [merger]
git_dir=/var/lib/zuul/git git_dir=/var/lib/zuul/git
;git_user_email=zuul@example.com ;git_user_email=zuul@example.com
;git_user_name=zuul ;git_user_name=zuul
prometheus_port=9092
[executor] [executor]
default_username=zuul default_username=zuul
trusted_ro_paths=/opt/zuul-scripts:/var/cache trusted_ro_paths=/opt/zuul-scripts:/var/cache
trusted_rw_paths=/opt/zuul-logs trusted_rw_paths=/opt/zuul-logs
prometheus_port=9093
[web] [web]
listen_address=127.0.0.1 listen_address=127.0.0.1
port=9000 port=9000
static_cache_expiry=0 static_cache_expiry=0
status_url=https://zuul.example.com/status status_url=https://zuul.example.com/status
prometheus_port=9094
[webclient] [webclient]
url=https://zuul.example.com url=https://zuul.example.com

View File

@ -0,0 +1,5 @@
---
features:
- |
A new prometheus_port option for the services can be used to start the
prometheus python client and exposes metrics.

View File

@ -10,6 +10,7 @@ GitPython>=2.1.8
python-daemon>=2.0.4 python-daemon>=2.0.4
extras extras
statsd>=3.0 statsd>=3.0
prometheus-client
voluptuous>=0.10.2 voluptuous>=0.10.2
gear>=0.13.0,<1.0.0,!=0.15.0 gear>=0.13.0,<1.0.0,!=0.15.0
apscheduler>=3.0 apscheduler>=3.0

View File

@ -65,6 +65,7 @@ from git.exc import NoSuchPathError
from git.util import IterableList from git.util import IterableList
import yaml import yaml
import paramiko import paramiko
import prometheus_client.exposition
from zuul.driver.sql.sqlconnection import DatabaseSession from zuul.driver.sql.sqlconnection import DatabaseSession
from zuul.model import Change from zuul.model import Change
@ -868,6 +869,25 @@ class FakeGerritChange(object):
self.reported += 1 self.reported += 1
class PrometheusServer(object):
def start(self):
app = prometheus_client.make_wsgi_app(prometheus_client.REGISTRY)
self.httpd = prometheus_client.exposition.make_server(
"0.0.0.0",
0,
app,
prometheus_client.exposition.ThreadingWSGIServer,
handler_class=prometheus_client.exposition._SilentHandler)
self.port = self.httpd.socket.getsockname()[1]
self.thread = threading.Thread(target=self.httpd.serve_forever)
self.thread.daemon = True
self.thread.start()
def stop(self):
self.httpd.shutdown()
self.thread.join()
class GerritWebServer(object): class GerritWebServer(object):
def __init__(self, fake_gerrit): def __init__(self, fake_gerrit):
@ -4156,6 +4176,10 @@ class ZuulTestCase(BaseTestCase):
server that all of the Zuul components in this test use to server that all of the Zuul components in this test use to
communicate with each other. communicate with each other.
:ivar PrometheusServer prometheus_server: An instance of
:py:class: ~test.base.PrometheusServer` which is the Prometheus
metrics endpoint.
:ivar RecordingExecutorServer executor_server: An instance of :ivar RecordingExecutorServer executor_server: An instance of
:py:class:`~tests.base.RecordingExecutorServer` which is the :py:class:`~tests.base.RecordingExecutorServer` which is the
Ansible execute server used to run jobs for this test. Ansible execute server used to run jobs for this test.
@ -4279,6 +4303,8 @@ class ZuulTestCase(BaseTestCase):
self.statsd.start() self.statsd.start()
self.gearman_server = FakeGearmanServer(self.use_ssl) self.gearman_server = FakeGearmanServer(self.use_ssl)
self.prometheus_server = PrometheusServer()
self.prometheus_server.start()
self.config.set('gearman', 'port', str(self.gearman_server.port)) self.config.set('gearman', 'port', str(self.gearman_server.port))
self.log.info("Gearman server on port %s" % self.log.info("Gearman server on port %s" %
@ -4673,6 +4699,7 @@ class ZuulTestCase(BaseTestCase):
self.statsd.join() self.statsd.join()
self.rpcclient.shutdown() self.rpcclient.shutdown()
self.gearman_server.shutdown() self.gearman_server.shutdown()
self.prometheus_server.stop()
self.fake_nodepool.stop() self.fake_nodepool.stop()
self.zk_client.disconnect() self.zk_client.disconnect()
self.printHistory() self.printHistory()

View File

@ -0,0 +1,42 @@
# Copyright 2019 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import requests
from tests.base import ZuulTestCase
class BaseTestPrometheus(ZuulTestCase):
tenant_config_file = 'config/single-tenant/main.yaml'
def get_metrics(self):
r = requests.get(
"http://localhost:%d" % self.prometheus_server.port)
metrics = {}
for line in r.text.split('\n'):
if not line or line.startswith("#"):
continue
try:
key, value = line.split()
except ValueError:
continue
metrics[key] = value
return metrics
class TestPrometheus(BaseTestPrometheus):
def test_prometheus_process_metrics(self):
metrics = self.get_metrics()
self.assertIn("process_resident_memory_bytes", metrics)
self.assertIn("process_open_fds", metrics)

View File

@ -28,6 +28,7 @@ import sys
import traceback import traceback
import threading import threading
prometheus_client = extras.try_import('prometheus_client')
yappi = extras.try_import('yappi') yappi = extras.try_import('yappi')
objgraph = extras.try_import('objgraph') objgraph = extras.try_import('objgraph')
@ -199,6 +200,15 @@ class ZuulDaemonApp(ZuulApp, metaclass=abc.ABCMeta):
"Configured logging: {version}".format( "Configured logging: {version}".format(
version=zuul_version_info.release_string())) version=zuul_version_info.release_string()))
def setup_prometheus(self, section):
if self.config.has_option(section, 'prometheus_port'):
if not prometheus_client:
raise RuntimeError("prometheus_client library is missing.")
port = int(self.config.get(section, 'prometheus_port'))
addr = get_default(
self.config, section, 'prometheus_addr', '0.0.0.0')
prometheus_client.start_http_server(port, addr)
def main(self): def main(self):
self.parseArguments() self.parseArguments()
self.readConfig() self.readConfig()

View File

@ -87,6 +87,7 @@ class Executor(zuul.cmd.ZuulDaemonApp):
os.mkdir(self.job_dir) os.mkdir(self.job_dir)
self.setup_logging('executor', 'log_config') self.setup_logging('executor', 'log_config')
self.setup_prometheus('executor')
self.log = logging.getLogger("zuul.Executor") self.log = logging.getLogger("zuul.Executor")
self.finger_port = int( self.finger_port = int(

View File

@ -50,6 +50,7 @@ class Merger(zuul.cmd.ZuulDaemonApp):
self.configure_connections(source_only=True) self.configure_connections(source_only=True)
self.setup_logging('merger', 'log_config') self.setup_logging('merger', 'log_config')
self.setup_prometheus('merger')
self.merger = MergeServer(self.config, self.connections) self.merger = MergeServer(self.config, self.connections)
self.merger.start() self.merger.start()

View File

@ -132,6 +132,7 @@ class Scheduler(zuul.cmd.ZuulDaemonApp):
self.start_gear_server() self.start_gear_server()
self.setup_logging('scheduler', 'log_config') self.setup_logging('scheduler', 'log_config')
self.setup_prometheus('scheduler')
self.log = logging.getLogger("zuul.Scheduler") self.log = logging.getLogger("zuul.Scheduler")
self.configure_connections(require_sql=True) self.configure_connections(require_sql=True)

View File

@ -84,6 +84,7 @@ class WebServer(zuul.cmd.ZuulDaemonApp):
sys.exit(0) sys.exit(0)
self.setup_logging('web', 'log_config') self.setup_logging('web', 'log_config')
self.setup_prometheus('web')
self.log = logging.getLogger("zuul.WebServer") self.log = logging.getLogger("zuul.WebServer")
try: try: