Merge "prometheus: add options to start the server and process collector"

This commit is contained in:
Zuul 2021-05-28 16:44:50 +00:00 committed by Gerrit Code Review
commit d48201651a
12 changed files with 148 additions and 0 deletions

View File

@ -392,6 +392,17 @@ The following sections of ``zuul.conf`` are used by the scheduler:
If a value higher than ``max_hold_expiration`` is supplied during
hold request creation, it will be lowered to this value.
.. attr:: prometheus_port
Set a TCP port to start the prometheus metrics client.
.. attr:: prometheus_addr
:default: 0.0.0.0
The IPv4 addr to listen for prometheus metrics poll.
To use IPv6, python>3.8 is required `issue24209 <https://bugs.python.org/issue24209>`_.
Operation
~~~~~~~~~

View File

@ -523,3 +523,46 @@ following statsd events:
* ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` +1
* ``zuul.tenant.mytenant.pipeline.gate.project.example_com.myproject.master.job.myjob.SUCCESS`` 40 seconds
* ``zuul.tenant.mytenant.pipeline.gate.all_jobs`` +1
Prometheus monitoring
---------------------
Zuul comes with support to start a prometheus_ metric server to be added as
prometheus's target.
.. _prometheus: https://prometheus.io/docs/introduction/overview/
Configuration
~~~~~~~~~~~~~
Prometheus support uses the ``prometheus_client`` python module.
Note that support is optional and Zuul will start without
the prometheus python module present.
To enable the service, set the ``prometheus_port`` in a service section of
``zuul.conf``. For example setting :attr:`scheduler.prometheus_port` to 9091
starts a HTTP server to expose metrics to a prometheus services at:
http://scheduler:9091/metrics
Metrics
~~~~~~~
These metrics are exposed by default:
.. stat:: process_virtual_memory_bytes
:type: gauge
.. stat:: process_resident_memory_bytes
:type: gauge
.. stat:: process_open_fds
:type: gauge
.. stat:: process_start_time_seconds
:type: gauge
.. stat:: process_cpu_seconds_total
:type: counter

View File

@ -26,22 +26,27 @@ tenant_config=/etc/zuul/main.yaml
log_config=/etc/zuul/logging.conf
pidfile=/var/run/zuul/zuul.pid
state_dir=/var/lib/zuul
prometheus_port=9091
;prometheus_addr=0.0.0.0
[merger]
git_dir=/var/lib/zuul/git
;git_user_email=zuul@example.com
;git_user_name=zuul
prometheus_port=9092
[executor]
default_username=zuul
trusted_ro_paths=/opt/zuul-scripts:/var/cache
trusted_rw_paths=/opt/zuul-logs
prometheus_port=9093
[web]
listen_address=127.0.0.1
port=9000
static_cache_expiry=0
status_url=https://zuul.example.com/status
prometheus_port=9094
[webclient]
url=https://zuul.example.com

View File

@ -0,0 +1,5 @@
---
features:
- |
A new prometheus_port option for the services can be used to start the
prometheus python client and exposes metrics.

View File

@ -10,6 +10,7 @@ GitPython>=2.1.8
python-daemon>=2.0.4
extras
statsd>=3.0
prometheus-client
voluptuous>=0.10.2
gear>=0.13.0,<1.0.0,!=0.15.0
apscheduler>=3.0

View File

@ -65,6 +65,7 @@ from git.exc import NoSuchPathError
from git.util import IterableList
import yaml
import paramiko
import prometheus_client.exposition
from zuul.driver.sql.sqlconnection import DatabaseSession
from zuul.model import Change
@ -871,6 +872,25 @@ class FakeGerritChange(object):
self.reported += 1
class PrometheusServer(object):
def start(self):
app = prometheus_client.make_wsgi_app(prometheus_client.REGISTRY)
self.httpd = prometheus_client.exposition.make_server(
"0.0.0.0",
0,
app,
prometheus_client.exposition.ThreadingWSGIServer,
handler_class=prometheus_client.exposition._SilentHandler)
self.port = self.httpd.socket.getsockname()[1]
self.thread = threading.Thread(target=self.httpd.serve_forever)
self.thread.daemon = True
self.thread.start()
def stop(self):
self.httpd.shutdown()
self.thread.join()
class GerritWebServer(object):
def __init__(self, fake_gerrit):
@ -4168,6 +4188,10 @@ class ZuulTestCase(BaseTestCase):
server that all of the Zuul components in this test use to
communicate with each other.
:ivar PrometheusServer prometheus_server: An instance of
:py:class: ~test.base.PrometheusServer` which is the Prometheus
metrics endpoint.
:ivar RecordingExecutorServer executor_server: An instance of
:py:class:`~tests.base.RecordingExecutorServer` which is the
Ansible execute server used to run jobs for this test.
@ -4291,6 +4315,8 @@ class ZuulTestCase(BaseTestCase):
self.statsd.start()
self.gearman_server = FakeGearmanServer(self.use_ssl)
self.prometheus_server = PrometheusServer()
self.prometheus_server.start()
self.config.set('gearman', 'port', str(self.gearman_server.port))
self.log.info("Gearman server on port %s" %
@ -4687,6 +4713,7 @@ class ZuulTestCase(BaseTestCase):
self.statsd.join()
self.rpcclient.shutdown()
self.gearman_server.shutdown()
self.prometheus_server.stop()
self.fake_nodepool.stop()
self.zk_client.disconnect()
self.printHistory()

View File

@ -0,0 +1,42 @@
# Copyright 2019 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import requests
from tests.base import ZuulTestCase
class BaseTestPrometheus(ZuulTestCase):
tenant_config_file = 'config/single-tenant/main.yaml'
def get_metrics(self):
r = requests.get(
"http://localhost:%d" % self.prometheus_server.port)
metrics = {}
for line in r.text.split('\n'):
if not line or line.startswith("#"):
continue
try:
key, value = line.split()
except ValueError:
continue
metrics[key] = value
return metrics
class TestPrometheus(BaseTestPrometheus):
def test_prometheus_process_metrics(self):
metrics = self.get_metrics()
self.assertIn("process_resident_memory_bytes", metrics)
self.assertIn("process_open_fds", metrics)

View File

@ -28,6 +28,7 @@ import sys
import traceback
import threading
prometheus_client = extras.try_import('prometheus_client')
yappi = extras.try_import('yappi')
objgraph = extras.try_import('objgraph')
@ -199,6 +200,15 @@ class ZuulDaemonApp(ZuulApp, metaclass=abc.ABCMeta):
"Configured logging: {version}".format(
version=zuul_version_info.release_string()))
def setup_prometheus(self, section):
if self.config.has_option(section, 'prometheus_port'):
if not prometheus_client:
raise RuntimeError("prometheus_client library is missing.")
port = int(self.config.get(section, 'prometheus_port'))
addr = get_default(
self.config, section, 'prometheus_addr', '0.0.0.0')
prometheus_client.start_http_server(port, addr)
def main(self):
self.parseArguments()
self.readConfig()

View File

@ -87,6 +87,7 @@ class Executor(zuul.cmd.ZuulDaemonApp):
os.mkdir(self.job_dir)
self.setup_logging('executor', 'log_config')
self.setup_prometheus('executor')
self.log = logging.getLogger("zuul.Executor")
self.finger_port = int(

View File

@ -50,6 +50,7 @@ class Merger(zuul.cmd.ZuulDaemonApp):
self.configure_connections(source_only=True)
self.setup_logging('merger', 'log_config')
self.setup_prometheus('merger')
self.merger = MergeServer(self.config, self.connections)
self.merger.start()

View File

@ -132,6 +132,7 @@ class Scheduler(zuul.cmd.ZuulDaemonApp):
self.start_gear_server()
self.setup_logging('scheduler', 'log_config')
self.setup_prometheus('scheduler')
self.log = logging.getLogger("zuul.Scheduler")
self.configure_connections(require_sql=True)

View File

@ -84,6 +84,7 @@ class WebServer(zuul.cmd.ZuulDaemonApp):
sys.exit(0)
self.setup_logging('web', 'log_config')
self.setup_prometheus('web')
self.log = logging.getLogger("zuul.WebServer")
try: