Add stats to web server
This adds matrics which report the number of thread workers in use as well as the number of requests queued at the start of each request in cherrypy. It also reports the number of streamers currently running. These can help us detect and diagnose problems with the web server. Change-Id: Iadf9479ae84167892ab11ae122f275637c0c6c6f
This commit is contained in:
parent
482338f70c
commit
b0d36267f3
|
@ -511,6 +511,34 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
|
|||
performance metric of how long the SQL query takes; it is not
|
||||
the estimated time value itself.
|
||||
|
||||
.. stat:: zuul.web
|
||||
|
||||
Holds metrics related to the Zuul web component.
|
||||
|
||||
.. stat:: server.<hostname>
|
||||
|
||||
Holds metrics from a specific zuul-web server.
|
||||
|
||||
.. stat:: threadpool
|
||||
|
||||
Metrics related to the web server thread pool.
|
||||
|
||||
.. stat:: idle
|
||||
:type: gauge
|
||||
|
||||
The number of idle workers.
|
||||
|
||||
.. stat:: queue
|
||||
:type: gauge
|
||||
|
||||
The number of requests queued for workers.
|
||||
|
||||
.. stat:: streamers
|
||||
:type: gauge
|
||||
|
||||
The number of log streamers currently in operation.
|
||||
|
||||
|
||||
As an example, given a job named `myjob` in `mytenant` triggered by a
|
||||
change to `myproject` on the `master` branch in the `gate` pipeline
|
||||
which took 40 seconds to build, the Zuul scheduler will emit the
|
||||
|
@ -559,6 +587,23 @@ These metrics are exposed by default:
|
|||
.. stat:: process_cpu_seconds_total
|
||||
:type: counter
|
||||
|
||||
On web servers the following additional metrics are exposed:
|
||||
|
||||
.. stat:: web_threadpool_idle
|
||||
:type: gauge
|
||||
|
||||
The number of idle workers in the thread pool.
|
||||
|
||||
.. stat:: web_threadpool_queue
|
||||
:type: gauge
|
||||
|
||||
The number of requests queued for thread pool workers.
|
||||
|
||||
.. stat:: web_streamers
|
||||
:type: gauge
|
||||
|
||||
The number of log streamers currently in operation.
|
||||
|
||||
.. _prometheus_liveness:
|
||||
|
||||
Liveness Probes
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
features:
|
||||
- |
|
||||
New monitoring metrics specific to zuul-web are available.
|
|
@ -36,6 +36,7 @@ from typing import Callable, Optional, Any, Iterable, Generator, List, Dict
|
|||
from unittest.case import skipIf
|
||||
import zlib
|
||||
|
||||
import prometheus_client
|
||||
import requests
|
||||
import select
|
||||
import shutil
|
||||
|
@ -4013,6 +4014,24 @@ class PostgresqlSchemaFixture(fixtures.Fixture):
|
|||
cur.execute("drop user %s" % self.name)
|
||||
|
||||
|
||||
class PrometheusFixture(fixtures.Fixture):
|
||||
def _setUp(self):
|
||||
# Save a list of collectors which exist at the start of the
|
||||
# test (ie, the standard prometheus_client collectors)
|
||||
self.collectors = list(
|
||||
prometheus_client.registry.REGISTRY._collector_to_names.keys())
|
||||
self.addCleanup(self._cleanup)
|
||||
|
||||
def _cleanup(self):
|
||||
# Avoid the "Duplicated timeseries in CollectorRegistry" error
|
||||
# by removing any collectors added during the test.
|
||||
collectors = list(
|
||||
prometheus_client.registry.REGISTRY._collector_to_names.keys())
|
||||
for collector in collectors:
|
||||
if collector not in self.collectors:
|
||||
prometheus_client.registry.REGISTRY.unregister(collector)
|
||||
|
||||
|
||||
class FakeCPUTimes:
|
||||
def __init__(self):
|
||||
self.user = 0
|
||||
|
@ -4052,6 +4071,7 @@ class BaseTestCase(testtools.TestCase):
|
|||
|
||||
def setUp(self):
|
||||
super(BaseTestCase, self).setUp()
|
||||
self.useFixture(PrometheusFixture())
|
||||
test_timeout = os.environ.get('OS_TEST_TIMEOUT', 0)
|
||||
try:
|
||||
test_timeout = int(test_timeout)
|
||||
|
|
|
@ -28,6 +28,7 @@ import time
|
|||
import zuul.web
|
||||
import zuul.lib.log_streamer
|
||||
from zuul.lib.fingergw import FingerGateway
|
||||
from zuul.lib.statsd import normalize_statsd_name
|
||||
import tests.base
|
||||
from tests.base import iterate_timeout, ZuulWebFixture, FIXTURE_DIR
|
||||
|
||||
|
@ -391,6 +392,10 @@ class TestStreaming(TestStreamingBase):
|
|||
self.log.debug("\n\nStreamed: %s\n\n", client1.results)
|
||||
self.assertEqual(file_contents, client1.results)
|
||||
|
||||
hostname = normalize_statsd_name(socket.getfqdn())
|
||||
self.assertReportedStat(
|
||||
f'zuul.web.server.{hostname}.streamers', kind='g')
|
||||
|
||||
def test_websocket_streaming(self):
|
||||
# Start the web server
|
||||
web = self.useFixture(
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
# Copyright 2014 Rackspace Australia
|
||||
# Copyright 2021-2022 Acme Gating, LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
|
@ -26,6 +27,7 @@ from unittest import skip
|
|||
|
||||
import requests
|
||||
|
||||
from zuul.lib.statsd import normalize_statsd_name
|
||||
import zuul.web
|
||||
|
||||
from tests.base import ZuulTestCase, AnsibleZuulTestCase
|
||||
|
@ -232,6 +234,11 @@ class TestWeb(BaseTestWeb):
|
|||
self.assertEqual(len(status_jobs[2]['dependencies']), 1)
|
||||
self.assertIn('project-merge', status_jobs[1]['dependencies'])
|
||||
self.assertIn('project-merge', status_jobs[2]['dependencies'])
|
||||
hostname = normalize_statsd_name(socket.getfqdn())
|
||||
self.assertReportedStat(
|
||||
f'zuul.web.server.{hostname}.threadpool.idle', kind='g')
|
||||
self.assertReportedStat(
|
||||
f'zuul.web.server.{hostname}.threadpool.queue', kind='g')
|
||||
|
||||
def test_web_components(self):
|
||||
"Test that we can retrieve the list of connected components"
|
||||
|
|
|
@ -31,6 +31,7 @@ import select
|
|||
import ssl
|
||||
import threading
|
||||
import uuid
|
||||
import prometheus_client
|
||||
|
||||
import zuul.executor.common
|
||||
from zuul import exceptions
|
||||
|
@ -76,6 +77,7 @@ from zuul.zk.zkobject import LocalZKContext, ZKContext
|
|||
from zuul.lib.auth import AuthenticatorRegistry
|
||||
from zuul.lib.config import get_default
|
||||
from zuul.lib.logutil import get_annotated_logger
|
||||
from zuul.lib.statsd import get_statsd, normalize_statsd_name
|
||||
from zuul.web.logutil import ZuulCherrypyLogManager
|
||||
|
||||
STATIC_DIR = os.path.join(os.path.dirname(__file__), 'static')
|
||||
|
@ -143,6 +145,38 @@ cherrypy.tools.handle_options = cherrypy.Tool('on_start_resource',
|
|||
handle_options)
|
||||
|
||||
|
||||
class StatsTool(cherrypy.Tool):
|
||||
def __init__(self, statsd, metrics):
|
||||
self.statsd = statsd
|
||||
self.metrics = metrics
|
||||
self.hostname = normalize_statsd_name(socket.getfqdn())
|
||||
cherrypy.Tool.__init__(self, 'on_start_resource',
|
||||
self.emitStats)
|
||||
|
||||
def emitStats(self):
|
||||
idle = cherrypy.server.httpserver.requests.idle
|
||||
qsize = cherrypy.server.httpserver.requests.qsize
|
||||
self.metrics.threadpool_idle.set(idle)
|
||||
self.metrics.threadpool_queue.set(idle)
|
||||
if self.statsd:
|
||||
self.statsd.gauge(
|
||||
f'zuul.web.server.{self.hostname}.threadpool.idle',
|
||||
idle)
|
||||
self.statsd.gauge(
|
||||
f'zuul.web.server.{self.hostname}.threadpool.queue',
|
||||
qsize)
|
||||
|
||||
|
||||
class WebMetrics:
|
||||
def __init__(self):
|
||||
self.threadpool_idle = prometheus_client.Gauge(
|
||||
'web_threadpool_idle', 'The number of idle worker threads')
|
||||
self.threadpool_queue = prometheus_client.Gauge(
|
||||
'web_threadpool_queue', 'The number of queued requests')
|
||||
self.streamers = prometheus_client.Gauge(
|
||||
'web_streamers', 'The number of log streamers currently operating')
|
||||
|
||||
|
||||
# Custom JSONEncoder that combines the ZuulJSONEncoder with cherrypy's
|
||||
# JSON functionality.
|
||||
class ZuulWebJSONEncoder(ZuulJSONEncoder):
|
||||
|
@ -1563,7 +1597,10 @@ class StaticHandler(object):
|
|||
class StreamManager(object):
|
||||
log = logging.getLogger("zuul.web")
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, statsd, metrics):
|
||||
self.statsd = statsd
|
||||
self.metrics = metrics
|
||||
self.hostname = normalize_statsd_name(socket.getfqdn())
|
||||
self.streamers = {}
|
||||
self.poll = select.poll()
|
||||
self.bitmask = (select.POLLIN | select.POLLERR |
|
||||
|
@ -1605,11 +1642,19 @@ class StreamManager(object):
|
|||
except KeyError:
|
||||
pass
|
||||
|
||||
def emitStats(self):
|
||||
streamers = len(self.streamers)
|
||||
self.metrics.streamers.set(streamers)
|
||||
if self.statsd:
|
||||
self.statsd.gauge(f'zuul.web.server.{self.hostname}.streamers',
|
||||
streamers)
|
||||
|
||||
def registerStreamer(self, streamer):
|
||||
self.log.debug("Registering streamer %s", streamer)
|
||||
self.streamers[streamer.finger_socket.fileno()] = streamer
|
||||
self.poll.register(streamer.finger_socket.fileno(), self.bitmask)
|
||||
os.write(self.wake_write, b'\n')
|
||||
self.emitStats()
|
||||
|
||||
def unregisterStreamer(self, streamer):
|
||||
self.log.debug("Unregistering streamer %s", streamer)
|
||||
|
@ -1622,6 +1667,7 @@ class StreamManager(object):
|
|||
except KeyError:
|
||||
pass
|
||||
streamer.closeSocket()
|
||||
self.emitStats()
|
||||
|
||||
|
||||
class ZuulWeb(object):
|
||||
|
@ -1634,6 +1680,9 @@ class ZuulWeb(object):
|
|||
info: WebInfo = None):
|
||||
self.start_time = time.time()
|
||||
self.config = config
|
||||
self.metrics = WebMetrics()
|
||||
self.statsd = get_statsd(config)
|
||||
|
||||
self.listen_address = get_default(self.config,
|
||||
'web', 'listen_address',
|
||||
'127.0.0.1')
|
||||
|
@ -1681,7 +1730,7 @@ class ZuulWeb(object):
|
|||
|
||||
self.connections = connections
|
||||
self.authenticators = authenticators
|
||||
self.stream_manager = StreamManager()
|
||||
self.stream_manager = StreamManager(self.statsd, self.metrics)
|
||||
self.zone = get_default(self.config, 'web', 'zone')
|
||||
|
||||
self.management_events = TenantManagementEventQueue.createRegistry(
|
||||
|
@ -1836,9 +1885,12 @@ class ZuulWeb(object):
|
|||
controller=StaticHandler(self.static_path),
|
||||
action='default')
|
||||
|
||||
cherrypy.tools.stats = StatsTool(self.statsd, self.metrics)
|
||||
|
||||
conf = {
|
||||
'/': {
|
||||
'request.dispatch': route_map
|
||||
'request.dispatch': route_map,
|
||||
'tools.stats.on': True,
|
||||
}
|
||||
}
|
||||
cherrypy.config.update({
|
||||
|
|
Loading…
Reference in New Issue