Add stats to web server

This adds matrics which report the number of thread workers in use
as well as the number of requests queued at the start of each
request in cherrypy.

It also reports the number of streamers currently running.

These can help us detect and diagnose problems with the web server.

Change-Id: Iadf9479ae84167892ab11ae122f275637c0c6c6f
This commit is contained in:
James E. Blair 2022-02-02 16:05:15 -08:00
parent 482338f70c
commit b0d36267f3
6 changed files with 136 additions and 3 deletions

View File

@ -511,6 +511,34 @@ These metrics are emitted by the Zuul :ref:`scheduler`:
performance metric of how long the SQL query takes; it is not
the estimated time value itself.
.. stat:: zuul.web
Holds metrics related to the Zuul web component.
.. stat:: server.<hostname>
Holds metrics from a specific zuul-web server.
.. stat:: threadpool
Metrics related to the web server thread pool.
.. stat:: idle
:type: gauge
The number of idle workers.
.. stat:: queue
:type: gauge
The number of requests queued for workers.
.. stat:: streamers
:type: gauge
The number of log streamers currently in operation.
As an example, given a job named `myjob` in `mytenant` triggered by a
change to `myproject` on the `master` branch in the `gate` pipeline
which took 40 seconds to build, the Zuul scheduler will emit the
@ -559,6 +587,23 @@ These metrics are exposed by default:
.. stat:: process_cpu_seconds_total
:type: counter
On web servers the following additional metrics are exposed:
.. stat:: web_threadpool_idle
:type: gauge
The number of idle workers in the thread pool.
.. stat:: web_threadpool_queue
:type: gauge
The number of requests queued for thread pool workers.
.. stat:: web_streamers
:type: gauge
The number of log streamers currently in operation.
.. _prometheus_liveness:
Liveness Probes

View File

@ -0,0 +1,4 @@
---
features:
- |
New monitoring metrics specific to zuul-web are available.

View File

@ -36,6 +36,7 @@ from typing import Callable, Optional, Any, Iterable, Generator, List, Dict
from unittest.case import skipIf
import zlib
import prometheus_client
import requests
import select
import shutil
@ -4013,6 +4014,24 @@ class PostgresqlSchemaFixture(fixtures.Fixture):
cur.execute("drop user %s" % self.name)
class PrometheusFixture(fixtures.Fixture):
def _setUp(self):
# Save a list of collectors which exist at the start of the
# test (ie, the standard prometheus_client collectors)
self.collectors = list(
prometheus_client.registry.REGISTRY._collector_to_names.keys())
self.addCleanup(self._cleanup)
def _cleanup(self):
# Avoid the "Duplicated timeseries in CollectorRegistry" error
# by removing any collectors added during the test.
collectors = list(
prometheus_client.registry.REGISTRY._collector_to_names.keys())
for collector in collectors:
if collector not in self.collectors:
prometheus_client.registry.REGISTRY.unregister(collector)
class FakeCPUTimes:
def __init__(self):
self.user = 0
@ -4052,6 +4071,7 @@ class BaseTestCase(testtools.TestCase):
def setUp(self):
super(BaseTestCase, self).setUp()
self.useFixture(PrometheusFixture())
test_timeout = os.environ.get('OS_TEST_TIMEOUT', 0)
try:
test_timeout = int(test_timeout)

View File

@ -28,6 +28,7 @@ import time
import zuul.web
import zuul.lib.log_streamer
from zuul.lib.fingergw import FingerGateway
from zuul.lib.statsd import normalize_statsd_name
import tests.base
from tests.base import iterate_timeout, ZuulWebFixture, FIXTURE_DIR
@ -391,6 +392,10 @@ class TestStreaming(TestStreamingBase):
self.log.debug("\n\nStreamed: %s\n\n", client1.results)
self.assertEqual(file_contents, client1.results)
hostname = normalize_statsd_name(socket.getfqdn())
self.assertReportedStat(
f'zuul.web.server.{hostname}.streamers', kind='g')
def test_websocket_streaming(self):
# Start the web server
web = self.useFixture(

View File

@ -1,5 +1,6 @@
# Copyright 2014 Hewlett-Packard Development Company, L.P.
# Copyright 2014 Rackspace Australia
# Copyright 2021-2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
@ -26,6 +27,7 @@ from unittest import skip
import requests
from zuul.lib.statsd import normalize_statsd_name
import zuul.web
from tests.base import ZuulTestCase, AnsibleZuulTestCase
@ -232,6 +234,11 @@ class TestWeb(BaseTestWeb):
self.assertEqual(len(status_jobs[2]['dependencies']), 1)
self.assertIn('project-merge', status_jobs[1]['dependencies'])
self.assertIn('project-merge', status_jobs[2]['dependencies'])
hostname = normalize_statsd_name(socket.getfqdn())
self.assertReportedStat(
f'zuul.web.server.{hostname}.threadpool.idle', kind='g')
self.assertReportedStat(
f'zuul.web.server.{hostname}.threadpool.queue', kind='g')
def test_web_components(self):
"Test that we can retrieve the list of connected components"

View File

@ -31,6 +31,7 @@ import select
import ssl
import threading
import uuid
import prometheus_client
import zuul.executor.common
from zuul import exceptions
@ -76,6 +77,7 @@ from zuul.zk.zkobject import LocalZKContext, ZKContext
from zuul.lib.auth import AuthenticatorRegistry
from zuul.lib.config import get_default
from zuul.lib.logutil import get_annotated_logger
from zuul.lib.statsd import get_statsd, normalize_statsd_name
from zuul.web.logutil import ZuulCherrypyLogManager
STATIC_DIR = os.path.join(os.path.dirname(__file__), 'static')
@ -143,6 +145,38 @@ cherrypy.tools.handle_options = cherrypy.Tool('on_start_resource',
handle_options)
class StatsTool(cherrypy.Tool):
def __init__(self, statsd, metrics):
self.statsd = statsd
self.metrics = metrics
self.hostname = normalize_statsd_name(socket.getfqdn())
cherrypy.Tool.__init__(self, 'on_start_resource',
self.emitStats)
def emitStats(self):
idle = cherrypy.server.httpserver.requests.idle
qsize = cherrypy.server.httpserver.requests.qsize
self.metrics.threadpool_idle.set(idle)
self.metrics.threadpool_queue.set(idle)
if self.statsd:
self.statsd.gauge(
f'zuul.web.server.{self.hostname}.threadpool.idle',
idle)
self.statsd.gauge(
f'zuul.web.server.{self.hostname}.threadpool.queue',
qsize)
class WebMetrics:
def __init__(self):
self.threadpool_idle = prometheus_client.Gauge(
'web_threadpool_idle', 'The number of idle worker threads')
self.threadpool_queue = prometheus_client.Gauge(
'web_threadpool_queue', 'The number of queued requests')
self.streamers = prometheus_client.Gauge(
'web_streamers', 'The number of log streamers currently operating')
# Custom JSONEncoder that combines the ZuulJSONEncoder with cherrypy's
# JSON functionality.
class ZuulWebJSONEncoder(ZuulJSONEncoder):
@ -1563,7 +1597,10 @@ class StaticHandler(object):
class StreamManager(object):
log = logging.getLogger("zuul.web")
def __init__(self):
def __init__(self, statsd, metrics):
self.statsd = statsd
self.metrics = metrics
self.hostname = normalize_statsd_name(socket.getfqdn())
self.streamers = {}
self.poll = select.poll()
self.bitmask = (select.POLLIN | select.POLLERR |
@ -1605,11 +1642,19 @@ class StreamManager(object):
except KeyError:
pass
def emitStats(self):
streamers = len(self.streamers)
self.metrics.streamers.set(streamers)
if self.statsd:
self.statsd.gauge(f'zuul.web.server.{self.hostname}.streamers',
streamers)
def registerStreamer(self, streamer):
self.log.debug("Registering streamer %s", streamer)
self.streamers[streamer.finger_socket.fileno()] = streamer
self.poll.register(streamer.finger_socket.fileno(), self.bitmask)
os.write(self.wake_write, b'\n')
self.emitStats()
def unregisterStreamer(self, streamer):
self.log.debug("Unregistering streamer %s", streamer)
@ -1622,6 +1667,7 @@ class StreamManager(object):
except KeyError:
pass
streamer.closeSocket()
self.emitStats()
class ZuulWeb(object):
@ -1634,6 +1680,9 @@ class ZuulWeb(object):
info: WebInfo = None):
self.start_time = time.time()
self.config = config
self.metrics = WebMetrics()
self.statsd = get_statsd(config)
self.listen_address = get_default(self.config,
'web', 'listen_address',
'127.0.0.1')
@ -1681,7 +1730,7 @@ class ZuulWeb(object):
self.connections = connections
self.authenticators = authenticators
self.stream_manager = StreamManager()
self.stream_manager = StreamManager(self.statsd, self.metrics)
self.zone = get_default(self.config, 'web', 'zone')
self.management_events = TenantManagementEventQueue.createRegistry(
@ -1836,9 +1885,12 @@ class ZuulWeb(object):
controller=StaticHandler(self.static_path),
action='default')
cherrypy.tools.stats = StatsTool(self.statsd, self.metrics)
conf = {
'/': {
'request.dispatch': route_map
'request.dispatch': route_map,
'tools.stats.on': True,
}
}
cherrypy.config.update({