zuul/zuul/lib/monitoring.py
James E. Blair 704fef6cb9 Add readiness/liveness probes to prometheus server
To facilitate automation of rolling restarts, configure the prometheus
server to answer readiness and liveness probes.  We are 'live' if the
process is running, and we are 'ready' if our component state is
either running or paused (not initializing or stopped).

The prometheus_client library doesn't support this directly, so we need
to handle this ourselves.  We could create yet another HTTP server that
each component would need to start, or we could take advantage of the
fact that the prometheus_client is a standard WSGI service and just
wrap it in our own WSGI service that adds the extra endpoints needed.
Since that is far simpler and less resounce intensive, that is what
this change does.

The prometheus_client will actually return the metrics on any path
given to it.  In order to reduce the chances of an operator configuring
a liveness probe with a typo (eg '/healthy/ready') and getting the
metrics page served with a 200 response, we restrict the metrics to
only the '/metrics' URI which is what we specified in our documentation,
and also '/' which is very likely accidentally used by users.

Change-Id: I154ca4896b69fd52eda655209480a75c8d7dbac3
2021-12-09 07:37:29 -08:00

86 lines
3.0 KiB
Python

# Copyright 2021 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import threading
import prometheus_client
from zuul.lib.config import get_default
class MonitoringServer:
def __init__(self, config, section, component_info):
if not config.has_option(section, 'prometheus_port'):
self.httpd = None
return
self.component_info = component_info
port = int(config.get(section, 'prometheus_port'))
addr = get_default(
config, section, 'prometheus_addr', '0.0.0.0')
self.prometheus_app = prometheus_client.make_wsgi_app(
prometheus_client.registry.REGISTRY)
self.httpd = prometheus_client.exposition.make_server(
addr, port,
self.handleRequest,
prometheus_client.exposition.ThreadingWSGIServer,
handler_class=prometheus_client.exposition._SilentHandler)
# The unit tests pass in 0 for the port
self.port = self.httpd.socket.getsockname()[1]
def start(self):
if self.httpd is None:
return
self.thread = threading.Thread(target=self.httpd.serve_forever)
self.thread.daemon = True
self.thread.start()
def stop(self):
if self.httpd is None:
return
self.httpd.shutdown()
def join(self):
if self.httpd is None:
return
self.thread.join()
self.httpd.socket.close()
def handleRequest(self, environ, start_response):
headers = []
output = b''
if environ['PATH_INFO'] == '/health/live':
status = '200 OK'
elif environ['PATH_INFO'] == '/health/ready':
if (self.component_info.state in (
self.component_info.RUNNING,
self.component_info.PAUSED)):
status = '200 OK'
else:
status = '503 Service Unavailable'
elif environ['PATH_INFO'] == '/health/status':
status = '200 OK'
headers = [('Content-Type', 'text/plain')]
output = str(self.component_info.state).encode('utf8').upper()
elif environ['PATH_INFO'] in ('/metrics', '/'):
# The docs say '/metrics' but '/' worked and was likely
# used by users, so let's support both for now.
return self.prometheus_app(environ, start_response)
else:
status = '404 Not Found'
# Return output
start_response(status, headers)
return [output]