Files
zuul/zuul/lib/streamer_utils.py
James E. Blair 317e42a2e5 Use a TreeCache for job request queues
To reduce ZK traffic and contention in sites with many executors
and jobs, we move the job request queue to a TreeCache.  This is
a structure we have been using successfully for some time in
nodepool, and we added it to Zuul for the nodepool-in-zuul work.

This uses a single watch for all items in the job request queue.
It will keep all the requests in the local cache of the queue
up-to-date.  We still do not cache the parameters and other data;
we only load those when needed.

A later change will take advantage of the fact that we can detect
whether a request is locked or not without locking it ourselves
to further reduce traffic and contention.  This change attempts
to minimize changes other than what is necessary for the tree
cache.

Change-Id: Ie3d62dadd9990c847df8302d4da801a990394d2c
2025-02-04 08:53:52 -08:00

238 lines
8.3 KiB
Python

# Copyright 2017 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
'''
This file contains code common to finger log streaming functionality.
The log streamer process within each executor, the finger gateway service,
and the web interface will all make use of this module.
'''
import logging
import os
import pwd
import random
import select
import socket
import socketserver
import ssl
import threading
import time
from zuul.exceptions import StreamingError
from zuul.zk.components import COMPONENT_REGISTRY
log = logging.getLogger("zuul.lib.streamer_utils")
class BaseFingerRequestHandler(socketserver.BaseRequestHandler):
'''
Base class for common methods for handling finger requests.
'''
MAX_REQUEST_LEN = 1024
REQUEST_TIMEOUT = 10
def getCommand(self):
poll = select.poll()
bitmask = (select.POLLIN | select.POLLERR |
select.POLLHUP | select.POLLNVAL)
poll.register(self.request, bitmask)
buffer = b''
ret = None
start = time.time()
while True:
elapsed = time.time() - start
timeout = max(self.REQUEST_TIMEOUT - elapsed, 0)
if not timeout:
raise Exception("Timeout while waiting for input")
for fd, event in poll.poll(timeout):
if event & select.POLLIN:
x = self.request.recv(self.MAX_REQUEST_LEN)
if not x:
# This will cause the caller to quietly shut down
raise BrokenPipeError
buffer += x
else:
raise Exception("Received error event")
if len(buffer) >= self.MAX_REQUEST_LEN:
raise Exception("Request too long")
try:
ret = buffer.decode('utf-8')
x = ret.find('\n')
if x > 0:
# rstrip to remove any other unnecessary chars (e.g. \r)
return ret[:x].rstrip()
except UnicodeDecodeError:
pass
class CustomThreadingTCPServer(socketserver.ThreadingTCPServer):
'''
Custom version that allows us to drop privileges after port binding.
'''
def __init__(self, *args, **kwargs):
# NOTE(pabelanger): Set up address_family for socketserver based on the
# fingergw.listen_address setting in zuul.conf.
# param tuple args[0]: The address and port to bind to for
# socketserver.
server_address = args[0]
address_family = None
for res in socket.getaddrinfo(
server_address[0], server_address[1], 0, self.socket_type):
if res[0] == socket.AF_INET6:
# If we get an IPv6 address, break our loop and use that first.
address_family = res[0]
break
elif res[0] == socket.AF_INET:
address_family = res[0]
# Check to see if getaddrinfo failed.
if not address_family:
raise Exception("getaddrinfo returns an empty list")
self.address_family = address_family
self.user = kwargs.pop('user', None)
self.pid_file = kwargs.pop('pid_file', None)
self.server_ssl_key = kwargs.pop('server_ssl_key', None)
self.server_ssl_cert = kwargs.pop('server_ssl_cert', None)
self.server_ssl_ca = kwargs.pop('server_ssl_ca', None)
socketserver.ThreadingTCPServer.__init__(self, *args, **kwargs)
def change_privs(self):
'''
Drop our privileges to another user.
'''
if os.getuid() != 0:
return
pw = pwd.getpwnam(self.user)
# Change owner on our pid file so it can be removed by us after
# dropping privileges. May not exist if not a daemon.
if self.pid_file and os.path.exists(self.pid_file):
os.chown(self.pid_file, pw.pw_uid, pw.pw_gid)
os.setgroups([])
os.setgid(pw.pw_gid)
os.setuid(pw.pw_uid)
os.umask(0o022)
def server_bind(self):
'''
Overridden from the base class to allow address reuse and to drop
privileges after binding to the listening socket.
'''
self.allow_reuse_address = True
socketserver.ThreadingTCPServer.server_bind(self)
if self.user:
self.change_privs()
def server_close(self):
'''
Overridden from base class to shutdown the socket immediately.
'''
try:
self.socket.shutdown(socket.SHUT_RD)
self.socket.close()
except socket.error as e:
# If it's already closed, don't error.
if e.errno == socket.EBADF:
return
raise
def process_request(self, request, client_address):
'''
Overridden from the base class to name the thread.
'''
t = threading.Thread(target=self.process_request_thread,
name='socketserver_Thread',
args=(request, client_address))
t.daemon = self.daemon_threads
t.start()
def get_request(self):
sock, addr = super().get_request()
if all([self.server_ssl_key, self.server_ssl_cert,
self.server_ssl_ca]):
context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
context.load_cert_chain(self.server_ssl_cert, self.server_ssl_key)
context.load_verify_locations(self.server_ssl_ca)
context.verify_mode = ssl.CERT_REQUIRED
sock = context.wrap_socket(sock, server_side=True)
return sock, addr
def getJobLogStreamAddress(executor_api, uuid, source_zone,
tenant_name=None):
"""
Looks up the log stream address for the given build UUID.
Try to find the build request for the given UUID in ZooKeeper
by searching through all available zones. If a build request
was found we use the worker information to build the log stream
address.
"""
build_request = executor_api.getRequest(uuid)
if build_request is None:
raise StreamingError("Build not found")
if tenant_name is not None and build_request.tenant_name != tenant_name:
# Intentionally the same error as above to avoid leaking
# out-of-tenant build information.
raise StreamingError("Build not found")
worker_info = build_request.worker_info
if not worker_info:
raise StreamingError("Build did not start yet")
worker_zone = worker_info.get("zone")
job_log_stream_address = {}
if worker_zone and source_zone != worker_zone:
info = _getFingerGatewayInZone(worker_zone)
if info:
job_log_stream_address['server'] = info.hostname
job_log_stream_address['port'] = info.public_port
job_log_stream_address['use_ssl'] = info.use_ssl
log.debug('Source (%s) and worker (%s) zone '
'are different, routing via %s:%s',
source_zone, worker_zone,
info.hostname, info.public_port)
else:
log.warning('Source (%s) and worker (%s) zone are different'
'but no fingergw in target zone found. '
'Falling back to direct connection.',
source_zone, worker_zone)
else:
log.debug('Source (%s) or worker zone (%s) undefined '
'or equal, no routing is needed.',
source_zone, worker_zone)
if 'server' not in job_log_stream_address:
job_log_stream_address['server'] = worker_info["hostname"]
job_log_stream_address['port'] = worker_info["log_port"]
return job_log_stream_address
def _getFingerGatewayInZone(zone):
registry = COMPONENT_REGISTRY.registry
gws = [gw for gw in registry.all('fingergw') if gw.zone == zone]
if gws:
return random.choice(gws)
return None