Files
zuul/zuul/driver/util.py
James E. Blair fd9d6aa80c Add quota support to zuul-launcher
This does the following:

* Retries node requests without penalty if they hit an unexpected
  quota error.  The new node state "TEMPFAIL" is used for this.
* Calculates total provider quota in a method similar to nodepool
  and caches it and updates it every 5 minutes.
* Keeps track of realtime zuul quota usage by way of a custom
  TreeCache that adds and removes quota as it sees events.  This
  lets us know real-time usage without needing to iterate over
  all nodes.
* Avoids starting a node state machine if it is expected that the
  node will put the provider over quota.  This is similar, but not
  quite the same as nodepool's idea of a paused provider.
* Attempts to keep the percentage quota used of all providers
  roughly equivalent while still including some randomization.
  This is still likely not a final allocation algorithm.

This has some issues that will need to be resolved in subsequent
commits:

* It is possible for a node request to be starved if multiple launchers
  are running for a provider, and there are requests of different
  sizes for that provider (the large one may never run if the small
  ones are able to fit in the last bit of quota).
* Because we record quota failures as node objects in ZK, if a node
  request continually hits quota errors, our node records in ZK will
  grow without bound.

Despite these issues, this change is useful for continued testing
of the system, and since it is mainly focused with adding quota
support and only minimally changes the node/request handling algorithms,
it stands on its own.

Change-Id: I6d65db2b103cc3f0aec7e46a6a63a393399c91eb
2025-02-21 09:56:49 -08:00

335 lines
10 KiB
Python

# Copyright 2017 Red Hat, Inc.
# Copyright 2023-2024 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# Utility methods to promote consistent configuration among drivers.
import concurrent
import os
import threading
import time
import voluptuous as vs
from zuul.configloader import ZUUL_REGEX, make_regex # noqa
def time_to_seconds(s):
if s.endswith('s'):
return int(s[:-1])
if s.endswith('m'):
return int(s[:-1]) * 60
if s.endswith('h'):
return int(s[:-1]) * 60 * 60
if s.endswith('d'):
return int(s[:-1]) * 24 * 60 * 60
if s.endswith('w'):
return int(s[:-1]) * 7 * 24 * 60 * 60
raise Exception("Unable to parse time value: %s" % s)
def scalar_or_list(x):
return vs.Any([x], x)
def to_list(item):
if not item:
return []
if isinstance(item, list):
return item
return [item]
class RateLimitInstance:
def __init__(self, limiter, logger, msg):
self.limiter = limiter
self.logger = logger
self.msg = msg
def __enter__(self):
self.delay = self.limiter._enter()
self.start_time = time.monotonic()
def __exit__(self, etype, value, tb):
end_time = time.monotonic()
self.limiter._exit(etype, value, tb)
self.logger("%s in %ss after %ss delay",
self.msg,
end_time - self.start_time,
self.delay)
class RateLimiter:
"""A Rate limiter
:param str name: The provider name; used in logging.
:param float rate_limit: The rate limit expressed in
requests per second.
Example:
.. code:: python
rate_limiter = RateLimiter('provider', 1.0)
with rate_limiter:
api_call()
You can optionally use the limiter as a callable in which case it
will log a supplied message with timing information.
.. code:: python
rate_limiter = RateLimiter('provider', 1.0)
with rate_limiter(log.debug, "an API call"):
api_call()
"""
def __init__(self, name, rate_limit):
self._running = True
self.name = name
if not rate_limit:
self.delta = 0.0
else:
self.delta = 1.0 / rate_limit
self.last_ts = None
self.lock = threading.Lock()
def __call__(self, logmethod, msg):
return RateLimitInstance(self, logmethod, msg)
def __enter__(self):
self._enter()
def _enter(self):
with self.lock:
total_delay = 0.0
if self.last_ts is None:
self.last_ts = time.monotonic()
return total_delay
while True:
now = time.monotonic()
delta = now - self.last_ts
if delta >= self.delta:
break
delay = self.delta - delta
time.sleep(delay)
total_delay += delay
self.last_ts = time.monotonic()
return total_delay
def __exit__(self, etype, value, tb):
self._exit(etype, value, tb)
def _exit(self, etype, value, tb):
pass
class LazyExecutorTTLCache:
"""This is a lazy executor TTL cache.
It's lazy because if it has cached data, it will always return it
instantly.
It's executor based, which means that if a cache miss occurs, it
will submit a task to an executor to fetch new data.
Finally, it's a TTL cache, which means it automatically expires data.
Since it is only expected to be used when caching provider
resource listing methods, it assumes there will only be one entry
and ignores arguments -- it will return the same cached data no
matter what arguments are supplied; but it will pass on those
arguments to the underlying method in a cache miss.
:param numeric ttl: The cache timeout in seconds.
:param concurrent.futures.Executor executor: An executor to use to
update data asynchronously in case of a cache miss.
"""
def __init__(self, ttl, executor):
self.ttl = ttl
self.executor = executor
# If we have an outstanding update being run by the executor,
# this is the future.
self.future = None
# The last time the underlying method completed.
self.last_time = None
# The last value from the underlying method.
self.last_value = None
# A lock to make all of this thread safe (especially to ensure
# we don't fire off multiple updates).
self.lock = threading.Lock()
def __call__(self, func):
def decorator(*args, **kw):
with self.lock:
now = time.monotonic()
if self.future and self.future.done():
# If a previous call spawned an update, resolve
# that now so we can use the data.
try:
self.last_time, self.last_value = self.future.result()
finally:
# Clear the future regardless so we don't loop.
self.future = None
if (self.last_time is not None and
now - self.last_time < self.ttl):
# A cache hit.
return self.last_value
# The rest of the method is a cache miss.
if self.last_time is not None:
if not self.future:
# Fire off an asynchronous update request.
# This second wrapper ensures that we record
# the time that the update is complete along
# with the value.
def func_with_time():
ret = func(*args, **kw)
now = time.monotonic()
return (now, ret)
self.future = self.executor.submit(func_with_time)
else:
# This is the first time this method has been
# called; since we don't have any cached data, we
# will synchronously update the data.
self.last_value = func(*args, **kw)
self.last_time = time.monotonic()
return self.last_value
return decorator
class Segment:
def __init__(self, index, offset, data):
self.index = index
self.offset = offset
self.data = data
class ImageUploader:
"""
A helper class for drivers that upload large images in chunks.
"""
# These values probably don't need to be changed
error_retries = 3
concurrency = 10
# Subclasses must implement these
segment_size = None
def __init__(self, endpoint, log, path, image_name, metadata):
if self.segment_size is None:
raise Exception("Subclass must set block size")
self.endpoint = endpoint
self.log = log
self.path = path
self.size = os.path.getsize(path)
self.image_name = image_name
self.metadata = metadata
self.timeout = None
def shouldRetryException(self, exception):
return True
def uploadSegment(self, segment):
pass
def startUpload(self):
pass
def finishUpload(self):
pass
def abortUpload(self):
pass
# Main API
def upload(self, timeout=None):
if timeout:
self.timeout = time.monotonic() + timeout
self.startUpload()
try:
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.concurrency) as executor:
with open(self.path, 'rb') as image_file:
self._uploadInner(executor, image_file)
return self.finishUpload()
except Exception:
self.log.exception("Error uploading image:")
self.abortUpload()
# Subclasses can use this helper method for wrapping retryable calls
def retry(self, func, *args, **kw):
for x in range(self.error_retries):
try:
return func(*args, **kw)
except Exception as e:
if not self.shouldRetryException(e):
raise
if x + 1 >= self.error_retries:
raise
time.sleep(2 * x)
def getTimeout(self):
if self.timeout is None:
return None
return self.timeout - time.monotonic()
def checkTimeout(self):
if self.timeout is None:
return
if self.getTimeout() < 0:
raise Exception("Timed out uploading image")
# Internal methods
def _uploadInner(self, executor, image_file):
futures = set()
for index, offset in enumerate(range(0, self.size, self.segment_size)):
segment = Segment(index, offset,
image_file.read(self.segment_size))
future = executor.submit(self.uploadSegment, segment)
futures.add(future)
# Keep the pool of workers supplied with data but without
# reading the entire file into memory.
if len(futures) >= (self.concurrency * 2):
(done, futures) = concurrent.futures.wait(
futures,
timeout=self.getTimeout(),
return_when=concurrent.futures.FIRST_COMPLETED)
for future in done:
future.result()
# Only check the timeout after waiting (not every pass
# through the loop)
self.checkTimeout()
# We're done reading the file, wait for all uploads to finish
(done, futures) = concurrent.futures.wait(
futures,
timeout=self.getTimeout())
for future in done:
future.result()
self.checkTimeout()
class Timer:
def __init__(self, log, msg):
self.log = log
self.msg = msg
def __enter__(self):
self.start = time.perf_counter()
def __exit__(self, type, value, traceback):
delta = time.perf_counter() - self.start
self.log.debug(f'{self.msg} in {delta}')