Files
zuul/zuul/zk/cache.py
Simon Westphahl 2dc9ec644d Implement launcher relative priority handling
This change adds support for revising the relative priority of nodeset
requests similar to how this is already possible for Nodepool.

Similar to build/merge requests we are not allowed to modify the nodeset
request after it has been created. To still allow revision of the
relative priority we will create a 'revision' subnode, which contains
the updated relative priority of the request.

The treecache was updated to handle the 'revision' subnode and update
the nodeset request accordingly.

More extensive tests that test the pipeline manager adjusting the
relative priority are added in a follow-up change to make review easier.

Change-Id: I8b154cd3626b1e0251c142e37b0a2e2ed7e9c0d5
2025-03-14 10:19:51 +01:00

426 lines
15 KiB
Python

# Copyright 2023 Acme Gating, LLC
# Copyright 2024 BMW Group
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import abc
import json
import logging
import queue
import threading
import time
from uuid import uuid4
from kazoo import exceptions as kze
from kazoo.protocol.states import (
EventType,
WatchedEvent,
KazooState,
)
from zuul.zk.vendor.states import AddWatchMode
class CacheSyncSentinel:
def __init__(self):
self.event = threading.Event()
def set(self):
return self.event.set()
def wait(self, timeout=None):
return self.event.wait(timeout)
class ZuulTreeCache(abc.ABC):
log = logging.getLogger("zuul.zk.ZooKeeper")
event_log = logging.getLogger("zuul.zk.cache.event")
qsize_warning_threshold = 1024
# Sentinel object that can be returned from preCacheHook to not
# update the current object.
STOP_OBJECT_UPDATE = object()
def __init__(self, zk_client, root, async_worker=True):
"""Use watchers to keep a cache of local objects up to date.
Typically this is used with async worker threads so that we
release the kazoo callback resources as quickly as possible.
This prevents large and active trees from starving other
threads.
In case the tree is small and rarely changed, async_worker may
be set to False in order to avoid incurring the overhead of
starting two extra threads.
:param ZookeeperClient zk_client: The Zuul ZookeeperClient object
:param str root: The root of the cache
:param bool async_worker: Whether to start async worker threads
"""
self.zk_client = zk_client
self.client = zk_client.client
self.root = root
self.sync_path = f"{root}/_sync/{uuid4().hex}"
self._last_event_warning = time.monotonic()
self._last_playback_warning = time.monotonic()
self._cached_objects = {}
self._cached_paths = set()
self._ready = threading.Event()
self._init_lock = threading.Lock()
self._stopped = False
self._stop_workers = False
self._event_queue = queue.Queue()
self._playback_queue = queue.Queue()
self._event_worker = None
self._playback_worker = None
self._async_worker = async_worker
self.client.add_listener(self._sessionListener)
self._start()
def _bytesToDict(self, data):
return json.loads(data.decode('utf8'))
def _sessionListener(self, state):
if state == KazooState.LOST:
self._ready.clear()
self._stop_workers = True
self._event_queue.put(None)
self._playback_queue.put(None)
elif state == KazooState.CONNECTED and not self._stopped:
self.client.handler.short_spawn(self._start)
def _cacheListener(self, event):
if self._async_worker:
self._event_queue.put(event)
else:
self._handleCacheEvent(event)
def _start(self):
with self._init_lock:
self.log.debug("Initialize cache at %s", self.root)
self._ready.clear()
self._stop_workers = True
self._event_queue.put(None)
self._playback_queue.put(None)
# If we have an event worker (this is a re-init), then wait
# for it to finish stopping.
if self._event_worker:
self._event_worker.join()
# Replace the queue since any events from the previous
# session aren't valid.
self._event_queue = queue.Queue()
# Prepare (but don't start) the new worker.
if self._async_worker:
self._event_worker = threading.Thread(
target=self._eventWorker)
self._event_worker.daemon = True
if self._playback_worker:
self._playback_worker.join()
self._playback_queue = queue.Queue()
if self._async_worker:
self._playback_worker = threading.Thread(
target=self._playbackWorker)
self._playback_worker.daemon = True
# Clear the stop flag and start the workers now that we
# are sure that both have stopped and we have cleared the
# queues.
self._stop_workers = False
if self._async_worker:
self._event_worker.start()
self._playback_worker.start()
try:
self.client.add_watch(
self.root, self._cacheListener,
AddWatchMode.PERSISTENT_RECURSIVE)
self.client.ensure_path(self.root)
self._walkTree()
self._ready.set()
self.log.debug("Cache at %s is ready", self.root)
except Exception:
self.log.exception("Error initializing cache at %s", self.root)
self.client.handler.short_spawn(self._start)
def stop(self):
self._stopped = True
self._event_queue.put(None)
self._playback_queue.put(None)
self.client.remove_listener(self._sessionListener)
def _walkTree(self, root=None, seen_paths=None):
# Recursively walk the tree and emit fake changed events for
# every item in zk and fake deleted events for every item in
# the cache that is not in zk
exists = True
am_root = False
if root is None:
am_root = True
root = self.root
seen_paths = set()
if exists:
seen_paths.add(root)
event = WatchedEvent(EventType.NONE,
self.client._state,
root)
self._cacheListener(event)
try:
for child in self.client.get_children(root):
safe_root = root
if safe_root == '/':
safe_root = ''
new_path = '/'.join([safe_root, child])
self._walkTree(new_path, seen_paths)
except kze.NoNodeError:
self.log.debug("Can't sync non-existent node %s", root)
if am_root:
for path in self._cached_paths.copy():
if path not in seen_paths:
event = WatchedEvent(
EventType.NONE,
self.client._state,
path)
self._cacheListener(event)
def _eventWorker(self):
while not (self._stopped or self._stop_workers):
event = self._event_queue.get()
if event is None:
self._event_queue.task_done()
continue
qsize = self._event_queue.qsize()
if qsize > self.qsize_warning_threshold:
now = time.monotonic()
if now - self._last_event_warning > 60:
self.log.warning("Event queue size for cache at %s is %s",
self.root, qsize)
self._last_event_warning = now
try:
self._handleCacheEvent(event)
except Exception:
self.log.exception("Error handling event %s:", event)
self._event_queue.task_done()
def _handleCacheEvent(self, event):
# Ignore root node since we don't maintain a cached object for
# it (all cached objects are under the root in our tree
# caches).
if event.path == self.root:
return
# Start by assuming we need to fetch data for the event.
fetch = True
if event.type == EventType.NONE:
if event.path is None:
# We're probably being told of a connection change; ignore.
return
elif (event.type == EventType.DELETED):
# If this is a normal deleted event, we don't need to
# fetch anything.
fetch = False
key = self.parsePath(event.path)
if (key is None and event.type != EventType.NONE
and event.path != self.sync_path):
# The cache doesn't care about this path, so we don't need
# to fetch (unless the type is none (re-initialization) in
# which case we always need to fetch in order to determine
# existence).
fetch = False
if fetch:
future = self.client.get_async(event.path)
else:
future = None
if self._async_worker:
self._playback_queue.put((event, future, key))
else:
self._handlePlayback(event, future, key)
def _playbackWorker(self):
while not (self._stopped or self._stop_workers):
item = self._playback_queue.get()
if item is None:
self._playback_queue.task_done()
continue
if isinstance(item, CacheSyncSentinel):
item.set()
self._playback_queue.task_done()
continue
qsize = self._playback_queue.qsize()
if qsize > self.qsize_warning_threshold:
now = time.monotonic()
if now - self._last_playback_warning > 60:
self.log.warning(
"Playback queue size for cache at %s is %s",
self.root, qsize)
self._last_playback_warning = now
event, future, key = item
try:
self._handlePlayback(event, future, key)
except Exception:
self.log.exception("Error playing back event %s:", event)
self._playback_queue.task_done()
def _handlePlayback(self, event, future, key):
# self.event_log.debug("Cache playback event %s", event)
exists = None
data, stat = None, None
if future:
try:
data, stat = future.get()
exists = True
except kze.NoNodeError:
exists = False
# We set "exists" above in case of cache re-initialization,
# which happens out of sequence with the normal watch events.
# and we can't be sure whether the node still exists or not by
# the time we process it. Later cache watch events may
# supercede this (for example, we may process a NONE event
# here which we interpret as a delete which may be followed by
# a normal delete event. That case, and any other variations
# should be anticipated.
# If the event tells us whether the node exists, prefer that
# value, otherwise fallback to what we determined above.
if (event.type in (EventType.CREATED, EventType.CHANGED)):
exists = True
elif (event.type == EventType.DELETED):
exists = False
# Keep the cached paths up to date
if exists:
self._cached_paths.add(event.path)
else:
self._cached_paths.discard(event.path)
# Some caches have special handling for certain sub-objects
if (self.preCacheHook(event, exists, data, stat)
== self.STOP_OBJECT_UPDATE):
return
# If we don't actually cache this kind of object, return now
if key is None:
return
obj = None
if data:
# Perform an in-place update of the cached object if possible
obj = self._cached_objects.get(key)
if obj:
# Don't update to older data
# Don't update a locked object
if (stat.mzxid > obj._zstat.mzxid and
getattr(obj, 'lock', None) is None):
self.updateFromRaw(obj, key, data, stat)
else:
obj = self.objectFromRaw(key, data, stat)
self._cached_objects[key] = obj
else:
try:
obj = self._cached_objects[key]
del self._cached_objects[key]
except KeyError:
# If it's already gone, don't care
pass
self.postCacheHook(event, data, stat, key, obj)
def ensureReady(self):
self._ready.wait()
def waitForSync(self, timeout=None):
sentinel = CacheSyncSentinel()
self._playback_queue.put(sentinel)
return sentinel.wait(timeout)
# Methods for subclasses:
def preCacheHook(self, event, exists, data=None, stat=None):
"""Called before the cache is updated
This is called for any add/update/remove event under the root,
even for paths that are ignored, so users much test the
relevance of the path in this method.
The ``exists`` argument is provided in all cases. In the case
of EventType.NONE events, it indicates whether the cache has
seen the node in ZK immediately before calling this method.
Otherwise, it indicates whether or not the EventType would
cause the node to exist in ZK.
Return sentinel STOP_OBJECT_UPDATE to skip the object update
step.
:param EventType event: The event.
:param bool exists: Whether the object exists in ZK.
:param bytes data: data when fetched, else None
:param ZnodeStat stat: ZNode stat when the node exists, else None
"""
return None
def postCacheHook(self, event, data, stat, key, obj):
"""Called after the cache has been updated"""
return None
@abc.abstractmethod
def parsePath(self, path):
"""Parse the path and return a cache key
The cache key is an opaque object ignored by the cache, but
must be hashable.
A convention is to use a tuple of relevant path components as
the key.
Return None to indicate the path is not relevant to the cache.
"""
return None
@abc.abstractmethod
def objectFromRaw(self, key, data, stat):
"""Construct an object from ZooKeeper data
Given data from ZK and cache key, construct
and return an object to insert into the cache.
:param object key: The key as returned by parsePath.
:param dict data: The raw data.
:param Zstat stat: The zstat of the znode.
"""
pass
@abc.abstractmethod
def updateFromRaw(self, obj, key, data, stat):
"""Construct an object from ZooKeeper data
Update an existing object with new data.
:param object obj: The old object.
:param object key: The key as returned by parsePath.
:param dict data: The raw data.
:param Zstat stat: The zstat of the znode.
"""
pass