Files
zuul/zuul/zk/launcher.py
Simon Westphahl c1f1d05f93 Fix 'is_locked' flag for items in ZKObject cache
This change fixes a problem with the LockableZKObjectCache cache and the
`is_locked` indicator.

The problem was that the pre-cache hook did not take into account
multiple lock contender nodes and was setting the `is_locked` flag to
false when any of the contenders was deleted.

To fix this the items in the cache will keep a set of lock contenders
and consider the node locked as long as contenders exist.

Change-Id: I4341498b09b34ada39fe19e6ceab2f66187b7361
2025-05-13 12:20:07 +02:00

331 lines
12 KiB
Python

# Copyright 2024 BMW Group
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import json
import logging
import threading
import mmh3
from kazoo.exceptions import NoNodeError
from zuul.model import NodesetRequest, ProviderNode, QuotaInformation
from zuul.zk.cache import ZuulTreeCache
from zuul.zk.zkobject import ZKContext
def _dictToBytes(data):
return json.dumps(data).encode("utf-8")
def _bytesToDict(raw_data):
return json.loads(raw_data.decode("utf-8"))
def launcher_score(name, item):
return mmh3.hash(f"{name}-{item.uuid}", signed=False)
class LockableZKObjectCache(ZuulTreeCache):
def __init__(self, zk_client, updated_event, root, items_path,
locks_path, zkobject_class):
self.updated_event = updated_event
self.items_path = items_path
self.locks_path = locks_path
self.zkobject_class = zkobject_class
super().__init__(zk_client, root)
def _parsePath(self, path):
if not path.startswith(self.root):
return None
path = path[len(self.root) + 1:]
parts = path.split('/')
# We are interested in requests with a parts that look like:
# ([<self.items_path>, <self.locks_path>], <uuid>, ...)
if len(parts) < 2:
return None
return parts
def parsePath(self, path):
parts = self._parsePath(path)
if parts is None:
return None
if len(parts) < 2:
return None
if parts[0] != self.items_path:
return None
item_uuid = parts[1]
return (item_uuid,)
def preCacheHook(self, event, exists, data=None, stat=None):
parts = self._parsePath(event.path)
if parts is None:
return
# Expecting (<self.locks_path>, <uuid>, <lock>,)
if len(parts) != 3:
return
object_type, request_uuid, contender, *_ = parts
if object_type != self.locks_path:
return
key = (request_uuid,)
request = self._cached_objects.get(key)
if not request:
return
if exists:
request._lock_contenders.add(contender)
else:
request._lock_contenders.discard(contender)
is_locked = bool(request._lock_contenders)
if request.is_locked != is_locked:
request._set(is_locked=is_locked)
if self.updated_event:
self.updated_event()
def postCacheHook(self, event, data, stat, key, obj):
if self.updated_event:
self.updated_event()
def objectFromRaw(self, key, data, zstat):
return self.zkobject_class._fromRaw(data, zstat, None)
def updateFromRaw(self, obj, key, data, zstat):
obj._updateFromRaw(data, zstat, None)
def getItem(self, item_id):
self.ensureReady()
return self._cached_objects.get((item_id,))
def getItems(self):
# get a copy of the values view to avoid runtime errors in the event
# the _cached_nodes dict gets updated while iterating
self.ensureReady()
return list(self._cached_objects.values())
def getKeys(self):
self.ensureReady()
return set(self._cached_objects.keys())
class RequestCache(LockableZKObjectCache):
def preCacheHook(self, event, exists, data=None, stat=None):
parts = self._parsePath(event.path)
if parts is None:
return
# Expecting (<self.locks_path>, <uuid>, <lock>,)
# or (<self.items_path>, <uuid>, revision,)
if len(parts) != 3:
return
object_type, request_uuid, *_ = parts
key = (request_uuid,)
request = self._cached_objects.get(key)
if not request:
return
if object_type == self.locks_path:
request._set(is_locked=exists)
elif data is not None:
request._revision._updateFromRaw(data, stat, None)
return self.STOP_OBJECT_UPDATE
class NodeCache(LockableZKObjectCache):
def __init__(self, *args, **kw):
# Key -> quota, for each cached object
self._cached_quota = {}
# Provider canonical name -> quota, for each provider
self._provider_quota = collections.defaultdict(
lambda: QuotaInformation())
super().__init__(*args, **kw)
def postCacheHook(self, event, data, stat, key, obj):
# Have we previously added quota for this object?
old_quota = self._cached_quota.get(key)
if key in self._cached_objects:
new_quota = obj.quota
else:
new_quota = None
# Now that we've established whether we should count the quota
# based on object presence, take node state into account.
if obj is None or obj.state not in obj.ALLOCATED_STATES:
new_quota = None
if new_quota != old_quota:
# Add the new value first so if another thread races these
# two operations, it sees us go over quota and not under.
if new_quota is not None:
self._provider_quota[obj.provider].add(new_quota)
if old_quota is not None:
self._provider_quota[obj.provider].subtract(old_quota)
self._cached_quota[key] = new_quota
super().postCacheHook(event, data, stat, key, obj)
def getQuota(self, provider):
return self._provider_quota[provider.canonical_name]
class LauncherApi:
log = logging.getLogger("zuul.LauncherApi")
def __init__(self, zk_client, component_registry, component_info,
event_callback, connection_filter=None):
self.zk_client = zk_client
self.component_registry = component_registry
self.component_info = component_info
self.event_callback = event_callback
self.requests_cache = RequestCache(
self.zk_client,
self.event_callback,
root=NodesetRequest.ROOT,
items_path=NodesetRequest.REQUESTS_PATH,
locks_path=NodesetRequest.LOCKS_PATH,
zkobject_class=NodesetRequest)
self.nodes_cache = NodeCache(
self.zk_client,
self.event_callback,
root=ProviderNode.ROOT,
items_path=ProviderNode.NODES_PATH,
locks_path=ProviderNode.LOCKS_PATH,
zkobject_class=ProviderNode)
self.connection_filter = connection_filter
self.stop_event = threading.Event()
def stop(self):
self.stop_event.set()
self.requests_cache.stop()
self.nodes_cache.stop()
def getMatchingRequests(self):
candidate_launchers = {
c.hostname: c for c in self.component_registry.all("launcher")}
candidate_names = set(candidate_launchers.keys())
for request in sorted(self.requests_cache.getItems()):
if request.hasLock():
# We are holding a lock, so short-circuit here.
yield request
if request.is_locked:
# Request is locked by someone else
continue
if request.state == NodesetRequest.State.TEST_HOLD:
continue
score_launchers = (
set(request._lscores.keys()) if request._lscores else set())
missing_scores = candidate_names - score_launchers
if missing_scores or request._lscores is None:
# (Re-)compute launcher scores
request._set(_lscores={launcher_score(n, request): n
for n in candidate_names})
launcher_scores = sorted(request._lscores.items())
# self.log.debug("Launcher scores: %s", launcher_scores)
for score, launcher_name in launcher_scores:
launcher = candidate_launchers.get(launcher_name)
if not launcher:
continue
if launcher.state != launcher.RUNNING:
continue
if launcher.hostname == self.component_info.hostname:
yield request
break
def getNodesetRequest(self, request_id):
return self.requests_cache.getItem(request_id)
def getNodesetRequests(self):
return sorted(self.requests_cache.getItems())
def getMatchingProviderNodes(self):
all_launchers = {
c.hostname: c for c in self.component_registry.all("launcher")}
for node in self.nodes_cache.getItems():
if node.hasLock():
# We are holding a lock, so short-circuit here.
yield node
if node.is_locked:
# Node is locked by someone else
continue
candidate_launchers = {
n: c for n, c in all_launchers.items()
if not c.connection_filter
or node.connection_name in c.connection_filter}
candidate_names = set(candidate_launchers)
if node._lscores is None:
missing_scores = candidate_names
else:
score_launchers = set(node._lscores.keys())
missing_scores = candidate_names - score_launchers
if missing_scores or node._lscores is None:
# (Re-)compute launcher scores
node._set(_lscores={launcher_score(n, node): n
for n in candidate_names})
launcher_scores = sorted(node._lscores.items())
for score, launcher_name in launcher_scores:
launcher = candidate_launchers.get(launcher_name)
if not launcher:
# Launcher is no longer online
continue
if launcher.state != launcher.RUNNING:
continue
if launcher.hostname == self.component_info.hostname:
yield node
break
def getProviderNode(self, node_id):
return self.nodes_cache.getItem(node_id)
def getProviderNodes(self):
return self.nodes_cache.getItems()
def createZKContext(self, lock=None, log=None):
return ZKContext(
self.zk_client, lock, self.stop_event, log or self.log)
def cleanupNodes(self):
# TODO: This method currently just performs some basic cleanup and
# might need to be extended in the future.
for node in self.getProviderNodes():
if node.state != ProviderNode.State.USED:
continue
# FIXME: check if the node request still exists
if node.is_locked:
continue
if self.getNodesetRequest(node.request_id):
continue
with self.createZKContext(None) as outer_ctx:
if lock := node.acquireLock(outer_ctx):
with self.createZKContext(lock) as ctx:
try:
node.delete(ctx)
except NoNodeError:
# Node is already deleted
pass
finally:
node.releaseLock(ctx)