529 lines
19 KiB
Python
529 lines
19 KiB
Python
# Copyright 2021 BMW Group
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import time
|
|
import json
|
|
import logging
|
|
from contextlib import suppress
|
|
from enum import Enum
|
|
|
|
from kazoo.exceptions import LockTimeout, NoNodeError
|
|
from kazoo.protocol.states import EventType
|
|
from kazoo.recipe.lock import Lock
|
|
|
|
from zuul.lib.jsonutil import json_dumps
|
|
from zuul.lib.logutil import get_annotated_logger
|
|
from zuul.model import BuildRequest
|
|
from zuul.zk import ZooKeeperSimpleBase
|
|
from zuul.zk.exceptions import BuildRequestNotFound
|
|
from zuul.zk import sharding
|
|
from zuul.zk.watchers import ExistingDataWatch
|
|
|
|
|
|
class BuildRequestEvent(Enum):
|
|
CREATED = 0
|
|
UPDATED = 1
|
|
RESUMED = 2
|
|
CANCELED = 3
|
|
DELETED = 4
|
|
|
|
|
|
class ExecutorApi(ZooKeeperSimpleBase):
|
|
|
|
BUILD_REQUEST_ROOT = "/zuul/build-requests"
|
|
BUILD_PARAMS_ROOT = "/zuul/build-params"
|
|
LOCK_ROOT = "/zuul/build-request-locks"
|
|
|
|
log = logging.getLogger("zuul.zk.executor.ExecutorApi")
|
|
|
|
def __init__(self, client, zone_filter=None,
|
|
build_request_callback=None,
|
|
build_event_callback=None):
|
|
super().__init__(client)
|
|
|
|
self.zone_filter = zone_filter
|
|
self._watched_zones = set()
|
|
self.build_request_callback = build_request_callback
|
|
self.build_event_callback = build_event_callback
|
|
|
|
# path -> build request
|
|
self._cached_build_requests = {}
|
|
|
|
self.kazoo_client.ensure_path(self.BUILD_PARAMS_ROOT)
|
|
if zone_filter is None:
|
|
self.registerAllZones()
|
|
else:
|
|
for zone in zone_filter:
|
|
self.registerZone(zone)
|
|
|
|
@property
|
|
def initial_state(self):
|
|
# This supports holding build requests in tests
|
|
return BuildRequest.REQUESTED
|
|
|
|
def _getZoneRoot(self, zone):
|
|
if zone is None:
|
|
return "/".join([self.BUILD_REQUEST_ROOT, 'unzoned'])
|
|
else:
|
|
return "/".join([self.BUILD_REQUEST_ROOT, 'zones', zone])
|
|
|
|
def registerZone(self, zone):
|
|
if zone in self._watched_zones:
|
|
return
|
|
zone_root = self._getZoneRoot(zone)
|
|
self.log.debug("Registering for zone %s at %s", zone, zone_root)
|
|
self.kazoo_client.ensure_path(zone_root)
|
|
|
|
self.kazoo_client.ChildrenWatch(
|
|
zone_root, self._makeBuildRequestWatcher(zone_root),
|
|
send_event=True
|
|
)
|
|
self._watched_zones.add(zone)
|
|
|
|
def registerAllZones(self):
|
|
self.kazoo_client.ensure_path(self.BUILD_REQUEST_ROOT)
|
|
|
|
# Register a child watch that listens to new zones and automatically
|
|
# registers to them.
|
|
def watch_zones(children):
|
|
for zone in children:
|
|
self.registerZone(zone)
|
|
|
|
zones_root = "/".join([self.BUILD_REQUEST_ROOT, 'zones'])
|
|
self.kazoo_client.ensure_path(zones_root)
|
|
self.kazoo_client.ChildrenWatch(zones_root, watch_zones)
|
|
self.registerZone(None)
|
|
|
|
def _makeBuildStateWatcher(self, path):
|
|
def watch(data, stat, event=None):
|
|
return self._watchBuildState(path, data, stat, event)
|
|
return watch
|
|
|
|
def _watchBuildState(self, path, data, stat, event=None):
|
|
if not event or event.type == EventType.CHANGED:
|
|
# Don't process change events w/o any data. This can happen when
|
|
# a "slow" change watch tried to retrieve the data of a znode that
|
|
# was deleted in the meantime.
|
|
if data is None:
|
|
return
|
|
# As we already get the data and the stat value, we can directly
|
|
# use it without asking ZooKeeper for the data again.
|
|
content = self._bytesToDict(data)
|
|
if not content:
|
|
return
|
|
|
|
# We need this one for the HOLD -> REQUESTED check further down
|
|
old_build_request = self._cached_build_requests.get(path)
|
|
|
|
build_request = BuildRequest.fromDict(content)
|
|
build_request.path = path
|
|
build_request._zstat = stat
|
|
self._cached_build_requests[path] = build_request
|
|
|
|
# NOTE (felix): This is a test-specific condition: For test cases
|
|
# which are using hold_jobs_in_queue the state change on the build
|
|
# request from HOLD to REQUESTED is done outside of the executor.
|
|
# Thus, we must also set the wake event (the callback) so the
|
|
# executor can pick up those builds after they are released. To not
|
|
# cause a thundering herd problem in production for each cache
|
|
# update, the callback is only called under this very specific
|
|
# condition that can only occur in the tests.
|
|
if (
|
|
self.build_request_callback
|
|
and old_build_request
|
|
and old_build_request.state == BuildRequest.HOLD
|
|
and build_request.state == BuildRequest.REQUESTED
|
|
):
|
|
self.build_request_callback()
|
|
|
|
elif event.type == EventType.DELETED:
|
|
build_request = self._cached_build_requests.get(path)
|
|
with suppress(KeyError):
|
|
del self._cached_build_requests[path]
|
|
|
|
if build_request and self.build_event_callback:
|
|
self.build_event_callback(
|
|
build_request, BuildRequestEvent.DELETED
|
|
)
|
|
|
|
# Return False to stop the datawatch as the build got deleted.
|
|
return False
|
|
|
|
def _makeBuildRequestWatcher(self, path):
|
|
def watch(build_requests, event=None):
|
|
return self._watchBuildRequests(path, build_requests, event)
|
|
return watch
|
|
|
|
def _watchBuildRequests(self, path, build_requests, event=None):
|
|
# The build_requests list always contains all active children. Thus, we
|
|
# first have to find the new ones by calculating the delta between the
|
|
# build_requests list and our current cache entries.
|
|
# NOTE (felix): We could also use this list to determine the deleted
|
|
# build requests, but it's easier to do this in the DataWatch for the
|
|
# single build request instead. Otherwise we have to deal with race
|
|
# conditions between the children and the data watch as one watch might
|
|
# update a cache entry while the other tries to remove it.
|
|
|
|
build_request_paths = {
|
|
f"{path}/{uuid}" for uuid in build_requests
|
|
}
|
|
|
|
new_build_requests = build_request_paths - set(
|
|
self._cached_build_requests.keys()
|
|
)
|
|
|
|
for req_path in new_build_requests:
|
|
ExistingDataWatch(self.kazoo_client,
|
|
req_path,
|
|
self._makeBuildStateWatcher(req_path))
|
|
|
|
# Notify the user about new build requests if a callback is provided,
|
|
# but only if there are new requests (we don't want to fire on the
|
|
# initial callback from kazoo from registering the datawatch).
|
|
if new_build_requests and self.build_request_callback:
|
|
self.build_request_callback()
|
|
|
|
def _iterBuildRequests(self):
|
|
# As the entries in the cache dictionary are added and removed via
|
|
# data and children watches, we can't simply iterate over it in here,
|
|
# as the values might change during iteration.
|
|
for key in list(self._cached_build_requests.keys()):
|
|
try:
|
|
build_request = self._cached_build_requests[key]
|
|
except KeyError:
|
|
continue
|
|
yield build_request
|
|
|
|
def inState(self, *states):
|
|
if not states:
|
|
# If no states are provided, build a tuple containing all available
|
|
# ones to always match. We need a tuple to be compliant to the
|
|
# type of *states above.
|
|
states = BuildRequest.ALL_STATES
|
|
|
|
build_requests = list(
|
|
filter(lambda b: b.state in states, self._iterBuildRequests())
|
|
)
|
|
|
|
# Sort the list of builds by precedence and their creation time in
|
|
# ZooKeeper in ascending order to prevent older builds from starving.
|
|
return (b for b in sorted(build_requests))
|
|
|
|
def next(self):
|
|
yield from self.inState(BuildRequest.REQUESTED)
|
|
|
|
def submit(self, uuid, tenant_name, pipeline_name, params, zone,
|
|
event_id, precedence=200):
|
|
log = get_annotated_logger(self.log, event=None, build=uuid)
|
|
|
|
zone_root = self._getZoneRoot(zone)
|
|
path = "/".join([zone_root, uuid])
|
|
|
|
build_request = BuildRequest(
|
|
uuid,
|
|
self.initial_state,
|
|
precedence,
|
|
zone,
|
|
tenant_name,
|
|
pipeline_name,
|
|
event_id,
|
|
)
|
|
|
|
log.debug("Submitting build request to ZooKeeper %s", build_request)
|
|
|
|
self.kazoo_client.ensure_path(zone_root)
|
|
|
|
params_path = self._getParamsPath(uuid)
|
|
with sharding.BufferedShardWriter(
|
|
self.kazoo_client, params_path) as stream:
|
|
stream.write(self._dictToBytes(params))
|
|
|
|
return self.kazoo_client.create(
|
|
path, self._dictToBytes(build_request.toDict()))
|
|
|
|
# We use child nodes here so that we don't need to lock the build
|
|
# request node.
|
|
def requestResume(self, build_request):
|
|
self.kazoo_client.ensure_path(f"{build_request.path}/resume")
|
|
|
|
def requestCancel(self, build_request):
|
|
self.kazoo_client.ensure_path(f"{build_request.path}/cancel")
|
|
|
|
def fulfillResume(self, build_request):
|
|
self.kazoo_client.delete(f"{build_request.path}/resume")
|
|
|
|
def fulfillCancel(self, build_request):
|
|
self.kazoo_client.delete(f"{build_request.path}/cancel")
|
|
|
|
def update(self, build_request):
|
|
log = get_annotated_logger(
|
|
self.log, event=None, build=build_request.uuid
|
|
)
|
|
log.debug("Updating build request %s", build_request)
|
|
|
|
if build_request._zstat is None:
|
|
log.debug(
|
|
"Cannot update build request %s: Missing version information.",
|
|
build_request.uuid,
|
|
)
|
|
return
|
|
try:
|
|
zstat = self.kazoo_client.set(
|
|
build_request.path,
|
|
self._dictToBytes(build_request.toDict()),
|
|
version=build_request._zstat.version,
|
|
)
|
|
# Update the zstat on the item after updating the ZK node
|
|
build_request._zstat = zstat
|
|
except NoNodeError:
|
|
raise BuildRequestNotFound(
|
|
f"Could not update {build_request.path}"
|
|
)
|
|
|
|
def get(self, path):
|
|
"""Get a build request
|
|
|
|
Note: do not mix get with iteration; iteration returns cached
|
|
BuildRequests while get returns a newly created object each
|
|
time. If you lock a BuildRequest, you must use the same
|
|
object to unlock it.
|
|
|
|
"""
|
|
|
|
try:
|
|
data, zstat = self.kazoo_client.get(path)
|
|
except NoNodeError:
|
|
return None
|
|
|
|
if not data:
|
|
return None
|
|
|
|
content = self._bytesToDict(data)
|
|
|
|
build_request = BuildRequest.fromDict(content)
|
|
build_request.path = path
|
|
build_request._zstat = zstat
|
|
|
|
return build_request
|
|
|
|
def remove(self, build_request):
|
|
log = get_annotated_logger(
|
|
self.log, event=None, build=build_request.uuid
|
|
)
|
|
log.debug("Removing build request %s", build_request)
|
|
try:
|
|
# As the build node might contain children (result, data, ...) we
|
|
# must delete it recursively.
|
|
self.kazoo_client.delete(build_request.path, recursive=True)
|
|
except NoNodeError:
|
|
# Nothing to do if the node is already deleted
|
|
pass
|
|
self.clearBuildParams(build_request)
|
|
try:
|
|
# Delete the lock parent node as well.
|
|
path = "/".join([self.LOCK_ROOT, build_request.uuid])
|
|
self.kazoo_client.delete(path, recursive=True)
|
|
except NoNodeError:
|
|
pass
|
|
try:
|
|
self.kazoo_client.get(build_request.path)
|
|
except NoNodeError:
|
|
pass
|
|
|
|
def _watchBuildEvents(self, actions, event=None):
|
|
if event is None:
|
|
return
|
|
|
|
build_event = None
|
|
if "cancel" in actions:
|
|
build_event = BuildRequestEvent.CANCELED
|
|
elif "resume" in actions:
|
|
build_event = BuildRequestEvent.RESUMED
|
|
|
|
if build_event and self.build_event_callback:
|
|
build_request = self._cached_build_requests.get(event.path)
|
|
self.build_event_callback(build_request, build_event)
|
|
|
|
def lock(self, build_request, blocking=True, timeout=None):
|
|
# Keep the lock nodes in a different path to keep the build request
|
|
# subnode structure clean. Otherwise, the lock node will be in between
|
|
# the cancel and resume requests.
|
|
path = "/".join([self.LOCK_ROOT, build_request.uuid])
|
|
have_lock = False
|
|
lock = None
|
|
try:
|
|
lock = Lock(self.kazoo_client, path)
|
|
have_lock = lock.acquire(blocking, timeout)
|
|
except LockTimeout:
|
|
have_lock = False
|
|
self.log.error(
|
|
"Timeout trying to acquire lock: %s", build_request.uuid
|
|
)
|
|
|
|
# If we aren't blocking, it's possible we didn't get the lock
|
|
# because someone else has it.
|
|
if not have_lock:
|
|
return False
|
|
|
|
if not self.kazoo_client.exists(build_request.path):
|
|
lock.release()
|
|
self.log.error(
|
|
"Build not found for locking: %s", build_request.uuid
|
|
)
|
|
|
|
# We may have just re-created the lock parent node just
|
|
# after the scheduler deleted it; therefore we should
|
|
# (re-) delete it.
|
|
try:
|
|
# Delete the lock parent node as well.
|
|
path = "/".join([self.LOCK_ROOT, build_request.uuid])
|
|
self.kazoo_client.delete(path, recursive=True)
|
|
except NoNodeError:
|
|
pass
|
|
|
|
return False
|
|
|
|
build_request.lock = lock
|
|
|
|
# Create the children watch to listen for cancel/resume actions on this
|
|
# build request.
|
|
self.kazoo_client.ChildrenWatch(
|
|
build_request.path, self._watchBuildEvents, send_event=True
|
|
)
|
|
return True
|
|
|
|
def unlock(self, build_request):
|
|
if build_request.lock is None:
|
|
self.log.warning(
|
|
"BuildRequest %s does not hold a lock", build_request
|
|
)
|
|
else:
|
|
build_request.lock.release()
|
|
build_request.lock = None
|
|
|
|
def isLocked(self, build_request):
|
|
path = "/".join([self.LOCK_ROOT, build_request.uuid])
|
|
lock = Lock(self.kazoo_client, path)
|
|
is_locked = len(lock.contenders()) > 0
|
|
return is_locked
|
|
|
|
def lostBuildRequests(self):
|
|
# Get a list of builds which are running but not locked by any executor
|
|
yield from filter(
|
|
lambda b: not self.isLocked(b),
|
|
self.inState(BuildRequest.RUNNING, BuildRequest.PAUSED),
|
|
)
|
|
|
|
def _getAllZones(self):
|
|
# Get a list of all zones without using the cache.
|
|
try:
|
|
# Get all available zones from ZooKeeper
|
|
zones = self.kazoo_client.get_children(
|
|
'/'.join([self.BUILD_REQUEST_ROOT, 'zones']))
|
|
zones.append(None)
|
|
except NoNodeError:
|
|
zones = [None]
|
|
return zones
|
|
|
|
def _getAllBuildIds(self, zones=None):
|
|
# Get a list of all build uuids without using the cache.
|
|
if zones is None:
|
|
zones = self._getAllZones()
|
|
|
|
all_builds = set()
|
|
for zone in zones:
|
|
try:
|
|
zone_path = self._getZoneRoot(zone)
|
|
all_builds.update(self.kazoo_client.get_children(zone_path))
|
|
except NoNodeError:
|
|
# Skip this zone as it doesn't have any builds
|
|
continue
|
|
return all_builds
|
|
|
|
def _findLostParams(self, age):
|
|
# Get data nodes which are older than the specified age (we
|
|
# don't want to delete nodes which are just being written
|
|
# slowly).
|
|
# Convert to MS
|
|
now = int(time.time() * 1000)
|
|
age = age * 1000
|
|
data_nodes = dict()
|
|
for data_id in self.kazoo_client.get_children(self.BUILD_PARAMS_ROOT):
|
|
data_path = self._getParamsPath(data_id)
|
|
data_zstat = self.kazoo_client.exists(data_path)
|
|
if now - data_zstat.mtime > age:
|
|
data_nodes[data_id] = data_path
|
|
|
|
# If there are no candidate data nodes, we don't need to
|
|
# filter them by known requests.
|
|
if not data_nodes:
|
|
return data_nodes.values()
|
|
|
|
# Remove current request uuids
|
|
for request_id in self._getAllBuildIds():
|
|
if request_id in data_nodes:
|
|
del data_nodes[request_id]
|
|
|
|
# Return the paths
|
|
return data_nodes.values()
|
|
|
|
def cleanup(self, age=300):
|
|
# Delete build request params which are not associated with
|
|
# any current build requests. Note, this does not clean up
|
|
# lost build requests themselves; the executor client takes
|
|
# care of that.
|
|
try:
|
|
for path in self._findLostParams(age):
|
|
try:
|
|
self.log.error("Removing build request params: %s", path)
|
|
self.kazoo_client.delete(path, recursive=True)
|
|
except Exception:
|
|
self.log.execption(
|
|
"Unable to delete build request params %s", path)
|
|
except Exception:
|
|
self.log.exception(
|
|
"Error cleaning up build request queue %s", self)
|
|
|
|
@staticmethod
|
|
def _bytesToDict(data):
|
|
return json.loads(data.decode("utf-8"))
|
|
|
|
@staticmethod
|
|
def _dictToBytes(data):
|
|
# The custom json_dumps() will also serialize MappingProxyType objects
|
|
return json_dumps(data).encode("utf-8")
|
|
|
|
def _getParamsPath(self, build_uuid):
|
|
return '/'.join([self.BUILD_PARAMS_ROOT, build_uuid])
|
|
|
|
def clearBuildParams(self, build_request):
|
|
"""Erase the build parameters from ZK to save space"""
|
|
self.kazoo_client.delete(self._getParamsPath(build_request.uuid),
|
|
recursive=True)
|
|
|
|
def getBuildParams(self, build_request):
|
|
"""Return the parameters for a build request, if they exist.
|
|
|
|
Once a build request is accepted by an executor, the params
|
|
may be erased from ZK; this will return None in that case.
|
|
|
|
"""
|
|
with sharding.BufferedShardReader(
|
|
self.kazoo_client,
|
|
self._getParamsPath(build_request.uuid)) as stream:
|
|
data = stream.read()
|
|
if not data:
|
|
return None
|
|
return self._bytesToDict(data)
|