# Copyright 2021 BMW Group # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. import time import json import logging from contextlib import suppress from enum import Enum from kazoo.exceptions import LockTimeout, NoNodeError from kazoo.protocol.states import EventType from kazoo.recipe.lock import Lock from zuul.lib.jsonutil import json_dumps from zuul.lib.logutil import get_annotated_logger from zuul.model import BuildRequest from zuul.zk import ZooKeeperSimpleBase from zuul.zk.exceptions import BuildRequestNotFound from zuul.zk import sharding from zuul.zk.watchers import ExistingDataWatch class BuildRequestEvent(Enum): CREATED = 0 UPDATED = 1 RESUMED = 2 CANCELED = 3 DELETED = 4 class ExecutorApi(ZooKeeperSimpleBase): BUILD_REQUEST_ROOT = "/zuul/build-requests" BUILD_PARAMS_ROOT = "/zuul/build-params" LOCK_ROOT = "/zuul/build-request-locks" log = logging.getLogger("zuul.zk.executor.ExecutorApi") def __init__(self, client, zone_filter=None, build_request_callback=None, build_event_callback=None): super().__init__(client) self.zone_filter = zone_filter self._watched_zones = set() self.build_request_callback = build_request_callback self.build_event_callback = build_event_callback # path -> build request self._cached_build_requests = {} self.kazoo_client.ensure_path(self.BUILD_PARAMS_ROOT) if zone_filter is None: self.registerAllZones() else: for zone in zone_filter: self.registerZone(zone) @property def initial_state(self): # This supports holding build requests in tests return BuildRequest.REQUESTED def _getZoneRoot(self, zone): if zone is None: return "/".join([self.BUILD_REQUEST_ROOT, 'unzoned']) else: return "/".join([self.BUILD_REQUEST_ROOT, 'zones', zone]) def registerZone(self, zone): if zone in self._watched_zones: return zone_root = self._getZoneRoot(zone) self.log.debug("Registering for zone %s at %s", zone, zone_root) self.kazoo_client.ensure_path(zone_root) self.kazoo_client.ChildrenWatch( zone_root, self._makeBuildRequestWatcher(zone_root), send_event=True ) self._watched_zones.add(zone) def registerAllZones(self): self.kazoo_client.ensure_path(self.BUILD_REQUEST_ROOT) # Register a child watch that listens to new zones and automatically # registers to them. def watch_zones(children): for zone in children: self.registerZone(zone) zones_root = "/".join([self.BUILD_REQUEST_ROOT, 'zones']) self.kazoo_client.ensure_path(zones_root) self.kazoo_client.ChildrenWatch(zones_root, watch_zones) self.registerZone(None) def _makeBuildStateWatcher(self, path): def watch(data, stat, event=None): return self._watchBuildState(path, data, stat, event) return watch def _watchBuildState(self, path, data, stat, event=None): if not event or event.type == EventType.CHANGED: # Don't process change events w/o any data. This can happen when # a "slow" change watch tried to retrieve the data of a znode that # was deleted in the meantime. if data is None: return # As we already get the data and the stat value, we can directly # use it without asking ZooKeeper for the data again. content = self._bytesToDict(data) if not content: return # We need this one for the HOLD -> REQUESTED check further down old_build_request = self._cached_build_requests.get(path) build_request = BuildRequest.fromDict(content) build_request.path = path build_request._zstat = stat self._cached_build_requests[path] = build_request # NOTE (felix): This is a test-specific condition: For test cases # which are using hold_jobs_in_queue the state change on the build # request from HOLD to REQUESTED is done outside of the executor. # Thus, we must also set the wake event (the callback) so the # executor can pick up those builds after they are released. To not # cause a thundering herd problem in production for each cache # update, the callback is only called under this very specific # condition that can only occur in the tests. if ( self.build_request_callback and old_build_request and old_build_request.state == BuildRequest.HOLD and build_request.state == BuildRequest.REQUESTED ): self.build_request_callback() elif event.type == EventType.DELETED: build_request = self._cached_build_requests.get(path) with suppress(KeyError): del self._cached_build_requests[path] if build_request and self.build_event_callback: self.build_event_callback( build_request, BuildRequestEvent.DELETED ) # Return False to stop the datawatch as the build got deleted. return False def _makeBuildRequestWatcher(self, path): def watch(build_requests, event=None): return self._watchBuildRequests(path, build_requests, event) return watch def _watchBuildRequests(self, path, build_requests, event=None): # The build_requests list always contains all active children. Thus, we # first have to find the new ones by calculating the delta between the # build_requests list and our current cache entries. # NOTE (felix): We could also use this list to determine the deleted # build requests, but it's easier to do this in the DataWatch for the # single build request instead. Otherwise we have to deal with race # conditions between the children and the data watch as one watch might # update a cache entry while the other tries to remove it. build_request_paths = { f"{path}/{uuid}" for uuid in build_requests } new_build_requests = build_request_paths - set( self._cached_build_requests.keys() ) for req_path in new_build_requests: ExistingDataWatch(self.kazoo_client, req_path, self._makeBuildStateWatcher(req_path)) # Notify the user about new build requests if a callback is provided, # but only if there are new requests (we don't want to fire on the # initial callback from kazoo from registering the datawatch). if new_build_requests and self.build_request_callback: self.build_request_callback() def _iterBuildRequests(self): # As the entries in the cache dictionary are added and removed via # data and children watches, we can't simply iterate over it in here, # as the values might change during iteration. for key in list(self._cached_build_requests.keys()): try: build_request = self._cached_build_requests[key] except KeyError: continue yield build_request def inState(self, *states): if not states: # If no states are provided, build a tuple containing all available # ones to always match. We need a tuple to be compliant to the # type of *states above. states = BuildRequest.ALL_STATES build_requests = list( filter(lambda b: b.state in states, self._iterBuildRequests()) ) # Sort the list of builds by precedence and their creation time in # ZooKeeper in ascending order to prevent older builds from starving. return (b for b in sorted(build_requests)) def next(self): yield from self.inState(BuildRequest.REQUESTED) def submit(self, uuid, tenant_name, pipeline_name, params, zone, event_id, precedence=200): log = get_annotated_logger(self.log, event=None, build=uuid) zone_root = self._getZoneRoot(zone) path = "/".join([zone_root, uuid]) build_request = BuildRequest( uuid, self.initial_state, precedence, zone, tenant_name, pipeline_name, event_id, ) log.debug("Submitting build request to ZooKeeper %s", build_request) self.kazoo_client.ensure_path(zone_root) params_path = self._getParamsPath(uuid) with sharding.BufferedShardWriter( self.kazoo_client, params_path) as stream: stream.write(self._dictToBytes(params)) return self.kazoo_client.create( path, self._dictToBytes(build_request.toDict())) # We use child nodes here so that we don't need to lock the build # request node. def requestResume(self, build_request): self.kazoo_client.ensure_path(f"{build_request.path}/resume") def requestCancel(self, build_request): self.kazoo_client.ensure_path(f"{build_request.path}/cancel") def fulfillResume(self, build_request): self.kazoo_client.delete(f"{build_request.path}/resume") def fulfillCancel(self, build_request): self.kazoo_client.delete(f"{build_request.path}/cancel") def update(self, build_request): log = get_annotated_logger( self.log, event=None, build=build_request.uuid ) log.debug("Updating build request %s", build_request) if build_request._zstat is None: log.debug( "Cannot update build request %s: Missing version information.", build_request.uuid, ) return try: zstat = self.kazoo_client.set( build_request.path, self._dictToBytes(build_request.toDict()), version=build_request._zstat.version, ) # Update the zstat on the item after updating the ZK node build_request._zstat = zstat except NoNodeError: raise BuildRequestNotFound( f"Could not update {build_request.path}" ) def get(self, path): """Get a build request Note: do not mix get with iteration; iteration returns cached BuildRequests while get returns a newly created object each time. If you lock a BuildRequest, you must use the same object to unlock it. """ try: data, zstat = self.kazoo_client.get(path) except NoNodeError: return None if not data: return None content = self._bytesToDict(data) build_request = BuildRequest.fromDict(content) build_request.path = path build_request._zstat = zstat return build_request def remove(self, build_request): log = get_annotated_logger( self.log, event=None, build=build_request.uuid ) log.debug("Removing build request %s", build_request) try: # As the build node might contain children (result, data, ...) we # must delete it recursively. self.kazoo_client.delete(build_request.path, recursive=True) except NoNodeError: # Nothing to do if the node is already deleted pass self.clearBuildParams(build_request) try: # Delete the lock parent node as well. path = "/".join([self.LOCK_ROOT, build_request.uuid]) self.kazoo_client.delete(path, recursive=True) except NoNodeError: pass try: self.kazoo_client.get(build_request.path) except NoNodeError: pass def _watchBuildEvents(self, actions, event=None): if event is None: return build_event = None if "cancel" in actions: build_event = BuildRequestEvent.CANCELED elif "resume" in actions: build_event = BuildRequestEvent.RESUMED if build_event and self.build_event_callback: build_request = self._cached_build_requests.get(event.path) self.build_event_callback(build_request, build_event) def lock(self, build_request, blocking=True, timeout=None): # Keep the lock nodes in a different path to keep the build request # subnode structure clean. Otherwise, the lock node will be in between # the cancel and resume requests. path = "/".join([self.LOCK_ROOT, build_request.uuid]) have_lock = False lock = None try: lock = Lock(self.kazoo_client, path) have_lock = lock.acquire(blocking, timeout) except LockTimeout: have_lock = False self.log.error( "Timeout trying to acquire lock: %s", build_request.uuid ) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: return False if not self.kazoo_client.exists(build_request.path): lock.release() self.log.error( "Build not found for locking: %s", build_request.uuid ) # We may have just re-created the lock parent node just # after the scheduler deleted it; therefore we should # (re-) delete it. try: # Delete the lock parent node as well. path = "/".join([self.LOCK_ROOT, build_request.uuid]) self.kazoo_client.delete(path, recursive=True) except NoNodeError: pass return False build_request.lock = lock # Create the children watch to listen for cancel/resume actions on this # build request. self.kazoo_client.ChildrenWatch( build_request.path, self._watchBuildEvents, send_event=True ) return True def unlock(self, build_request): if build_request.lock is None: self.log.warning( "BuildRequest %s does not hold a lock", build_request ) else: build_request.lock.release() build_request.lock = None def isLocked(self, build_request): path = "/".join([self.LOCK_ROOT, build_request.uuid]) lock = Lock(self.kazoo_client, path) is_locked = len(lock.contenders()) > 0 return is_locked def lostBuildRequests(self): # Get a list of builds which are running but not locked by any executor yield from filter( lambda b: not self.isLocked(b), self.inState(BuildRequest.RUNNING, BuildRequest.PAUSED), ) def _getAllZones(self): # Get a list of all zones without using the cache. try: # Get all available zones from ZooKeeper zones = self.kazoo_client.get_children( '/'.join([self.BUILD_REQUEST_ROOT, 'zones'])) zones.append(None) except NoNodeError: zones = [None] return zones def _getAllBuildIds(self, zones=None): # Get a list of all build uuids without using the cache. if zones is None: zones = self._getAllZones() all_builds = set() for zone in zones: try: zone_path = self._getZoneRoot(zone) all_builds.update(self.kazoo_client.get_children(zone_path)) except NoNodeError: # Skip this zone as it doesn't have any builds continue return all_builds def _findLostParams(self, age): # Get data nodes which are older than the specified age (we # don't want to delete nodes which are just being written # slowly). # Convert to MS now = int(time.time() * 1000) age = age * 1000 data_nodes = dict() for data_id in self.kazoo_client.get_children(self.BUILD_PARAMS_ROOT): data_path = self._getParamsPath(data_id) data_zstat = self.kazoo_client.exists(data_path) if now - data_zstat.mtime > age: data_nodes[data_id] = data_path # If there are no candidate data nodes, we don't need to # filter them by known requests. if not data_nodes: return data_nodes.values() # Remove current request uuids for request_id in self._getAllBuildIds(): if request_id in data_nodes: del data_nodes[request_id] # Return the paths return data_nodes.values() def cleanup(self, age=300): # Delete build request params which are not associated with # any current build requests. Note, this does not clean up # lost build requests themselves; the executor client takes # care of that. try: for path in self._findLostParams(age): try: self.log.error("Removing build request params: %s", path) self.kazoo_client.delete(path, recursive=True) except Exception: self.log.execption( "Unable to delete build request params %s", path) except Exception: self.log.exception( "Error cleaning up build request queue %s", self) @staticmethod def _bytesToDict(data): return json.loads(data.decode("utf-8")) @staticmethod def _dictToBytes(data): # The custom json_dumps() will also serialize MappingProxyType objects return json_dumps(data).encode("utf-8") def _getParamsPath(self, build_uuid): return '/'.join([self.BUILD_PARAMS_ROOT, build_uuid]) def clearBuildParams(self, build_request): """Erase the build parameters from ZK to save space""" self.kazoo_client.delete(self._getParamsPath(build_request.uuid), recursive=True) def getBuildParams(self, build_request): """Return the parameters for a build request, if they exist. Once a build request is accepted by an executor, the params may be erased from ZK; this will return None in that case. """ with sharding.BufferedShardReader( self.kazoo_client, self._getParamsPath(build_request.uuid)) as stream: data = stream.read() if not data: return None return self._bytesToDict(data)