zuul/zuul/zk/semaphore.py
James E. Blair c355adf44e Add playbook semaphores
This adds the ability to specify that the Zuul executor should
acquire a semaphore before running an individual playbook.  This
is useful for long running jobs which need exclusive access to
a resources for only a small amount of time.

Change-Id: I90f5e0f570ef6c4b0986b0143318a78ddc27bbde
2022-11-07 08:41:10 -08:00

261 lines
9.7 KiB
Python

# Copyright 2021 BMW Group
# Copyright 2021 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import logging
from urllib.parse import quote_plus, unquote
from kazoo.exceptions import BadVersionError, NoNodeError
from zuul.lib.logutil import get_annotated_logger
from zuul.model import SemaphoreReleaseEvent
from zuul.zk import ZooKeeperSimpleBase
def holdersFromData(data):
if not data:
return []
return json.loads(data.decode("utf8"))
def holdersToData(holders):
return json.dumps(holders, sort_keys=True).encode("utf8")
class SemaphoreHandler(ZooKeeperSimpleBase):
log = logging.getLogger("zuul.zk.SemaphoreHandler")
semaphore_root = "/zuul/semaphores"
global_semaphore_root = "/zuul/global-semaphores"
def __init__(self, client, statsd, tenant_name, layout, abide,
read_only=False):
super().__init__(client)
if read_only:
statsd = None
self.read_only = read_only
self.abide = abide
self.layout = layout
self.statsd = statsd
self.tenant_name = tenant_name
self.tenant_root = f"{self.semaphore_root}/{tenant_name}"
def _makePath(self, semaphore):
semaphore_key = quote_plus(semaphore.name)
if semaphore.global_scope:
return f"{self.global_semaphore_root}/{semaphore_key}"
else:
return f"{self.tenant_root}/{semaphore_key}"
def _emitStats(self, semaphore_path, num_holders):
if self.statsd is None:
return
try:
semaphore_quoted = semaphore_path.split('/')[-1]
semaphore_name = unquote(semaphore_quoted)
# statsd safe key:
semaphore_key = semaphore_name.replace('.', '_').replace('/', '_')
key = (f'zuul.tenant.{self.tenant_name}'
f'.semaphore.{semaphore_key}')
self.statsd.gauge(f'{key}.holders', num_holders)
except Exception:
self.log.exception("Unable to send semaphore stats:")
def getSemaphoreInfo(self, job_semaphore):
semaphore = self.layout.getSemaphore(self.abide, job_semaphore.name)
return {
'name': job_semaphore.name,
'path': self._makePath(semaphore),
'resources_first': job_semaphore.resources_first,
'max': 1 if semaphore is None else semaphore.max,
}
def getSemaphoreHandle(self, item, job):
return {
"buildset_path": item.current_build_set.getPath(),
"job_name": job.name,
}
def acquire(self, item, job, request_resources):
# This is the typical method for acquiring semaphores. It
# runs on the scheduler and acquires all semaphores for a job.
if self.read_only:
raise RuntimeError("Read-only semaphore handler")
if not job.semaphores:
return True
log = get_annotated_logger(self.log, item.event)
handle = self.getSemaphoreHandle(item, job)
infos = [self.getSemaphoreInfo(job_semaphore)
for job_semaphore in job.semaphores]
return self.acquireFromInfo(log, infos, handle, request_resources)
def acquireFromInfo(self, log, infos, handle, request_resources=False):
# This method is used by the executor to acquire a playbook
# semaphore; it is similar to the acquire method but the
# semaphore info is frozen (this operates without an abide).
if self.read_only:
raise RuntimeError("Read-only semaphore handler")
if not infos:
return True
all_acquired = True
for info in infos:
if not self._acquire_one(log, info, handle, request_resources):
all_acquired = False
break
if not all_acquired:
# Since we know we have less than all the required
# semaphores, set quiet=True so we don't log an inability
# to release them.
self.releaseFromInfo(log, None, infos, handle, quiet=True)
return False
return True
def _acquire_one(self, log, info, handle, request_resources):
if info['resources_first'] and request_resources:
# We're currently in the resource request phase and want to get the
# resources before locking. So we don't need to do anything here.
return True
else:
# As a safety net we want to acuire the semaphore at least in the
# run phase so don't filter this here as re-acuiring the semaphore
# is not a problem here if it has been already acquired before in
# the resources phase.
pass
self.kazoo_client.ensure_path(info['path'])
semaphore_holders, zstat = self.getHolders(info['path'])
if handle in semaphore_holders:
return True
# semaphore is there, check max
while len(semaphore_holders) < info['max']:
semaphore_holders.append(handle)
try:
self.kazoo_client.set(info['path'],
holdersToData(semaphore_holders),
version=zstat.version)
except BadVersionError:
log.debug(
"Retrying semaphore %s acquire due to concurrent update",
info['name'])
semaphore_holders, zstat = self.getHolders(info['path'])
continue
log.info("Semaphore %s acquired: handle %s",
info['name'], handle)
self._emitStats(info['path'], len(semaphore_holders))
return True
return False
def release(self, event_queue, item, job, quiet=False):
if self.read_only:
raise RuntimeError("Read-only semaphore handler")
if not job.semaphores:
return
log = get_annotated_logger(self.log, item.event)
handle = self.getSemaphoreHandle(item, job)
infos = [self.getSemaphoreInfo(job_semaphore)
for job_semaphore in job.semaphores]
return self.releaseFromInfo(log, event_queue, infos, handle,
quiet=False)
def releaseFromInfo(self, log, event_queue, infos, handle, quiet=False):
for info in infos:
self._release_one(log, info, handle, quiet)
if event_queue:
# If a scheduler has been provided (which it is except
# in the case of a rollback from acquire in this
# class), broadcast an event to trigger pipeline runs.
event = SemaphoreReleaseEvent(info['name'])
event_queue.put(event)
def _release_one(self, log, info, handle, quiet=False):
while True:
try:
semaphore_holders, zstat = self.getHolders(info['path'])
semaphore_holders.remove(handle)
except (ValueError, NoNodeError):
if not quiet:
log.error("Semaphore %s can not be released for %s "
"because the semaphore is not held",
info['path'], handle)
break
try:
self.kazoo_client.set(info['path'],
holdersToData(semaphore_holders),
zstat.version)
except BadVersionError:
log.debug(
"Retrying semaphore %s release due to concurrent update",
info['path'])
continue
log.info("Semaphore %s released for %s",
info['path'], handle)
self._emitStats(info['path'], len(semaphore_holders))
break
def getHolders(self, semaphore_path):
data, zstat = self.kazoo_client.get(semaphore_path)
return holdersFromData(data), zstat
def getSemaphores(self):
ret = []
for root in (self.global_semaphore_root, self.tenant_root):
try:
ret.extend(self.kazoo_client.get_children(root))
except NoNodeError:
pass
return ret
def semaphoreHolders(self, semaphore_name):
semaphore = self.layout.getSemaphore(self.abide, semaphore_name)
semaphore_path = self._makePath(semaphore)
try:
holders, _ = self.getHolders(semaphore_path)
except NoNodeError:
holders = []
return holders
def cleanupLeaks(self):
if self.read_only:
raise RuntimeError("Read-only semaphore handler")
for semaphore_name in self.getSemaphores():
for holder in self.semaphoreHolders(semaphore_name):
if (self.kazoo_client.exists(holder["buildset_path"])
is not None):
continue
semaphore = self.layout.getSemaphore(
self.abide, semaphore_name)
info = {
'name': semaphore.name,
'path': self._makePath(semaphore),
}
self.log.error("Releasing leaked semaphore %s held by %s",
info['path'], holder)
self._release_one(self.log, info, holder, quiet=False)