zuul/zuul/zk/blob_store.py
James E. Blair 3a981b89a8 Parallelize some pipeline refresh ops
We may be able to speed up pipeline refreshes in cases where there
are large numbers of items or jobs/builds by parallelizing ZK reads.

Quick refresher: the ZK protocol is async, and kazoo uses a queue to
send operations to a single thread which manages IO.  We typically
call synchronous kazoo client methods which wait for the async result
before returning.  Since this is all thread-safe, we can attempt to
fill the kazoo pipe by having multiple threads call the synchronous
kazoo methods.  If kazoo is waiting on IO for an earlier call, it
will be able to start a later request simultaneously.

Quick aside: it would be difficult for us to use the async methods
directly since our overall code structure is still ordered and
effectively single threaded (we need to load a QueueItem before we
can load the BuildSet and the Builds, etc).

Thus it makes the most sense for us to retain our ordering by using
a ThreadPoolExecutor to run some operations in parallel.

This change parallelizes loading QueueItems within a ChangeQueue,
and also Builds/Jobs within a BuildSet.  These are the points in
a pipeline refresh tree which potentially have the largest number
of children and could benefit the most from the change, especially
if the ZK server has some measurable latency.

Change-Id: I0871cc05a2d13e4ddc4ac284bd67e5e3003200ad
2022-11-09 10:51:29 -08:00

201 lines
7.1 KiB
Python

# Copyright 2020 BMW Group
# Copyright 2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import hashlib
from kazoo.exceptions import NoNodeError
from kazoo.retry import KazooRetry
from zuul.zk.locks import locked, SessionAwareLock
from zuul.zk.zkobject import LocalZKContext, ZKContext
from zuul.zk import sharding
class BlobStore:
_retry_interval = 5
data_root = "/zuul/cache/blob/data"
lock_root = "/zuul/cache/blob/lock"
def __init__(self, context):
self.context = context
def _getRootPath(self, key):
return f"{self.data_root}/{key[0:2]}/{key}"
def _getPath(self, key):
root = self._getRootPath(key)
return f"{root}/data"
def _getFlagPath(self, key):
root = self._getRootPath(key)
return f"{root}/complete"
def _retry(self, context, func, *args, max_tries=-1, **kw):
kazoo_retry = KazooRetry(max_tries=max_tries,
interrupt=context.sessionIsInvalid,
delay=self._retry_interval, backoff=0,
ignore_expire=False)
try:
return kazoo_retry(func, *args, **kw)
except InterruptedError:
pass
@staticmethod
def _retryableLoad(context, key, path, flag):
if not context.client.exists(flag):
raise KeyError(key)
with sharding.BufferedShardReader(context.client, path) as stream:
data = stream.read()
compressed_size = stream.compressed_bytes_read
context.cumulative_read_time += stream.cumulative_read_time
context.cumulative_read_objects += 1
context.cumulative_read_znodes += stream.znodes_read
context.cumulative_read_bytes += compressed_size
return data, compressed_size
def get(self, key):
path = self._getPath(key)
flag = self._getFlagPath(key)
if self.context.sessionIsInvalid():
raise Exception("ZooKeeper session or lock not valid")
data, compressed_size = self._retry(self.context, self._retryableLoad,
self.context, key, path, flag)
return data
def _checkKey(self, key):
# This returns whether the key is in the store. If it is in
# the store, it also touches the flag file so that the cleanup
# routine can know the last time an entry was used.
flag = self._getFlagPath(key)
if self.context.sessionIsInvalid():
raise Exception("ZooKeeper session or lock not valid")
ret = self._retry(self.context, self.context.client.exists,
flag)
if not ret:
return False
self._retry(self.context, self.context.client.set,
flag, b'')
return True
@staticmethod
def _retryableSave(context, path, flag, data):
with sharding.BufferedShardWriter(context.client, path) as stream:
stream.truncate(0)
stream.write(data)
stream.flush()
context.client.ensure_path(flag)
compressed_size = stream.compressed_bytes_written
context.cumulative_write_time += stream.cumulative_write_time
context.cumulative_write_objects += 1
context.cumulative_write_znodes += stream.znodes_written
context.cumulative_write_bytes += compressed_size
return compressed_size
def put(self, data):
if isinstance(self.context, LocalZKContext):
return None
if self.context.sessionIsInvalid():
raise Exception("ZooKeeper session or lock not valid")
hasher = hashlib.sha256()
hasher.update(data)
key = hasher.hexdigest()
path = self._getPath(key)
flag = self._getFlagPath(key)
if self._checkKey(key):
return key
with locked(
SessionAwareLock(
self.context.client,
f"{self.lock_root}/{key}"),
blocking=True
) as lock:
if self._checkKey(key):
return key
# make a new context based on the old one
with ZKContext(self.context.client, lock,
self.context.stop_event,
self.context.log) as locked_context:
self._retry(
locked_context,
self._retryableSave,
locked_context, path, flag, data)
self.context.updateStatsFromOtherContext(locked_context)
return key
def delete(self, key, ltime):
path = self._getRootPath(key)
flag = self._getFlagPath(key)
if self.context.sessionIsInvalid():
raise Exception("ZooKeeper session or lock not valid")
try:
with locked(
SessionAwareLock(
self.context.client,
f"{self.lock_root}/{key}"),
blocking=True
) as lock:
# make a new context based on the old one
with ZKContext(self.context.client, lock,
self.context.stop_event,
self.context.log) as locked_context:
# Double check that it hasn't been used since we
# decided to delete it
data, zstat = self._retry(locked_context,
self.context.client.get,
flag)
if zstat.last_modified_transaction_id < ltime:
self._retry(locked_context, self.context.client.delete,
path, recursive=True)
except NoNodeError:
raise KeyError(key)
def __iter__(self):
try:
hashdirs = self.context.client.get_children(self.data_root)
except NoNodeError:
return
for hashdir in hashdirs:
try:
for key in self.context.client.get_children(
f'{self.data_root}/{hashdir}'):
yield key
except NoNodeError:
pass
def __len__(self):
return len([x for x in self])
def getKeysLastUsedBefore(self, ltime):
ret = set()
for key in self:
flag = self._getFlagPath(key)
data, zstat = self._retry(self.context, self.context.client.get,
flag)
if zstat.last_modified_transaction_id < ltime:
ret.add(key)
return ret