Stream API for sharded Zookeeper data

Provide 'RawShardedIO' and a buffered reader/writer that allow us to
treat sharded content in Zookeeper as a simple byte stream.

Change-Id: Ifa0ea33cfda325367b0c222ae1100074401028dc
This commit is contained in:
Simon Westphahl
2020-11-25 11:23:42 +01:00
parent 3bb8684e93
commit 3061107fdc
2 changed files with 159 additions and 7 deletions

81
zuul/zk/sharding.py Normal file
View File

@@ -0,0 +1,81 @@
# Copyright 2020 BMW Group
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import io
from contextlib import suppress
from kazoo.exceptions import NoNodeError
# The default size limit for a node in Zookeeper is ~1MiB. However, as this
# also includes the size of the key we can not use all of it for data.
# Because of that we will leave ~47 KiB for the key.
NODE_BYTE_SIZE_LIMIT = 1000000
class RawShardIO(io.RawIOBase):
def __init__(self, client, path):
self.client = client
self.shard_base = path
def readable(self):
return True
def writable(self):
return True
def truncate(self, size=None):
if size != 0:
raise ValueError("Can only truncate to 0")
with suppress(NoNodeError):
self.client.delete(self.shard_base, recursive=True)
@property
def _shards(self):
try:
return self.client.get_children(self.shard_base)
except NoNodeError:
return []
def _getData(self, path):
data, _ = self.client.get(path)
return data
def readall(self):
read_buffer = io.BytesIO()
for shard_name in sorted(self._shards):
shard_path = "/".join((self.shard_base, shard_name))
read_buffer.write(self._getData(shard_path))
return read_buffer.getvalue()
def write(self, shard_data):
byte_count = len(shard_data)
# Only write one key at a time and defer writing the rest to the caller
shard_bytes = bytes(shard_data[0:NODE_BYTE_SIZE_LIMIT])
self.client.create(
"{}/".format(self.shard_base),
shard_bytes,
sequence=True,
makepath=True,
)
return min(byte_count, NODE_BYTE_SIZE_LIMIT)
class BufferedShardWriter(io.BufferedWriter):
def __init__(self, client, path):
super().__init__(RawShardIO(client, path), NODE_BYTE_SIZE_LIMIT)
class BufferedShardReader(io.BufferedReader):
def __init__(self, client, path):
super().__init__(RawShardIO(client, path), NODE_BYTE_SIZE_LIMIT)