Check zkobject size before writing

If we attempt to write a non-sharded zkobject that is larger than
1MiB, zk will drop our connection, and we will retry in an infinite
loop.  To avoid this, check the data size before writing and raise
an exception.  This will still likely cause errors in Zuul, but
it may limit the blast radius of the damage, and may be more
recoverable.

ZK has a 1MiB max data size for a znode but that includes some
other data like the key (it's really a packet buffer size).  To
avoid false negatives, we use the same value we use for sharding:
1MB, which gives 47KiB headroom.

This should be fine since we either expect objects to be much smaller
than 1MiB or sharded.  We're just trying to catch the cases where
we should have sharded something but didn't.

Change-Id: If9da8b3968a2db813895ca5aabb736da652d7f2b
This commit is contained in:
James E. Blair
2024-10-21 15:22:13 -07:00
parent 493c13cb77
commit ada502db49
2 changed files with 39 additions and 0 deletions
+37
View File
@@ -18,6 +18,8 @@ from collections import defaultdict
import json
import math
import queue
import random
import string
import threading
import time
import uuid
@@ -1946,6 +1948,37 @@ class TestZKObject(ZooKeeperBaseTestCase):
pipeline1.foo = 'five'
self.assertEqual(pipeline1.foo, 'five')
def _test_zk_object_too_large(self, zkobject_class):
# This produces a consistent string that compresses to > 1MiB
rnd = random.Random()
rnd.seed(42)
size = 1024 * 1200
foo = ''.join(rnd.choice(string.printable) for x in range(size))
stop_event = threading.Event()
self.zk_client.client.create('/zuul/pipeline', makepath=True)
# Create a new object
tenant_name = 'fake_tenant'
with tenant_write_lock(self.zk_client, tenant_name) as lock:
context = ZKContext(self.zk_client, lock, stop_event, self.log)
with testtools.ExpectedException(Exception,
'ZK data size too large'):
pipeline1 = zkobject_class.new(context,
name=tenant_name,
foo=foo)
pipeline1 = zkobject_class.new(context,
name=tenant_name,
foo='foo')
with testtools.ExpectedException(Exception,
'ZK data size too large'):
pipeline1.updateAttributes(context, foo=foo)
# Refresh an existing object
pipeline1.refresh(context)
self.assertEqual(pipeline1.foo, 'foo')
def test_zk_object(self):
self._test_zk_object(DummyZKObject)
@@ -1958,6 +1991,10 @@ class TestZKObject(ZooKeeperBaseTestCase):
def test_sharded_zk_object_exception(self):
self._test_zk_object_exception(DummyShardedZKObject)
def test_zk_object_too_large(self):
# This only makes sense for a regular zkobject
self._test_zk_object_too_large(DummyZKObject)
class TestBranchCache(ZooKeeperBaseTestCase):
def test_branch_cache_protected_then_all(self):
+2
View File
@@ -67,6 +67,8 @@ class RawZKIO(io.RawIOBase):
def write(self, data):
byte_count = len(data)
if byte_count > NODE_BYTE_SIZE_LIMIT:
raise Exception(f"ZK data size too large: {byte_count}")
start = time.perf_counter()
if self.create:
_, self.zstat = self.client.create(