From ada502db492f9ce0e9efc888fa63ed929376ce7f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 21 Oct 2024 15:22:13 -0700 Subject: [PATCH] Check zkobject size before writing If we attempt to write a non-sharded zkobject that is larger than 1MiB, zk will drop our connection, and we will retry in an infinite loop. To avoid this, check the data size before writing and raise an exception. This will still likely cause errors in Zuul, but it may limit the blast radius of the damage, and may be more recoverable. ZK has a 1MiB max data size for a znode but that includes some other data like the key (it's really a packet buffer size). To avoid false negatives, we use the same value we use for sharding: 1MB, which gives 47KiB headroom. This should be fine since we either expect objects to be much smaller than 1MiB or sharded. We're just trying to catch the cases where we should have sharded something but didn't. Change-Id: If9da8b3968a2db813895ca5aabb736da652d7f2b --- tests/unit/test_zk.py | 37 +++++++++++++++++++++++++++++++++++++ zuul/zk/sharding.py | 2 ++ 2 files changed, 39 insertions(+) diff --git a/tests/unit/test_zk.py b/tests/unit/test_zk.py index a4092e19fb..924b81e1b1 100644 --- a/tests/unit/test_zk.py +++ b/tests/unit/test_zk.py @@ -18,6 +18,8 @@ from collections import defaultdict import json import math import queue +import random +import string import threading import time import uuid @@ -1946,6 +1948,37 @@ class TestZKObject(ZooKeeperBaseTestCase): pipeline1.foo = 'five' self.assertEqual(pipeline1.foo, 'five') + def _test_zk_object_too_large(self, zkobject_class): + # This produces a consistent string that compresses to > 1MiB + rnd = random.Random() + rnd.seed(42) + size = 1024 * 1200 + foo = ''.join(rnd.choice(string.printable) for x in range(size)) + + stop_event = threading.Event() + self.zk_client.client.create('/zuul/pipeline', makepath=True) + # Create a new object + tenant_name = 'fake_tenant' + with tenant_write_lock(self.zk_client, tenant_name) as lock: + context = ZKContext(self.zk_client, lock, stop_event, self.log) + with testtools.ExpectedException(Exception, + 'ZK data size too large'): + pipeline1 = zkobject_class.new(context, + name=tenant_name, + foo=foo) + + pipeline1 = zkobject_class.new(context, + name=tenant_name, + foo='foo') + + with testtools.ExpectedException(Exception, + 'ZK data size too large'): + pipeline1.updateAttributes(context, foo=foo) + + # Refresh an existing object + pipeline1.refresh(context) + self.assertEqual(pipeline1.foo, 'foo') + def test_zk_object(self): self._test_zk_object(DummyZKObject) @@ -1958,6 +1991,10 @@ class TestZKObject(ZooKeeperBaseTestCase): def test_sharded_zk_object_exception(self): self._test_zk_object_exception(DummyShardedZKObject) + def test_zk_object_too_large(self): + # This only makes sense for a regular zkobject + self._test_zk_object_too_large(DummyZKObject) + class TestBranchCache(ZooKeeperBaseTestCase): def test_branch_cache_protected_then_all(self): diff --git a/zuul/zk/sharding.py b/zuul/zk/sharding.py index 9f2bf70661..de960ebdfa 100644 --- a/zuul/zk/sharding.py +++ b/zuul/zk/sharding.py @@ -67,6 +67,8 @@ class RawZKIO(io.RawIOBase): def write(self, data): byte_count = len(data) + if byte_count > NODE_BYTE_SIZE_LIMIT: + raise Exception(f"ZK data size too large: {byte_count}") start = time.perf_counter() if self.create: _, self.zstat = self.client.create(