Use kazoo.retry in zkobject
It appears that the division of retryable/non-retryable exceptions is not exactly as asserted in the comments in the retry handlers in zkobject. The kazoo.retry module has a class that implements retry handling with the correct set of retryable exceptions, so let's use that instead. The main thing we lose here is the log messages indicating we are in a retry loop. There doesn't appear to be a good way to hook into KazooRetry for that (we could log in the interrupt method, but that is called every 0.1 seconds). The InvalidObjectError exception appears to be unused, so it is removed (since continuing to use it would make the exception handling more complex). Change-Id: I1278cd27873374b4efd90504d7166c74c6057b52
This commit is contained in:
@ -53,7 +53,7 @@ from tests.base import (
|
||||
from zuul.zk.zkobject import ShardedZKObject, ZKObject, ZKContext
|
||||
from zuul.zk.locks import tenant_write_lock
|
||||
|
||||
from kazoo.exceptions import KazooException, ZookeeperError
|
||||
from kazoo.exceptions import ZookeeperError, OperationTimeoutError
|
||||
|
||||
|
||||
class ZooKeeperBaseTestCase(BaseTestCase):
|
||||
@ -1603,13 +1603,13 @@ class TestZKObject(ZooKeeperBaseTestCase):
|
||||
def delete(self, *args, **kw):
|
||||
self.count += 1
|
||||
if self.count < 2:
|
||||
raise KazooException()
|
||||
raise OperationTimeoutError()
|
||||
return self._real_client.delete(*args, **kw)
|
||||
|
||||
def set(self, *args, **kw):
|
||||
self.count += 1
|
||||
if self.count < 2:
|
||||
raise KazooException()
|
||||
raise OperationTimeoutError()
|
||||
return self._real_client.set(*args, **kw)
|
||||
|
||||
# Fail an update
|
||||
|
@ -31,7 +31,7 @@ import textwrap
|
||||
import types
|
||||
import itertools
|
||||
|
||||
from kazoo.exceptions import NodeExistsError, NoNodeError, ZookeeperError
|
||||
from kazoo.exceptions import NodeExistsError, NoNodeError
|
||||
from cachetools.func import lru_cache
|
||||
|
||||
from zuul.lib import yamlutil as yaml
|
||||
@ -801,24 +801,8 @@ class PipelineChangeList(zkobject.ShardedZKObject):
|
||||
)
|
||||
|
||||
def refresh(self, context):
|
||||
# See comment above about reading without a lock.
|
||||
retry_count = 0
|
||||
max_retries = 5
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
super().refresh(context)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
raise
|
||||
except Exception:
|
||||
context.log.error("Failed to refresh change list")
|
||||
if retry_count >= max_retries:
|
||||
raise
|
||||
retry_count += 1
|
||||
time.sleep(self._retry_interval)
|
||||
self._retry(context, super().refresh,
|
||||
context, max_tries=5)
|
||||
|
||||
def getPath(self):
|
||||
return self.getChangeListPath(self.pipeline)
|
||||
|
@ -29,7 +29,3 @@ class NoClientException(ZuulZooKeeperException):
|
||||
|
||||
class JobRequestNotFound(ZuulZooKeeperException):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidObjectError(ZuulZooKeeperException):
|
||||
pass
|
||||
|
@ -18,11 +18,10 @@ import time
|
||||
import types
|
||||
import zlib
|
||||
|
||||
from kazoo.exceptions import (
|
||||
KazooException, NodeExistsError, NoNodeError, ZookeeperError)
|
||||
from kazoo.exceptions import NodeExistsError, NoNodeError
|
||||
from kazoo.retry import KazooRetry
|
||||
|
||||
from zuul.zk import sharding
|
||||
from zuul.zk.exceptions import InvalidObjectError
|
||||
|
||||
|
||||
class ZKContext:
|
||||
@ -44,6 +43,9 @@ class ZKContext:
|
||||
return ((not self.lock or self.lock.is_still_valid()) and
|
||||
(not self.stop_event or not self.stop_event.is_set()))
|
||||
|
||||
def sessionIsInvalid(self):
|
||||
return not self.sessionIsValid()
|
||||
|
||||
|
||||
class LocalZKContext:
|
||||
"""A Local ZKContext that means don't actually write anything to ZK"""
|
||||
@ -57,6 +59,9 @@ class LocalZKContext:
|
||||
def sessionIsValid(self):
|
||||
return True
|
||||
|
||||
def sessionIsInvalid(self):
|
||||
return False
|
||||
|
||||
|
||||
class ZKObject:
|
||||
_retry_interval = 5
|
||||
@ -175,22 +180,16 @@ class ZKObject:
|
||||
|
||||
def delete(self, context):
|
||||
path = self.getPath()
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
context.client.delete(path, recursive=True)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
context.log.error(
|
||||
"Exception deleting ZKObject %s at %s", self, path)
|
||||
raise
|
||||
except KazooException:
|
||||
context.log.exception(
|
||||
"Exception deleting ZKObject %s, will retry", self)
|
||||
time.sleep(self._retry_interval)
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
if context.sessionIsInvalid():
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
try:
|
||||
self._retry(context, context.client.delete,
|
||||
path, recursive=True)
|
||||
return
|
||||
except Exception:
|
||||
context.log.error(
|
||||
"Exception deleting ZKObject %s at %s", self, path)
|
||||
raise
|
||||
|
||||
def estimateDataSize(self, seen=None):
|
||||
"""Attempt to find all ZKObjects below this one and sum their
|
||||
@ -234,96 +233,98 @@ class ZKObject:
|
||||
|
||||
# Private methods below
|
||||
|
||||
def _retry(self, context, func, *args, max_tries=-1, **kw):
|
||||
kazoo_retry = KazooRetry(max_tries=max_tries,
|
||||
interrupt=context.sessionIsInvalid,
|
||||
delay=self._retry_interval, backoff=0,
|
||||
ignore_expire=False)
|
||||
try:
|
||||
return kazoo_retry(func, *args, **kw)
|
||||
except InterruptedError:
|
||||
pass
|
||||
|
||||
def __init__(self):
|
||||
# Don't support any arguments in constructor to force us to go
|
||||
# through a save or restore path.
|
||||
super().__init__()
|
||||
self._set(_active_context=None)
|
||||
|
||||
@staticmethod
|
||||
def _retryableLoad(context, path):
|
||||
start = time.perf_counter()
|
||||
compressed_data, zstat = context.client.get(path)
|
||||
context.cumulative_read_time += time.perf_counter() - start
|
||||
context.cumulative_read_objects += 1
|
||||
context.cumulative_read_znodes += 1
|
||||
context.cumulative_read_bytes += len(compressed_data)
|
||||
return compressed_data, zstat
|
||||
|
||||
def _load(self, context, path=None):
|
||||
if path is None:
|
||||
path = self.getPath()
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
start = time.perf_counter()
|
||||
compressed_data, zstat = context.client.get(path)
|
||||
context.cumulative_read_time += time.perf_counter() - start
|
||||
context.cumulative_read_objects += 1
|
||||
context.cumulative_read_znodes += 1
|
||||
context.cumulative_read_bytes += len(compressed_data)
|
||||
if context.sessionIsInvalid():
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
try:
|
||||
compressed_data, zstat = self._retry(context, self._retryableLoad,
|
||||
context, path)
|
||||
except Exception:
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
raise
|
||||
self._set(_zkobject_hash=None)
|
||||
try:
|
||||
data = zlib.decompress(compressed_data)
|
||||
except zlib.error:
|
||||
# Fallback for old, uncompressed data
|
||||
data = compressed_data
|
||||
self._set(**self.deserialize(data, context))
|
||||
self._set(_zstat=zstat,
|
||||
_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=len(compressed_data),
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
|
||||
self._set(_zkobject_hash=None)
|
||||
try:
|
||||
data = zlib.decompress(compressed_data)
|
||||
except zlib.error:
|
||||
# Fallback for old, uncompressed data
|
||||
data = compressed_data
|
||||
self._set(**self.deserialize(data, context))
|
||||
self._set(_zstat=zstat,
|
||||
_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=len(compressed_data),
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
raise
|
||||
except KazooException:
|
||||
context.log.exception(
|
||||
"Exception loading ZKObject %s at %s, will retry",
|
||||
self, path)
|
||||
time.sleep(5)
|
||||
except Exception:
|
||||
# A higher level must handle this exception, but log
|
||||
# ourself here so we know what object triggered it.
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
raise
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
@staticmethod
|
||||
def _retryableSave(context, create, path, compressed_data, version):
|
||||
start = time.perf_counter()
|
||||
if create:
|
||||
real_path, zstat = context.client.create(
|
||||
path, compressed_data, makepath=True,
|
||||
include_data=True)
|
||||
else:
|
||||
zstat = context.client.set(path, compressed_data,
|
||||
version=version)
|
||||
context.cumulative_write_time += time.perf_counter() - start
|
||||
context.cumulative_write_objects += 1
|
||||
context.cumulative_write_znodes += 1
|
||||
context.cumulative_write_bytes += len(compressed_data)
|
||||
return zstat
|
||||
|
||||
def _save(self, context, data, create=False):
|
||||
if isinstance(context, LocalZKContext):
|
||||
return
|
||||
path = self.getPath()
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
compressed_data = zlib.compress(data)
|
||||
start = time.perf_counter()
|
||||
if create:
|
||||
real_path, zstat = context.client.create(
|
||||
path, compressed_data, makepath=True,
|
||||
include_data=True)
|
||||
else:
|
||||
zstat = context.client.set(path, compressed_data,
|
||||
version=self._zstat.version)
|
||||
context.cumulative_write_time += time.perf_counter() - start
|
||||
context.cumulative_write_objects += 1
|
||||
context.cumulative_write_znodes += 1
|
||||
context.cumulative_write_bytes += len(compressed_data)
|
||||
if context.sessionIsInvalid():
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
compressed_data = zlib.compress(data)
|
||||
|
||||
self._set(_zstat=zstat,
|
||||
_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=len(compressed_data),
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
context.log.error(
|
||||
"Exception saving ZKObject %s at %s", self, path)
|
||||
raise
|
||||
except KazooException:
|
||||
context.log.exception(
|
||||
"Exception saving ZKObject %s at %s, will retry",
|
||||
self, path)
|
||||
time.sleep(self._retry_interval)
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
try:
|
||||
if hasattr(self, '_zstat'):
|
||||
version = self._zstat.version
|
||||
else:
|
||||
version = None
|
||||
zstat = self._retry(context, self._retryableSave,
|
||||
context, create, path, compressed_data,
|
||||
version)
|
||||
except Exception:
|
||||
context.log.error(
|
||||
"Exception saving ZKObject %s at %s", self, path)
|
||||
raise
|
||||
self._set(_zstat=zstat,
|
||||
_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=len(compressed_data),
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if self._active_context:
|
||||
@ -346,91 +347,73 @@ class ShardedZKObject(ZKObject):
|
||||
# expected. Don't delete them in that case.
|
||||
delete_on_error = True
|
||||
|
||||
@staticmethod
|
||||
def _retryableLoad(context, path):
|
||||
with sharding.BufferedShardReader(context.client, path) as stream:
|
||||
data = stream.read()
|
||||
compressed_size = stream.compressed_bytes_read
|
||||
context.cumulative_read_time += stream.cumulative_read_time
|
||||
context.cumulative_read_objects += 1
|
||||
context.cumulative_read_znodes += stream.znodes_read
|
||||
context.cumulative_read_bytes += compressed_size
|
||||
if not data and context.client.exists(path) is None:
|
||||
raise NoNodeError
|
||||
return data, compressed_size
|
||||
|
||||
def _load(self, context, path=None):
|
||||
if path is None:
|
||||
path = self.getPath()
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
self._set(_zkobject_hash=None)
|
||||
with sharding.BufferedShardReader(
|
||||
context.client, path) as stream:
|
||||
data = stream.read()
|
||||
compressed_size = stream.compressed_bytes_read
|
||||
context.cumulative_read_time += \
|
||||
stream.cumulative_read_time
|
||||
context.cumulative_read_objects += 1
|
||||
context.cumulative_read_znodes += \
|
||||
stream.znodes_read
|
||||
context.cumulative_read_bytes += compressed_size
|
||||
if context.sessionIsInvalid():
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
try:
|
||||
self._set(_zkobject_hash=None)
|
||||
data, compressed_size = self._retry(context, self._retryableLoad,
|
||||
context, path)
|
||||
self._set(**self.deserialize(data, context))
|
||||
self._set(_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=compressed_size,
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
except Exception:
|
||||
# A higher level must handle this exception, but log
|
||||
# ourself here so we know what object triggered it.
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
if self.delete_on_error:
|
||||
self.delete(context)
|
||||
raise
|
||||
|
||||
if not data and context.client.exists(path) is None:
|
||||
raise NoNodeError
|
||||
self._set(**self.deserialize(data, context))
|
||||
self._set(_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=compressed_size,
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
raise
|
||||
except KazooException:
|
||||
context.log.exception(
|
||||
"Exception loading ZKObject %s at %s, will retry",
|
||||
self, path)
|
||||
time.sleep(5)
|
||||
except Exception as exc:
|
||||
# A higher level must handle this exception, but log
|
||||
# ourself here so we know what object triggered it.
|
||||
context.log.error(
|
||||
"Exception loading ZKObject %s at %s", self, path)
|
||||
if self.delete_on_error:
|
||||
self.delete(context)
|
||||
raise InvalidObjectError from exc
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
@staticmethod
|
||||
def _retryableSave(context, path, data):
|
||||
with sharding.BufferedShardWriter(context.client, path) as stream:
|
||||
stream.truncate(0)
|
||||
stream.write(data)
|
||||
stream.flush()
|
||||
compressed_size = stream.compressed_bytes_written
|
||||
context.cumulative_write_time += stream.cumulative_write_time
|
||||
context.cumulative_write_objects += 1
|
||||
context.cumulative_write_znodes += stream.znodes_written
|
||||
context.cumulative_write_bytes += compressed_size
|
||||
return compressed_size
|
||||
|
||||
def _save(self, context, data, create=False):
|
||||
if isinstance(context, LocalZKContext):
|
||||
return
|
||||
path = self.getPath()
|
||||
while context.sessionIsValid():
|
||||
try:
|
||||
if (create and
|
||||
not self.truncate_on_create and
|
||||
context.client.exists(path) is not None):
|
||||
if context.sessionIsInvalid():
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
try:
|
||||
if create and not self.truncate_on_create:
|
||||
exists = self._retry(context, context.client.exists, path)
|
||||
if exists is not None:
|
||||
raise NodeExistsError
|
||||
with sharding.BufferedShardWriter(
|
||||
context.client, path) as stream:
|
||||
stream.truncate(0)
|
||||
stream.write(data)
|
||||
stream.flush()
|
||||
compressed_size = stream.compressed_bytes_written
|
||||
context.cumulative_write_time += \
|
||||
stream.cumulative_write_time
|
||||
context.cumulative_write_objects += 1
|
||||
context.cumulative_write_znodes += \
|
||||
stream.znodes_written
|
||||
context.cumulative_write_bytes += compressed_size
|
||||
|
||||
self._set(_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=compressed_size,
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
return
|
||||
except ZookeeperError:
|
||||
# These errors come from the server and are not
|
||||
# retryable. Connection errors are KazooExceptions so
|
||||
# they aren't caught here and we will retry.
|
||||
context.log.error(
|
||||
"Exception saving ZKObject %s at %s", self, path)
|
||||
raise
|
||||
except KazooException:
|
||||
context.log.exception(
|
||||
"Exception saving ZKObject %s at %s, will retry",
|
||||
self, path)
|
||||
time.sleep(self._retry_interval)
|
||||
raise Exception("ZooKeeper session or lock not valid")
|
||||
compressed_size = self._retry(context, self._retryableSave,
|
||||
context, path, data)
|
||||
self._set(_zkobject_hash=hash(data),
|
||||
_zkobject_compressed_size=compressed_size,
|
||||
_zkobject_uncompressed_size=len(data),
|
||||
)
|
||||
except Exception:
|
||||
context.log.error(
|
||||
"Exception saving ZKObject %s at %s", self, path)
|
||||
raise
|
||||
|
Reference in New Issue
Block a user