From 0dfb4752ee93da37b7a890d28261ab894ea9038f Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Wed, 26 Feb 2025 13:59:29 -0800 Subject: [PATCH] Retry lock cleanup If one actor (thread, scheduler, etc) is holding a lock, and another attempts a non-blocking acquisition (eg, a pipeline processing lock), the second actor can encounter the following sequence: 1) create lock contender 2) see that lock contender did not win the election 3) connection transitions to suspended 4) attempt to delete lock contender Steps 1 and 2 happen in a kazoo retry handler, so if the connection is suspended, the attempt will retry. Step 4 is not performed in a retry handler; in fact the implementation method is named "_best_effort_cleanup". Making a best effort at cleanup would be okay for a blocking acquisition with no timeout since we would always succeed, but it is not enough for a non-blocking acquisition. We must delete our lock contender if the connection is suspended (not expired) because if it resumes, our lock contender will later win the election, after it is orphaned. If the connection does expire, then it doesn't matter, the ephemeral lock would be released. The current best effort method also suppresses the KazooException, so when this does happen, there is no record of it in our logs, making this difficult to identify. To correct the behavior, we will retry the lock cleanup indefinitely, so we can be assured that even if the connection is suspended, we will delete our lock contender. See also https://github.com/python-zk/kazoo/issues/732 and https://github.com/python-zk/kazoo/pull/760 Change-Id: Ieb970373735557bfccdf20fee27102de660052de --- zuul/zk/locks.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/zuul/zk/locks.py b/zuul/zk/locks.py index 1a02ed4957..33522c1a03 100644 --- a/zuul/zk/locks.py +++ b/zuul/zk/locks.py @@ -101,6 +101,22 @@ class SessionAwareMixin: raise Exception("Watch not started") return name in self._zuul_seen_contender_names + # This is a kazoo recipe bugfix included here for convenience, but + # otherwise is not required for the main purpose of this class. + # https://github.com/python-zk/kazoo/issues/732 + def _best_effort_cleanup(self): + self._retry( + self._inner_best_effort_cleanup, + ) + + def _inner_best_effort_cleanup(self): + node = self.node or self._find_node() + if node: + try: + self._delete_node(node) + except NoNodeError: + pass + class SessionAwareLock(SessionAwareMixin, Lock): pass