From 975294b89d5a194dac58849441389dd2e1b992d0 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Thu, 28 Oct 2021 10:53:36 -0700 Subject: [PATCH] Use ZK transactions when deleting job request locks If the lostRequest method is running right as a request was being deleted, it's possible for the following to occur: 1] lostRequests gets a cached list of requests that are running 2] a request node is removed 2] the request lock node is removed, but the parent node is not yet removed 1] isLocked is called 1] isLocked sees the parent node 2] the child node is removed 1] isLocked creates a lock to check if there are children and therefore causes the parent node not to be deleted by thread #2 1] isLocked returns false This would not be possible if lock deletions were atomic, so attempt to make them so by using a ZK transaction to delete the lock. Since requests are deleted while holding the lock, making the lock deletion atomic should avoid the race when checking the lock. Change-Id: I1fc15870c55ca20a586a78c2e9ea3a3bfb58200b --- zuul/zk/job_request_queue.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/zuul/zk/job_request_queue.py b/zuul/zk/job_request_queue.py index de2893256c..058bdcc13a 100644 --- a/zuul/zk/job_request_queue.py +++ b/zuul/zk/job_request_queue.py @@ -329,12 +329,7 @@ class JobRequestQueue(ZooKeeperSimpleBase): self.clearParams(request) except NoNodeError: pass - try: - # Delete the lock parent node as well - path = "/".join([self.LOCK_ROOT, request.uuid]) - self.kazoo_client.delete(path, recursive=True) - except NoNodeError: - pass + self._deleteLock(request.uuid) # We use child nodes here so that we don't need to lock the # request node. @@ -414,12 +409,22 @@ class JobRequestQueue(ZooKeeperSimpleBase): # We may have just re-created the lock parent node just after the # scheduler deleted it; therefore we should (re-) delete it. + self._deleteLock(request.uuid) + + def _deleteLock(self, uuid): + # Recursively delete the children and the lock parent node. + path = "/".join([self.LOCK_ROOT, uuid]) try: - # Delete the lock parent node as well. - path = "/".join([self.LOCK_ROOT, request.uuid]) - self.kazoo_client.delete(path, recursive=True) + children = self.kazoo_client.get_children(path) except NoNodeError: - pass + # The lock is apparently already gone. + return + tr = self.kazoo_client.transaction() + for child in children: + tr.delete("/".join([path, child])) + tr.delete(path) + # We don't care about the results + tr.commit() def unlock(self, request): if request.lock is None: