From 4c00f6a72d2931b7060bd59101526d2239ca508e Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 13 Aug 2024 08:55:49 -0700 Subject: [PATCH] Avoid recursive delete in forceUnlockNode The forceUnlockNode method has exactly one call site: unlocking the backing node within the metastatic driver. This is because the metastatic driver acquires a non-ephemeral lock (so that the lock is held even if the launcher is restarted). The Kazoo lock recipe does not have a facility for unlocking a non-ephemeral lock once the original lock python object is lost. That means we need to remove the lock znode ourselves. That's why we have a forceUnlockNode method. This method previously performed a simple recursive delete on the lock path, which would delete all of the lock contenders including the one holding it (typically, this would be the only contender) and also the lock path znode itself. This is a simple, brute-force method of handling it. However, it is subject to a race condition where two actors might: A] call forceUnlockNode A] delete node/lock/contender0 B] create node/lock/contender1 A] delete node/lock And the last call would fail because the lock znode was no longer empty. To address this, instead of deleting the entire lock znode, we will delete only the node which actually has the lock. Assuming we're dealing with a standard exclusive lock, this is the first contender (the node with the lowest id). The updated method merely deletes this lock. This avoids the race condition above. It also makes things less disruptive for other actors which may be trying to acquire the lock (in that we will no longer delete their lock contender znode). Change-Id: I8c1c3a3fe7f14eb605b19f7ede4931c38c67393d --- nodepool/tests/unit/test_zk.py | 19 ++++++++++++++++++ nodepool/zk/zookeeper.py | 36 ++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/nodepool/tests/unit/test_zk.py b/nodepool/tests/unit/test_zk.py index 9ec621fe0..c5dab9458 100644 --- a/nodepool/tests/unit/test_zk.py +++ b/nodepool/tests/unit/test_zk.py @@ -700,6 +700,25 @@ class TestZooKeeper(tests.DBTestCase): with testtools.ExpectedException(npe.ZKLockException): self.zk.unlockNode(node) + def test_forceUnlockNode(self): + node = zk.Node('100') + self.zk.lockNode(node, ephemeral=False) + self.assertIsNotNone( + self.zk.kazoo_client.exists(self.zk._nodeLockPath(node.id)) + ) + lock_path = self.zk._nodeLockPath(node.id) + children = self.zk.kazoo_client.get_children(lock_path) + self.assertEqual(1, len(children)) + + # Create a node that alpha sorts before the real lock + fake_path = f'{lock_path}/aaaa' + self.zk.kazoo_client.create(fake_path, sequence=True) + + self.zk.forceUnlockNode(node) + + children = self.zk.kazoo_client.get_children(lock_path) + self.assertEqual(['aaaa0000000001'], children) + def _create_node(self): node = zk.Node() node.state = zk.BUILDING diff --git a/nodepool/zk/zookeeper.py b/nodepool/zk/zookeeper.py index 3f4b00dbe..4a3339c48 100644 --- a/nodepool/zk/zookeeper.py +++ b/nodepool/zk/zookeeper.py @@ -19,6 +19,7 @@ import queue import threading import time import uuid +import re from kazoo import exceptions as kze from kazoo.recipe.lock import Lock @@ -2528,15 +2529,42 @@ class ZooKeeper(ZooKeeperBase): node.lock = None node._thread_lock.release() + contenders_re = re.compile(r'^.*?(\d{10})$') + def forceUnlockNode(self, node): - ''' - Forcibly unlock a node. + '''Forcibly unlock a node. + + This assumes that we are only using a plain exclusive kazoo + Lock recipe (no read/write locks). :param Node node: The node to unlock. + ''' - path = self._nodeLockPath(node.id) + + # getNodeLockContenders returns the identifiers but we need + # the path, so this simplified approach just gets the lowest + # sequence node, which is the lock holder (as long as this is + # a plain exclusive lock). + lock_path = self._nodeLockPath(node.id) + contenders = {} try: - self.kazoo_client.delete(path, recursive=True) + for child in self.kazoo_client.get_children(lock_path): + m = self.contenders_re.match(child) + if m: + contenders[m.group(1)] = child + except kze.NoNodeError: + pass + + if not contenders: + return + + key = sorted(contenders.keys())[0] + lock_id = contenders[key] + + lock_path = self._nodeLockPath(node.id) + path = f'{lock_path}/{lock_id}' + try: + self.kazoo_client.delete(path) except kze.NoNodeError: pass