From b3053779a676b2deb23eaf2df6832d3491932bf8 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Tue, 4 Dec 2018 08:14:43 -0800 Subject: [PATCH] Fix race when deleting Node znodes There is a race where: Thread A Thread B locks the node deletes the instance deletes the lock locks the node deletes the node data loads empty data from the node Thread A only deletes the node data because during it's recursive delete, a new child node (the lock from B) has appeared. Thread B proceeds with invalid data and errors out. We can detect this condition because the node has no data. It may be theoretically possible to lock the node and load the data before the node data are deleted as well, so to protect against this case, we set a new node state, "deleted" before we start deleting anything. If thread B encounters either of those two conditions (no data, or a "deleted" state), we know we've hit this race and can safely attempt to recursively delete the node again. Change-Id: Iea5558f9eb471cf1096120b06c098f8f41ab59d9 --- nodepool/launcher.py | 21 +++++++++++++++++++++ nodepool/zk.py | 12 +++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/nodepool/launcher.py b/nodepool/launcher.py index 6f2c39c64..e881ec275 100755 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -704,6 +704,27 @@ class DeletedNodeWorker(BaseCleanupWorker): except exceptions.ZKLockException: continue + if (node.state == zk.DELETED or + node.provider is None): + # The node has been deleted out from under us -- + # we only obtained the lock because in the + # recursive delete, the lock is deleted first and + # we locked the node between the time of the lock + # delete and the node delete. We need to clean up + # the mess. + try: + # This should delete the lock as well + zk_conn.deleteNode(node) + except Exception: + self.log.exception( + "Error deleting already deleted znode:") + try: + zk_conn.unlockNode(node) + except Exception: + self.log.exception( + "Error unlocking already deleted znode:") + continue + # Double check the state now that we have a lock since it # may have changed on us. if node.state not in cleanup_states: diff --git a/nodepool/zk.py b/nodepool/zk.py index ca02502ae..cfdb6df45 100755 --- a/nodepool/zk.py +++ b/nodepool/zk.py @@ -57,6 +57,8 @@ INIT = 'init' # Aborted due to a transient error like overquota that should not count as a # failed launch attempt ABORTED = 'aborted' +# The node has actually been deleted and the Znode should be deleted +DELETED = 'deleted' # NOTE(Shrews): Importing this from nodepool.config causes an import error @@ -503,7 +505,8 @@ class Node(BaseModel): Class representing a launched node. ''' VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED, - HOLD, DELETING, FAILED, INIT, ABORTED]) + HOLD, DELETING, FAILED, INIT, ABORTED, + DELETED]) def __init__(self, id=None): super(Node, self).__init__(id) @@ -1889,6 +1892,13 @@ class ZooKeeper(object): return path = self._nodePath(node.id) + + # Set the node state to deleted before we start deleting + # anything so that we can detect a race condition where the + # lock is removed before the node deletion occurs. + node.state = DELETED + self.client.set(path, node.serialize()) + try: self.client.delete(path, recursive=True) except kze.NoNodeError: