From 7e71dc74817fc93c98a6edee500411d3919ff31d Mon Sep 17 00:00:00 2001 From: David Shrewsbury Date: Mon, 16 Oct 2017 16:42:53 -0400 Subject: [PATCH] Harden exception handling around instance deletes It's possible to leave a ZK node in a locked state if deleting an instance fails for some reason. This hardens the logic in the cleanup thread to make sure we always attempt to unlock the node if the delete fails. We have a similar situation in clearing the allocation of nodes whose request has disappeared. Also, s/unallocate/deallocate/ Change-Id: I82d9dcb0f86c3296f2427a315e14fce772f5369a --- nodepool/launcher.py | 49 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/nodepool/launcher.py b/nodepool/launcher.py index bb5234435..cdbd6323f 100755 --- a/nodepool/launcher.py +++ b/nodepool/launcher.py @@ -348,7 +348,7 @@ class CleanupWorker(BaseCleanupWorker): def _resetLostRequest(self, zk_conn, req): ''' - Reset the request state and unallocate nodes. + Reset the request state and deallocate nodes. :param ZooKeeper zk_conn: A ZooKeeper connection object. :param NodeRequest req: The lost NodeRequest object. @@ -364,14 +364,21 @@ class CleanupWorker(BaseCleanupWorker): zk_conn.lockNode(node) except exceptions.ZKLockException: self.log.warning( - "Unable to unallocate node %s from request %s", - node.id, req.id) + "Unable to grab lock to deallocate node %s from " + "request %s", node.id, req.id) return node.allocated_to = None - zk_conn.storeNode(node) + try: + zk_conn.storeNode(node) + self.log.debug("Deallocated node %s for lost request %s", + node.id, req.id) + except Exception: + self.log.exception( + "Unable to deallocate node %s from request %s:", + node.id, req.id) + zk_conn.unlockNode(node) - self.log.debug("Unallocated lost request node %s", node.id) req.state = zk.REQUESTED req.nodes = [] @@ -505,9 +512,18 @@ class CleanupWorker(BaseCleanupWorker): zk_conn.unlockNode(node) continue + self.log.debug("Node %s exceeds max ready age: %s >= %s", + node.id, now - node.state_time, + label.max_ready_age) + # The NodeDeleter thread will unlock and remove the # node from ZooKeeper if it succeeds. - self._deleteInstance(node) + try: + self._deleteInstance(node) + except Exception: + self.log.exception("Failure deleting aged node %s:", + node.id) + zk_conn.unlockNode(node) def _run(self): ''' @@ -566,11 +582,16 @@ class DeletedNodeWorker(BaseCleanupWorker): else: # Double check node conditions after lock if node.state == zk.READY and node.allocated_to: - self.log.debug( - "Unallocating node %s with missing request %s", - node.id, node.allocated_to) node.allocated_to = None - zk_conn.storeNode(node) + try: + zk_conn.storeNode(node) + self.log.debug( + "Deallocated node %s with missing request %s", + node.id, node.allocated_to) + except Exception: + self.log.exception( + "Failed to deallocate node %s for missing " + "request %s:", node.id, node.allocated_to) zk_conn.unlockNode(node) @@ -598,7 +619,13 @@ class DeletedNodeWorker(BaseCleanupWorker): # The NodeDeleter thread will unlock and remove the # node from ZooKeeper if it succeeds. - self._deleteInstance(node) + try: + self._deleteInstance(node) + except Exception: + self.log.exception( + "Failure deleting node %s in cleanup state %s:", + node.id, node.state) + zk_conn.unlockNode(node) def _run(self): try: