From 31c276c23485971f6d1e024fe865d5f10ea72877 Mon Sep 17 00:00:00 2001
From: Tobias Henkel <tobias.henkel@bmw.de>
Date: Tue, 23 Jan 2018 19:43:10 +0100
Subject: [PATCH] Fix relaunch attempts when hitting quota errors

The quota calculations in nodepool never can be perfectly accurate
because there still can be some races with other launchers in the
tenant or by other workloads sharing the tenant. So we must be able to
gracefully handle the case when the cloud refuses a launch because of
quota. Currently we just invalidate the quota cache and immediately
try again to launch the node. Of course that will fail again in most
cases. This makes the node request and thus the zuul job fail with
NODE_FAILURE.

Instead we need to pause the handler like it would happen because of
quota calculation. Instead we mark the node as aborted which is no
failure indicator and pause the handler so it will automatically
reschedule a new node launch as soon as the quota calculation allows
it.

Change-Id: I122268dc7723901c04a58fa6f5c1688a3fdab227
---
 nodepool/driver/__init__.py          | 28 ++++++++++++++++++++-
 nodepool/driver/openstack/handler.py | 13 ++++++----
 nodepool/driver/utils.py             | 11 +++++++++
 nodepool/exceptions.py               |  4 +++
 nodepool/launcher.py                 |  4 ++-
 nodepool/tests/test_launcher.py      | 37 +++++++++++++++-------------
 nodepool/zk.py                       |  5 +++-
 7 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/nodepool/driver/__init__.py b/nodepool/driver/__init__.py
index 7383b2534..5ae55c83e 100644
--- a/nodepool/driver/__init__.py
+++ b/nodepool/driver/__init__.py
@@ -300,6 +300,15 @@ class LabelRecorder(object):
         self.data.remove({'label': label, 'node_id': node_id})
         return node_id
 
+    def removeNode(self, id):
+        '''
+        Remove the node with the specified ID.
+        '''
+        for d in self.data:
+            if d['node_id'] == id:
+                self.data.remove(d)
+                return
+
 
 class NodeRequestHandlerNotifications(object):
     """
@@ -645,9 +654,18 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
             return False
 
         # Launches are complete, so populate ready_nodes and failed_nodes.
-        for node in self.nodeset:
+        aborted_nodes = []
+        for node in self.nodeset[:]:
             if node.state == zk.READY:
                 self.ready_nodes.append(node)
+            elif node.state == zk.ABORTED:
+                # ABORTED is a transient error triggered by overquota. In order
+                # to handle this gracefully don't count this as failed so the
+                # node is relaunched within this provider. Unlock the node so
+                # the DeletedNodeWorker cleans up the zNode.
+                aborted_nodes.append(node)
+                self.nodeset.remove(node)
+                self.zk.unlockNode(node)
             else:
                 self.failed_nodes.append(node)
 
@@ -674,6 +692,14 @@ class NodeRequestHandler(NodeRequestHandlerNotifications,
             self.log.debug("Declining node request %s because nodes failed",
                            self.request.id)
             self.decline_request()
+        elif aborted_nodes:
+            # Because nodes are added to the satisfied types list before they
+            # are ready we need to remove the aborted nodes again so they can
+            # be created again.
+            for node in aborted_nodes:
+                self._satisfied_types.removeNode(node.id)
+            self.paused = True
+            return False
         else:
             # The assigned nodes must be added to the request in the order
             # in which they were requested.
diff --git a/nodepool/driver/openstack/handler.py b/nodepool/driver/openstack/handler.py
index 946e93141..1f5e64b98 100644
--- a/nodepool/driver/openstack/handler.py
+++ b/nodepool/driver/openstack/handler.py
@@ -246,10 +246,12 @@ class OpenStackNodeLauncher(NodeLauncher):
                     self.zk.storeNode(self.node)
                 if attempts == self._retries:
                     raise
-                # Invalidate the quota cache if we encountered a quota error.
                 if 'quota exceeded' in str(e).lower():
+                    # A quota exception is not directly recoverable so bail
+                    # out immediately with a specific exception.
                     self.log.info("Quota exceeded, invalidating quota cache")
                     self.handler.manager.invalidateQuotaCache()
+                    raise exceptions.QuotaException("Quota exceeded")
                 attempts += 1
 
         self.node.state = zk.READY
@@ -380,8 +382,8 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
         '''
         Check if all launch requests have completed.
 
-        When all of the Node objects have reached a final state (READY or
-        FAILED), we'll know all threads have finished the launch process.
+        When all of the Node objects have reached a final state (READY, FAILED
+        or ABORTED), we'll know all threads have finished the launch process.
         '''
         if not self._threads:
             return True
@@ -392,9 +394,10 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
 
         node_states = [node.state for node in self.nodeset]
 
-        # NOTE: It very important that NodeLauncher always sets one of
+        # NOTE: It's very important that NodeLauncher always sets one of
         # these states, no matter what.
-        if not all(s in (zk.READY, zk.FAILED) for s in node_states):
+        if not all(s in (zk.READY, zk.FAILED, zk.ABORTED)
+                   for s in node_states):
             return False
 
         return True
diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py
index 32f77f6c4..0a760168f 100644
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@@ -22,6 +22,7 @@ import time
 
 from kazoo import exceptions as kze
 
+from nodepool import exceptions
 from nodepool import stats
 from nodepool import zk
 
@@ -69,6 +70,16 @@ class NodeLauncher(threading.Thread,
                 self.node.id)
             self.node.state = zk.FAILED
             statsd_key = 'error.zksession'
+        except exceptions.QuotaException:
+            # We encountered a quota error when trying to launch a
+            # node. In this case we need to abort the launch. The upper
+            # layers will take care of this and reschedule a new node once
+            # the quota is ok again.
+            self.log.info("Aborting node %s due to quota failure" %
+                          self.node.id)
+            self.node.state = zk.ABORTED
+            self.zk.storeNode(self.node)
+            statsd_key = 'error.quota'
         except Exception as e:
             self.log.exception("Launch failed for node %s:", self.node.id)
             self.node.state = zk.FAILED
diff --git a/nodepool/exceptions.py b/nodepool/exceptions.py
index 44cabfe59..f811a4ee7 100755
--- a/nodepool/exceptions.py
+++ b/nodepool/exceptions.py
@@ -45,6 +45,10 @@ class DibFailedError(BuilderError):
     pass
 
 
+class QuotaException(Exception):
+    pass
+
+
 class TimeoutException(Exception):
     pass
 
diff --git a/nodepool/launcher.py b/nodepool/launcher.py
index c62c8e84e..e81887e19 100755
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@@ -224,6 +224,8 @@ class PoolWorker(threading.Thread):
             try:
                 if not r.poll():
                     active_handlers.append(r)
+                    if r.paused:
+                        self.paused_handler = r
                 else:
                     self.log.debug("Removing handler for request %s",
                                    r.request.id)
@@ -662,7 +664,7 @@ class DeletedNodeWorker(BaseCleanupWorker):
         Delete instances from providers and nodes entries from ZooKeeper.
         '''
         cleanup_states = (zk.USED, zk.IN_USE, zk.BUILDING, zk.FAILED,
-                          zk.DELETING)
+                          zk.DELETING, zk.ABORTED)
 
         zk_conn = self._nodepool.getZK()
         for node in zk_conn.nodeIterator():
diff --git a/nodepool/tests/test_launcher.py b/nodepool/tests/test_launcher.py
index c9e088155..87f2732d3 100644
--- a/nodepool/tests/test_launcher.py
+++ b/nodepool/tests/test_launcher.py
@@ -297,32 +297,35 @@ class TestLauncher(tests.DBTestCase):
         # (according to nodepool's quota estimate) fails.
         client.max_instances = 1
 
-        # Request a second node; this request should fail.
+        # Request a second node; this request should pause the handler.
         req2 = zk.NodeRequest()
         req2.state = zk.REQUESTED
         req2.node_types.append('fake-label')
         self.log.debug("Adding second request")
         self.zk.storeNodeRequest(req2)
-        req2 = self.waitForNodeRequest(req2)
-        self.assertEqual(req2.state, zk.FAILED)
 
-        # After the second request failed, the internal quota estimate
-        # should be reset, so the next request should pause to wait
-        # for more quota to become available.
-        req3 = zk.NodeRequest()
-        req3.state = zk.REQUESTED
-        req3.node_types.append('fake-label')
-        self.log.debug("Adding third request")
-        self.zk.storeNodeRequest(req3)
-        req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
-        self.assertEqual(req3.state, zk.PENDING)
-
-        # Wait until there is a paused request handler and verify that
-        # there is still only one server built (from the first
-        # request).
         pool_worker = pool.getPoolWorkers('fake-provider')
         while not pool_worker[0].paused_handler:
+            # self.log.debug("tick")
             time.sleep(0.1)
+        self.log.debug("finished waiting")
+
+        # The handler is paused now and the request should be in state PENDING
+        req2 = self.waitForNodeRequest(req2, zk.PENDING)
+        self.assertEqual(req2.state, zk.PENDING)
+
+        # Now free up the first node
+        self.log.debug("Marking first node as used %s", req1.id)
+        req1_node.state = zk.USED
+        self.zk.storeNode(req1_node)
+        self.zk.unlockNode(req1_node)
+        self.waitForNodeDeletion(req1_node)
+
+        # After the first node is cleaned up the second request should be
+        # able to fulfill now.
+        req2 = self.waitForNodeRequest(req2)
+        self.assertEqual(req2.state, zk.FULFILLED)
+
         self.assertEqual(len(client._server_list), 1)
 
     def test_fail_request_on_launch_failure(self):
diff --git a/nodepool/zk.py b/nodepool/zk.py
index 6ead9199e..bde1ab4e5 100755
--- a/nodepool/zk.py
+++ b/nodepool/zk.py
@@ -52,6 +52,9 @@ USED = 'used'
 HOLD = 'hold'
 # Initial node state
 INIT = 'init'
+# Aborted due to a transient error like overquota that should not count as a
+# failed launch attempt
+ABORTED = 'aborted'
 
 
 # NOTE(Shrews): Importing this from nodepool.config causes an import error
@@ -484,7 +487,7 @@ class Node(BaseModel):
     Class representing a launched node.
     '''
     VALID_STATES = set([BUILDING, TESTING, READY, IN_USE, USED,
-                        HOLD, DELETING, FAILED, INIT])
+                        HOLD, DELETING, FAILED, INIT, ABORTED])
 
     def __init__(self, id=None):
         super(Node, self).__init__(id)