From 2c632af4264006e92352fa0ff8f35856c82a16ad Mon Sep 17 00:00:00 2001
From: Joshua Watt <JPEWhacker@gmail.com>
Date: Mon, 25 Apr 2022 08:48:38 -0500
Subject: [PATCH] Do not reset quota cache timestamp when invalid

The quota cache may not be a valid dictionary when
invalidateQuotaCache() is called (e.g. when 'ignore-provider-quota' is
used in OpenStack). In that case, don't attempt to treat the None as a
dictionary as this raises a TypeError exception.

This bug was preventing Quota errors from OpenStack from causing
nodepool to retry the node request when ignore-provider-quota is True,
because the OpenStack handler calles invalidateQuotaCache() before
raising the QuotaException. Since invalidateQuotaCache() was raising
TypeError, it prevented the QuotaException from being raised and the
node allocation was outright failed.

A test has been added to verify that nodepool and OpenStack will now
retry node allocations as intended.

This fixes that bug, but does change the behavior of OpenStack when
ignore-provider-quota is True and it returns a Quota error.

Change-Id: I1916c56c4f07c6a5d53ce82f4c1bb32bddbd7d63
Signed-off-by: Joshua Watt <JPEWhacker@gmail.com>
---
 nodepool/driver/fake/provider.py              |  9 +++-
 nodepool/driver/utils.py                      | 17 +++----
 nodepool/tests/__init__.py                    |  9 ++++
 nodepool/tests/unit/test_launcher.py          | 50 ++++++++++++++++++-
 ...-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml |  8 +++
 5 files changed, 81 insertions(+), 12 deletions(-)
 create mode 100644 releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml

diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py
index e38376a1d..6e2388b0c 100644
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
         ]
         self._azs = ['az1', 'az2']
         self._server_list = []
-        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
-            _get_quota()
+        self._update_quota()
         self._down_ports = [
             Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
                   device_owner="compute:nova"),
@@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
                   device_owner=None),
         ]
 
+    def _update_quota(self):
+        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
+            _get_quota()
+
     def _get(self, name_or_id, instance_list):
         self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
         for instance in instance_list:
@@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
             private_v4 = 'fake'
             host_id = 'fake'
             interface_ip = 'fake'
+        self._update_quota()
         over_quota = False
         if (instance_type == Dummy.INSTANCE and
             self.max_instances > -1 and
@@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
         return self._azs.copy()
 
     def get_compute_limits(self):
+        self._update_quota()
         return Dummy(
             'limits',
             max_total_cores=self.max_cores,
diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py
index e017e569d..a2fe32267 100644
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@@ -259,7 +259,8 @@ class QuotaSupport:
 
     def __init__(self, *args, **kw):
         super().__init__(*args, **kw)
-        self._current_nodepool_quota = None
+        self._current_nodepool_quota_timestamp = 0
+        self._current_nodepool_quota = {}
 
     @abc.abstractmethod
     def quotaNeededByLabel(self, label, pool):
@@ -291,7 +292,7 @@ class QuotaSupport:
         pass
 
     def invalidateQuotaCache(self):
-        self._current_nodepool_quota['timestamp'] = 0
+        self._current_nodepool_quota_timestamp = 0
 
     def estimatedNodepoolQuota(self):
         '''
@@ -307,8 +308,8 @@ class QuotaSupport:
 
         if self._current_nodepool_quota:
             now = time.time()
-            if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+            if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
+                return copy.deepcopy(self._current_nodepool_quota)
 
         # This is initialized with the full tenant quota and later becomes
         # the quota available for nodepool.
@@ -318,7 +319,7 @@ class QuotaSupport:
             if self._current_nodepool_quota:
                 self.log.exception("Unable to get provider quota, "
                                    "using cached value")
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+                return copy.deepcopy(self._current_nodepool_quota)
             raise
 
         self.log.debug("Provider quota for %s: %s",
@@ -328,10 +329,8 @@ class QuotaSupport:
         # to get the quota available for us.
         nodepool_quota.subtract(self.unmanagedQuotaUsed())
 
-        self._current_nodepool_quota = {
-            'quota': nodepool_quota,
-            'timestamp': time.time()
-        }
+        self._current_nodepool_quota = nodepool_quota
+        self._current_nodepool_quota_timestamp = time.time()
 
         self.log.debug("Available quota for %s: %s",
                        self.provider.name, nodepool_quota)
diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py
index 56edb03db..1dc4eb663 100644
--- a/nodepool/tests/__init__.py
+++ b/nodepool/tests/__init__.py
@@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
         self.wait_for_threads()
         return ready_nodes[label]
 
+    def waitForAnyNodeInState(self, state):
+        # Wait for a node to be in the aborted state
+        for _ in iterate_timeout(ONE_MINUTE, Exception,
+                                 f"Timeout waiting for node in {state} state",
+                                 interval=1):
+            for node in self.zk.nodeIterator(False):
+                if node.state == state:
+                    return node
+
     def waitForNodeRequest(self, req, states=None):
         '''
         Wait for a node request to transition to a final state.
diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py
index 9f8b1aa38..453d4843d 100644
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):
 
         # patch the cloud with requested quota
         def fake_get_quota():
+            nonlocal max_cores, max_instances, max_ram
             return (max_cores, max_instances, max_ram)
         self.useFixture(fixtures.MockPatchObject(
             fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):
 
         # Now, reduce the quota so the next node unexpectedly
         # (according to nodepool's quota estimate) fails.
-        client.max_instances = 1
+        max_instances = 1
 
         # Request a second node; this request should pause the handler.
         req2 = zk.NodeRequest()
@@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
         req3 = self.waitForNodeRequest(req3)
         self.assertEqual(req3.state, zk.FAILED)
 
+    def test_ignore_provider_quota_true_with_provider_error(self):
+        '''
+        Tests that quota errors returned from the provider when allocating a
+        node are correctly handled and retried. The node request should be
+        retried until there is sufficient quota to satisfy it.
+        '''
+        max_instances = 0
+
+        def fake_get_quota():
+            nonlocal max_instances
+            return (100, max_instances, 1000000)
+
+        self.useFixture(fixtures.MockPatchObject(
+            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
+            fake_get_quota
+        ))
+
+        configfile = self.setup_config('ignore_provider_quota_true.yaml')
+        self.useBuilder(configfile)
+        self.waitForImage('fake-provider', 'fake-image')
+
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+
+        # Create a request with ignore-provider-quota set to true that should
+        # pass regardless of the lack of cloud/provider quota.
+        self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
+
+        self.log.debug(
+            "Submitting an initial request with ignore-provider-quota True")
+        req = zk.NodeRequest()
+        req.state = zk.REQUESTED
+        req.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req)
+
+        self.waitForAnyNodeInState(zk.ABORTED)
+
+        self.assertReportedStat(
+            'nodepool.launch.provider.fake-provider.error.quota')
+        self.assertReportedStat(
+            'nodepool.launch.error.quota')
+
+        # Bump up the quota to allow the provider to allocate a node
+        max_instances = 1
+        req = self.waitForNodeRequest(req)
+        self.assertEqual(req.state, zk.FULFILLED)
+
     def test_request_order(self):
         """Test that requests are handled in sorted order"""
         configfile = self.setup_config('node_no_min_ready.yaml')
diff --git a/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
new file mode 100644
index 000000000..235e0f1d9
--- /dev/null
+++ b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes an exception that was raised by the OpenStack driver when attempting
+    to reset the quota timestamp and `ignore-provider-quota` is `true`. This
+    exception prevented nodepool from seeing quota errors from OpenStack,
+    causing it to fail the node request instead of retrying as it does for
+    other providers.