Merge "Do not reset quota cache timestamp when invalid"

2022-05-19 15:46:57 +00:00 · 2022-05-19 15:46:57 +00:00 · 7129681181
parent d6ed4b0d5b 2c632af426
commit 7129681181
5 changed files with 81 additions and 12 deletions
--- a/nodepool/driver/fake/provider.py
+++ b/nodepool/driver/fake/provider.py
@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
        ]
        self._azs = ['az1', 'az2']
        self._server_list = []
-        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
-            _get_quota()
+        self._update_quota()
        self._down_ports = [
            Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
                  device_owner="compute:nova"),
@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
                  device_owner=None),
        ]

+    def _update_quota(self):
+        self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
+            _get_quota()
+
    def _get(self, name_or_id, instance_list):
        self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
        for instance in instance_list:
@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
            private_v4 = 'fake'
            host_id = 'fake'
            interface_ip = 'fake'
+        self._update_quota()
        over_quota = False
        if (instance_type == Dummy.INSTANCE and
            self.max_instances > -1 and
@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
        return self._azs.copy()

    def get_compute_limits(self):
+        self._update_quota()
        return Dummy(
            'limits',
            max_total_cores=self.max_cores,
--- a/nodepool/driver/utils.py
+++ b/nodepool/driver/utils.py
@ -259,7 +259,8 @@ class QuotaSupport:

    def __init__(self, *args, **kw):
        super().__init__(*args, **kw)
-        self._current_nodepool_quota = None
+        self._current_nodepool_quota_timestamp = 0
+        self._current_nodepool_quota = {}

    @abc.abstractmethod
    def quotaNeededByLabel(self, label, pool):
@ -291,7 +292,7 @@ class QuotaSupport:
        pass

    def invalidateQuotaCache(self):
-        self._current_nodepool_quota['timestamp'] = 0
+        self._current_nodepool_quota_timestamp = 0

    def estimatedNodepoolQuota(self):
        '''
@ -307,8 +308,8 @@ class QuotaSupport:

        if self._current_nodepool_quota:
            now = time.time()
-            if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+            if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
+                return copy.deepcopy(self._current_nodepool_quota)

        # This is initialized with the full tenant quota and later becomes
        # the quota available for nodepool.
@ -318,7 +319,7 @@ class QuotaSupport:
            if self._current_nodepool_quota:
                self.log.exception("Unable to get provider quota, "
                                   "using cached value")
-                return copy.deepcopy(self._current_nodepool_quota['quota'])
+                return copy.deepcopy(self._current_nodepool_quota)
            raise

        self.log.debug("Provider quota for %s: %s",
@ -328,10 +329,8 @@ class QuotaSupport:
        # to get the quota available for us.
        nodepool_quota.subtract(self.unmanagedQuotaUsed())

-        self._current_nodepool_quota = {
-            'quota': nodepool_quota,
-            'timestamp': time.time()
-        }
+        self._current_nodepool_quota = nodepool_quota
+        self._current_nodepool_quota_timestamp = time.time()

        self.log.debug("Available quota for %s: %s",
                       self.provider.name, nodepool_quota)
--- a/nodepool/tests/init.py
+++ b/nodepool/tests/init.py
@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
        self.wait_for_threads()
        return ready_nodes[label]

+    def waitForAnyNodeInState(self, state):
+        # Wait for a node to be in the aborted state
+        for _ in iterate_timeout(ONE_MINUTE, Exception,
+                                 f"Timeout waiting for node in {state} state",
+                                 interval=1):
+            for node in self.zk.nodeIterator(False):
+                if node.state == state:
+                    return node
+
    def waitForNodeRequest(self, req, states=None):
        '''
        Wait for a node request to transition to a final state.
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):

        # patch the cloud with requested quota
        def fake_get_quota():
+            nonlocal max_cores, max_instances, max_ram
            return (max_cores, max_instances, max_ram)
        self.useFixture(fixtures.MockPatchObject(
            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):

        # Now, reduce the quota so the next node unexpectedly
        # (according to nodepool's quota estimate) fails.
-        client.max_instances = 1
+        max_instances = 1

        # Request a second node; this request should pause the handler.
        req2 = zk.NodeRequest()
@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
        req3 = self.waitForNodeRequest(req3)
        self.assertEqual(req3.state, zk.FAILED)

+    def test_ignore_provider_quota_true_with_provider_error(self):
+        '''
+        Tests that quota errors returned from the provider when allocating a
+        node are correctly handled and retried. The node request should be
+        retried until there is sufficient quota to satisfy it.
+        '''
+        max_instances = 0
+
+        def fake_get_quota():
+            nonlocal max_instances
+            return (100, max_instances, 1000000)
+
+        self.useFixture(fixtures.MockPatchObject(
+            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
+            fake_get_quota
+        ))
+
+        configfile = self.setup_config('ignore_provider_quota_true.yaml')
+        self.useBuilder(configfile)
+        self.waitForImage('fake-provider', 'fake-image')
+
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+
+        # Create a request with ignore-provider-quota set to true that should
+        # pass regardless of the lack of cloud/provider quota.
+        self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
+
+        self.log.debug(
+            "Submitting an initial request with ignore-provider-quota True")
+        req = zk.NodeRequest()
+        req.state = zk.REQUESTED
+        req.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req)
+
+        self.waitForAnyNodeInState(zk.ABORTED)
+
+        self.assertReportedStat(
+            'nodepool.launch.provider.fake-provider.error.quota')
+        self.assertReportedStat(
+            'nodepool.launch.error.quota')
+
+        # Bump up the quota to allow the provider to allocate a node
+        max_instances = 1
+        req = self.waitForNodeRequest(req)
+        self.assertEqual(req.state, zk.FULFILLED)
+
    def test_request_order(self):
        """Test that requests are handled in sorted order"""
        configfile = self.setup_config('node_no_min_ready.yaml')
--- a/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
+++ b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml
@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes an exception that was raised by the OpenStack driver when attempting
+    to reset the quota timestamp and `ignore-provider-quota` is `true`. This
+    exception prevented nodepool from seeing quota errors from OpenStack,
+    causing it to fail the node request instead of retrying as it does for
+    other providers.