From 2c632af4264006e92352fa0ff8f35856c82a16ad Mon Sep 17 00:00:00 2001 From: Joshua Watt Date: Mon, 25 Apr 2022 08:48:38 -0500 Subject: [PATCH] Do not reset quota cache timestamp when invalid The quota cache may not be a valid dictionary when invalidateQuotaCache() is called (e.g. when 'ignore-provider-quota' is used in OpenStack). In that case, don't attempt to treat the None as a dictionary as this raises a TypeError exception. This bug was preventing Quota errors from OpenStack from causing nodepool to retry the node request when ignore-provider-quota is True, because the OpenStack handler calles invalidateQuotaCache() before raising the QuotaException. Since invalidateQuotaCache() was raising TypeError, it prevented the QuotaException from being raised and the node allocation was outright failed. A test has been added to verify that nodepool and OpenStack will now retry node allocations as intended. This fixes that bug, but does change the behavior of OpenStack when ignore-provider-quota is True and it returns a Quota error. Change-Id: I1916c56c4f07c6a5d53ce82f4c1bb32bddbd7d63 Signed-off-by: Joshua Watt --- nodepool/driver/fake/provider.py | 9 +++- nodepool/driver/utils.py | 17 +++---- nodepool/tests/__init__.py | 9 ++++ nodepool/tests/unit/test_launcher.py | 50 ++++++++++++++++++- ...-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml | 8 +++ 5 files changed, 81 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml diff --git a/nodepool/driver/fake/provider.py b/nodepool/driver/fake/provider.py index e38376a1d..6e2388b0c 100644 --- a/nodepool/driver/fake/provider.py +++ b/nodepool/driver/fake/provider.py @@ -105,8 +105,7 @@ class FakeOpenStackCloud(object): ] self._azs = ['az1', 'az2'] self._server_list = [] - self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ - _get_quota() + self._update_quota() self._down_ports = [ Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN', device_owner="compute:nova"), @@ -114,6 +113,10 @@ class FakeOpenStackCloud(object): device_owner=None), ] + def _update_quota(self): + self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ + _get_quota() + def _get(self, name_or_id, instance_list): self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) for instance in instance_list: @@ -162,6 +165,7 @@ class FakeOpenStackCloud(object): private_v4 = 'fake' host_id = 'fake' interface_ip = 'fake' + self._update_quota() over_quota = False if (instance_type == Dummy.INSTANCE and self.max_instances > -1 and @@ -292,6 +296,7 @@ class FakeOpenStackCloud(object): return self._azs.copy() def get_compute_limits(self): + self._update_quota() return Dummy( 'limits', max_total_cores=self.max_cores, diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py index e017e569d..a2fe32267 100644 --- a/nodepool/driver/utils.py +++ b/nodepool/driver/utils.py @@ -259,7 +259,8 @@ class QuotaSupport: def __init__(self, *args, **kw): super().__init__(*args, **kw) - self._current_nodepool_quota = None + self._current_nodepool_quota_timestamp = 0 + self._current_nodepool_quota = {} @abc.abstractmethod def quotaNeededByLabel(self, label, pool): @@ -291,7 +292,7 @@ class QuotaSupport: pass def invalidateQuotaCache(self): - self._current_nodepool_quota['timestamp'] = 0 + self._current_nodepool_quota_timestamp = 0 def estimatedNodepoolQuota(self): ''' @@ -307,8 +308,8 @@ class QuotaSupport: if self._current_nodepool_quota: now = time.time() - if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE: - return copy.deepcopy(self._current_nodepool_quota['quota']) + if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE: + return copy.deepcopy(self._current_nodepool_quota) # This is initialized with the full tenant quota and later becomes # the quota available for nodepool. @@ -318,7 +319,7 @@ class QuotaSupport: if self._current_nodepool_quota: self.log.exception("Unable to get provider quota, " "using cached value") - return copy.deepcopy(self._current_nodepool_quota['quota']) + return copy.deepcopy(self._current_nodepool_quota) raise self.log.debug("Provider quota for %s: %s", @@ -328,10 +329,8 @@ class QuotaSupport: # to get the quota available for us. nodepool_quota.subtract(self.unmanagedQuotaUsed()) - self._current_nodepool_quota = { - 'quota': nodepool_quota, - 'timestamp': time.time() - } + self._current_nodepool_quota = nodepool_quota + self._current_nodepool_quota_timestamp = time.time() self.log.debug("Available quota for %s: %s", self.provider.name, nodepool_quota) diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index 56edb03db..1dc4eb663 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase): self.wait_for_threads() return ready_nodes[label] + def waitForAnyNodeInState(self, state): + # Wait for a node to be in the aborted state + for _ in iterate_timeout(ONE_MINUTE, Exception, + f"Timeout waiting for node in {state} state", + interval=1): + for node in self.zk.nodeIterator(False): + if node.state == state: + return node + def waitForNodeRequest(self, req, states=None): ''' Wait for a node request to transition to a final state. diff --git a/nodepool/tests/unit/test_launcher.py b/nodepool/tests/unit/test_launcher.py index 9f8b1aa38..453d4843d 100644 --- a/nodepool/tests/unit/test_launcher.py +++ b/nodepool/tests/unit/test_launcher.py @@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase): # patch the cloud with requested quota def fake_get_quota(): + nonlocal max_cores, max_instances, max_ram return (max_cores, max_instances, max_ram) self.useFixture(fixtures.MockPatchObject( fakeprovider.FakeProvider.fake_cloud, '_get_quota', @@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase): # Now, reduce the quota so the next node unexpectedly # (according to nodepool's quota estimate) fails. - client.max_instances = 1 + max_instances = 1 # Request a second node; this request should pause the handler. req2 = zk.NodeRequest() @@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase): req3 = self.waitForNodeRequest(req3) self.assertEqual(req3.state, zk.FAILED) + def test_ignore_provider_quota_true_with_provider_error(self): + ''' + Tests that quota errors returned from the provider when allocating a + node are correctly handled and retried. The node request should be + retried until there is sufficient quota to satisfy it. + ''' + max_instances = 0 + + def fake_get_quota(): + nonlocal max_instances + return (100, max_instances, 1000000) + + self.useFixture(fixtures.MockPatchObject( + fakeprovider.FakeProvider.fake_cloud, '_get_quota', + fake_get_quota + )) + + configfile = self.setup_config('ignore_provider_quota_true.yaml') + self.useBuilder(configfile) + self.waitForImage('fake-provider', 'fake-image') + + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + # Create a request with ignore-provider-quota set to true that should + # pass regardless of the lack of cloud/provider quota. + self.replace_config(configfile, 'ignore_provider_quota_true.yaml') + + self.log.debug( + "Submitting an initial request with ignore-provider-quota True") + req = zk.NodeRequest() + req.state = zk.REQUESTED + req.node_types.append('fake-label') + self.zk.storeNodeRequest(req) + + self.waitForAnyNodeInState(zk.ABORTED) + + self.assertReportedStat( + 'nodepool.launch.provider.fake-provider.error.quota') + self.assertReportedStat( + 'nodepool.launch.error.quota') + + # Bump up the quota to allow the provider to allocate a node + max_instances = 1 + req = self.waitForNodeRequest(req) + self.assertEqual(req.state, zk.FULFILLED) + def test_request_order(self): """Test that requests are handled in sorted order""" configfile = self.setup_config('node_no_min_ready.yaml') diff --git a/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml new file mode 100644 index 000000000..235e0f1d9 --- /dev/null +++ b/releasenotes/notes/quota-cache-timestamp-fix-c4edf9dc08e0eb8b.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an exception that was raised by the OpenStack driver when attempting + to reset the quota timestamp and `ignore-provider-quota` is `true`. This + exception prevented nodepool from seeing quota errors from OpenStack, + causing it to fail the node request instead of retrying as it does for + other providers.