Do not reset quota cache timestamp when invalid

The quota cache may not be a valid dictionary when
invalidateQuotaCache() is called (e.g. when 'ignore-provider-quota' is
used in OpenStack). In that case, don't attempt to treat the None as a
dictionary as this raises a TypeError exception.

This bug was preventing Quota errors from OpenStack from causing
nodepool to retry the node request when ignore-provider-quota is True,
because the OpenStack handler calles invalidateQuotaCache() before
raising the QuotaException. Since invalidateQuotaCache() was raising
TypeError, it prevented the QuotaException from being raised and the
node allocation was outright failed.

A test has been added to verify that nodepool and OpenStack will now
retry node allocations as intended.

This fixes that bug, but does change the behavior of OpenStack when
ignore-provider-quota is True and it returns a Quota error.

Change-Id: I1916c56c4f07c6a5d53ce82f4c1bb32bddbd7d63
Signed-off-by: Joshua Watt <JPEWhacker@gmail.com>
This commit is contained in:
Joshua Watt 2022-04-25 08:48:38 -05:00
parent 46c9e01ab8
commit 2c632af426
5 changed files with 81 additions and 12 deletions

View File

@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
]
self._azs = ['az1', 'az2']
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
self._update_quota()
self._down_ports = [
Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
device_owner="compute:nova"),
@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
device_owner=None),
]
def _update_quota(self):
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
for instance in instance_list:
@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
private_v4 = 'fake'
host_id = 'fake'
interface_ip = 'fake'
self._update_quota()
over_quota = False
if (instance_type == Dummy.INSTANCE and
self.max_instances > -1 and
@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
return self._azs.copy()
def get_compute_limits(self):
self._update_quota()
return Dummy(
'limits',
max_total_cores=self.max_cores,

View File

@ -259,7 +259,8 @@ class QuotaSupport:
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
self._current_nodepool_quota = None
self._current_nodepool_quota_timestamp = 0
self._current_nodepool_quota = {}
@abc.abstractmethod
def quotaNeededByLabel(self, label, pool):
@ -291,7 +292,7 @@ class QuotaSupport:
pass
def invalidateQuotaCache(self):
self._current_nodepool_quota['timestamp'] = 0
self._current_nodepool_quota_timestamp = 0
def estimatedNodepoolQuota(self):
'''
@ -307,8 +308,8 @@ class QuotaSupport:
if self._current_nodepool_quota:
now = time.time()
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota['quota'])
if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota)
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
@ -318,7 +319,7 @@ class QuotaSupport:
if self._current_nodepool_quota:
self.log.exception("Unable to get provider quota, "
"using cached value")
return copy.deepcopy(self._current_nodepool_quota['quota'])
return copy.deepcopy(self._current_nodepool_quota)
raise
self.log.debug("Provider quota for %s: %s",
@ -328,10 +329,8 @@ class QuotaSupport:
# to get the quota available for us.
nodepool_quota.subtract(self.unmanagedQuotaUsed())
self._current_nodepool_quota = {
'quota': nodepool_quota,
'timestamp': time.time()
}
self._current_nodepool_quota = nodepool_quota
self._current_nodepool_quota_timestamp = time.time()
self.log.debug("Available quota for %s: %s",
self.provider.name, nodepool_quota)

View File

@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
self.wait_for_threads()
return ready_nodes[label]
def waitForAnyNodeInState(self, state):
# Wait for a node to be in the aborted state
for _ in iterate_timeout(ONE_MINUTE, Exception,
f"Timeout waiting for node in {state} state",
interval=1):
for node in self.zk.nodeIterator(False):
if node.state == state:
return node
def waitForNodeRequest(self, req, states=None):
'''
Wait for a node request to transition to a final state.

View File

@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):
# patch the cloud with requested quota
def fake_get_quota():
nonlocal max_cores, max_instances, max_ram
return (max_cores, max_instances, max_ram)
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):
# Now, reduce the quota so the next node unexpectedly
# (according to nodepool's quota estimate) fails.
client.max_instances = 1
max_instances = 1
# Request a second node; this request should pause the handler.
req2 = zk.NodeRequest()
@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
req3 = self.waitForNodeRequest(req3)
self.assertEqual(req3.state, zk.FAILED)
def test_ignore_provider_quota_true_with_provider_error(self):
'''
Tests that quota errors returned from the provider when allocating a
node are correctly handled and retried. The node request should be
retried until there is sufficient quota to satisfy it.
'''
max_instances = 0
def fake_get_quota():
nonlocal max_instances
return (100, max_instances, 1000000)
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota',
fake_get_quota
))
configfile = self.setup_config('ignore_provider_quota_true.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
# Create a request with ignore-provider-quota set to true that should
# pass regardless of the lack of cloud/provider quota.
self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
self.log.debug(
"Submitting an initial request with ignore-provider-quota True")
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
self.waitForAnyNodeInState(zk.ABORTED)
self.assertReportedStat(
'nodepool.launch.provider.fake-provider.error.quota')
self.assertReportedStat(
'nodepool.launch.error.quota')
# Bump up the quota to allow the provider to allocate a node
max_instances = 1
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
def test_request_order(self):
"""Test that requests are handled in sorted order"""
configfile = self.setup_config('node_no_min_ready.yaml')

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an exception that was raised by the OpenStack driver when attempting
to reset the quota timestamp and `ignore-provider-quota` is `true`. This
exception prevented nodepool from seeing quota errors from OpenStack,
causing it to fail the node request instead of retrying as it does for
other providers.