Do not reset quota cache timestamp when invalid

The quota cache may not be a valid dictionary when
invalidateQuotaCache() is called (e.g. when 'ignore-provider-quota' is
used in OpenStack). In that case, don't attempt to treat the None as a
dictionary as this raises a TypeError exception.

This bug was preventing Quota errors from OpenStack from causing
nodepool to retry the node request when ignore-provider-quota is True,
because the OpenStack handler calles invalidateQuotaCache() before
raising the QuotaException. Since invalidateQuotaCache() was raising
TypeError, it prevented the QuotaException from being raised and the
node allocation was outright failed.

A test has been added to verify that nodepool and OpenStack will now
retry node allocations as intended.

This fixes that bug, but does change the behavior of OpenStack when
ignore-provider-quota is True and it returns a Quota error.

Change-Id: I1916c56c4f07c6a5d53ce82f4c1bb32bddbd7d63
Signed-off-by: Joshua Watt <JPEWhacker@gmail.com>
This commit is contained in:
Joshua Watt 2022-04-25 08:48:38 -05:00
parent 46c9e01ab8
commit 2c632af426
5 changed files with 81 additions and 12 deletions

View File

@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
] ]
self._azs = ['az1', 'az2'] self._azs = ['az1', 'az2']
self._server_list = [] self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\ self._update_quota()
_get_quota()
self._down_ports = [ self._down_ports = [
Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN', Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
device_owner="compute:nova"), device_owner="compute:nova"),
@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
device_owner=None), device_owner=None),
] ]
def _update_quota(self):
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
def _get(self, name_or_id, instance_list): def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list))) self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
for instance in instance_list: for instance in instance_list:
@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
private_v4 = 'fake' private_v4 = 'fake'
host_id = 'fake' host_id = 'fake'
interface_ip = 'fake' interface_ip = 'fake'
self._update_quota()
over_quota = False over_quota = False
if (instance_type == Dummy.INSTANCE and if (instance_type == Dummy.INSTANCE and
self.max_instances > -1 and self.max_instances > -1 and
@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
return self._azs.copy() return self._azs.copy()
def get_compute_limits(self): def get_compute_limits(self):
self._update_quota()
return Dummy( return Dummy(
'limits', 'limits',
max_total_cores=self.max_cores, max_total_cores=self.max_cores,

View File

@ -259,7 +259,8 @@ class QuotaSupport:
def __init__(self, *args, **kw): def __init__(self, *args, **kw):
super().__init__(*args, **kw) super().__init__(*args, **kw)
self._current_nodepool_quota = None self._current_nodepool_quota_timestamp = 0
self._current_nodepool_quota = {}
@abc.abstractmethod @abc.abstractmethod
def quotaNeededByLabel(self, label, pool): def quotaNeededByLabel(self, label, pool):
@ -291,7 +292,7 @@ class QuotaSupport:
pass pass
def invalidateQuotaCache(self): def invalidateQuotaCache(self):
self._current_nodepool_quota['timestamp'] = 0 self._current_nodepool_quota_timestamp = 0
def estimatedNodepoolQuota(self): def estimatedNodepoolQuota(self):
''' '''
@ -307,8 +308,8 @@ class QuotaSupport:
if self._current_nodepool_quota: if self._current_nodepool_quota:
now = time.time() now = time.time()
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE: if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota['quota']) return copy.deepcopy(self._current_nodepool_quota)
# This is initialized with the full tenant quota and later becomes # This is initialized with the full tenant quota and later becomes
# the quota available for nodepool. # the quota available for nodepool.
@ -318,7 +319,7 @@ class QuotaSupport:
if self._current_nodepool_quota: if self._current_nodepool_quota:
self.log.exception("Unable to get provider quota, " self.log.exception("Unable to get provider quota, "
"using cached value") "using cached value")
return copy.deepcopy(self._current_nodepool_quota['quota']) return copy.deepcopy(self._current_nodepool_quota)
raise raise
self.log.debug("Provider quota for %s: %s", self.log.debug("Provider quota for %s: %s",
@ -328,10 +329,8 @@ class QuotaSupport:
# to get the quota available for us. # to get the quota available for us.
nodepool_quota.subtract(self.unmanagedQuotaUsed()) nodepool_quota.subtract(self.unmanagedQuotaUsed())
self._current_nodepool_quota = { self._current_nodepool_quota = nodepool_quota
'quota': nodepool_quota, self._current_nodepool_quota_timestamp = time.time()
'timestamp': time.time()
}
self.log.debug("Available quota for %s: %s", self.log.debug("Available quota for %s: %s",
self.provider.name, nodepool_quota) self.provider.name, nodepool_quota)

View File

@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
self.wait_for_threads() self.wait_for_threads()
return ready_nodes[label] return ready_nodes[label]
def waitForAnyNodeInState(self, state):
# Wait for a node to be in the aborted state
for _ in iterate_timeout(ONE_MINUTE, Exception,
f"Timeout waiting for node in {state} state",
interval=1):
for node in self.zk.nodeIterator(False):
if node.state == state:
return node
def waitForNodeRequest(self, req, states=None): def waitForNodeRequest(self, req, states=None):
''' '''
Wait for a node request to transition to a final state. Wait for a node request to transition to a final state.

View File

@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):
# patch the cloud with requested quota # patch the cloud with requested quota
def fake_get_quota(): def fake_get_quota():
nonlocal max_cores, max_instances, max_ram
return (max_cores, max_instances, max_ram) return (max_cores, max_instances, max_ram)
self.useFixture(fixtures.MockPatchObject( self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota', fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):
# Now, reduce the quota so the next node unexpectedly # Now, reduce the quota so the next node unexpectedly
# (according to nodepool's quota estimate) fails. # (according to nodepool's quota estimate) fails.
client.max_instances = 1 max_instances = 1
# Request a second node; this request should pause the handler. # Request a second node; this request should pause the handler.
req2 = zk.NodeRequest() req2 = zk.NodeRequest()
@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
req3 = self.waitForNodeRequest(req3) req3 = self.waitForNodeRequest(req3)
self.assertEqual(req3.state, zk.FAILED) self.assertEqual(req3.state, zk.FAILED)
def test_ignore_provider_quota_true_with_provider_error(self):
'''
Tests that quota errors returned from the provider when allocating a
node are correctly handled and retried. The node request should be
retried until there is sufficient quota to satisfy it.
'''
max_instances = 0
def fake_get_quota():
nonlocal max_instances
return (100, max_instances, 1000000)
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota',
fake_get_quota
))
configfile = self.setup_config('ignore_provider_quota_true.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
# Create a request with ignore-provider-quota set to true that should
# pass regardless of the lack of cloud/provider quota.
self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
self.log.debug(
"Submitting an initial request with ignore-provider-quota True")
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
self.waitForAnyNodeInState(zk.ABORTED)
self.assertReportedStat(
'nodepool.launch.provider.fake-provider.error.quota')
self.assertReportedStat(
'nodepool.launch.error.quota')
# Bump up the quota to allow the provider to allocate a node
max_instances = 1
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
def test_request_order(self): def test_request_order(self):
"""Test that requests are handled in sorted order""" """Test that requests are handled in sorted order"""
configfile = self.setup_config('node_no_min_ready.yaml') configfile = self.setup_config('node_no_min_ready.yaml')

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an exception that was raised by the OpenStack driver when attempting
to reset the quota timestamp and `ignore-provider-quota` is `true`. This
exception prevented nodepool from seeing quota errors from OpenStack,
causing it to fail the node request instead of retrying as it does for
other providers.