Merge "Do not reset quota cache timestamp when invalid"

This commit is contained in:
Zuul 2022-05-19 15:46:57 +00:00 committed by Gerrit Code Review
commit 7129681181
5 changed files with 81 additions and 12 deletions

View File

@ -105,8 +105,7 @@ class FakeOpenStackCloud(object):
]
self._azs = ['az1', 'az2']
self._server_list = []
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
self._update_quota()
self._down_ports = [
Dummy(Dummy.PORT, id=uuid.uuid4().hex, status='DOWN',
device_owner="compute:nova"),
@ -114,6 +113,10 @@ class FakeOpenStackCloud(object):
device_owner=None),
]
def _update_quota(self):
self.max_cores, self.max_instances, self.max_ram = FakeOpenStackCloud.\
_get_quota()
def _get(self, name_or_id, instance_list):
self.log.debug("Get %s in %s" % (name_or_id, repr(instance_list)))
for instance in instance_list:
@ -162,6 +165,7 @@ class FakeOpenStackCloud(object):
private_v4 = 'fake'
host_id = 'fake'
interface_ip = 'fake'
self._update_quota()
over_quota = False
if (instance_type == Dummy.INSTANCE and
self.max_instances > -1 and
@ -292,6 +296,7 @@ class FakeOpenStackCloud(object):
return self._azs.copy()
def get_compute_limits(self):
self._update_quota()
return Dummy(
'limits',
max_total_cores=self.max_cores,

View File

@ -259,7 +259,8 @@ class QuotaSupport:
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
self._current_nodepool_quota = None
self._current_nodepool_quota_timestamp = 0
self._current_nodepool_quota = {}
@abc.abstractmethod
def quotaNeededByLabel(self, label, pool):
@ -291,7 +292,7 @@ class QuotaSupport:
pass
def invalidateQuotaCache(self):
self._current_nodepool_quota['timestamp'] = 0
self._current_nodepool_quota_timestamp = 0
def estimatedNodepoolQuota(self):
'''
@ -307,8 +308,8 @@ class QuotaSupport:
if self._current_nodepool_quota:
now = time.time()
if now < self._current_nodepool_quota['timestamp'] + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota['quota'])
if now < self._current_nodepool_quota_timestamp + MAX_QUOTA_AGE:
return copy.deepcopy(self._current_nodepool_quota)
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
@ -318,7 +319,7 @@ class QuotaSupport:
if self._current_nodepool_quota:
self.log.exception("Unable to get provider quota, "
"using cached value")
return copy.deepcopy(self._current_nodepool_quota['quota'])
return copy.deepcopy(self._current_nodepool_quota)
raise
self.log.debug("Provider quota for %s: %s",
@ -328,10 +329,8 @@ class QuotaSupport:
# to get the quota available for us.
nodepool_quota.subtract(self.unmanagedQuotaUsed())
self._current_nodepool_quota = {
'quota': nodepool_quota,
'timestamp': time.time()
}
self._current_nodepool_quota = nodepool_quota
self._current_nodepool_quota_timestamp = time.time()
self.log.debug("Available quota for %s: %s",
self.provider.name, nodepool_quota)

View File

@ -583,6 +583,15 @@ class DBTestCase(BaseTestCase):
self.wait_for_threads()
return ready_nodes[label]
def waitForAnyNodeInState(self, state):
# Wait for a node to be in the aborted state
for _ in iterate_timeout(ONE_MINUTE, Exception,
f"Timeout waiting for node in {state} state",
interval=1):
for node in self.zk.nodeIterator(False):
if node.state == state:
return node
def waitForNodeRequest(self, req, states=None):
'''
Wait for a node request to transition to a final state.

View File

@ -403,6 +403,7 @@ class TestLauncher(tests.DBTestCase):
# patch the cloud with requested quota
def fake_get_quota():
nonlocal max_cores, max_instances, max_ram
return (max_cores, max_instances, max_ram)
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota',
@ -435,7 +436,7 @@ class TestLauncher(tests.DBTestCase):
# Now, reduce the quota so the next node unexpectedly
# (according to nodepool's quota estimate) fails.
client.max_instances = 1
max_instances = 1
# Request a second node; this request should pause the handler.
req2 = zk.NodeRequest()
@ -2280,6 +2281,53 @@ class TestLauncher(tests.DBTestCase):
req3 = self.waitForNodeRequest(req3)
self.assertEqual(req3.state, zk.FAILED)
def test_ignore_provider_quota_true_with_provider_error(self):
'''
Tests that quota errors returned from the provider when allocating a
node are correctly handled and retried. The node request should be
retried until there is sufficient quota to satisfy it.
'''
max_instances = 0
def fake_get_quota():
nonlocal max_instances
return (100, max_instances, 1000000)
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider.fake_cloud, '_get_quota',
fake_get_quota
))
configfile = self.setup_config('ignore_provider_quota_true.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
# Create a request with ignore-provider-quota set to true that should
# pass regardless of the lack of cloud/provider quota.
self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
self.log.debug(
"Submitting an initial request with ignore-provider-quota True")
req = zk.NodeRequest()
req.state = zk.REQUESTED
req.node_types.append('fake-label')
self.zk.storeNodeRequest(req)
self.waitForAnyNodeInState(zk.ABORTED)
self.assertReportedStat(
'nodepool.launch.provider.fake-provider.error.quota')
self.assertReportedStat(
'nodepool.launch.error.quota')
# Bump up the quota to allow the provider to allocate a node
max_instances = 1
req = self.waitForNodeRequest(req)
self.assertEqual(req.state, zk.FULFILLED)
def test_request_order(self):
"""Test that requests are handled in sorted order"""
configfile = self.setup_config('node_no_min_ready.yaml')

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an exception that was raised by the OpenStack driver when attempting
to reset the quota timestamp and `ignore-provider-quota` is `true`. This
exception prevented nodepool from seeing quota errors from OpenStack,
causing it to fail the node request instead of retrying as it does for
other providers.