Launcher: consider provider capacity before assigning nodes

Do not assign nodes to a provider which does not have the capacity
for that node at all (ignoring zuul usage).  In other words,
if the provider has a resource limit lower than that required by
a node (for example, 0) don't use it.

Change-Id: I918bdf6fdd3454be83ea1868dd2d0e6e454bbf61
This commit is contained in:
James E. Blair
2025-06-05 15:36:32 -07:00
parent defdcec21b
commit d17d4ff3aa
2 changed files with 59 additions and 8 deletions

View File

@ -1126,6 +1126,17 @@ class TestLauncher(LauncherBaseTestCase):
with testtools.ExpectedException(Exception):
self.requestNodes(["debian-normal"])
@simple_layout('layouts/nodepool.yaml',
enable_nodepool=True)
@driver_config('test_launcher', quotas={
'instances': 0,
})
def test_quota_insufficient_capacity(self):
# Test that we fail requests which are impossible to satisfy
self.waitUntilSettled()
request = self.requestNodes(["debian-normal"])
self.assertEqual(request.state, model.NodesetRequest.State.FAILED)
@simple_layout('layouts/nodepool-nodescan.yaml', enable_nodepool=True)
@okay_tracebacks('_checkNodescanRequest')
@mock.patch('paramiko.transport.Transport')
@ -1278,6 +1289,8 @@ class TestLauncher(LauncherBaseTestCase):
# Make sure the next requests always have current quota info
self.launcher._provider_quota_cache = cachetools.TTLCache(
maxsize=8192, ttl=0)
self.launcher._provider_available_cache = cachetools.TTLCache(
maxsize=8192, ttl=0)
requests = []
ctx = self.createZKContext(None)

View File

@ -969,8 +969,12 @@ class Launcher:
else:
self.statsd_timer = nullcontext
# Raw provider quota
self._provider_quota_cache = cachetools.TTLCache(
maxsize=8192, ttl=self.MAX_QUOTA_AGE)
# Provider quota - unmanaged usage
self._provider_available_cache = cachetools.TTLCache(
maxsize=8192, ttl=self.MAX_QUOTA_AGE)
self.tracing = tracing.Tracing(self.config)
self.zk_client = ZooKeeperClient.fromConfig(self.config)
@ -1287,6 +1291,21 @@ class Launcher:
)
if not any(valid_uploads):
continue
# Check if the provider could possibly handle the
# request based on quota but not current zuul usage.
# TODO: consider the impact of a multi-node request
# for the same label where that single request is
# larger than the capacity.
try:
if not self.doesProviderHaveQuotaForLabel(
provider, label, log, include_usage=False):
continue
except Exception:
self.log.exception(
"Error checking quota for label %s "
"in provider %s", label, provider)
raise NodesetRequestError(
"Unable to determine quota")
providers_for_label[i].append(provider)
providers_for_all_labels &= set(providers_for_label[i])
@ -2284,6 +2303,17 @@ class Launcher:
self.log.debug("Provider quota for %s: %s",
provider.name, quota)
self._provider_quota_cache[provider.canonical_name] = quota
return quota
def getProviderQuotaAvailable(self, provider):
val = self._provider_available_cache.get(provider.canonical_name)
if val:
return val
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
quota = self.getProviderQuota(provider).copy()
unmanaged = self.getUnmanagedQuotaUsed(provider)
self.log.debug("Provider unmanaged quota used for %s: %s",
provider.name, unmanaged)
@ -2291,12 +2321,12 @@ class Launcher:
# Subtract the unmanaged quota usage from nodepool_max
# to get the quota available for us.
quota.subtract(unmanaged)
self._provider_quota_cache[provider.canonical_name] = quota
self._provider_available_cache[provider.canonical_name] = quota
return quota
def getQuotaPercentage(self, provider):
# This is cached and updated every 5 minutes
total = self.getProviderQuota(provider).copy()
total = self.getProviderQuotaAvailable(provider).copy()
# This is continuously updated in the background
used = self.api.nodes_cache.getQuota(provider)
pct = 0.0
@ -2319,18 +2349,26 @@ class Launcher:
pct = round(pct, 1)
return pct
def doesProviderHaveQuotaForLabel(self, provider, label, log):
total = self.getProviderQuota(provider).copy()
log.debug("Provider %s quota before Zuul: %s", provider, total)
total.subtract(self.getQuotaUsed(provider))
log.debug("Provider %s quota including Zuul: %s", provider, total)
def doesProviderHaveQuotaForLabel(self, provider, label, log,
include_usage=True):
if include_usage:
total = self.getProviderQuotaAvailable(provider).copy()
log.debug("Provider %s quota available before Zuul: %s",
provider, total)
total.subtract(self.getQuotaUsed(provider))
log.debug("Provider %s quota available including Zuul: %s",
provider, total)
else:
total = self.getProviderQuota(provider).copy()
log.debug("Provider %s quota before Zuul: %s", provider, total)
label_quota = provider.getQuotaForLabel(label)
total.subtract(label_quota)
log.debug("Label %s required quota: %s", label, label_quota)
return total.nonNegative()
def doesProviderHaveQuotaForNode(self, provider, node, log):
total = self.getProviderQuota(provider).copy()
total = self.getProviderQuotaAvailable(provider).copy()
log.debug("Provider %s quota before Zuul: %s", provider, total)
total.subtract(self.getQuotaUsed(provider))
log.debug("Provider %s quota including Zuul: %s", provider, total)