Merge "Fix provider quota calculation"

This commit is contained in:
Zuul
2025-09-12 15:47:42 +00:00
committed by Gerrit Code Review
7 changed files with 304 additions and 58 deletions

View File

@@ -318,7 +318,7 @@ class AwsProvider(BaseProvider, subclass_id='aws'):
def listInstances(self):
return self.endpoint.listInstances()
def getQuotaLimits(self):
def getEndpointLimits(self):
# Get the instance and volume types that this provider handles
limits = self.endpoint.quota_cache.getLimits()
if limits is None:
@@ -395,10 +395,7 @@ class AwsProvider(BaseProvider, subclass_id='aws'):
value *= 1000
args[code] = value
cloud = QuotaInformation(**args)
zuul = QuotaInformation(default=math.inf, **self.resource_limits)
cloud.min(zuul)
return cloud
return QuotaInformation(**args)
def getQuotaForLabel(self, label):
flavor = self.flavors[label.flavor]

View File

@@ -245,16 +245,13 @@ class OpenstackProvider(BaseProvider, subclass_id='openstack'):
def listInstances(self):
return self.endpoint.listInstances()
def getQuotaLimits(self):
def getEndpointLimits(self):
limits = self.endpoint.quota_cache.getLimits()
if limits is None:
limits = {}
else:
limits = limits.quota
cloud = QuotaInformation(default=math.inf, **limits)
zuul = QuotaInformation(default=math.inf, **self.resource_limits)
cloud.min(zuul)
return cloud
return QuotaInformation(default=math.inf, **limits)
def getQuotaForLabel(self, label):
flavor = self.flavors[label.flavor]

View File

@@ -22,6 +22,7 @@ import errno
import fcntl
import hashlib
import logging
import math
import os
import random
import select
@@ -1046,11 +1047,11 @@ class Launcher:
else:
self.statsd_timer = nullcontext
# Raw provider quota
self._provider_quota_cache = cachetools.TTLCache(
# Raw provider quota limits
self._provider_limits_cache = cachetools.TTLCache(
maxsize=8192, ttl=self.MAX_QUOTA_AGE)
# Provider quota - unmanaged usage
self._provider_available_cache = cachetools.TTLCache(
self._provider_quota_cache = cachetools.TTLCache(
maxsize=8192, ttl=self.MAX_QUOTA_AGE)
self.tracing = tracing.Tracing(self.config)
@@ -2699,42 +2700,67 @@ class Launcher:
image_upload = valid_uploads[-1]
return image_upload.external_id
def getEndpointLimits(self, provider):
val = self._provider_limits_cache.get(provider.canonical_name)
if val:
return val
quota = provider.getEndpointLimits()
self.log.debug("Provider endpoint quota limits for %s: %s",
provider.name, quota)
self._provider_limits_cache[provider.canonical_name] = quota
return quota
def getProviderQuota(self, provider):
val = self._provider_quota_cache.get(provider.canonical_name)
if val:
return val
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
quota = provider.getQuotaLimits()
self.log.debug("Provider quota for %s: %s",
provider.name, quota)
# This is initialized with the full endpoint quota and later
# becomes the quota available for Zuul.
quota = self.getEndpointLimits(provider).copy()
unmanaged = provider.getEndpoint().quota_cache.getUnmanagedUsage()
self.log.debug("Provider endpoint unmanaged quota used for %s: %s",
provider.name, unmanaged)
# Subtract the unmanaged quota usage from nodepool_max
# to get the quota available for us.
quota.subtract(unmanaged)
self._provider_quota_cache[provider.canonical_name] = quota
return quota
def getProviderQuotaAvailable(self, provider):
val = self._provider_available_cache.get(provider.canonical_name)
if val:
return val
# This is initialized with the full tenant quota and later becomes
# the quota available for nodepool.
def getProviderQuotaAvailable(self, provider, include_requested=False):
# This is initialized with the full provider endpoint quota,
# which is cached and updated every 5 minutes.
quota = self.getProviderQuota(provider).copy()
unmanaged = provider.getEndpoint().quota_cache.getUnmanagedUsage()
self.log.debug("Provider unmanaged quota used for %s: %s",
provider.name, unmanaged)
# Subtract the unmanaged quota usage from nodepool_max
# to get the quota available for us.
quota.subtract(unmanaged)
self._provider_available_cache[provider.canonical_name] = quota
# Subtract the quota used by other providers on the same
# endpoint.
other = model.QuotaInformation()
for other_provider in self._getUniqueProviders():
if (other_provider.endpoint != provider.endpoint
or other_provider.canonical_name == provider.canonical_name):
continue
other.add(self.api.nodes_cache.getQuota(
other_provider, include_requested=include_requested))
quota.subtract(other)
self.log.debug("Provider endpoint other quota used for %s: %s",
provider.name, other)
# Restrict quota limits based on our provider limits
provider_limits = model.QuotaInformation(
default=math.inf, **provider.resource_limits
)
quota.min(provider_limits)
return quota
def getQuotaPercentage(self, provider, messages):
try:
# This is cached and updated every 5 minutes
total = self.getProviderQuotaAvailable(provider).copy()
total = self.getProviderQuotaAvailable(
provider, include_requested=True)
except Exception:
# This will emit an annotated log message, but no traceback
messages.append("Unable to get provider quota")
@@ -2772,22 +2798,27 @@ class Launcher:
def doesProviderHaveQuotaForLabel(self, provider, label, messages,
include_usage=True):
if include_usage:
total = self.getProviderQuotaAvailable(provider).copy()
# When include_usage is True, we include requested nodes here
# because this is called to decide whether to add a new request
# to the provider, so we should include other nodes we've
# already allocated to providers in the decision.
total = self.getProviderQuotaAvailable(
provider, include_requested=True)
messages.append(
f"Provider {provider} quota available before Zuul: {total}")
# We include requested nodes here because this is called
# to decide whether to add a new request to the provider,
# so we should include other nodes we've already allocated
# to this provider in the decision.
f"Provider quota available before {provider}: {total}")
used = self.api.nodes_cache.getQuota(
provider, include_requested=True)
total.subtract(used)
messages.append(
f"Provider {provider} quota available including Zuul: {total}")
f"Provider quota available including {provider}: {total}")
else:
total = self.getProviderQuota(provider).copy()
messages.append(
f"Provider {provider} quota before Zuul: {total}")
total = self.getEndpointLimits(provider).copy()
# Restrict quota limits based on our provider limits
provider_limits = model.QuotaInformation(
default=math.inf, **provider.resource_limits
)
total.min(provider_limits)
messages.append(f"Provider {provider} limits: {total}")
label_quota = provider.getQuotaForLabel(label)
total.subtract(label_quota)
@@ -2796,15 +2827,16 @@ class Launcher:
return total.nonNegative()
def doesProviderHaveQuotaForNode(self, provider, node, messages):
total = self.getProviderQuotaAvailable(provider).copy()
messages.append(f"Provider {provider} quota before Zuul: {total}")
# We do not include requested nodes here because this is
# called to decide whether to issue the create API call for a
# node already allocated to the provider. We only want to
# "pause" the provider if it really is at quota.
total = self.getProviderQuotaAvailable(
provider, include_requested=False)
messages.append(f"Provider quota before {provider}: {total}")
used = self.api.nodes_cache.getQuota(provider, include_requested=False)
total.subtract(used)
messages.append(f"Provider {provider} quota including Zuul: {total}")
messages.append(f"Provider quota including {provider}: {total}")
total.subtract(node.quota)
messages.append(f"Node {node} required quota: {node.quota}")
return total.nonNegative()
@@ -2873,7 +2905,13 @@ class Launcher:
providers[tenant_provider.canonical_name] = tenant_provider
for provider in providers.values():
safe_pname = normalize_statsd_name(provider.canonical_name)
limits = self.getProviderQuota(provider).getResources()
quota = self.getEndpointLimits(provider).copy()
# Restrict quota limits based on our provider limits
provider_limits = model.QuotaInformation(
default=math.inf, **provider.resource_limits
)
quota.min(provider_limits)
limits = quota.getResources()
# zuul.provider.<provider>.limit.<resource> gauge
for res, value in limits.items():
safe_res = normalize_statsd_name(res)

View File

@@ -611,8 +611,8 @@ class BaseProvider(zkobject.PolymorphicZKObjectMixin,
"""
raise NotImplementedError()
def getQuotaLimits(self):
"""Return the quota limits for this provider
def getEndpointLimits(self):
"""Return the endpoint quota limits for this provider
The default implementation returns a simple QuotaInformation
with no limits. Override this to provide accurate