Merge "Fix provider quota calculation"

2025-09-12 15:47:42 +00:00
parent 3b78b3666a e58ec9295d
commit 36b70d0cca
7 changed files with 304 additions and 58 deletions
--- a/zuul/driver/aws/awsprovider.py
+++ b/zuul/driver/aws/awsprovider.py
@@ -318,7 +318,7 @@ class AwsProvider(BaseProvider, subclass_id='aws'):
    def listInstances(self):
        return self.endpoint.listInstances()

-    def getQuotaLimits(self):
+    def getEndpointLimits(self):
        # Get the instance and volume types that this provider handles
        limits = self.endpoint.quota_cache.getLimits()
        if limits is None:
@@ -395,10 +395,7 @@ class AwsProvider(BaseProvider, subclass_id='aws'):
                    value *= 1000
                args[code] = value

-        cloud = QuotaInformation(**args)
-        zuul = QuotaInformation(default=math.inf, **self.resource_limits)
-        cloud.min(zuul)
-        return cloud
+        return QuotaInformation(**args)

    def getQuotaForLabel(self, label):
        flavor = self.flavors[label.flavor]
--- a/zuul/driver/openstack/openstackprovider.py
+++ b/zuul/driver/openstack/openstackprovider.py
@@ -245,16 +245,13 @@ class OpenstackProvider(BaseProvider, subclass_id='openstack'):
    def listInstances(self):
        return self.endpoint.listInstances()

-    def getQuotaLimits(self):
+    def getEndpointLimits(self):
        limits = self.endpoint.quota_cache.getLimits()
        if limits is None:
            limits = {}
        else:
            limits = limits.quota
-        cloud = QuotaInformation(default=math.inf, **limits)
-        zuul = QuotaInformation(default=math.inf, **self.resource_limits)
-        cloud.min(zuul)
-        return cloud
+        return QuotaInformation(default=math.inf, **limits)

    def getQuotaForLabel(self, label):
        flavor = self.flavors[label.flavor]
--- a/zuul/launcher/server.py
+++ b/zuul/launcher/server.py
@@ -22,6 +22,7 @@ import errno
 import fcntl
 import hashlib
 import logging
+import math
 import os
 import random
 import select
@@ -1046,11 +1047,11 @@ class Launcher:
        else:
            self.statsd_timer = nullcontext

-        # Raw provider quota
-        self._provider_quota_cache = cachetools.TTLCache(
+        # Raw provider quota limits
+        self._provider_limits_cache = cachetools.TTLCache(
            maxsize=8192, ttl=self.MAX_QUOTA_AGE)
        # Provider quota - unmanaged usage
-        self._provider_available_cache = cachetools.TTLCache(
+        self._provider_quota_cache = cachetools.TTLCache(
            maxsize=8192, ttl=self.MAX_QUOTA_AGE)

        self.tracing = tracing.Tracing(self.config)
@@ -2699,42 +2700,67 @@ class Launcher:
        image_upload = valid_uploads[-1]
        return image_upload.external_id

+    def getEndpointLimits(self, provider):
+        val = self._provider_limits_cache.get(provider.canonical_name)
+        if val:
+            return val
+
+        quota = provider.getEndpointLimits()
+        self.log.debug("Provider endpoint quota limits for %s: %s",
+                       provider.name, quota)
+
+        self._provider_limits_cache[provider.canonical_name] = quota
+        return quota
+
    def getProviderQuota(self, provider):
        val = self._provider_quota_cache.get(provider.canonical_name)
        if val:
            return val

-        # This is initialized with the full tenant quota and later becomes
-        # the quota available for nodepool.
-        quota = provider.getQuotaLimits()
-        self.log.debug("Provider quota for %s: %s",
-                       provider.name, quota)
+        # This is initialized with the full endpoint quota and later
+        # becomes the quota available for Zuul.
+        quota = self.getEndpointLimits(provider).copy()
+
+        unmanaged = provider.getEndpoint().quota_cache.getUnmanagedUsage()
+        self.log.debug("Provider endpoint unmanaged quota used for %s: %s",
+                       provider.name, unmanaged)
+        # Subtract the unmanaged quota usage from nodepool_max
+        # to get the quota available for us.
+        quota.subtract(unmanaged)

        self._provider_quota_cache[provider.canonical_name] = quota
        return quota

-    def getProviderQuotaAvailable(self, provider):
-        val = self._provider_available_cache.get(provider.canonical_name)
-        if val:
-            return val
-
-        # This is initialized with the full tenant quota and later becomes
-        # the quota available for nodepool.
+    def getProviderQuotaAvailable(self, provider, include_requested=False):
+        # This is initialized with the full provider endpoint quota,
+        # which is cached and updated every 5 minutes.
        quota = self.getProviderQuota(provider).copy()
-        unmanaged = provider.getEndpoint().quota_cache.getUnmanagedUsage()
-        self.log.debug("Provider unmanaged quota used for %s: %s",
-                       provider.name, unmanaged)

-        # Subtract the unmanaged quota usage from nodepool_max
-        # to get the quota available for us.
-        quota.subtract(unmanaged)
-        self._provider_available_cache[provider.canonical_name] = quota
+        # Subtract the quota used by other providers on the same
+        # endpoint.
+        other = model.QuotaInformation()
+        for other_provider in self._getUniqueProviders():
+            if (other_provider.endpoint != provider.endpoint
+                or other_provider.canonical_name == provider.canonical_name):
+                continue
+            other.add(self.api.nodes_cache.getQuota(
+                other_provider, include_requested=include_requested))
+
+        quota.subtract(other)
+        self.log.debug("Provider endpoint other quota used for %s: %s",
+                       provider.name, other)
+
+        # Restrict quota limits based on our provider limits
+        provider_limits = model.QuotaInformation(
+            default=math.inf, **provider.resource_limits
+        )
+        quota.min(provider_limits)
        return quota

    def getQuotaPercentage(self, provider, messages):
        try:
-            # This is cached and updated every 5 minutes
-            total = self.getProviderQuotaAvailable(provider).copy()
+            total = self.getProviderQuotaAvailable(
+                provider, include_requested=True)
        except Exception:
            # This will emit an annotated log message, but no traceback
            messages.append("Unable to get provider quota")
@@ -2772,22 +2798,27 @@ class Launcher:
    def doesProviderHaveQuotaForLabel(self, provider, label, messages,
                                      include_usage=True):
        if include_usage:
-            total = self.getProviderQuotaAvailable(provider).copy()
+            # When include_usage is True, we include requested nodes here
+            # because this is called to decide whether to add a new request
+            # to the provider, so we should include other nodes we've
+            # already allocated to providers in the decision.
+            total = self.getProviderQuotaAvailable(
+                provider, include_requested=True)
            messages.append(
-                f"Provider {provider} quota available before Zuul: {total}")
-            # We include requested nodes here because this is called
-            # to decide whether to add a new request to the provider,
-            # so we should include other nodes we've already allocated
-            # to this provider in the decision.
+                f"Provider quota available before {provider}: {total}")
            used = self.api.nodes_cache.getQuota(
                provider, include_requested=True)
            total.subtract(used)
            messages.append(
-                f"Provider {provider} quota available including Zuul: {total}")
+                f"Provider quota available including {provider}: {total}")
        else:
-            total = self.getProviderQuota(provider).copy()
-            messages.append(
-                f"Provider {provider} quota before Zuul: {total}")
+            total = self.getEndpointLimits(provider).copy()
+            # Restrict quota limits based on our provider limits
+            provider_limits = model.QuotaInformation(
+                default=math.inf, **provider.resource_limits
+            )
+            total.min(provider_limits)
+            messages.append(f"Provider {provider} limits: {total}")

        label_quota = provider.getQuotaForLabel(label)
        total.subtract(label_quota)
@@ -2796,15 +2827,16 @@ class Launcher:
        return total.nonNegative()

    def doesProviderHaveQuotaForNode(self, provider, node, messages):
-        total = self.getProviderQuotaAvailable(provider).copy()
-        messages.append(f"Provider {provider} quota before Zuul: {total}")
        # We do not include requested nodes here because this is
        # called to decide whether to issue the create API call for a
        # node already allocated to the provider.  We only want to
        # "pause" the provider if it really is at quota.
+        total = self.getProviderQuotaAvailable(
+            provider, include_requested=False)
+        messages.append(f"Provider quota before {provider}: {total}")
        used = self.api.nodes_cache.getQuota(provider, include_requested=False)
        total.subtract(used)
-        messages.append(f"Provider {provider} quota including Zuul: {total}")
+        messages.append(f"Provider quota including {provider}: {total}")
        total.subtract(node.quota)
        messages.append(f"Node {node} required quota: {node.quota}")
        return total.nonNegative()
@@ -2873,7 +2905,13 @@ class Launcher:
                providers[tenant_provider.canonical_name] = tenant_provider
        for provider in providers.values():
            safe_pname = normalize_statsd_name(provider.canonical_name)
-            limits = self.getProviderQuota(provider).getResources()
+            quota = self.getEndpointLimits(provider).copy()
+            # Restrict quota limits based on our provider limits
+            provider_limits = model.QuotaInformation(
+                default=math.inf, **provider.resource_limits
+            )
+            quota.min(provider_limits)
+            limits = quota.getResources()
            # zuul.provider.<provider>.limit.<resource> gauge
            for res, value in limits.items():
                safe_res = normalize_statsd_name(res)
--- a/zuul/provider/init.py
+++ b/zuul/provider/init.py
@@ -611,8 +611,8 @@ class BaseProvider(zkobject.PolymorphicZKObjectMixin,
        """
        raise NotImplementedError()

-    def getQuotaLimits(self):
-        """Return the quota limits for this provider
+    def getEndpointLimits(self):
+        """Return the endpoint quota limits for this provider

        The default implementation returns a simple QuotaInformation
        with no limits.  Override this to provide accurate