Add provider/pool priority support

This lets users configure providers which should fulfill requests before other providers. This facilitates using a less expensive cloud before using a more expensive one. The default priority is 100, to facilitate either raising above or lowering below the default (while using only positive integers in order to avoid confusion). Change-Id: I969ea821e10a7773a0a8d135a4f13407319362ee
2022-05-11 14:41:27 -07:00
parent a612aa603c
commit ea35fd5152
18 changed files with 349 additions and 17 deletions
--- a/doc/source/aws.rst
+++ b/doc/source/aws.rst
@@ -353,6 +353,20 @@ Selecting the ``aws`` driver adds the following options to the

         A unique name within the provider for this pool of resources.

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/doc/source/azure.rst
+++ b/doc/source/azure.rst
@@ -484,6 +484,20 @@ section of the configuration.

          A unique name within the provider for this pool of resources.

+       .. attr:: priority
+          :type: int
+          :default: 100
+
+          The priority of this provider pool (a lesser number is a higher
+          priority).  Nodepool launchers will yield requests to other
+          provider pools with a higher priority as long as they are not
+          paused.  This means that in general, higher priority pools will
+          reach quota first before lower priority pools begin to be used.
+
+          This setting may be specified at the provider level in order
+          to apply to all pools within that provider, or it can be
+          overridden here for a specific pool.
+
       .. attr:: node-attributes
          :type: dict

--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -459,6 +459,19 @@ Options
      thread, this can be useful for limiting the number of threads
      used by the nodepool-launcher daemon.

+   .. attr:: priority
+      :type: int
+      :default: 100
+
+      The priority of this provider (a lesser number is a higher
+      priority).  Nodepool launchers will yield requests to other
+      provider pools with a higher priority as long as they are not
+      paused.  This means that in general, higher priority pools will
+      reach quota first before lower priority pools begin to be used.
+
+      This setting provides the default for each provider pool, but
+      the value can be overidden in the pool configuration.
+
   .. attr:: driver
      :type: string
      :default: openstack
--- a/doc/source/gce.rst
+++ b/doc/source/gce.rst
@@ -195,6 +195,20 @@ section of the configuration.

         A unique name within the provider for this pool of resources.

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/doc/source/ibmvpc.rst
+++ b/doc/source/ibmvpc.rst
@@ -567,6 +567,20 @@ section of the configuration.
          A unique name within the provider for this pool of
          resources.  The name may not contain underscores.

+       .. attr:: priority
+          :type: int
+          :default: 100
+
+          The priority of this provider pool (a lesser number is a higher
+          priority).  Nodepool launchers will yield requests to other
+          provider pools with a higher priority as long as they are not
+          paused.  This means that in general, higher priority pools will
+          reach quota first before lower priority pools begin to be used.
+
+          This setting may be specified at the provider level in order
+          to apply to all pools within that provider, or it can be
+          overridden here for a specific pool.
+
       .. attr:: zone

          Name of the IBM Cloud region zone to interact with (e.g.,
--- a/doc/source/kubernetes.rst
+++ b/doc/source/kubernetes.rst
@@ -76,6 +76,20 @@ Selecting the kubernetes driver adds the following options to the

         Namespaces are prefixed with the pool's name.

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/doc/source/metastatic.rst
+++ b/doc/source/metastatic.rst
@@ -71,6 +71,20 @@ itself, which is "meta".

          A unique name within the provider for this pool of resources.

+       .. attr:: priority
+          :type: int
+          :default: 100
+
+          The priority of this provider pool (a lesser number is a higher
+          priority).  Nodepool launchers will yield requests to other
+          provider pools with a higher priority as long as they are not
+          paused.  This means that in general, higher priority pools will
+          reach quota first before lower priority pools begin to be used.
+
+          This setting may be specified at the provider level in order
+          to apply to all pools within that provider, or it can be
+          overridden here for a specific pool.
+
       .. attr:: max-servers
          :type: int

--- a/doc/source/openshift-pods.rst
+++ b/doc/source/openshift-pods.rst
@@ -61,6 +61,20 @@ Selecting the openshift pods driver adds the following options to the

         The project's (namespace) name that will be used to create the pods.

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/doc/source/openshift.rst
+++ b/doc/source/openshift.rst
@@ -78,6 +78,20 @@ Selecting the openshift driver adds the following options to the

         Project's name are prefixed with the pool's name.

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/doc/source/openstack.rst
+++ b/doc/source/openstack.rst
@@ -422,6 +422,20 @@ Selecting the OpenStack driver adds the following options to the

        Pool name

+     .. attr:: priority
+        :type: int
+        :default: 100
+
+        The priority of this provider pool (a lesser number is a higher
+        priority).  Nodepool launchers will yield requests to other
+        provider pools with a higher priority as long as they are not
+        paused.  This means that in general, higher priority pools will
+        reach quota first before lower priority pools begin to be used.
+
+        This setting may be specified at the provider level in order
+        to apply to all pools within that provider, or it can be
+        overridden here for a specific pool.
+
     .. attr:: node-attributes
        :type: dict

--- a/doc/source/static.rst
+++ b/doc/source/static.rst
@@ -60,6 +60,20 @@ Selecting the static driver adds the following options to the

         Pool name

+      .. attr:: priority
+         :type: int
+         :default: 100
+
+         The priority of this provider pool (a lesser number is a higher
+         priority).  Nodepool launchers will yield requests to other
+         provider pools with a higher priority as long as they are not
+         paused.  This means that in general, higher priority pools will
+         reach quota first before lower priority pools begin to be used.
+
+         This setting may be specified at the provider level in order
+         to apply to all pools within that provider, or it can be
+         overridden here for a specific pool.
+
      .. attr:: node-attributes
         :type: dict

--- a/nodepool/driver/init.py
+++ b/nodepool/driver/init.py
@@ -860,6 +860,7 @@ class ConfigPool(ConfigValue, metaclass=abc.ABCMeta):
        self.labels = {}
        self.max_servers = math.inf
        self.node_attributes = None
+        self.priority = None

    @classmethod
    def getCommonSchemaDict(self):
@@ -876,6 +877,7 @@ class ConfigPool(ConfigValue, metaclass=abc.ABCMeta):
        return {
            'max-servers': int,
            'node-attributes': dict,
+            'priority': int,
        }

    @abc.abstractmethod
@@ -894,6 +896,7 @@ class ConfigPool(ConfigValue, metaclass=abc.ABCMeta):
        '''
        self.max_servers = pool_config.get('max-servers', math.inf)
        self.node_attributes = pool_config.get('node-attributes')
+        self.priority = pool_config.get('priority', None)


 class DriverConfig(ConfigValue):
@@ -913,13 +916,15 @@ class ProviderConfig(ConfigValue, metaclass=abc.ABCMeta):
        self.driver = DriverConfig()
        self.driver.name = provider.get('driver', 'openstack')
        self.max_concurrency = provider.get('max-concurrency', -1)
+        self.priority = provider.get('priority', None)

    @classmethod
    def getCommonSchemaDict(self):
        return {
            v.Required('name'): str,
            'driver': str,
-            'max-concurrency': int
+            'max-concurrency': int,
+            'priority': int,
        }

    @property
--- a/nodepool/driver/ibmvpc/config.py
+++ b/nodepool/driver/ibmvpc/config.py
@@ -297,12 +297,11 @@ class IBMVPCProviderConfig(ProviderConfig):
            'ram': int,
        }

-        provider = {
+        provider = ProviderConfig.getCommonSchemaDict()
+        provider.update({
            # Normally provided by superclass; we override to add the
            # underscore restriction
            v.Required('name'): v.Match(r'^[^_]+$'),
-            'driver': str,
-            'max-concurrency': int,
            # For this class
            v.Required('pools'): [pool],
            v.Required('vpc'): str,
@@ -322,7 +321,7 @@ class IBMVPCProviderConfig(ProviderConfig):
            'launch-retries': int,
            'credentials-file': str,
            'object-storage': object_storage,
-        }
+        })
        return v.Schema(provider)

    def getSupportedLabels(self, pool_name=None):
--- a/nodepool/launcher.py
+++ b/nodepool/launcher.py
@@ -81,6 +81,17 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
                                         uuid.uuid4().hex)
        stats.StatsReporter.__init__(self)

+    def getPriority(self):
+        pool = self.getPoolConfig()
+        provider = self.getProviderConfig()
+        if pool.priority is not None:
+            priority = pool.priority
+        elif provider.priority is not None:
+            priority = provider.priority
+        else:
+            priority = 100
+        return priority
+
    # ---------------------------------------------------------------
    # Private methods
    # ---------------------------------------------------------------
@@ -168,21 +179,37 @@ class PoolWorker(threading.Thread, stats.StatsReporter):

            log = get_annotated_logger(self.log, event_id=req.event_id,
                                       node_request_id=req.id)
+            # Get the candidate launchers for these nodes
+            candidate_launchers = set(
+                x for x in launchers
+                if (set(x.supported_labels).issuperset(set(req.node_types)) and
+                    x.id not in req.declined_by)
+            )
            # Skip this request if it is requesting another provider
            # which is online
            if req.provider and req.provider != self.provider_name:
                # The request is asking for a specific provider
-                candidate_launchers = set(
-                    x.id for x in launchers
+                launcher_ids_for_provider = set(
+                    x.id for x in candidate_launchers
                    if x.provider_name == req.provider
-                    and set(x.supported_labels).issuperset(req.node_types))
-                if candidate_launchers:
-                    # There is a launcher online which can satisfy the request
-                    if not candidate_launchers.issubset(set(req.declined_by)):
-                        # It has not yet declined the request, so yield to it.
-                        log.debug("Yielding request to provider %s %s",
-                                  req.provider, candidate_launchers)
-                        continue
+                )
+                if launcher_ids_for_provider:
+                    # There is a launcher online which can satisfy the
+                    # request that has not yet declined the request,
+                    # so yield to it.
+                    log.debug("Yielding request to provider %s %s",
+                              req.provider, launcher_ids_for_provider)
+                    continue
+
+            priority = self.getPriority()
+            launcher_ids_with_higher_priority = set(
+                x.id for x in candidate_launchers
+                if x.priority < priority and not x.paused
+            )
+            if launcher_ids_with_higher_priority:
+                log.debug("Yielding request to higher priority providers %s",
+                          launcher_ids_with_higher_priority)
+                continue

            if has_quota_support and not all(label_quota.get(l, math.inf) > 0
                                             for l in req.node_types):
@@ -363,6 +390,7 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
                'provider_name': self.provider_name,
                'supported_labels': list(labels),
                'state': self.component_info.RUNNING,
+                'priority': self.getPriority(),
            })
        self.component_info.register()

@@ -380,8 +408,8 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
                labels = set()
                for prov_cfg in self.nodepool.config.providers.values():
                    labels.update(prov_cfg.getSupportedLabels())
-                if set(self.component_info.supported_labels) != labels:
-                    self.component_info.supported_labels = list(labels)
+                self.component_info.supported_labels = list(labels)
+                self.component_info.priority = self.getPriority()

                self.updateProviderLimits(
                    self.nodepool.config.providers.get(self.provider_name))
@@ -389,6 +417,7 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
                    self.nodepool.config.tenant_resource_limits)

                if not self.paused_handler:
+                    self.component_info.paused = False
                    while not self._assignHandlers():
                        # _assignHandlers can take quite some time on a busy
                        # system so sprinkle _removeCompletedHandlers in
@@ -396,6 +425,7 @@ class PoolWorker(threading.Thread, stats.StatsReporter):
                        # requests that already have all nodes.
                        self._removeCompletedHandlers()
                else:
+                    self.component_info.paused = True
                    # If we are paused, one request handler could not
                    # satisfy its assigned request, so give it
                    # another shot. Unpause ourselves if it completed.
--- a/nodepool/tests/fixtures/priority.yaml
+++ b/nodepool/tests/fixtures/priority.yaml
@@ -0,0 +1,87 @@
+elements-dir: .
+images-dir: '{images_dir}'
+build-log-dir: '{build_log_dir}'
+build-log-retention: 1
+
+zookeeper-servers:
+  - host: {zookeeper_host}
+    port: {zookeeper_port}
+    chroot: {zookeeper_chroot}
+
+zookeeper-tls:
+  ca: {zookeeper_ca}
+  cert: {zookeeper_cert}
+  key: {zookeeper_key}
+
+zookeeper-timeout: 20.0
+
+labels:
+  - name: fake-label
+    min-ready: 0
+
+providers:
+  - name: low-provider
+    priority: 2
+    cloud: fake
+    driver: fake
+    region-name: fake-region
+    rate: 0.0001
+    diskimages:
+      - name: fake-image
+        meta:
+          key: value
+          key2: value
+    pools:
+      - name: main
+        max-servers: 2
+        node-attributes:
+          key1: value1
+          key2: value2
+        availability-zones:
+          - az1
+        networks:
+          - net-name
+        labels:
+          - name: fake-label
+            diskimage: fake-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+  - name: high-provider
+    priority: 1
+    cloud: fake
+    driver: fake
+    region-name: fake-region
+    rate: 0.0001
+    diskimages:
+      - name: fake-image
+        meta:
+          key: value
+          key2: value
+    pools:
+      - name: main
+        max-servers: 1
+        node-attributes:
+          key1: value1
+          key2: value2
+        availability-zones:
+          - az1
+        networks:
+          - net-name
+        labels:
+          - name: fake-label
+            diskimage: fake-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+
+diskimages:
+  - name: fake-image
+    elements:
+      - fedora
+      - vm
+    release: 21
+    dib-cmd: nodepool/tests/fake-image-create
+    env-vars:
+      TMPDIR: /opt/dib_tmp
+      DIB_IMAGE_CACHE: /opt/dib_cache
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
--- a/nodepool/tests/unit/test_launcher.py
+++ b/nodepool/tests/unit/test_launcher.py
@@ -2455,3 +2455,47 @@ class TestLauncher(tests.DBTestCase):
        # Ready for the real delete now
        zk.ZooKeeper.deleteRawNode = real_method
        self.waitForNodeDeletion(node)
+
+    def test_provider_priority(self):
+        """Test provider priorities"""
+        configfile = self.setup_config('priority.yaml')
+        self.useBuilder(configfile)
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+        self.waitForImage('low-provider', 'fake-image')
+        self.waitForImage('high-provider', 'fake-image')
+
+        # The first request should be handled by the highest priority
+        # provider (high-provider; priority 1)
+        req1 = zk.NodeRequest()
+        req1.state = zk.REQUESTED
+        req1.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req1)
+        req1 = self.waitForNodeRequest(req1)
+        self.assertEqual(req1.state, zk.FULFILLED)
+        self.assertEqual(len(req1.nodes), 1)
+        node1 = self.zk.getNode(req1.nodes[0])
+        self.assertEqual(node1.provider, 'high-provider')
+
+        # The second request should also be handled by the highest
+        # priority provider, but since it has max-servers=1, this
+        # request should be paused, which will cause the provider to
+        # be paused.
+        req2 = zk.NodeRequest()
+        req2.state = zk.REQUESTED
+        req2.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req2)
+        req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
+        self.assertEqual(req2.state, zk.PENDING)
+
+        # The third request should be handled by the low priority
+        # provider now that the high provider is paused.
+        req3 = zk.NodeRequest()
+        req3.state = zk.REQUESTED
+        req3.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req3)
+        req3 = self.waitForNodeRequest(req3)
+        self.assertEqual(req3.state, zk.FULFILLED)
+        self.assertEqual(len(req3.nodes), 1)
+        node3 = self.zk.getNode(req3.nodes[0])
+        self.assertEqual(node3.provider, 'low-provider')
--- a/nodepool/zk/components.py
+++ b/nodepool/zk/components.py
@@ -104,6 +104,9 @@ class BaseComponent(ZooKeeperBase):
        if name not in self.content.keys():
            return super().__setattr__(name, value)

+        if self.content[name] == value:
+            return
+
        # Set the value in the local content dict
        self.content[name] = value

@@ -214,6 +217,8 @@ class PoolComponent(BaseComponent):
            "id": None,
            "provider_name": None,
            "supported_labels": [],
+            "priority": 100,
+            "paused": False,
        }
        self.content.update(self.initial_state)

--- a/releasenotes/notes/priority-cb2f0ad1829fbaaf.yaml
+++ b/releasenotes/notes/priority-cb2f0ad1829fbaaf.yaml
@@ -0,0 +1,9 @@
+---
+features:
+  - |
+    Priority may now be set at the provider or provider-pool level.
+    Providers or pools with the highest priority will fulfill requests
+    first, until they are at quota, at which point lower-priority
+    providers will begin to be used.
+
+    See :attr:`providers.priority` for details.