Add ability to ignore provider quota for a pool

In some circumstances it is useful to tell the launcher to ignore the provide quota and just trust the max-* settings for that pool instead. This particular need arises when using Rackspace public cloud for both instances and OnMetal nodes. In this situation the quota for instances and for OnMetal nodes is different, but nodepool only queries the quota for instances. When trying to build OnMetal nodes, the quota check fails - but should not. In this circumstance, instead of making shade/nodepool complicated by figuring out how to calculate disparate quota types, it makes sense to rather just allow nodepool to ignore the quota for a pool to try executing the build instead. While this is our use-case, it may also be useful to others for other reasons. Change-Id: I232a1ab365795381ab180aceb48e8c87843ac713
2018-07-10 12:30:44 +01:00
parent 03b7b4baef
commit 4c8b5f4f99
7 changed files with 264 additions and 12 deletions
--- a/doc/source/configuration.rst
+++ b/doc/source/configuration.rst
@@ -488,6 +488,12 @@ Example::
    ram allocated by nodepool. If not defined nodepool can use as much ram as
    the tenant allows.

+  ``ignore-provider-quota``
+    Ignore the provider quota for this pool. Instead, only check against the
+    configured max values for this pool and the current usage based on stored
+    data. This may be useful in circumstances where the provider is incorrectly
+    calculating quota.
+
  ``availability-zones`` (list)
    A list of availability zones to use.

--- a/nodepool/driver/openstack/config.py
+++ b/nodepool/driver/openstack/config.py
@@ -113,6 +113,7 @@ class ProviderPool(ConfigPool):
        self.name = None
        self.max_cores = None
        self.max_ram = None
+        self.ignore_provider_quota = False
        self.azs = None
        self.networks = None
        self.security_groups = None
@@ -133,6 +134,8 @@ class ProviderPool(ConfigPool):
                    other.name == self.name and
                    other.max_cores == self.max_cores and
                    other.max_ram == self.max_ram and
+                    other.ignore_provider_quota == (
+                        self.ignore_provider_quota) and
                    other.azs == self.azs and
                    other.networks == self.networks and
                    other.security_groups == self.security_groups and
@@ -264,6 +267,7 @@ class OpenStackProviderConfig(ProviderConfig):
            pp.max_cores = pool.get('max-cores', math.inf)
            pp.max_servers = pool.get('max-servers', math.inf)
            pp.max_ram = pool.get('max-ram', math.inf)
+            pp.ignore_provider_quota = pool.get('ignore-provider-quota', False)
            pp.azs = pool.get('availability-zones')
            pp.networks = pool.get('networks', [])
            pp.security_groups = pool.get('security-groups', [])
@@ -353,6 +357,7 @@ class OpenStackProviderConfig(ProviderConfig):
            'networks': [str],
            'auto-floating-ip': bool,
            'host-key-checking': bool,
+            'ignore-provider-quota': bool,
            'max-cores': int,
            'max-servers': int,
            'max-ram': int,
--- a/nodepool/driver/openstack/handler.py
+++ b/nodepool/driver/openstack/handler.py
@@ -298,15 +298,18 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
    def hasRemainingQuota(self, ntype):
        needed_quota = self.manager.quotaNeededByNodeType(ntype, self.pool)

-        # Calculate remaining quota which is calculated as:
-        # quota = <total nodepool quota> - <used quota> - <quota for node>
-        cloud_quota = self.manager.estimatedNodepoolQuota()
-        cloud_quota.subtract(self.manager.estimatedNodepoolQuotaUsed(self.zk))
-        cloud_quota.subtract(needed_quota)
-        self.log.debug("Predicted remaining tenant quota: %s", cloud_quota)
+        if not self.pool.ignore_provider_quota:
+            # Calculate remaining quota which is calculated as:
+            # quota = <total nodepool quota> - <used quota> - <quota for node>
+            cloud_quota = self.manager.estimatedNodepoolQuota()
+            cloud_quota.subtract(
+                self.manager.estimatedNodepoolQuotaUsed(self.zk))
+            cloud_quota.subtract(needed_quota)
+            self.log.debug("Predicted remaining provider quota: %s",
+                           cloud_quota)

-        if not cloud_quota.non_negative():
-            return False
+            if not cloud_quota.non_negative():
+                return False

        # Now calculate pool specific quota. Values indicating no quota default
        # to math.inf representing infinity that can be calculated with.
@@ -329,11 +332,12 @@ class OpenStackNodeRequestHandler(NodeRequestHandler):
            needed_quota.add(
                self.manager.quotaNeededByNodeType(ntype, self.pool))

-        cloud_quota = self.manager.estimatedNodepoolQuota()
-        cloud_quota.subtract(needed_quota)
+        if not self.pool.ignore_provider_quota:
+            cloud_quota = self.manager.estimatedNodepoolQuota()
+            cloud_quota.subtract(needed_quota)

-        if not cloud_quota.non_negative():
-            return False
+            if not cloud_quota.non_negative():
+                return False

        # Now calculate pool specific quota. Values indicating no quota default
        # to math.inf representing infinity that can be calculated with.
--- a/nodepool/tests/fixtures/ignore_provider_quota_false.yaml
+++ b/nodepool/tests/fixtures/ignore_provider_quota_false.yaml
@@ -0,0 +1,48 @@
+elements-dir: .
+images-dir: '{images_dir}'
+build-log-dir: '{build_log_dir}'
+build-log-retention: 1
+
+zookeeper-servers:
+  - host: {zookeeper_host}
+    port: {zookeeper_port}
+    chroot: {zookeeper_chroot}
+
+labels:
+  - name: fake-label
+
+providers:
+  - name: fake-provider
+    cloud: fake
+    driver: fake
+    region-name: fake-region
+    rate: 0.0001
+    diskimages:
+      - name: fake-image
+        meta:
+          key: value
+          key2: value
+    pools:
+      - name: main
+        ignore-provider-quota: false
+        availability-zones:
+          - az1
+        networks:
+          - net-name
+        labels:
+          - name: fake-label
+            diskimage: fake-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+
+diskimages:
+  - name: fake-image
+    elements:
+      - fedora
+      - vm
+    release: 21
+    env-vars:
+      TMPDIR: /opt/dib_tmp
+      DIB_IMAGE_CACHE: /opt/dib_cache
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
--- a/nodepool/tests/fixtures/ignore_provider_quota_true.yaml
+++ b/nodepool/tests/fixtures/ignore_provider_quota_true.yaml
@@ -0,0 +1,49 @@
+elements-dir: .
+images-dir: '{images_dir}'
+build-log-dir: '{build_log_dir}'
+build-log-retention: 1
+
+zookeeper-servers:
+  - host: {zookeeper_host}
+    port: {zookeeper_port}
+    chroot: {zookeeper_chroot}
+
+labels:
+  - name: fake-label
+
+providers:
+  - name: fake-provider
+    cloud: fake
+    driver: fake
+    region-name: fake-region
+    rate: 0.0001
+    diskimages:
+      - name: fake-image
+        meta:
+          key: value
+          key2: value
+    pools:
+      - name: main
+        max-servers: 1
+        ignore-provider-quota: true
+        availability-zones:
+          - az1
+        networks:
+          - net-name
+        labels:
+          - name: fake-label
+            diskimage: fake-image
+            min-ram: 8192
+            flavor-name: 'Fake'
+
+diskimages:
+  - name: fake-image
+    elements:
+      - fedora
+      - vm
+    release: 21
+    env-vars:
+      TMPDIR: /opt/dib_tmp
+      DIB_IMAGE_CACHE: /opt/dib_cache
+      DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
+      BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
--- a/nodepool/tests/test_launcher.py
+++ b/nodepool/tests/test_launcher.py
@@ -1436,3 +1436,135 @@ class TestLauncher(tests.DBTestCase):
        while pool_worker[0].paused_handler:
            time.sleep(0.1)
        self.assertEqual(0, len(pool_worker[0].request_handlers))
+
+    def test_ignore_provider_quota_false(self):
+        '''
+        Test that a node request get fulfilled with ignore-provider-quota set
+        to false.
+        '''
+
+        # Set max-cores quota value to 0 to force "out of quota". Note that
+        # the fake provider checks the number of instances during server
+        # creation to decide if it should throw an over quota exception,
+        # but it doesn't check cores.
+        def fake_get_quota():
+            return (0, 20, 1000000)
+        self.useFixture(fixtures.MockPatchObject(
+            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
+            fake_get_quota
+        ))
+
+        configfile = self.setup_config('ignore_provider_quota_false.yaml')
+        self.useBuilder(configfile)
+        self.waitForImage('fake-provider', 'fake-image')
+
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+
+        # Create a request with ignore-provider-quota set to false that should
+        # fail because it will decline the request because "it would exceed
+        # quota".
+        self.log.debug("Submitting request with ignore-provider-quota False")
+        req = zk.NodeRequest()
+        req.state = zk.REQUESTED
+        req.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req)
+        req = self.waitForNodeRequest(req)
+        self.assertEqual(req.state, zk.FAILED)
+
+    def test_ignore_provider_quota_true(self):
+        '''
+        Test that a node request get fulfilled with ignore-provider-quota set
+        to true.
+        '''
+
+        # Set max-cores quota value to 0 to force "out of quota". Note that
+        # the fake provider checks the number of instances during server
+        # creation to decide if it should throw an over quota exception,
+        # but it doesn't check cores.
+        def fake_get_quota():
+            return (0, 20, 1000000)
+        self.useFixture(fixtures.MockPatchObject(
+            fakeprovider.FakeProvider.fake_cloud, '_get_quota',
+            fake_get_quota
+        ))
+
+        configfile = self.setup_config('ignore_provider_quota_true.yaml')
+        self.useBuilder(configfile)
+        self.waitForImage('fake-provider', 'fake-image')
+
+        pool = self.useNodepool(configfile, watermark_sleep=1)
+        pool.start()
+
+        # Create a request with ignore-provider-quota set to true that should
+        # pass regardless of the lack of cloud/provider quota.
+        self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
+
+        self.log.debug(
+            "Submitting an initial request with ignore-provider-quota True")
+        req1 = zk.NodeRequest()
+        req1.state = zk.REQUESTED
+        req1.node_types.append('fake-label')
+        self.zk.storeNodeRequest(req1)
+        req1 = self.waitForNodeRequest(req1)
+        self.assertEqual(req1.state, zk.FULFILLED)
+
+        # Lock this node so it appears as used and not deleted
+        req1_node = self.zk.getNode(req1.nodes[0])
+        self.zk.lockNode(req1_node, blocking=False)
+
+        # Request a second node; this request should pause the handler
+        # due to the pool set with max-servers: 1
+        req2 = zk.NodeRequest()
+        req2.state = zk.REQUESTED
+        req2.node_types.append('fake-label')
+        self.log.debug(
+            "Submitting a second request with ignore-provider-quota True"
+            "but with a full max-servers quota.")
+        self.zk.storeNodeRequest(req2)
+
+        pool_worker = pool.getPoolWorkers('fake-provider')
+        while not pool_worker[0].paused_handler:
+            time.sleep(0.1)
+
+        # The handler is paused now and the request should be in state PENDING
+        req2 = self.waitForNodeRequest(req2, zk.PENDING)
+        self.assertEqual(req2.state, zk.PENDING)
+
+        # Now free up the first node
+        self.log.debug("Marking first node as used %s", req1.id)
+        req1_node.state = zk.USED
+        self.zk.storeNode(req1_node)
+        self.zk.unlockNode(req1_node)
+        self.waitForNodeDeletion(req1_node)
+
+        # After the first node is cleaned up the second request should be
+        # able to fulfill now.
+        req2 = self.waitForNodeRequest(req2)
+        self.assertEqual(req2.state, zk.FULFILLED)
+
+        # Lock this node so it appears as used and not deleted
+        req2_node = self.zk.getNode(req2.nodes[0])
+        self.zk.lockNode(req2_node, blocking=False)
+
+        # Now free up the second node
+        self.log.debug("Marking second node as used %s", req2.id)
+        req2_node.state = zk.USED
+        self.zk.storeNode(req2_node)
+        self.zk.unlockNode(req2_node)
+        self.waitForNodeDeletion(req2_node)
+
+        # Request a 2 node set; this request should fail
+        # due to the provider only being able to fulfill
+        # a single node at a time.
+        req3 = zk.NodeRequest()
+        req3.state = zk.REQUESTED
+        req3.node_types.append('fake-label')
+        req3.node_types.append('fake-label')
+        self.log.debug(
+            "Submitting a third request with ignore-provider-quota True"
+            "for a 2-node set which the provider cannot fulfill.")
+        self.zk.storeNodeRequest(req3)
+
+        req3 = self.waitForNodeRequest(req3)
+        self.assertEqual(req3.state, zk.FAILED)
--- a/releasenotes/notes/ignore-provider-quota-aa19e7a7271ee106.yaml
+++ b/releasenotes/notes/ignore-provider-quota-aa19e7a7271ee106.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    A new boolean pool variable ``ignore-provider-quota`` has been added to
+    allow the provider quota to be ignored for a pool. Instead, nodepool only
+    checks against the configured max values for the pool and the current usage
+    based on stored data. This may be useful in circumstances where the
+    provider is incorrectly calculating quota.