Merge "AWS multi quota support"

This commit is contained in:
Zuul 2022-07-29 17:01:09 +00:00 committed by Gerrit Code Review
commit 123a32f922
13 changed files with 524 additions and 34 deletions

View File

@ -109,6 +109,42 @@ Selecting the ``aws`` driver adds the following options to the
until that instance is reported as "active". If the timeout is
exceeded, the node launch is aborted and the instance deleted.
.. attr:: max-cores
:type: int
:default: unlimited
Maximum number of cores usable from this provider's pools by default.
.. attr:: max-servers
:type: int
:default: unlimited
Maximum number of servers spawnable from this provider's pools by default.
.. attr:: max-ram
:type: int
:default: unlimited
Maximum RAM usable from this provider's pools by default.
.. attr:: max-resources
:type: dict
:default: unlimited
A dictionary of other quota resource limits. AWS has quotas
for certain instance types. These may be specified here to
limit Nodepool's usage.
The following example limits the number of high-memory
instance cores:
.. code-block:: yaml
max-resources:
'L-43DA4232': 224
See `instance quotas`_ for more information.
.. attr:: launch-retries
:default: 3
@ -379,6 +415,42 @@ Selecting the ``aws`` driver adds the following options to the
A dictionary of key-value pairs that will be stored with the node data
in ZooKeeper. The keys and values can be any arbitrary string.
.. attr:: max-cores
:type: int
Maximum number of cores usable from this pool. Defaults to
:attr:`providers.[aws].max-cores`.
.. attr:: max-servers
:type: int
Maximum number of servers spawnable from this pool. Defaults to
:attr:`providers.[aws].max-servers`.
.. attr:: max-ram
:type: int
Maximum RAM usable from this pool. Defaults to
:attr:`providers.[aws].max-ram`.
.. attr:: max-resources
:type: dict
A dictionary of other quota resource limits. AWS has quotas
for certain instance types. These may be specified here to
limit Nodepool's usage. Defaults to
:attr:`providers.[aws].max-resources`.
The following example limits the number of high-memory
instance cores:
.. code-block:: yaml
max-resources:
'L-43DA4232': 224
See `instance quotas`_ for more information.
.. attr:: subnet-id
If provided, specifies the subnet to assign to the primary network
@ -538,3 +610,4 @@ Selecting the ``aws`` driver adds the following options to the
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
.. _`Boto describe images`: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
.. _`VM Import/Export service role`: https://docs.aws.amazon.com/vm-import/latest/userguide/vmie_prereqs.html#vmimport-role
.. _`instance quotas`: https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas

View File

@ -548,8 +548,13 @@ Options
max-servers: 10
max-cores: 200
max-ram: 16565
'L-43DA4232': 224
Each entry is a dictionary with the following keys.
Each entry is a dictionary with the following keys. Any other keys
are interpreted as driver-specific resource limits (otherwise
specified as ``max-resources`` in the provider configuration). The
only driver that currently supports additional resource limits is
AWS.
.. attr:: tenant-name
:type: str

View File

@ -67,6 +67,7 @@ class ConfigValidator:
'max-cores': int,
'max-ram': int,
'max-servers': int,
str: int,
}
top_level = {

View File

@ -255,19 +255,14 @@ class Config(ConfigValue):
if not tenant_resource_limits_cfg:
return
for resource_limit in tenant_resource_limits_cfg:
tenant_name = resource_limit['tenant-name']
max_cores = resource_limit.get('max-cores')
max_ram = resource_limit.get('max-ram')
max_servers = resource_limit.get('max-servers')
resource_limit = resource_limit.copy()
tenant_name = resource_limit.pop('tenant-name')
limits = {}
if max_cores:
limits['cores'] = max_cores
if max_servers:
limits['instances'] = max_servers
if max_ram:
limits['ram'] = max_ram
limits['cores'] = resource_limit.pop('max-cores', math.inf)
limits['instances'] = resource_limit.pop('max-servers', math.inf)
limits['ram'] = resource_limit.pop('max-ram', math.inf)
for k, v in resource_limit.items():
limits[k] = v
self.tenant_resource_limits[tenant_name] = limits

View File

@ -42,6 +42,41 @@ def tag_list_to_dict(taglist):
return {t["Key"]: t["Value"] for t in taglist}
# This is a map of instance types to quota codes. There does not
# appear to be an automated way to determine what quota code to use
# for an instance type, therefore this list was manually created by
# visiting
# https://us-west-1.console.aws.amazon.com/servicequotas/home/services/ec2/quotas
# and filtering by "Instances". An example description is "Running
# On-Demand P instances" which we can infer means we should use that
# quota code for instance types starting with the letter "p". All
# instance type names follow the format "([a-z\-]+)\d", so we can
# match the first letters (up to the first number) of the instance
# type name with the letters in the quota name. The prefix "u-" for
# "Running On-Demand High Memory instances" was determined from
# https://aws.amazon.com/ec2/instance-types/high-memory/
QUOTA_CODES = {
'a': 'L-1216C47A',
'c': 'L-1216C47A',
'd': 'L-1216C47A',
'h': 'L-1216C47A',
'i': 'L-1216C47A',
'm': 'L-1216C47A',
'r': 'L-1216C47A',
't': 'L-1216C47A',
'z': 'L-1216C47A',
'dl': 'L-6E869C2A',
'f': 'L-74FC7D96',
'g': 'L-DB2E81BA',
'vt': 'L-DB2E81BA',
'u-': 'L-43DA4232', # 'high memory'
'inf': 'L-1945791B',
'p': 'L-417A185B',
'x': 'L-7295265B',
}
class AwsInstance(statemachine.Instance):
def __init__(self, instance, quota):
super().__init__()
@ -293,15 +328,28 @@ class AwsAdapter(statemachine.Adapter):
yield AwsInstance(instance, quota)
def getQuotaLimits(self):
with self.non_mutating_rate_limiter:
self.log.debug("Getting quota limits")
response = self.aws_quotas.get_service_quota(
ServiceCode='ec2',
QuotaCode='L-1216C47A'
)
cores = response['Quota']['Value']
return QuotaInformation(cores=cores,
default=math.inf)
# Get the instance types that this provider handles
instance_types = set()
for pool in self.provider.pools.values():
for label in pool.labels.values():
instance_types.add(label.instance_type)
args = dict(default=math.inf)
for instance_type in instance_types:
code = self._getQuotaCodeForInstanceType(instance_type)
if code in args:
continue
if not code:
self.log.warning("Unknown quota code for instance type: %s",
instance_type)
continue
with self.non_mutating_rate_limiter:
self.log.debug("Getting quota limits for %s", code)
response = self.aws_quotas.get_service_quota(
ServiceCode='ec2',
QuotaCode=code,
)
args[code] = response['Quota']['Value']
return QuotaInformation(**args)
def getQuotaForLabel(self, label):
return self._getQuotaForInstanceType(label.instance_type)
@ -454,13 +502,27 @@ class AwsAdapter(statemachine.Adapter):
# Return the first and only task
return task
instance_key_re = re.compile(r'([a-z\-]+)\d.*')
def _getQuotaCodeForInstanceType(self, instance_type):
m = self.instance_key_re.match(instance_type)
if m:
key = m.group(1)
return QUOTA_CODES.get(key)
def _getQuotaForInstanceType(self, instance_type):
itype = self._getInstanceType(instance_type)
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
return QuotaInformation(cores=cores,
ram=ram,
instances=1)
code = self._getQuotaCodeForInstanceType(instance_type)
# We include cores twice: one to match the overall cores quota
# (which may be set as a tenant resource limit), and a second
# time as the specific AWS quota code which in for a specific
# instance type.
args = dict(cores=cores, ram=ram, instances=1)
if code:
args[code] = cores
return QuotaInformation(**args)
@cachetools.func.lru_cache(maxsize=None)
def _getInstanceType(self, instance_type):

View File

@ -15,6 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
import math
import voluptuous as v
from nodepool.driver import ConfigPool
@ -203,6 +206,13 @@ class AwsPool(ConfigPool):
'use-internal-ip', self.provider.use_internal_ip)
self.host_key_checking = pool_config.get(
'host-key-checking', self.provider.host_key_checking)
self.max_servers = pool_config.get(
'max-servers', self.provider.max_servers)
self.max_cores = pool_config.get('max-cores', self.provider.max_cores)
self.max_ram = pool_config.get('max-ram', self.provider.max_ram)
self.max_resources = self.provider.max_resources.copy()
for k, val in pool_config.get('max-resources', {}).items():
self.max_resources[k] = val
@staticmethod
def getSchema():
@ -218,6 +228,9 @@ class AwsPool(ConfigPool):
'public-ipv4': bool,
'public-ipv6': bool,
'host-key-checking': bool,
'max-cores': int,
'max-ram': int,
'max-resources': {str: int},
})
return pool
@ -263,6 +276,12 @@ class AwsProviderConfig(ProviderConfig):
self.image_type = self.provider.get('image-format', 'raw')
self.image_name_format = '{image_name}-{timestamp}'
self.post_upload_hook = self.provider.get('post-upload-hook')
self.max_servers = self.provider.get('max-servers', math.inf)
self.max_cores = self.provider.get('max-cores', math.inf)
self.max_ram = self.provider.get('max-ram', math.inf)
self.max_resources = defaultdict(lambda: math.inf)
for k, val in self.provider.get('max-resources', {}).items():
self.max_resources[k] = val
self.cloud_images = {}
for image in self.provider.get('cloud-images', []):
@ -305,6 +324,10 @@ class AwsProviderConfig(ProviderConfig):
'launch-retries': int,
'object-storage': object_storage,
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
'max-servers': int,
'max-cores': int,
'max-ram': int,
'max-resources': {str: int},
})
return v.Schema(provider)

View File

@ -123,6 +123,10 @@ class StateMachineNodeLauncher(stats.StatsReporter):
self.node.shell_type = image.shell_type
self.node.connection_port = image.connection_port
self.node.connection_type = image.connection_type
qi = self.manager.quotaNeededByLabel(label.name, self.handler.pool)
if qi:
self.node.resources = qi.get_resources()
self.zk.storeNode(self.node)
# Windows computer names can be no more than 15 chars long.
@ -386,11 +390,14 @@ class StateMachineHandler(NodeRequestHandler):
# Now calculate pool specific quota. Values indicating no quota default
# to math.inf representing infinity that can be calculated with.
pool_quota = QuotaInformation(
args = dict(
cores=getattr(self.pool, 'max_cores', None),
instances=self.pool.max_servers,
ram=getattr(self.pool, 'max_ram', None),
default=math.inf)
default=math.inf,
)
args.update(getattr(self.pool, 'max_resources', {}))
pool_quota = QuotaInformation(**args)
pool_quota.subtract(needed_quota)
return pool_quota.non_negative()
@ -403,6 +410,7 @@ class StateMachineHandler(NodeRequestHandler):
:return: True if there is enough quota, False otherwise
'''
needed_quota = self.manager.quotaNeededByLabel(ntype, self.pool)
self.log.debug("Needed quota: %s", needed_quota)
# Calculate remaining quota which is calculated as:
# quota = <total nodepool quota> - <used quota> - <quota for node>
@ -418,11 +426,14 @@ class StateMachineHandler(NodeRequestHandler):
# Now calculate pool specific quota. Values indicating no quota default
# to math.inf representing infinity that can be calculated with.
pool_quota = QuotaInformation(
args = dict(
cores=getattr(self.pool, 'max_cores', None),
instances=self.pool.max_servers,
ram=getattr(self.pool, 'max_ram', None),
default=math.inf)
default=math.inf,
)
args.update(getattr(self.pool, 'max_resources', {}))
pool_quota = QuotaInformation(**args)
pool_quota.subtract(
self.manager.estimatedNodepoolQuotaUsed(self.pool))
self.log.debug("Current pool quota: %s" % pool_quota)

View File

@ -1,4 +1,5 @@
# Copyright (C) 2018 Red Hat
# Copyright 2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -179,7 +180,7 @@ class NodeDeleter(threading.Thread):
class QuotaInformation:
def __init__(self, cores=None, instances=None, ram=None, default=0):
def __init__(self, cores=None, instances=None, ram=None, default=0, **kw):
'''
Initializes the quota information with some values. None values will
be initialized with default which will be typically 0 or math.inf
@ -202,6 +203,9 @@ class QuotaInformation:
'ram': self._get_default(ram, default),
}
}
for k, v in kw.items():
self.quota['compute'][k] = v
self.default = default
@staticmethod
def construct_from_flavor(flavor):
@ -225,9 +229,14 @@ class QuotaInformation:
return value if value is not None else default
def _add_subtract(self, other, add=True):
for category in other.quota.keys():
self.quota.setdefault(category, {})
for resource in other.quota[category].keys():
self.quota[category].setdefault(resource, self.default)
for category in self.quota.keys():
for resource in self.quota[category].keys():
second_value = other.quota.get(category, {}).get(resource, 0)
second_value = other.quota.get(category, {}).get(
resource, other.default)
if add:
self.quota[category][resource] += second_value
else:

View File

@ -0,0 +1,46 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-cores: 1024
'L-43DA4232': 224 # high mem cores
labels:
- name: standard
- name: high
providers:
- name: ec2-us-west-2
driver: aws
region-name: us-west-2
cloud-images:
- name: ubuntu1404
image-id: ami-1e749f67
username: ubuntu
pools:
- name: main
max-servers: 10
subnet-id: {subnet_id}
security-group-id: {security_group_id}
node-attributes:
key1: value1
key2: value2
max-resources:
'L-1216C47A': 1 # standard cores
labels:
- name: standard
cloud-image: ubuntu1404
instance-type: t3.medium
key-name: zuul
- name: high
cloud-image: ubuntu1404
instance-type: u-6tb1.112xlarge
key-name: zuul

View File

@ -0,0 +1,43 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-cores: 1024
labels:
- name: standard
- name: high
providers:
- name: ec2-us-west-2
driver: aws
region-name: us-west-2
cloud-images:
- name: ubuntu1404
image-id: ami-1e749f67
username: ubuntu
pools:
- name: main
max-servers: 10
subnet-id: {subnet_id}
security-group-id: {security_group_id}
node-attributes:
key1: value1
key2: value2
labels:
- name: standard
cloud-image: ubuntu1404
instance-type: t3.medium
key-name: zuul
- name: high
cloud-image: ubuntu1404
instance-type: u-6tb1.112xlarge
key-name: zuul

View File

@ -114,7 +114,8 @@ class TestDriverAws(tests.DBTestCase):
kw['security_group_id'] = self.security_group_id
return super().setup_config(*args, **kw)
def patchProvider(self, nodepool, provider_name='ec2-us-west-2'):
def patchProvider(self, nodepool, provider_name='ec2-us-west-2',
quotas=None):
for _ in iterate_timeout(
30, Exception, 'wait for provider'):
try:
@ -138,10 +139,13 @@ class TestDriverAws(tests.DBTestCase):
_fake_create_instances
# moto does not mock service-quotas, so we do it ourselves:
def _fake_get_service_quota(*args, **kwargs):
def _fake_get_service_quota(ServiceCode, QuotaCode, *args, **kwargs):
# This is a simple fake that only returns the number
# of cores.
return {'Quota': {'Value': 100}}
if quotas is None:
return {'Quota': {'Value': 100}}
else:
return {'Quota': {'Value': quotas.get(QuotaCode)}}
provider_manager.adapter.aws_quotas.get_service_quota =\
_fake_get_service_quota
@ -204,6 +208,149 @@ class TestDriverAws(tests.DBTestCase):
for node in nodes:
self.waitForNodeDeletion(node)
def test_aws_multi_quota(self):
# Test multiple instance type quotas (standard and high-mem)
configfile = self.setup_config('aws/aws-quota.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.patchProvider(pool, quotas={
'L-1216C47A': 1,
'L-43DA4232': 224,
})
# Create a high-memory node request.
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('high')
self.zk.storeNodeRequest(req1)
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
node1 = self.assertSuccess(req1)
# Create a second high-memory node request; this should be
# over quota so it won't be fulfilled.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('high')
self.zk.storeNodeRequest(req2)
self.log.debug("Waiting for request %s", req2.id)
req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
# Make sure we're paused while we attempt to fulfill the
# second request.
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
for _ in iterate_timeout(30, Exception, 'paused handler'):
if pool_worker[0].paused_handler:
break
# Release the first node so that the second can be fulfilled.
node1.state = zk.USED
self.zk.storeNode(node1)
self.waitForNodeDeletion(node1)
# Make sure the second high node exists now.
req2 = self.waitForNodeRequest(req2)
self.assertSuccess(req2)
# Create a standard node request which should succeed even
# though we're at quota for high-mem (but not standard).
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.node_types.append('standard')
self.zk.storeNodeRequest(req3)
self.log.debug("Waiting for request %s", req3.id)
req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3)
def test_aws_multi_pool_limits(self):
# Test multiple instance type quotas (standard and high-mem)
# with pool resource limits
configfile = self.setup_config('aws/aws-limits.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.patchProvider(pool, quotas={
'L-1216C47A': 1000,
'L-43DA4232': 1000,
})
# Create a standard node request.
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('standard')
self.zk.storeNodeRequest(req1)
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
node1 = self.assertSuccess(req1)
# Create a second standard node request; this should be
# over max-cores so it won't be fulfilled.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('standard')
self.zk.storeNodeRequest(req2)
self.log.debug("Waiting for request %s", req2.id)
req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
# Make sure we're paused while we attempt to fulfill the
# second request.
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
for _ in iterate_timeout(30, Exception, 'paused handler'):
if pool_worker[0].paused_handler:
break
# Release the first node so that the second can be fulfilled.
node1.state = zk.USED
self.zk.storeNode(node1)
self.waitForNodeDeletion(node1)
# Make sure the second standard node exists now.
req2 = self.waitForNodeRequest(req2)
self.assertSuccess(req2)
def test_aws_multi_tenant_limits(self):
# Test multiple instance type quotas (standard and high-mem)
# with tenant resource limits
configfile = self.setup_config('aws/aws-limits.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
self.patchProvider(pool, quotas={
'L-1216C47A': 1000,
'L-43DA4232': 1000,
})
# Create a high node request.
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.tenant_name = 'tenant-1'
req1.node_types.append('high')
self.zk.storeNodeRequest(req1)
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
self.assertSuccess(req1)
# Create a second high node request; this should be
# over quota so it won't be fulfilled.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.tenant_name = 'tenant-1'
req2.node_types.append('high')
self.zk.storeNodeRequest(req2)
req2 = self.waitForNodeRequest(req2, (zk.REQUESTED,))
# Create a standard node request which should succeed even
# though we're at quota for high-mem (but not standard).
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.tenant_name = 'tenant-1'
req3.node_types.append('standard')
self.zk.storeNodeRequest(req3)
self.log.debug("Waiting for request %s", req3.id)
req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3)
# Assert that the second request is still being deferred
req2 = self.waitForNodeRequest(req2, (zk.REQUESTED,))
def test_aws_node(self):
req = self.requestNode('aws/aws.yaml', 'ubuntu1404')
node = self.assertSuccess(req)

View File

@ -0,0 +1,68 @@
# Copyright 2022 Acme Gating, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import copy
import math
from nodepool import tests
from nodepool.driver.utils import QuotaInformation
class TestQutoInformation(tests.BaseTestCase):
def test_subtract(self):
provider = QuotaInformation(cores=8, ram=8192, default=math.inf)
needed = QuotaInformation(cores=2, instances=1)
expected = QuotaInformation(cores=6, instances=math.inf, ram=8192)
remain = copy.deepcopy(provider)
remain.subtract(needed)
self.assertEqual(expected.quota, remain.quota)
def test_add(self):
label1 = QuotaInformation(cores=8, ram=8192)
label2 = QuotaInformation(cores=2, instances=1)
needed = copy.deepcopy(label1)
needed.add(label2)
expected = QuotaInformation(cores=10, instances=1, ram=8192)
self.assertEqual(expected.quota, needed.quota)
def test_extra(self):
# Test extra quota fields
# We call them red_, blue_, green_
# cores here. They are arbitrary names other than the
# standard cores, ram, instances.
label1 = QuotaInformation(cores=8, ram=8192,
red_cores=8, green_cores=8)
label2 = QuotaInformation(cores=2, instances=1, blue_cores=2)
needed = copy.deepcopy(label1)
needed.add(label2)
expected = QuotaInformation(cores=10, instances=1, ram=8192,
red_cores=8, blue_cores=2,
green_cores=8)
self.assertEqual(expected.quota, needed.quota)
provider = QuotaInformation(cores=8, ram=8192, default=math.inf,
green_cores=16)
expected = QuotaInformation(cores=-2, instances=math.inf, ram=0,
red_cores=math.inf, blue_cores=math.inf,
green_cores=8)
remain = copy.deepcopy(provider)
remain.subtract(needed)
self.assertEqual(expected.quota, remain.quota)

View File

@ -0,0 +1,7 @@
---
features:
- |
The AWS driver now supports multiple quotas for specific instance
types. This support is automatic, but also includes corresponding
enhancements to provider, pool, and tenant limits configured in
Nodepool.