From 7d31a8e79fd769af0b001739fa8a855bfe55e8a9 Mon Sep 17 00:00:00 2001 From: Benedikt Loeffler Date: Thu, 4 Jul 2024 14:50:03 +0200 Subject: [PATCH] Support EC2 Fleet API This change adds support for launching Amazon EC2 instances (On-Demand and Spot), using the EC2 fleet API. One key difference comparing to other type of labels is the quota information can not be determined by the label itself, it is only known after the instance is launched. Therefore, when EC2 fleet API is configured, quota is not checked before launching the instance, but the quota would be taken into account after the instance is launched. In the current change the `InstanceRequirements` is not supported. Change-Id: I1759d3539ef79d4a556661898553396d85aa69fa Co-Authored-By: James E. Blair Co-Authored-By: Dong Zhang --- doc/source/aws.rst | 24 +- nodepool/driver/aws/adapter.py | 227 +++++++++++++++++- nodepool/driver/aws/config.py | 11 +- nodepool/driver/statemachine.py | 4 + nodepool/driver/utils.py | 16 +- .../fixtures/aws/aws-fleet-old-template.yaml | 44 ++++ nodepool/tests/fixtures/aws/aws-fleet.yaml | 104 ++++++++ .../aws-fleet-allocation-strategy-error.yaml | 35 +++ .../aws-fleet-exclusive-error.yaml | 43 ++++ .../config_validate/aws-fleet-good.yaml | 35 +++ nodepool/tests/unit/test_config_validator.py | 27 +++ nodepool/tests/unit/test_driver_aws.py | 180 ++++++++++++++ .../notes/aws-fleet-api-ef99139931642c54.yaml | 6 + 13 files changed, 745 insertions(+), 11 deletions(-) create mode 100644 nodepool/tests/fixtures/aws/aws-fleet-old-template.yaml create mode 100644 nodepool/tests/fixtures/aws/aws-fleet.yaml create mode 100644 nodepool/tests/fixtures/config_validate/aws-fleet-allocation-strategy-error.yaml create mode 100644 nodepool/tests/fixtures/config_validate/aws-fleet-exclusive-error.yaml create mode 100644 nodepool/tests/fixtures/config_validate/aws-fleet-good.yaml create mode 100644 releasenotes/notes/aws-fleet-api-ef99139931642c54.yaml diff --git a/doc/source/aws.rst b/doc/source/aws.rst index 5f621d415..9bd7ab4b5 100644 --- a/doc/source/aws.rst +++ b/doc/source/aws.rst @@ -682,9 +682,31 @@ Selecting the ``aws`` driver adds the following options to the .. attr:: instance-type :type: str - :required: Name of the flavor to use. + Mutually exclusive with :attr:`providers.[aws].pools.labels.fleet` + + .. attr:: fleet + :type: dict + + If specified, EC2 fleet API would be used for launching + the instance. In this case, quota is not checked before + launching the instance, but is taken into account after + the instance is launched. Mutually exclusive with + :attr:`providers.[aws].pools.labels.instance-type` + + .. attr:: instance-types + :type: list + + Names of the flavors of the instance that to be launched. + + .. attr:: allocation-strategy + :type: str + :required: + + Allowed values for on On-Demand: ``lowest-price`` or ``prioritized``. + Allowed values for Spot: ``price-capacity-optimized``, ``capacity-optimized``, + ``diversified`` or ``lowest-price`` .. attr:: iam-instance-profile :type: dict diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index 057bfb708..fbba3aec7 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -17,6 +17,7 @@ from concurrent.futures import ThreadPoolExecutor import cachetools.func import copy import functools +import hashlib import json import logging import math @@ -396,7 +397,7 @@ class AwsCreateStateMachine(statemachine.StateMachine): return self.instance = instance self.external_id['instance'] = instance['InstanceId'] - self.quota = self.adapter.getQuotaForLabel(self.label) + self.quota = self.adapter._getQuotaForLabel(self.label) self.state = self.INSTANCE_CREATING if self.state == self.INSTANCE_CREATING: @@ -412,12 +413,15 @@ class AwsCreateStateMachine(statemachine.StateMachine): if self.state == self.COMPLETE: self.complete = True + self.quota = self.adapter._getQuotaForLabel( + self.label, self.instance['InstanceType']) return AwsInstance(self.adapter.provider, self.instance, self.host, self.quota) class AwsAdapter(statemachine.Adapter): IMAGE_UPLOAD_SLEEP = 30 + LAUNCH_TEMPLATE_PREFIX = 'nodepool-launch-template' def __init__(self, provider_config): # Wrap these instance methods with a per-instance LRU cache so @@ -532,6 +536,7 @@ class AwsAdapter(statemachine.Adapter): # time on that again. self.not_our_images = set() self.not_our_snapshots = set() + self._createLaunchTemplates() def stop(self): self.create_executor.shutdown() @@ -668,10 +673,18 @@ class AwsAdapter(statemachine.Adapter): if label.dedicated_host: host_types.add(label.instance_type) else: - if label.instance_type not in instance_types: - instance_types[label.instance_type] = set() - instance_types[label.instance_type].add( - SPOT if label.use_spot else ON_DEMAND) + label_instance_types = [] + if label.instance_type: + label_instance_types.append(label.instance_type) + elif label.fleet and label.fleet.get('instance-types'): + # Include instance-types from fleet config if available + label_instance_types.extend( + label.fleet.get('instance-types')) + for label_instance_type in label_instance_types: + if label_instance_type not in instance_types: + instance_types[label_instance_type] = set() + instance_types[label_instance_type].add( + SPOT if label.use_spot else ON_DEMAND) if label.volume_type: volume_types.add(label.volume_type) args = dict(default=math.inf) @@ -725,6 +738,13 @@ class AwsAdapter(statemachine.Adapter): return QuotaInformation(**args) def getQuotaForLabel(self, label): + return self._getQuotaForLabel(label) + + def _getQuotaForLabel(self, label, instance_type=None): + # When using the Fleet API, we may need to fill in quota + # information from the actual instance, so this internal + # method operates on the label alone or label+instance. + # For now, we are optimistically assuming that when an # instance is launched on a dedicated host, it is not counted # against instance quota. That may be overly optimistic. If @@ -733,9 +753,13 @@ class AwsAdapter(statemachine.Adapter): if label.dedicated_host: quota = self._getQuotaForHostType( label.instance_type) + elif label.fleet and instance_type is None: + # For fleet API, do not check quota before launch the instance + quota = QuotaInformation(instances=1) else: + check_instance_type = label.instance_type or instance_type quota = self._getQuotaForInstanceType( - label.instance_type, + check_instance_type, SPOT if label.use_spot else ON_DEMAND) if label.volume_type: quota.add(self._getQuotaForVolumeType( @@ -1486,6 +1510,197 @@ class AwsAdapter(statemachine.Adapter): else: image_id = self._getImageId(label.cloud_image) + if label.fleet: + return self._createFleet(label, image_id, tags, hostname, log) + else: + return self._runInstance(label, image_id, tags, + hostname, dedicated_host_id, log) + + def _createLaunchTemplates(self): + fleet_labels = [] + for pool_name, pool in self.provider.pools.items(): + for label_name, label in pool.labels.items(): + # Create launch templates only for labels which usage fleet + if not label.fleet: + continue + fleet_labels.append(label) + + if not fleet_labels: + return + + self.log.info("Creating launch templates") + tags = { + 'nodepool_managed': True, + 'nodepool_provider_name': self.provider.name, + } + existing_templates = dict() # for clean up and avoid creation attempt + created_templates = set() # for avoid creation attempt + configured_templates = set() # for clean up + + name_filter = { + 'Name': 'launch-template-name', + 'Values': [f'{self.LAUNCH_TEMPLATE_PREFIX}-*'], + } + paginator = self.ec2_client.get_paginator( + 'describe_launch_templates') + with self.non_mutating_rate_limiter: + for page in paginator.paginate(Filters=[name_filter]): + for template in page['LaunchTemplates']: + existing_templates[ + template['LaunchTemplateName']] = template + + for label in fleet_labels: + ebs_settings = { + 'DeleteOnTermination': True, + } + if label.volume_size: + ebs_settings['VolumeSize'] = label.volume_size + if label.volume_type: + ebs_settings['VolumeType'] = label.volume_type + if label.iops: + ebs_settings['Iops'] = label.iops + if label.throughput: + ebs_settings['Throughput'] = label.throughput + template_data = { + 'KeyName': label.key_name, + 'SecurityGroupIds': [label.pool.security_group_id], + 'BlockDeviceMappings': [ + { + 'DeviceName': '/dev/sda1', + 'Ebs': ebs_settings, + }, + ], + } + template_args = dict( + LaunchTemplateData=template_data, + TagSpecifications=[ + { + 'ResourceType': 'launch-template', + 'Tags': tag_dict_to_list(tags), + }, + ] + ) + + template_name = self._getLaunchTemplateName(template_args) + configured_templates.add(template_name) + + label._launch_template_name = template_name + if (template_name in existing_templates or + template_name in created_templates): + self.log.debug( + 'Launch template %s already exists', template_name) + continue + + template_args['LaunchTemplateName'] = template_name + self.log.debug('Creating launch template %s', template_name) + try: + self.ec2_client.create_launch_template(**template_args) + created_templates.add(template_name) + self.log.debug('Launch template %s created', template_name) + except botocore.exceptions.ClientError as e: + if (e.response['Error']['Code'] == + 'InvalidLaunchTemplateName.AlreadyExistsException'): + self.log.debug( + 'Launch template %s already created', + template_name) + else: + raise e + except Exception: + self.log.exception( + 'Could not create launch template %s', template_name) + + # remove unused templates + for template_name, template in existing_templates.items(): + if template_name not in configured_templates: + # check if the template was created by the current provider + tags = template.get('Tags', []) + for tag in tags: + if (tag['Key'] == 'nodepool_provider_name' and + tag['Value'] == self.provider.name): + self.ec2_client.delete_launch_template( + LaunchTemplateName=template_name) + self.log.debug("Deleted unused launch template: %s", + template_name) + + def _getLaunchTemplateName(self, args): + hasher = hashlib.sha256() + hasher.update(json.dumps(args, sort_keys=True).encode('utf8')) + sha = hasher.hexdigest() + return (f'{self.LAUNCH_TEMPLATE_PREFIX}-{sha}') + + def _createFleet(self, label, image_id, tags, hostname, log): + overrides = [] + + instance_types = label.fleet.get('instance-types', []) + for instance_type in instance_types: + overrides.append({ + 'ImageId': image_id, + 'InstanceType': instance_type, + 'SubnetId': label.pool.subnet_id, + }) + + if label.use_spot: + capacity_type_option = { + 'SpotOptions': { + 'AllocationStrategy': label.fleet['allocation-strategy'], + }, + 'TargetCapacitySpecification': { + 'TotalTargetCapacity': 1, + 'DefaultTargetCapacityType': 'spot', + }, + } + else: + capacity_type_option = { + 'OnDemandOptions': { + 'AllocationStrategy': label.fleet['allocation-strategy'], + }, + 'TargetCapacitySpecification': { + 'TotalTargetCapacity': 1, + 'DefaultTargetCapacityType': 'on-demand', + }, + } + + template_name = label._launch_template_name + + args = { + **capacity_type_option, + 'LaunchTemplateConfigs': [ + { + 'LaunchTemplateSpecification': { + 'LaunchTemplateName': template_name, + 'Version': '$Latest', + }, + 'Overrides': overrides, + }, + ], + 'Type': 'instant', + 'TagSpecifications': [ + { + 'ResourceType': 'instance', + 'Tags': tag_dict_to_list(tags), + }, + { + 'ResourceType': 'volume', + 'Tags': tag_dict_to_list(tags), + }, + ], + } + + with self.rate_limiter(log.debug, "Created fleet"): + resp = self.ec2_client.create_fleet(**args) + + instance_id = resp['Instances'][0]['InstanceIds'][0] + + describe_instances_result = self.ec2_client.describe_instances( + InstanceIds=[instance_id] + ) + log.debug("Created VM %s as instance %s using EC2 Fleet API", + hostname, instance_id) + + return describe_instances_result['Reservations'][0]['Instances'][0] + + def _runInstance(self, label, image_id, tags, hostname, + dedicated_host_id, log): args = dict( ImageId=image_id, MinCount=1, diff --git a/nodepool/driver/aws/config.py b/nodepool/driver/aws/config.py index a9c8ed916..2e135e675 100644 --- a/nodepool/driver/aws/config.py +++ b/nodepool/driver/aws/config.py @@ -173,7 +173,7 @@ class AwsLabel(ConfigValue): self.diskimage = None self.ebs_optimized = bool(label.get('ebs-optimized', False)) - self.instance_type = label['instance-type'] + self.instance_type = label.get('instance-type', None) self.key_name = label.get('key-name') self.volume_type = label.get('volume-type') self.volume_size = label.get('volume-size') @@ -194,6 +194,7 @@ class AwsLabel(ConfigValue): if not self.pool.az: raise Exception( "Availability-zone is required for dedicated hosts") + self.fleet = label.get('fleet', None) @staticmethod def getSchema(): @@ -201,7 +202,13 @@ class AwsLabel(ConfigValue): v.Required('name'): str, v.Exclusive('cloud-image', 'image'): str, v.Exclusive('diskimage', 'image'): str, - v.Required('instance-type'): str, + v.Exclusive('instance-type', 'instance'): str, + v.Exclusive('fleet', 'instance'): { + v.Required('instance-types'): list, + v.Required('allocation-strategy'): v.Any( + 'prioritized', 'price-capacity-optimized', + 'capacity-optimized', 'diversified', 'lowest-price') + }, v.Required('key-name'): str, 'ebs-optimized': bool, 'volume-type': str, diff --git a/nodepool/driver/statemachine.py b/nodepool/driver/statemachine.py index a90ea0bfe..c18509665 100644 --- a/nodepool/driver/statemachine.py +++ b/nodepool/driver/statemachine.py @@ -166,6 +166,10 @@ class StateMachineNodeLauncher(stats.StatsReporter): node.driver_data = instance.driver_data node.slot = instance.slot + # If we did not know the resource information before + # launching, update it now. + node.resources = instance.getQuotaInformation().get_resources() + # Optionally, if the node has updated values that we set from # the image attributes earlier, set those. for attr in ('username', 'python_path', 'shell_type', diff --git a/nodepool/driver/utils.py b/nodepool/driver/utils.py index a6e158ea0..1e7bab796 100644 --- a/nodepool/driver/utils.py +++ b/nodepool/driver/utils.py @@ -246,6 +246,12 @@ class QuotaInformation: '''Return resources value to register in ZK node''' return self.quota['compute'] + @staticmethod + def from_resources(resources): + qi = QuotaInformation() + qi.quota['compute'] = resources + return qi + def __str__(self): return str(self.quota) @@ -364,8 +370,14 @@ class QuotaSupport: # may have changed under it. It should settle out # eventually when it's deleted. continue - node_resources = self.quotaNeededByLabel( - node.type[0], provider_pool) + # If the node resources is valid, we can use that to + # construct the qi object for the node. + if node.resources['cores']: + node_resources = QuotaInformation.from_resources( + node.resources) + else: + node_resources = self.quotaNeededByLabel( + node.type[0], provider_pool) used_quota.add(node_resources) except Exception: self.log.exception("Couldn't consider invalid node %s " diff --git a/nodepool/tests/fixtures/aws/aws-fleet-old-template.yaml b/nodepool/tests/fixtures/aws/aws-fleet-old-template.yaml new file mode 100644 index 000000000..0f3b01fa5 --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-fleet-old-template.yaml @@ -0,0 +1,44 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +tenant-resource-limits: + - tenant-name: tenant-1 + max-cores: 1024 + +labels: + - name: ubuntu1404-io2 + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-io2 + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.medium + allocation-strategy: prioritized + key-name: zuul + volume-type: io2 + volume-size: 20 + iops: 1234 diff --git a/nodepool/tests/fixtures/aws/aws-fleet.yaml b/nodepool/tests/fixtures/aws/aws-fleet.yaml new file mode 100644 index 000000000..fa86a5404 --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-fleet.yaml @@ -0,0 +1,104 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +tenant-resource-limits: + - tenant-name: tenant-1 + max-cores: 1024 + +labels: + - name: ubuntu1404-io2 + - name: ubuntu1404-gp3 + - name: ubuntu1404-on-demand + - name: ubuntu1404-spot + - name: ubuntu1404-4core + - name: ubuntu1404-fleet-4core + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-io2 + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.medium + allocation-strategy: prioritized + key-name: zuul + volume-type: io2 + volume-size: 20 + iops: 2000 + - name: ubuntu1404-gp3 + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.medium + allocation-strategy: prioritized + key-name: zuul + volume-type: gp3 + volume-size: 40 + iops: 1000 + throughput: 200 + - name: ubuntu1404-on-demand + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.nano + - t3.micro + - t3.small + - t3.medium + allocation-strategy: prioritized + key-name: zuul + volume-type: gp3 + volume-size: 40 + iops: 1000 + throughput: 200 + - name: ubuntu1404-spot + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.nano + - t3.micro + - t3.small + - t3.medium + allocation-strategy: price-capacity-optimized + key-name: zuul + volume-type: gp3 + volume-size: 40 + iops: 1000 + throughput: 200 + use-spot: True + - name: ubuntu1404-4core + cloud-image: ubuntu1404 + instance-type: t3.xlarge + key-name: zuul + - name: ubuntu1404-fleet-4core + cloud-image: ubuntu1404 + fleet: + instance-types: + - t3.xlarge + allocation-strategy: prioritized + key-name: zuul + volume-type: gp3 + volume-size: 40 + iops: 1000 + throughput: 200 \ No newline at end of file diff --git a/nodepool/tests/fixtures/config_validate/aws-fleet-allocation-strategy-error.yaml b/nodepool/tests/fixtures/config_validate/aws-fleet-allocation-strategy-error.yaml new file mode 100644 index 000000000..5a1515118 --- /dev/null +++ b/nodepool/tests/fixtures/config_validate/aws-fleet-allocation-strategy-error.yaml @@ -0,0 +1,35 @@ +elements-dir: /etc/nodepool/elements +images-dir: /opt/nodepool_dib + +zookeeper-servers: + - host: zk1.openstack.org + port: 2181 + chroot: /test + +labels: + - name: ubuntu1404-on-demand + - name: ubuntu1404-spot + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-on-demand + cloud-image: ubuntu1404 + key-name: zuul + fleet: + instance-types: + - t3.nano + - t3.micro + allocation-strategy: not-exist diff --git a/nodepool/tests/fixtures/config_validate/aws-fleet-exclusive-error.yaml b/nodepool/tests/fixtures/config_validate/aws-fleet-exclusive-error.yaml new file mode 100644 index 000000000..14f21ff28 --- /dev/null +++ b/nodepool/tests/fixtures/config_validate/aws-fleet-exclusive-error.yaml @@ -0,0 +1,43 @@ +elements-dir: /etc/nodepool/elements +images-dir: /opt/nodepool_dib + +zookeeper-servers: + - host: zk1.openstack.org + port: 2181 + chroot: /test + +labels: + - name: ubuntu1404-on-demand + - name: ubuntu1404-spot + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-on-demand + cloud-image: ubuntu1404 + key-name: zuul + fleet: + # "instance-types" and "instance-requirements" are exclusive + instance-types: + - t3.nano + - t3.micro + instance-requirements: + vcpu-count: + min: 1 + max: 8 + memory-mib: + min: 1 + max: 16000 + allocation-strategy: lowest-price diff --git a/nodepool/tests/fixtures/config_validate/aws-fleet-good.yaml b/nodepool/tests/fixtures/config_validate/aws-fleet-good.yaml new file mode 100644 index 000000000..0efde683b --- /dev/null +++ b/nodepool/tests/fixtures/config_validate/aws-fleet-good.yaml @@ -0,0 +1,35 @@ +elements-dir: /etc/nodepool/elements +images-dir: /opt/nodepool_dib + +zookeeper-servers: + - host: zk1.openstack.org + port: 2181 + chroot: /test + +labels: + - name: ubuntu1404-on-demand + - name: ubuntu1404-spot + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-on-demand + cloud-image: ubuntu1404 + key-name: zuul + fleet: + instance-types: + - t3.small + - t3.large + allocation-strategy: lowest-price diff --git a/nodepool/tests/unit/test_config_validator.py b/nodepool/tests/unit/test_config_validator.py index 4084e447d..219f0df10 100644 --- a/nodepool/tests/unit/test_config_validator.py +++ b/nodepool/tests/unit/test_config_validator.py @@ -102,3 +102,30 @@ class TestConfigValidation(tests.BaseTestCase): validator = ConfigValidator(config) ret = validator.validate() self.assertEqual(ret, 1) + + def test_aws_fleet_good(self): + config = os.path.join(os.path.dirname(tests.__file__), + 'fixtures', 'config_validate', + 'aws-fleet-good.yaml') + + validator = ConfigValidator(config) + ret = validator.validate() + self.assertEqual(ret, 0) + + def test_aws_fleet_exclusive_error(self): + config = os.path.join(os.path.dirname(tests.__file__), + 'fixtures', 'config_validate', + 'aws-fleet-exclusive-error.yaml') + + validator = ConfigValidator(config) + ret = validator.validate() + self.assertEqual(ret, 1) + + def test_aws_fleet_allocation_strategy_error(self): + config = os.path.join(os.path.dirname(tests.__file__), + 'fixtures', 'config_validate', + 'aws-fleet-allocation-strategy-error.yaml') + + validator = ConfigValidator(config) + ret = validator.validate() + self.assertEqual(ret, 1) diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index a48853839..c34a28c07 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -55,6 +55,19 @@ class FakeAwsAdapter(AwsAdapter): raise self.__testcase.run_instances_exception return self.ec2_client.run_instances_orig(*args, **kwargs) + # Note: boto3 doesn't handle all features correctly (e.g. + # instance-requirements, volume attributes) when creating + # fleet in fake mode, we need to intercept the create_fleet + # call and validate the args we supply. Results are also + # intercepted for validate instance attributes + def _fake_create_fleet(*args, **kwargs): + self.__testcase.create_fleet_calls.append(kwargs) + if self.__testcase.create_fleet_exception: + raise self.__testcase.create_fleet_exception + result = self.ec2_client.create_fleet_orig(*args, **kwargs) + self.__testcase.create_fleet_results.append(result) + return result + def _fake_allocate_hosts(*args, **kwargs): if self.__testcase.allocate_hosts_exception: raise self.__testcase.allocate_hosts_exception @@ -73,6 +86,8 @@ class FakeAwsAdapter(AwsAdapter): self.ec2_client.run_instances_orig = self.ec2_client.run_instances self.ec2_client.run_instances = _fake_run_instances + self.ec2_client.create_fleet_orig = self.ec2_client.create_fleet + self.ec2_client.create_fleet = _fake_create_fleet self.ec2_client.allocate_hosts_orig = self.ec2_client.allocate_hosts self.ec2_client.allocate_hosts = _fake_allocate_hosts self.ec2_client.register_image_orig = self.ec2_client.register_image @@ -174,6 +189,9 @@ class TestDriverAws(tests.DBTestCase): # A list of args to method calls for validation self.run_instances_calls = [] self.run_instances_exception = None + self.create_fleet_calls = [] + self.create_fleet_results = [] + self.create_fleet_exception = None self.allocate_hosts_exception = None self.register_image_calls = [] @@ -721,6 +739,7 @@ class TestDriverAws(tests.DBTestCase): instance['Placement'] = {'AvailabilityZone': 'us-west-2b'} iface = {'Ipv6Addresses': [{'Ipv6Address': 'fe80::dead:beef'}]} instance['NetworkInterfaces'] = [iface] + instance['InstanceType'] = 'test' provider = Dummy() provider.region_name = 'us-west-2' awsi = AwsInstance(provider, instance, None, None) @@ -1349,3 +1368,164 @@ class TestDriverAws(tests.DBTestCase): hosts = self.ec2_client.describe_hosts()['Hosts'] hosts = [h for h in hosts if h['State'] != 'released'] self.assertEqual(len(hosts), 0) + + def test_aws_create_launch_templates(self): + configfile = self.setup_config('aws/aws-fleet.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + launch_tempaltes = self.ec2_client.\ + describe_launch_templates()['LaunchTemplates'] + self.assertEqual(len(launch_tempaltes), 2) + lt1 = launch_tempaltes[0] + lt2 = launch_tempaltes[1] + self.assertTrue(lt1['LaunchTemplateName'].startswith( + 'nodepool-launch-template')) + self.assertTrue(lt2['LaunchTemplateName'].startswith( + 'nodepool-launch-template')) + + lt_version = self.ec2_client.\ + describe_launch_template_versions( + LaunchTemplateId=lt2['LaunchTemplateId'])[ + 'LaunchTemplateVersions'][0] + lt_data = lt_version['LaunchTemplateData'] + self.assertIsNotNone(lt_data.get('SecurityGroupIds')) + ebs_settings = lt_data['BlockDeviceMappings'][0]['Ebs'] + self.assertTrue(ebs_settings['DeleteOnTermination']) + self.assertEqual(ebs_settings['Iops'], 1000) + self.assertEqual(ebs_settings['VolumeSize'], 40) + self.assertEqual(ebs_settings['VolumeType'], 'gp3') + self.assertEqual(ebs_settings['Throughput'], 200) + + # Restart pool, the launch templates must be the same and + # must not be recreated + pool.stop() + configfile = self.setup_config('aws/aws-fleet.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + lt_2nd_run = self.ec2_client.\ + describe_launch_templates()['LaunchTemplates'] + self.assertEqual(len(lt_2nd_run), 2) + self.assertEqual(lt1['LaunchTemplateId'], + lt_2nd_run[0]['LaunchTemplateId']) + self.assertEqual(lt2['LaunchTemplateId'], + lt_2nd_run[1]['LaunchTemplateId']) + + def test_aws_cleanup_launch_templates(self): + # start nodepool with old templates config + configfile = self.setup_config('aws/aws-fleet-old-template.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + launch_tempaltes = self.ec2_client.\ + describe_launch_templates()['LaunchTemplates'] + self.assertEqual(len(launch_tempaltes), 1) + + # Restart pool with the config that not include the old template, + # the old template should be deleted. + pool.stop() + configfile = self.setup_config('aws/aws-fleet.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + lt_2nd_run = self.ec2_client.\ + describe_launch_templates()['LaunchTemplates'] + self.assertEqual(len(lt_2nd_run), 2) + + def test_aws_create_fleet_on_demand(self): + req = self.requestNode('aws/aws-fleet.yaml', 'ubuntu1404-on-demand') + node = self.assertSuccess(req) + + self.assertEqual( + self.create_fleet_calls[0]['OnDemandOptions'] + ['AllocationStrategy'], 'prioritized') + self.assertTrue( + self.create_fleet_calls[0]['LaunchTemplateConfigs'][0] + ['LaunchTemplateSpecification']['LaunchTemplateName'].startswith( + 'nodepool-launch-template')) + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][0] + ['ResourceType'], 'instance') + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][0] + ['Tags'][1]['Key'], 'nodepool_pool_name') + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][0] + ['Tags'][1]['Value'], 'main') + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][1] + ['ResourceType'], 'volume') + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][1] + ['Tags'][1]['Key'], 'nodepool_pool_name') + self.assertEqual(self.create_fleet_calls[0]['TagSpecifications'][1] + ['Tags'][1]['Value'], 'main') + self.assertEqual( + self.create_fleet_results[0]['Instances'][0]['Lifecycle'], + 'on-demand') + self.assertIn(self.create_fleet_results[0]['Instances'][0] + ['InstanceType'], + ('t3.nano', 't3.micro', 't3.small', 't3.medium')) + + node.state = zk.USED + self.zk.storeNode(node) + self.waitForNodeDeletion(node) + + def test_aws_create_fleet_spot(self): + req = self.requestNode('aws/aws-fleet.yaml', 'ubuntu1404-spot') + node = self.assertSuccess(req) + + self.assertEqual( + self.create_fleet_calls[0]['SpotOptions'] + ['AllocationStrategy'], 'price-capacity-optimized') + self.assertEqual( + self.create_fleet_calls[0]['TargetCapacitySpecification'] + ['DefaultTargetCapacityType'], 'spot') + self.assertIn(self.create_fleet_results[0]['Instances'][0] + ['InstanceType'], + ('t3.nano', 't3.micro', 't3.small', 't3.medium')) + + node.state = zk.USED + self.zk.storeNode(node) + self.waitForNodeDeletion(node) + + @ec2_quotas({ + 'L-1216C47A': 6, + 'L-34B43A08': 2 + }) + def test_aws_fleet_quota(self): + # Test if the quota used by instances launched by fleet API + # are taken into account. + configfile = self.setup_config('aws/aws-fleet.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + self.startPool(pool) + + # Create a node request with fleet API. + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('ubuntu1404-fleet-4core') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + node1 = self.assertSuccess(req1) + + # Create a second node request with non-fleet API; this should be + # over quota so it won't be fulfilled. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('ubuntu1404-4core') + self.zk.storeNodeRequest(req2) + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2, (zk.PENDING,)) + + # Make sure we're paused while we attempt to fulfill the + # second request. + pool_worker = pool.getPoolWorkers('ec2-us-west-2') + for _ in iterate_timeout(30, Exception, 'paused handler'): + if pool_worker[0].paused_handlers: + break + + # Release the first node so that the second can be fulfilled. + node1.state = zk.USED + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + + # Make sure the second high node exists now. + req2 = self.waitForNodeRequest(req2) + self.assertSuccess(req2) diff --git a/releasenotes/notes/aws-fleet-api-ef99139931642c54.yaml b/releasenotes/notes/aws-fleet-api-ef99139931642c54.yaml new file mode 100644 index 000000000..ce1565f2d --- /dev/null +++ b/releasenotes/notes/aws-fleet-api-ef99139931642c54.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The AWS driver now support EC2 Fleet API to launch instances whose + types are selected from among a specified set based on certain + optimization criteria.