diff --git a/doc/source/aws.rst b/doc/source/aws.rst index 07f7a19b1..71a61c9da 100644 --- a/doc/source/aws.rst +++ b/doc/source/aws.rst @@ -66,6 +66,7 @@ Selecting the ``aws`` driver adds the following options to the cloud-image: debian9 instance-type: t3.large key-name: zuul + use-spot: True tags: key1: value1 key2: value2 @@ -741,6 +742,30 @@ Selecting the ``aws`` driver adds the following options to the dynamic-tags: request_info: "Created for request {request.id}" + .. attr:: use-spot + :type: bool + :default: False + + When set to True, Nodepool will try to launch an Amazon EC2 Spot + instance, instead of an On-Demand instance. Spot instances let + you take advantage of unused EC2 capacity at a discount. + + For example: + + .. code-block:: yaml + + labels: + - name: frugal + use-spot: True + + .. note:: As Amazon EC2 Spot instances take advantage of unused + EC2 capacity, you may not get an instance, if demand + is high. In addition, Amazon EC2 may interrupt your + Spot instance and reclaim it with a two minutes warning + upfront. Therefore, you might want to setup alternative + nodesets as fallback. + + .. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html .. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html .. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index c5a9eb3b4..09cdf151c 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -60,26 +60,31 @@ def tag_list_to_dict(taglist): # https://aws.amazon.com/ec2/instance-types/high-memory/ QUOTA_CODES = { - 'a': 'L-1216C47A', - 'c': 'L-1216C47A', - 'd': 'L-1216C47A', - 'h': 'L-1216C47A', - 'i': 'L-1216C47A', - 'm': 'L-1216C47A', - 'r': 'L-1216C47A', - 't': 'L-1216C47A', - 'z': 'L-1216C47A', - 'dl': 'L-6E869C2A', - 'f': 'L-74FC7D96', - 'g': 'L-DB2E81BA', - 'vt': 'L-DB2E81BA', - 'u-': 'L-43DA4232', # 'high memory' - 'inf': 'L-1945791B', - 'p': 'L-417A185B', - 'x': 'L-7295265B', + # INSTANCE FAMILY: [ON-DEMAND, SPOT] + 'a': ['L-1216C47A', 'L-34B43A08'], + 'c': ['L-1216C47A', 'L-34B43A08'], + 'd': ['L-1216C47A', 'L-34B43A08'], + 'h': ['L-1216C47A', 'L-34B43A08'], + 'i': ['L-1216C47A', 'L-34B43A08'], + 'm': ['L-1216C47A', 'L-34B43A08'], + 'r': ['L-1216C47A', 'L-34B43A08'], + 't': ['L-1216C47A', 'L-34B43A08'], + 'z': ['L-1216C47A', 'L-34B43A08'], + 'dl': ['L-6E869C2A', 'L-85EED4F7'], + 'f': ['L-74FC7D96', 'L-88CF9481'], + 'g': ['L-DB2E81BA', 'L-3819A6DF'], + 'vt': ['L-DB2E81BA', 'L-3819A6DF'], + 'u-': ['L-43DA4232', ''], # 'high memory' + 'inf': ['L-1945791B', 'L-B5D1601B'], + 'p': ['L-417A185B', 'L-7212CCBC'], + 'x': ['L-7295265B', 'L-E3A00192'], + 'trn': ['L-2C3B7624', 'L-6B0D517C'], + 'hpc': ['L-F7808C92', ''] } CACHE_TTL = 10 +ON_DEMAND = 0 +SPOT = 1 class AwsInstance(statemachine.Instance): @@ -183,7 +188,8 @@ class AwsCreateStateMachine(statemachine.StateMachine): return self.instance = instance self.quota = self.adapter._getQuotaForInstanceType( - self.instance.instance_type) + self.instance.instance_type, + SPOT if self.label.use_spot else ON_DEMAND) self.state = self.INSTANCE_CREATING if self.state == self.INSTANCE_CREATING: @@ -360,35 +366,45 @@ class AwsAdapter(statemachine.Adapter): for instance in self._listInstances(): if instance.state["Name"].lower() == "terminated": continue - quota = self._getQuotaForInstanceType(instance.instance_type) + quota = self._getQuotaForInstanceType( + instance.instance_type, + SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND) yield AwsInstance(self.provider, instance, quota) def getQuotaLimits(self): # Get the instance types that this provider handles - instance_types = set() + instance_types = {} for pool in self.provider.pools.values(): for label in pool.labels.values(): - instance_types.add(label.instance_type) + if label.instance_type not in instance_types: + instance_types[label.instance_type] = set() + instance_types[label.instance_type].add( + SPOT if label.use_spot else ON_DEMAND) args = dict(default=math.inf) for instance_type in instance_types: - code = self._getQuotaCodeForInstanceType(instance_type) - if code in args: - continue - if not code: - self.log.warning("Unknown quota code for instance type: %s", - instance_type) - continue - with self.non_mutating_rate_limiter: - self.log.debug("Getting quota limits for %s", code) - response = self.aws_quotas.get_service_quota( - ServiceCode='ec2', - QuotaCode=code, - ) - args[code] = response['Quota']['Value'] + for market_type_option in instance_types[instance_type]: + code = self._getQuotaCodeForInstanceType(instance_type, + market_type_option) + if code in args: + continue + if not code: + self.log.warning( + "Unknown quota code for instance type: %s", + instance_type) + continue + with self.non_mutating_rate_limiter: + self.log.debug("Getting quota limits for %s", code) + response = self.aws_quotas.get_service_quota( + ServiceCode='ec2', + QuotaCode=code, + ) + args[code] = response['Quota']['Value'] return QuotaInformation(**args) def getQuotaForLabel(self, label): - return self._getQuotaForInstanceType(label.instance_type) + return self._getQuotaForInstanceType( + label.instance_type, + SPOT if label.use_spot else ON_DEMAND) def uploadImage(self, provider_image, image_name, filename, image_format, metadata, md5, sha256): @@ -753,18 +769,19 @@ class AwsAdapter(statemachine.Adapter): instance_key_re = re.compile(r'([a-z\-]+)\d.*') - def _getQuotaCodeForInstanceType(self, instance_type): + def _getQuotaCodeForInstanceType(self, instance_type, market_type_option): m = self.instance_key_re.match(instance_type) if m: key = m.group(1) - return QUOTA_CODES.get(key) + return QUOTA_CODES.get(key)[market_type_option] - def _getQuotaForInstanceType(self, instance_type): + def _getQuotaForInstanceType(self, instance_type, market_type_option): itype = self._getInstanceType(instance_type) cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores'] vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus'] ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB'] - code = self._getQuotaCodeForInstanceType(instance_type) + code = self._getQuotaCodeForInstanceType(instance_type, + market_type_option) # We include cores to match the overall cores quota (which may # be set as a tenant resource limit), and include vCPUs for the # specific AWS quota code which in for a specific instance @@ -967,6 +984,16 @@ class AwsAdapter(statemachine.Adapter): del mapping['Ebs']['Encrypted'] args['BlockDeviceMappings'] = [mapping] + # enable EC2 Spot + if label.use_spot: + args['InstanceMarketOptions'] = { + 'MarketType': 'spot', + 'SpotOptions': { + 'SpotInstanceType': 'one-time', + 'InstanceInterruptionBehavior': 'terminate' + } + } + with self.rate_limiter(log.debug, "Created instance"): log.debug(f"Creating VM {hostname}") instances = self.ec2.create_instances(**args) diff --git a/nodepool/driver/aws/config.py b/nodepool/driver/aws/config.py index e1adeca01..997d3f283 100644 --- a/nodepool/driver/aws/config.py +++ b/nodepool/driver/aws/config.py @@ -179,6 +179,7 @@ class AwsLabel(ConfigValue): self.tags = label.get('tags', {}) self.dynamic_tags = label.get('dynamic-tags', {}) self.host_key_checking = self.pool.host_key_checking + self.use_spot = bool(label.get('use-spot', False)) @staticmethod def getSchema(): @@ -200,6 +201,7 @@ class AwsLabel(ConfigValue): }, 'tags': dict, 'dynamic-tags': dict, + 'use-spot': bool, } diff --git a/nodepool/tests/fixtures/aws/aws-quota.yaml b/nodepool/tests/fixtures/aws/aws-quota.yaml index 9dce1c959..bc040094e 100644 --- a/nodepool/tests/fixtures/aws/aws-quota.yaml +++ b/nodepool/tests/fixtures/aws/aws-quota.yaml @@ -15,6 +15,8 @@ tenant-resource-limits: labels: - name: standard - name: high + - name: spot + - name: on-demand providers: - name: ec2-us-west-2 @@ -41,3 +43,12 @@ providers: cloud-image: ubuntu1404 instance-type: u-6tb1.112xlarge key-name: zuul + - name: spot + cloud-image: ubuntu1404 + instance-type: m6i.32xlarge + key-name: zuul + use-spot: True + - name: on-demand + cloud-image: ubuntu1404 + instance-type: m6i.32xlarge + key-name: zuul diff --git a/nodepool/tests/fixtures/aws/aws-spot.yaml b/nodepool/tests/fixtures/aws/aws-spot.yaml new file mode 100644 index 000000000..1bdc53651 --- /dev/null +++ b/nodepool/tests/fixtures/aws/aws-spot.yaml @@ -0,0 +1,39 @@ +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +tenant-resource-limits: + - tenant-name: tenant-1 + max-cores: 1024 + +labels: + - name: ubuntu1404-spot + +providers: + - name: ec2-us-west-2 + driver: aws + region-name: us-west-2 + cloud-images: + - name: ubuntu1404 + image-id: ami-1e749f67 + username: ubuntu + pools: + - name: main + max-servers: 10 + subnet-id: {subnet_id} + security-group-id: {security_group_id} + node-attributes: + key1: value1 + key2: value2 + labels: + - name: ubuntu1404-spot + cloud-image: ubuntu1404 + instance-type: t3.medium + key-name: zuul-spot + use-spot: True diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 698789bb2..41aebbc55 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -244,6 +244,7 @@ class TestDriverAws(tests.DBTestCase): @aws_quotas({ 'L-1216C47A': 2, 'L-43DA4232': 448, + 'L-34B43A08': 2 }) def test_aws_multi_quota(self): # Test multiple instance type quotas (standard and high-mem) @@ -295,6 +296,59 @@ class TestDriverAws(tests.DBTestCase): req3 = self.waitForNodeRequest(req3) self.assertSuccess(req3) + @aws_quotas({ + 'L-43DA4232': 448, + 'L-1216C47A': 200, + 'L-34B43A08': 200 + }) + def test_aws_multi_quota_spot(self): + # Test multiple instance type quotas (standard, high-mem and spot) + configfile = self.setup_config('aws/aws-quota.yaml') + pool = self.useNodepool(configfile, watermark_sleep=1) + pool.start() + + # Create a spot node request which should succeed. + req1 = zk.NodeRequest() + req1.state = zk.REQUESTED + req1.node_types.append('spot') + self.zk.storeNodeRequest(req1) + self.log.debug("Waiting for request %s", req1.id) + req1 = self.waitForNodeRequest(req1) + node1 = self.assertSuccess(req1) + + # Create an on-demand node request which should succeed. + req2 = zk.NodeRequest() + req2.state = zk.REQUESTED + req2.node_types.append('on-demand') + self.zk.storeNodeRequest(req2) + self.log.debug("Waiting for request %s", req2.id) + req2 = self.waitForNodeRequest(req2) + self.assertSuccess(req2) + + # Create another spot node request which should be paused. + req3 = zk.NodeRequest() + req3.state = zk.REQUESTED + req3.node_types.append('spot') + self.zk.storeNodeRequest(req3) + self.log.debug("Waiting for request %s", req3.id) + req3 = self.waitForNodeRequest(req3, (zk.PENDING,)) + + # Make sure we're paused while we attempt to fulfill the + # third request. + pool_worker = pool.getPoolWorkers('ec2-us-west-2') + for _ in iterate_timeout(30, Exception, 'paused handler'): + if pool_worker[0].paused_handlers: + break + + # Release the first spot node so that the third can be fulfilled. + node1.state = zk.USED + self.zk.storeNode(node1) + self.waitForNodeDeletion(node1) + + # Make sure the fourth spot node exists now. + req3 = self.waitForNodeRequest(req3) + self.assertSuccess(req3) + @aws_quotas({ 'L-1216C47A': 1000, 'L-43DA4232': 1000, @@ -916,3 +970,12 @@ class TestDriverAws(tests.DBTestCase): except botocore.exceptions.ClientError: # Probably not found break + + def test_aws_provisioning_spot_instances(self): + # Test creating a spot instances instead of an on-demand on. + req = self.requestNode('aws/aws-spot.yaml', 'ubuntu1404-spot') + node = self.assertSuccess(req) + instance = self.ec2.Instance(node.external_id) + self.assertEqual(instance.instance_lifecycle, 'spot') + # moto doesn't provide the spot_instance_request_id + # self.assertIsNotNone(instance.spot_instance_request_id) diff --git a/releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml b/releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml new file mode 100644 index 000000000..5027cedd2 --- /dev/null +++ b/releasenotes/notes/aws-spot-9ef3d8ee39fde2b2.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The AWS driver now supports launching Amazon EC2 Spot instances + (https://aws.amazon.com/ec2/spot/), when specifying + :attr:`providers.[aws].pools.labels.use-spot`.