Amazon EC2 Spot support

This adds support for launching Amazon EC2 Spot instances
(https://aws.amazon.com/ec2/spot/), which comes with huge cost saving
opportunities.

Amazon EC2 Spot instances are spare Amazon EC2 capacity, you can get
with an discount of up to 90% compared to on-demand pricing.
In contrast to on-demand instances, Spot instances can be relaimed with a
2 minute notification in advance
(https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-interruptions.html).

When :attr:`providers.[aws].pools.labels.use-spot` is set to True, the AWS
driver will launch Spot instances. If an instance get interrupted, it will be
terminated and no replacement instance will be launched.

Change-Id: I9868d014991d78e7b2421439403ae1371b33524c
This commit is contained in:
Christian Mueller 2022-11-24 12:32:37 +01:00
parent 0e7de19664
commit 36dbff84ba
7 changed files with 213 additions and 40 deletions

View File

@ -66,6 +66,7 @@ Selecting the ``aws`` driver adds the following options to the
cloud-image: debian9 cloud-image: debian9
instance-type: t3.large instance-type: t3.large
key-name: zuul key-name: zuul
use-spot: True
tags: tags:
key1: value1 key1: value1
key2: value2 key2: value2
@ -741,6 +742,30 @@ Selecting the ``aws`` driver adds the following options to the
dynamic-tags: dynamic-tags:
request_info: "Created for request {request.id}" request_info: "Created for request {request.id}"
.. attr:: use-spot
:type: bool
:default: False
When set to True, Nodepool will try to launch an Amazon EC2 Spot
instance, instead of an On-Demand instance. Spot instances let
you take advantage of unused EC2 capacity at a discount.
For example:
.. code-block:: yaml
labels:
- name: frugal
use-spot: True
.. note:: As Amazon EC2 Spot instances take advantage of unused
EC2 capacity, you may not get an instance, if demand
is high. In addition, Amazon EC2 may interrupt your
Spot instance and reclaim it with a two minutes warning
upfront. Therefore, you might want to setup alternative
nodesets as fallback.
.. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html .. _`EBS volume type`: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html
.. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html .. _`AWS region`: https://docs.aws.amazon.com/general/latest/gr/rande.html
.. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html .. _`Boto configuration`: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html

View File

@ -60,26 +60,31 @@ def tag_list_to_dict(taglist):
# https://aws.amazon.com/ec2/instance-types/high-memory/ # https://aws.amazon.com/ec2/instance-types/high-memory/
QUOTA_CODES = { QUOTA_CODES = {
'a': 'L-1216C47A', # INSTANCE FAMILY: [ON-DEMAND, SPOT]
'c': 'L-1216C47A', 'a': ['L-1216C47A', 'L-34B43A08'],
'd': 'L-1216C47A', 'c': ['L-1216C47A', 'L-34B43A08'],
'h': 'L-1216C47A', 'd': ['L-1216C47A', 'L-34B43A08'],
'i': 'L-1216C47A', 'h': ['L-1216C47A', 'L-34B43A08'],
'm': 'L-1216C47A', 'i': ['L-1216C47A', 'L-34B43A08'],
'r': 'L-1216C47A', 'm': ['L-1216C47A', 'L-34B43A08'],
't': 'L-1216C47A', 'r': ['L-1216C47A', 'L-34B43A08'],
'z': 'L-1216C47A', 't': ['L-1216C47A', 'L-34B43A08'],
'dl': 'L-6E869C2A', 'z': ['L-1216C47A', 'L-34B43A08'],
'f': 'L-74FC7D96', 'dl': ['L-6E869C2A', 'L-85EED4F7'],
'g': 'L-DB2E81BA', 'f': ['L-74FC7D96', 'L-88CF9481'],
'vt': 'L-DB2E81BA', 'g': ['L-DB2E81BA', 'L-3819A6DF'],
'u-': 'L-43DA4232', # 'high memory' 'vt': ['L-DB2E81BA', 'L-3819A6DF'],
'inf': 'L-1945791B', 'u-': ['L-43DA4232', ''], # 'high memory'
'p': 'L-417A185B', 'inf': ['L-1945791B', 'L-B5D1601B'],
'x': 'L-7295265B', 'p': ['L-417A185B', 'L-7212CCBC'],
'x': ['L-7295265B', 'L-E3A00192'],
'trn': ['L-2C3B7624', 'L-6B0D517C'],
'hpc': ['L-F7808C92', '']
} }
CACHE_TTL = 10 CACHE_TTL = 10
ON_DEMAND = 0
SPOT = 1
class AwsInstance(statemachine.Instance): class AwsInstance(statemachine.Instance):
@ -183,7 +188,8 @@ class AwsCreateStateMachine(statemachine.StateMachine):
return return
self.instance = instance self.instance = instance
self.quota = self.adapter._getQuotaForInstanceType( self.quota = self.adapter._getQuotaForInstanceType(
self.instance.instance_type) self.instance.instance_type,
SPOT if self.label.use_spot else ON_DEMAND)
self.state = self.INSTANCE_CREATING self.state = self.INSTANCE_CREATING
if self.state == self.INSTANCE_CREATING: if self.state == self.INSTANCE_CREATING:
@ -360,35 +366,45 @@ class AwsAdapter(statemachine.Adapter):
for instance in self._listInstances(): for instance in self._listInstances():
if instance.state["Name"].lower() == "terminated": if instance.state["Name"].lower() == "terminated":
continue continue
quota = self._getQuotaForInstanceType(instance.instance_type) quota = self._getQuotaForInstanceType(
instance.instance_type,
SPOT if instance.instance_lifecycle == 'spot' else ON_DEMAND)
yield AwsInstance(self.provider, instance, quota) yield AwsInstance(self.provider, instance, quota)
def getQuotaLimits(self): def getQuotaLimits(self):
# Get the instance types that this provider handles # Get the instance types that this provider handles
instance_types = set() instance_types = {}
for pool in self.provider.pools.values(): for pool in self.provider.pools.values():
for label in pool.labels.values(): for label in pool.labels.values():
instance_types.add(label.instance_type) if label.instance_type not in instance_types:
instance_types[label.instance_type] = set()
instance_types[label.instance_type].add(
SPOT if label.use_spot else ON_DEMAND)
args = dict(default=math.inf) args = dict(default=math.inf)
for instance_type in instance_types: for instance_type in instance_types:
code = self._getQuotaCodeForInstanceType(instance_type) for market_type_option in instance_types[instance_type]:
if code in args: code = self._getQuotaCodeForInstanceType(instance_type,
continue market_type_option)
if not code: if code in args:
self.log.warning("Unknown quota code for instance type: %s", continue
instance_type) if not code:
continue self.log.warning(
with self.non_mutating_rate_limiter: "Unknown quota code for instance type: %s",
self.log.debug("Getting quota limits for %s", code) instance_type)
response = self.aws_quotas.get_service_quota( continue
ServiceCode='ec2', with self.non_mutating_rate_limiter:
QuotaCode=code, self.log.debug("Getting quota limits for %s", code)
) response = self.aws_quotas.get_service_quota(
args[code] = response['Quota']['Value'] ServiceCode='ec2',
QuotaCode=code,
)
args[code] = response['Quota']['Value']
return QuotaInformation(**args) return QuotaInformation(**args)
def getQuotaForLabel(self, label): def getQuotaForLabel(self, label):
return self._getQuotaForInstanceType(label.instance_type) return self._getQuotaForInstanceType(
label.instance_type,
SPOT if label.use_spot else ON_DEMAND)
def uploadImage(self, provider_image, image_name, filename, def uploadImage(self, provider_image, image_name, filename,
image_format, metadata, md5, sha256): image_format, metadata, md5, sha256):
@ -753,18 +769,19 @@ class AwsAdapter(statemachine.Adapter):
instance_key_re = re.compile(r'([a-z\-]+)\d.*') instance_key_re = re.compile(r'([a-z\-]+)\d.*')
def _getQuotaCodeForInstanceType(self, instance_type): def _getQuotaCodeForInstanceType(self, instance_type, market_type_option):
m = self.instance_key_re.match(instance_type) m = self.instance_key_re.match(instance_type)
if m: if m:
key = m.group(1) key = m.group(1)
return QUOTA_CODES.get(key) return QUOTA_CODES.get(key)[market_type_option]
def _getQuotaForInstanceType(self, instance_type): def _getQuotaForInstanceType(self, instance_type, market_type_option):
itype = self._getInstanceType(instance_type) itype = self._getInstanceType(instance_type)
cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores'] cores = itype['InstanceTypes'][0]['VCpuInfo']['DefaultCores']
vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus'] vcpus = itype['InstanceTypes'][0]['VCpuInfo']['DefaultVCpus']
ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB'] ram = itype['InstanceTypes'][0]['MemoryInfo']['SizeInMiB']
code = self._getQuotaCodeForInstanceType(instance_type) code = self._getQuotaCodeForInstanceType(instance_type,
market_type_option)
# We include cores to match the overall cores quota (which may # We include cores to match the overall cores quota (which may
# be set as a tenant resource limit), and include vCPUs for the # be set as a tenant resource limit), and include vCPUs for the
# specific AWS quota code which in for a specific instance # specific AWS quota code which in for a specific instance
@ -967,6 +984,16 @@ class AwsAdapter(statemachine.Adapter):
del mapping['Ebs']['Encrypted'] del mapping['Ebs']['Encrypted']
args['BlockDeviceMappings'] = [mapping] args['BlockDeviceMappings'] = [mapping]
# enable EC2 Spot
if label.use_spot:
args['InstanceMarketOptions'] = {
'MarketType': 'spot',
'SpotOptions': {
'SpotInstanceType': 'one-time',
'InstanceInterruptionBehavior': 'terminate'
}
}
with self.rate_limiter(log.debug, "Created instance"): with self.rate_limiter(log.debug, "Created instance"):
log.debug(f"Creating VM {hostname}") log.debug(f"Creating VM {hostname}")
instances = self.ec2.create_instances(**args) instances = self.ec2.create_instances(**args)

View File

@ -179,6 +179,7 @@ class AwsLabel(ConfigValue):
self.tags = label.get('tags', {}) self.tags = label.get('tags', {})
self.dynamic_tags = label.get('dynamic-tags', {}) self.dynamic_tags = label.get('dynamic-tags', {})
self.host_key_checking = self.pool.host_key_checking self.host_key_checking = self.pool.host_key_checking
self.use_spot = bool(label.get('use-spot', False))
@staticmethod @staticmethod
def getSchema(): def getSchema():
@ -200,6 +201,7 @@ class AwsLabel(ConfigValue):
}, },
'tags': dict, 'tags': dict,
'dynamic-tags': dict, 'dynamic-tags': dict,
'use-spot': bool,
} }

View File

@ -15,6 +15,8 @@ tenant-resource-limits:
labels: labels:
- name: standard - name: standard
- name: high - name: high
- name: spot
- name: on-demand
providers: providers:
- name: ec2-us-west-2 - name: ec2-us-west-2
@ -41,3 +43,12 @@ providers:
cloud-image: ubuntu1404 cloud-image: ubuntu1404
instance-type: u-6tb1.112xlarge instance-type: u-6tb1.112xlarge
key-name: zuul key-name: zuul
- name: spot
cloud-image: ubuntu1404
instance-type: m6i.32xlarge
key-name: zuul
use-spot: True
- name: on-demand
cloud-image: ubuntu1404
instance-type: m6i.32xlarge
key-name: zuul

View File

@ -0,0 +1,39 @@
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
tenant-resource-limits:
- tenant-name: tenant-1
max-cores: 1024
labels:
- name: ubuntu1404-spot
providers:
- name: ec2-us-west-2
driver: aws
region-name: us-west-2
cloud-images:
- name: ubuntu1404
image-id: ami-1e749f67
username: ubuntu
pools:
- name: main
max-servers: 10
subnet-id: {subnet_id}
security-group-id: {security_group_id}
node-attributes:
key1: value1
key2: value2
labels:
- name: ubuntu1404-spot
cloud-image: ubuntu1404
instance-type: t3.medium
key-name: zuul-spot
use-spot: True

View File

@ -244,6 +244,7 @@ class TestDriverAws(tests.DBTestCase):
@aws_quotas({ @aws_quotas({
'L-1216C47A': 2, 'L-1216C47A': 2,
'L-43DA4232': 448, 'L-43DA4232': 448,
'L-34B43A08': 2
}) })
def test_aws_multi_quota(self): def test_aws_multi_quota(self):
# Test multiple instance type quotas (standard and high-mem) # Test multiple instance type quotas (standard and high-mem)
@ -295,6 +296,59 @@ class TestDriverAws(tests.DBTestCase):
req3 = self.waitForNodeRequest(req3) req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3) self.assertSuccess(req3)
@aws_quotas({
'L-43DA4232': 448,
'L-1216C47A': 200,
'L-34B43A08': 200
})
def test_aws_multi_quota_spot(self):
# Test multiple instance type quotas (standard, high-mem and spot)
configfile = self.setup_config('aws/aws-quota.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)
pool.start()
# Create a spot node request which should succeed.
req1 = zk.NodeRequest()
req1.state = zk.REQUESTED
req1.node_types.append('spot')
self.zk.storeNodeRequest(req1)
self.log.debug("Waiting for request %s", req1.id)
req1 = self.waitForNodeRequest(req1)
node1 = self.assertSuccess(req1)
# Create an on-demand node request which should succeed.
req2 = zk.NodeRequest()
req2.state = zk.REQUESTED
req2.node_types.append('on-demand')
self.zk.storeNodeRequest(req2)
self.log.debug("Waiting for request %s", req2.id)
req2 = self.waitForNodeRequest(req2)
self.assertSuccess(req2)
# Create another spot node request which should be paused.
req3 = zk.NodeRequest()
req3.state = zk.REQUESTED
req3.node_types.append('spot')
self.zk.storeNodeRequest(req3)
self.log.debug("Waiting for request %s", req3.id)
req3 = self.waitForNodeRequest(req3, (zk.PENDING,))
# Make sure we're paused while we attempt to fulfill the
# third request.
pool_worker = pool.getPoolWorkers('ec2-us-west-2')
for _ in iterate_timeout(30, Exception, 'paused handler'):
if pool_worker[0].paused_handlers:
break
# Release the first spot node so that the third can be fulfilled.
node1.state = zk.USED
self.zk.storeNode(node1)
self.waitForNodeDeletion(node1)
# Make sure the fourth spot node exists now.
req3 = self.waitForNodeRequest(req3)
self.assertSuccess(req3)
@aws_quotas({ @aws_quotas({
'L-1216C47A': 1000, 'L-1216C47A': 1000,
'L-43DA4232': 1000, 'L-43DA4232': 1000,
@ -916,3 +970,12 @@ class TestDriverAws(tests.DBTestCase):
except botocore.exceptions.ClientError: except botocore.exceptions.ClientError:
# Probably not found # Probably not found
break break
def test_aws_provisioning_spot_instances(self):
# Test creating a spot instances instead of an on-demand on.
req = self.requestNode('aws/aws-spot.yaml', 'ubuntu1404-spot')
node = self.assertSuccess(req)
instance = self.ec2.Instance(node.external_id)
self.assertEqual(instance.instance_lifecycle, 'spot')
# moto doesn't provide the spot_instance_request_id
# self.assertIsNotNone(instance.spot_instance_request_id)

View File

@ -0,0 +1,6 @@
---
features:
- |
The AWS driver now supports launching Amazon EC2 Spot instances
(https://aws.amazon.com/ec2/spot/), when specifying
:attr:`providers.[aws].pools.labels.use-spot`.