Merge "AWS: Add support for retrying image imports"
This commit is contained in:
commit
785f7dcbc9
@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the
|
||||
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
|
||||
are supported by diskimage-builder).
|
||||
|
||||
.. attr:: image-import-timeout
|
||||
:type: int
|
||||
|
||||
Generally there is no limit on the amount of time a successful
|
||||
image import can take. However, some import tasks may encounter
|
||||
temporary resource limitations from AWS. In these cases, if
|
||||
this value is set, Nodepool will retry the import tasks until
|
||||
the timeout is reached. If this is unset (the default), then
|
||||
the first resource limitation detected will result in an error.
|
||||
The value is in seconds.
|
||||
|
||||
.. attr:: cloud-images
|
||||
:type: list
|
||||
|
||||
|
@ -478,22 +478,37 @@ class AwsAdapter(statemachine.Adapter):
|
||||
bucket_name, object_filename):
|
||||
# Import snapshot
|
||||
self.log.debug(f"Importing {image_name} as snapshot")
|
||||
with self.rate_limiter:
|
||||
import_snapshot_task = self.ec2_client.import_snapshot(
|
||||
DiskContainer={
|
||||
'Format': image_format,
|
||||
'UserBucket': {
|
||||
'S3Bucket': bucket_name,
|
||||
'S3Key': object_filename,
|
||||
},
|
||||
},
|
||||
TagSpecifications=[
|
||||
{
|
||||
'ResourceType': 'import-snapshot-task',
|
||||
'Tags': tag_dict_to_list(metadata),
|
||||
},
|
||||
]
|
||||
)
|
||||
timeout = time.time()
|
||||
if self.provider.image_import_timeout:
|
||||
timeout += self.provider.image_import_timeout
|
||||
while True:
|
||||
try:
|
||||
with self.rate_limiter:
|
||||
import_snapshot_task = self.ec2_client.import_snapshot(
|
||||
DiskContainer={
|
||||
'Format': image_format,
|
||||
'UserBucket': {
|
||||
'S3Bucket': bucket_name,
|
||||
'S3Key': object_filename,
|
||||
},
|
||||
},
|
||||
TagSpecifications=[
|
||||
{
|
||||
'ResourceType': 'import-snapshot-task',
|
||||
'Tags': tag_dict_to_list(metadata),
|
||||
},
|
||||
]
|
||||
)
|
||||
break
|
||||
except botocore.exceptions.ClientError as error:
|
||||
if (error.response['Error']['Code'] ==
|
||||
'ResourceCountLimitExceeded'):
|
||||
if time.time() < timeout:
|
||||
self.log.warning("AWS error: '%s' will retry",
|
||||
str(error))
|
||||
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||
continue
|
||||
raise
|
||||
task_id = import_snapshot_task['ImportTaskId']
|
||||
|
||||
paginator = self.ec2_client.get_paginator(
|
||||
@ -571,23 +586,38 @@ class AwsAdapter(statemachine.Adapter):
|
||||
bucket_name, object_filename):
|
||||
# Import image as AMI
|
||||
self.log.debug(f"Importing {image_name} as AMI")
|
||||
with self.rate_limiter:
|
||||
import_image_task = self.ec2_client.import_image(
|
||||
Architecture=provider_image.architecture,
|
||||
DiskContainers=[{
|
||||
'Format': image_format,
|
||||
'UserBucket': {
|
||||
'S3Bucket': bucket_name,
|
||||
'S3Key': object_filename,
|
||||
},
|
||||
}],
|
||||
TagSpecifications=[
|
||||
{
|
||||
'ResourceType': 'import-image-task',
|
||||
'Tags': tag_dict_to_list(metadata),
|
||||
},
|
||||
]
|
||||
)
|
||||
timeout = time.time()
|
||||
if self.provider.image_import_timeout:
|
||||
timeout += self.provider.image_import_timeout
|
||||
while True:
|
||||
try:
|
||||
with self.rate_limiter:
|
||||
import_image_task = self.ec2_client.import_image(
|
||||
Architecture=provider_image.architecture,
|
||||
DiskContainers=[{
|
||||
'Format': image_format,
|
||||
'UserBucket': {
|
||||
'S3Bucket': bucket_name,
|
||||
'S3Key': object_filename,
|
||||
},
|
||||
}],
|
||||
TagSpecifications=[
|
||||
{
|
||||
'ResourceType': 'import-image-task',
|
||||
'Tags': tag_dict_to_list(metadata),
|
||||
},
|
||||
]
|
||||
)
|
||||
break
|
||||
except botocore.exceptions.ClientError as error:
|
||||
if (error.response['Error']['Code'] ==
|
||||
'ResourceCountLimitExceeded'):
|
||||
if time.time() < timeout:
|
||||
self.log.warning("AWS error: '%s' will retry",
|
||||
str(error))
|
||||
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||
continue
|
||||
raise
|
||||
task_id = import_image_task['ImportTaskId']
|
||||
|
||||
paginator = self.ec2_client.get_paginator(
|
||||
|
@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig):
|
||||
self.object_storage = self.provider.get('object-storage')
|
||||
self.image_type = self.provider.get('image-format', 'raw')
|
||||
self.image_name_format = '{image_name}-{timestamp}'
|
||||
self.image_import_timeout = self.provider.get(
|
||||
'image-import-timeout', None)
|
||||
self.post_upload_hook = self.provider.get('post-upload-hook')
|
||||
self.max_servers = self.provider.get('max-servers', math.inf)
|
||||
self.max_cores = self.provider.get('max-cores', math.inf)
|
||||
@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig):
|
||||
'launch-retries': int,
|
||||
'object-storage': object_storage,
|
||||
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
|
||||
'image-import-timeout': int,
|
||||
'max-servers': int,
|
||||
'max-cores': int,
|
||||
'max-ram': int,
|
||||
|
@ -27,6 +27,7 @@ providers:
|
||||
region-name: us-west-2
|
||||
object-storage:
|
||||
bucket-name: nodepool
|
||||
image-import-timeout: 60
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
tags:
|
||||
|
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
@ -27,6 +27,7 @@ providers:
|
||||
region-name: us-west-2
|
||||
object-storage:
|
||||
bucket-name: nodepool
|
||||
image-import-timeout: 60
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
tags:
|
||||
|
@ -16,6 +16,7 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
import botocore
|
||||
import boto3
|
||||
|
||||
|
||||
@ -136,8 +137,14 @@ class FakeAws:
|
||||
self.tasks = {}
|
||||
self.ec2 = boto3.resource('ec2', region_name='us-west-2')
|
||||
self.ec2_client = boto3.client('ec2', region_name='us-west-2')
|
||||
self.fail_import_count = 0
|
||||
|
||||
def import_snapshot(self, *args, **kw):
|
||||
while self.fail_import_count:
|
||||
self.fail_import_count -= 1
|
||||
raise botocore.exceptions.ClientError(
|
||||
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||
'ImportSnapshot')
|
||||
task_id = uuid.uuid4().hex
|
||||
task = make_import_snapshot_stage_1(
|
||||
task_id,
|
||||
@ -162,6 +169,11 @@ class FakeAws:
|
||||
return snap_id
|
||||
|
||||
def import_image(self, *args, **kw):
|
||||
while self.fail_import_count:
|
||||
self.fail_import_count -= 1
|
||||
raise botocore.exceptions.ClientError(
|
||||
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||
'ImportImage')
|
||||
task_id = uuid.uuid4().hex
|
||||
task = make_import_image_stage_1(
|
||||
task_id,
|
||||
|
@ -710,6 +710,7 @@ class TestDriverAws(tests.DBTestCase):
|
||||
self.assertTrue(response['EbsOptimized']['Value'])
|
||||
|
||||
def test_aws_diskimage_snapshot(self):
|
||||
self.fake_aws.fail_import_count = 1
|
||||
configfile = self.setup_config('aws/diskimage.yaml')
|
||||
|
||||
self.useBuilder(configfile)
|
||||
@ -753,6 +754,7 @@ class TestDriverAws(tests.DBTestCase):
|
||||
['Throughput'], 200)
|
||||
|
||||
def test_aws_diskimage_image(self):
|
||||
self.fake_aws.fail_import_count = 1
|
||||
configfile = self.setup_config('aws/diskimage-import-image.yaml')
|
||||
|
||||
self.useBuilder(configfile)
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The AWS driver now supports an
|
||||
:attr:`providers.[aws].image-import-timeout` option to control
|
||||
automatic retries and timeouts when AWS import task resource
|
||||
limits are reached.
|
Loading…
Reference in New Issue
Block a user