Retry on checksum failures
HTTP is a fun protocol. Size is basically optional. And clients implicitly trust the server and socket has transferred all the bytes. Which *really* means you should always checksum. But... previously we didn't checksum as part of retrying. So if anything happened with python-requests, or lower level library code or the system itself causing bytes to be lost off the buffer, creating an incomplete transfer situation, then we wouldn't know until the checksum. So now, we checksum and re-trigger the download if there is a failure of the checksum. This involved a minor shift in the download logic, and resulted in a needful minor fix to an image checksum test as it would loop for 90 seconds as well. Closes-Bug: 2038934 Change-Id: I543a60555a2621b49dd7b6564bd0654a46db2e9a
This commit is contained in:
parent
89be7bd420
commit
cb61a8d6c0
@ -554,7 +554,9 @@ def _download_image(image_info):
|
||||
msg = 'Unable to write image to {}. Error: {}'.format(
|
||||
image_location, str(e))
|
||||
raise errors.ImageDownloadError(image_info['id'], msg)
|
||||
except errors.ImageDownloadError as e:
|
||||
image_download.verify_image(image_location)
|
||||
except (errors.ImageDownloadError,
|
||||
errors.ImageChecksumError) as e:
|
||||
if attempt == CONF.image_download_connection_retries:
|
||||
raise
|
||||
else:
|
||||
@ -575,7 +577,6 @@ def _download_image(image_info):
|
||||
'totaltime': totaltime,
|
||||
'size': image_download.bytes_transferred,
|
||||
'reported': image_download.content_length})
|
||||
image_download.verify_image(image_location)
|
||||
|
||||
|
||||
def _validate_image_info(ext, image_info=None, **kwargs):
|
||||
@ -729,7 +730,12 @@ class StandbyExtension(base.BaseAgentExtension):
|
||||
msg = ('Unable to write image to device {}. '
|
||||
'Error: {}').format(device, str(e))
|
||||
raise errors.ImageDownloadError(image_info['id'], msg)
|
||||
except errors.ImageDownloadError as e:
|
||||
# Verify the checksum of the streamed image is correct while
|
||||
# still in the retry loop, so we can retry should a checksum
|
||||
# failure be detected.
|
||||
image_download.verify_image(device)
|
||||
except (errors.ImageDownloadError,
|
||||
errors.ImageChecksumError) as e:
|
||||
if attempt == CONF.image_download_connection_retries:
|
||||
raise
|
||||
else:
|
||||
@ -749,8 +755,6 @@ class StandbyExtension(base.BaseAgentExtension):
|
||||
{'device': device, 'totaltime': totaltime,
|
||||
'size': image_download.bytes_transferred,
|
||||
'reported': image_download.content_length})
|
||||
# Verify if the checksum of the streamed image is correct
|
||||
image_download.verify_image(device)
|
||||
# Fix any gpt partition
|
||||
try:
|
||||
disk_utils.fix_gpt_partition(device, node_uuid=None)
|
||||
|
@ -441,6 +441,11 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
log_mock_calls = [
|
||||
mock.call.info('Attempting to download image from %s',
|
||||
'http://example.org'),
|
||||
mock.call.debug('Verifying image at %(image_location)s against '
|
||||
'%(algo_name)s checksum %(checksum)s',
|
||||
{'image_location': mock.ANY,
|
||||
'algo_name': mock.ANY,
|
||||
'checksum': 'fake-checksum'}),
|
||||
mock.call.info('Image downloaded from %(image_location)s in '
|
||||
'%(totaltime)s seconds. Transferred %(size)s '
|
||||
'bytes. Server originaly reported: %(reported)s.',
|
||||
@ -448,11 +453,6 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
'totaltime': mock.ANY,
|
||||
'size': 11,
|
||||
'reported': None}),
|
||||
mock.call.debug('Verifying image at %(image_location)s against '
|
||||
'%(algo_name)s checksum %(checksum)s',
|
||||
{'image_location': mock.ANY,
|
||||
'algo_name': mock.ANY,
|
||||
'checksum': 'fake-checksum'})
|
||||
]
|
||||
log_mock.assert_has_calls(log_mock_calls)
|
||||
|
||||
@ -509,6 +509,9 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
@mock.patch('requests.get', autospec=True)
|
||||
def test_download_image_verify_fails(self, requests_mock, open_mock,
|
||||
hash_mock):
|
||||
# Set the config to 0 retries, so we don't retry in this case
|
||||
# and cause the test download to loop multiple times.
|
||||
self.config(image_download_connection_retries=0)
|
||||
image_info = _build_fake_image_info()
|
||||
response = requests_mock.return_value
|
||||
response.status_code = 200
|
||||
@ -1334,6 +1337,11 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
mock_log_calls = [
|
||||
mock.call.info('Attempting to download image from %s',
|
||||
'http://example.org'),
|
||||
mock.call.debug('Verifying image at %(image_location)s '
|
||||
'against %(algo_name)s checksum %(checksum)s',
|
||||
{'image_location': '/dev/foo',
|
||||
'algo_name': mock.ANY,
|
||||
'checksum': 'fake-checksum'}),
|
||||
mock.call.info('Image streamed onto device %(device)s in '
|
||||
'%(totaltime)s seconds for %(size)s bytes. '
|
||||
'Server originaly reported %(reported)s.',
|
||||
@ -1341,11 +1349,6 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
'totaltime': mock.ANY,
|
||||
'size': 11,
|
||||
'reported': 11}),
|
||||
mock.call.debug('Verifying image at %(image_location)s '
|
||||
'against %(algo_name)s checksum %(checksum)s',
|
||||
{'image_location': '/dev/foo',
|
||||
'algo_name': mock.ANY,
|
||||
'checksum': 'fake-checksum'}),
|
||||
mock.call.info('%(device)s UUID is now %(root_uuid)s',
|
||||
{'device': '/dev/foo', 'root_uuid': 'aaaabbbb'})
|
||||
]
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes a failure case where downloads would not be retried when the
|
||||
checksum fails verification. the agent now includes the checksum
|
||||
activity as part of the file download operation, and will
|
||||
automatically retry downloads when the checksum fails in
|
||||
accordance with the existing download retry logic.
|
||||
This is largely in response to what appears to be intermittent
|
||||
transport failures at lower levels which we cannot otherwise
|
||||
detect.
|
Loading…
Reference in New Issue
Block a user