Retry on checksum failures

HTTP is a fun protocol.

Size is basically optional. And clients implicitly trust the server
and socket has transferred all the bytes. Which *really* means you
should always checksum.

But... previously we didn't checksum as part of retrying.

So if anything happened with python-requests, or lower level
library code or the system itself causing bytes to be lost off the
buffer, creating an incomplete transfer situation, then we wouldn't
know until the checksum.

So now, we checksum and re-trigger the download if there is a
failure of the checksum.

This involved a minor shift in the download logic, and resulted in
a needful minor fix to an image checksum test as it would loop for
90 seconds as well.

Closes-Bug: 2038934
Change-Id: I543a60555a2621b49dd7b6564bd0654a46db2e9a
This commit is contained in:
Julia Kreger 2023-10-10 08:56:05 -07:00
parent 89be7bd420
commit cb61a8d6c0
3 changed files with 33 additions and 15 deletions

View File

@ -554,7 +554,9 @@ def _download_image(image_info):
msg = 'Unable to write image to {}. Error: {}'.format(
image_location, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
image_download.verify_image(image_location)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
@ -575,7 +577,6 @@ def _download_image(image_info):
'totaltime': totaltime,
'size': image_download.bytes_transferred,
'reported': image_download.content_length})
image_download.verify_image(image_location)
def _validate_image_info(ext, image_info=None, **kwargs):
@ -729,7 +730,12 @@ class StandbyExtension(base.BaseAgentExtension):
msg = ('Unable to write image to device {}. '
'Error: {}').format(device, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
# Verify the checksum of the streamed image is correct while
# still in the retry loop, so we can retry should a checksum
# failure be detected.
image_download.verify_image(device)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
@ -749,8 +755,6 @@ class StandbyExtension(base.BaseAgentExtension):
{'device': device, 'totaltime': totaltime,
'size': image_download.bytes_transferred,
'reported': image_download.content_length})
# Verify if the checksum of the streamed image is correct
image_download.verify_image(device)
# Fix any gpt partition
try:
disk_utils.fix_gpt_partition(device, node_uuid=None)

View File

@ -441,6 +441,11 @@ class TestStandbyExtension(base.IronicAgentTest):
log_mock_calls = [
mock.call.info('Attempting to download image from %s',
'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image downloaded from %(image_location)s in '
'%(totaltime)s seconds. Transferred %(size)s '
'bytes. Server originaly reported: %(reported)s.',
@ -448,11 +453,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY,
'size': 11,
'reported': None}),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'})
]
log_mock.assert_has_calls(log_mock_calls)
@ -509,6 +509,9 @@ class TestStandbyExtension(base.IronicAgentTest):
@mock.patch('requests.get', autospec=True)
def test_download_image_verify_fails(self, requests_mock, open_mock,
hash_mock):
# Set the config to 0 retries, so we don't retry in this case
# and cause the test download to loop multiple times.
self.config(image_download_connection_retries=0)
image_info = _build_fake_image_info()
response = requests_mock.return_value
response.status_code = 200
@ -1334,6 +1337,11 @@ class TestStandbyExtension(base.IronicAgentTest):
mock_log_calls = [
mock.call.info('Attempting to download image from %s',
'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s '
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image streamed onto device %(device)s in '
'%(totaltime)s seconds for %(size)s bytes. '
'Server originaly reported %(reported)s.',
@ -1341,11 +1349,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY,
'size': 11,
'reported': 11}),
mock.call.debug('Verifying image at %(image_location)s '
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('%(device)s UUID is now %(root_uuid)s',
{'device': '/dev/foo', 'root_uuid': 'aaaabbbb'})
]

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a failure case where downloads would not be retried when the
checksum fails verification. the agent now includes the checksum
activity as part of the file download operation, and will
automatically retry downloads when the checksum fails in
accordance with the existing download retry logic.
This is largely in response to what appears to be intermittent
transport failures at lower levels which we cannot otherwise
detect.