Retry on checksum failures

HTTP is a fun protocol.

Size is basically optional. And clients implicitly trust the server
and socket has transferred all the bytes. Which *really* means you
should always checksum.

But... previously we didn't checksum as part of retrying.

So if anything happened with python-requests, or lower level
library code or the system itself causing bytes to be lost off the
buffer, creating an incomplete transfer situation, then we wouldn't
know until the checksum.

So now, we checksum and re-trigger the download if there is a
failure of the checksum.

This involved a minor shift in the download logic, and resulted in
a needful minor fix to an image checksum test as it would loop for
90 seconds as well.

Closes-Bug: 2038934
Change-Id: I543a60555a2621b49dd7b6564bd0654a46db2e9a
(cherry picked from commit cb61a8d6c0)
This commit is contained in:
Julia Kreger 2023-10-10 08:56:05 -07:00
parent 30744024d4
commit 0b31c4cb75
3 changed files with 33 additions and 15 deletions

View File

@ -554,7 +554,9 @@ def _download_image(image_info):
msg = 'Unable to write image to {}. Error: {}'.format( msg = 'Unable to write image to {}. Error: {}'.format(
image_location, str(e)) image_location, str(e))
raise errors.ImageDownloadError(image_info['id'], msg) raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e: image_download.verify_image(image_location)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries: if attempt == CONF.image_download_connection_retries:
raise raise
else: else:
@ -575,7 +577,6 @@ def _download_image(image_info):
'totaltime': totaltime, 'totaltime': totaltime,
'size': image_download.bytes_transferred, 'size': image_download.bytes_transferred,
'reported': image_download.content_length}) 'reported': image_download.content_length})
image_download.verify_image(image_location)
def _validate_image_info(ext, image_info=None, **kwargs): def _validate_image_info(ext, image_info=None, **kwargs):
@ -729,7 +730,12 @@ class StandbyExtension(base.BaseAgentExtension):
msg = ('Unable to write image to device {}. ' msg = ('Unable to write image to device {}. '
'Error: {}').format(device, str(e)) 'Error: {}').format(device, str(e))
raise errors.ImageDownloadError(image_info['id'], msg) raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e: # Verify the checksum of the streamed image is correct while
# still in the retry loop, so we can retry should a checksum
# failure be detected.
image_download.verify_image(device)
except (errors.ImageDownloadError,
errors.ImageChecksumError) as e:
if attempt == CONF.image_download_connection_retries: if attempt == CONF.image_download_connection_retries:
raise raise
else: else:
@ -749,8 +755,6 @@ class StandbyExtension(base.BaseAgentExtension):
{'device': device, 'totaltime': totaltime, {'device': device, 'totaltime': totaltime,
'size': image_download.bytes_transferred, 'size': image_download.bytes_transferred,
'reported': image_download.content_length}) 'reported': image_download.content_length})
# Verify if the checksum of the streamed image is correct
image_download.verify_image(device)
# Fix any gpt partition # Fix any gpt partition
try: try:
disk_utils.fix_gpt_partition(device, node_uuid=None) disk_utils.fix_gpt_partition(device, node_uuid=None)

View File

@ -441,6 +441,11 @@ class TestStandbyExtension(base.IronicAgentTest):
log_mock_calls = [ log_mock_calls = [
mock.call.info('Attempting to download image from %s', mock.call.info('Attempting to download image from %s',
'http://example.org'), 'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image downloaded from %(image_location)s in ' mock.call.info('Image downloaded from %(image_location)s in '
'%(totaltime)s seconds. Transferred %(size)s ' '%(totaltime)s seconds. Transferred %(size)s '
'bytes. Server originaly reported: %(reported)s.', 'bytes. Server originaly reported: %(reported)s.',
@ -448,11 +453,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY, 'totaltime': mock.ANY,
'size': 11, 'size': 11,
'reported': None}), 'reported': None}),
mock.call.debug('Verifying image at %(image_location)s against '
'%(algo_name)s checksum %(checksum)s',
{'image_location': mock.ANY,
'algo_name': mock.ANY,
'checksum': 'fake-checksum'})
] ]
log_mock.assert_has_calls(log_mock_calls) log_mock.assert_has_calls(log_mock_calls)
@ -509,6 +509,9 @@ class TestStandbyExtension(base.IronicAgentTest):
@mock.patch('requests.get', autospec=True) @mock.patch('requests.get', autospec=True)
def test_download_image_verify_fails(self, requests_mock, open_mock, def test_download_image_verify_fails(self, requests_mock, open_mock,
hash_mock): hash_mock):
# Set the config to 0 retries, so we don't retry in this case
# and cause the test download to loop multiple times.
self.config(image_download_connection_retries=0)
image_info = _build_fake_image_info() image_info = _build_fake_image_info()
response = requests_mock.return_value response = requests_mock.return_value
response.status_code = 200 response.status_code = 200
@ -1334,6 +1337,11 @@ class TestStandbyExtension(base.IronicAgentTest):
mock_log_calls = [ mock_log_calls = [
mock.call.info('Attempting to download image from %s', mock.call.info('Attempting to download image from %s',
'http://example.org'), 'http://example.org'),
mock.call.debug('Verifying image at %(image_location)s '
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('Image streamed onto device %(device)s in ' mock.call.info('Image streamed onto device %(device)s in '
'%(totaltime)s seconds for %(size)s bytes. ' '%(totaltime)s seconds for %(size)s bytes. '
'Server originaly reported %(reported)s.', 'Server originaly reported %(reported)s.',
@ -1341,11 +1349,6 @@ class TestStandbyExtension(base.IronicAgentTest):
'totaltime': mock.ANY, 'totaltime': mock.ANY,
'size': 11, 'size': 11,
'reported': 11}), 'reported': 11}),
mock.call.debug('Verifying image at %(image_location)s '
'against %(algo_name)s checksum %(checksum)s',
{'image_location': '/dev/foo',
'algo_name': mock.ANY,
'checksum': 'fake-checksum'}),
mock.call.info('%(device)s UUID is now %(root_uuid)s', mock.call.info('%(device)s UUID is now %(root_uuid)s',
{'device': '/dev/foo', 'root_uuid': 'aaaabbbb'}) {'device': '/dev/foo', 'root_uuid': 'aaaabbbb'})
] ]

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a failure case where downloads would not be retried when the
checksum fails verification. the agent now includes the checksum
activity as part of the file download operation, and will
automatically retry downloads when the checksum fails in
accordance with the existing download retry logic.
This is largely in response to what appears to be intermittent
transport failures at lower levels which we cannot otherwise
detect.