From 0b31c4cb753fd5ea8f9f17f9e0612d6a3a668971 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 10 Oct 2023 08:56:05 -0700 Subject: [PATCH] Retry on checksum failures HTTP is a fun protocol. Size is basically optional. And clients implicitly trust the server and socket has transferred all the bytes. Which *really* means you should always checksum. But... previously we didn't checksum as part of retrying. So if anything happened with python-requests, or lower level library code or the system itself causing bytes to be lost off the buffer, creating an incomplete transfer situation, then we wouldn't know until the checksum. So now, we checksum and re-trigger the download if there is a failure of the checksum. This involved a minor shift in the download logic, and resulted in a needful minor fix to an image checksum test as it would loop for 90 seconds as well. Closes-Bug: 2038934 Change-Id: I543a60555a2621b49dd7b6564bd0654a46db2e9a (cherry picked from commit cb61a8d6c052f4401d482ca45b43bc53935a6b28) --- ironic_python_agent/extensions/standby.py | 14 +++++++---- .../tests/unit/extensions/test_standby.py | 23 +++++++++++-------- ...g-download-completed-91cca9fef34d8cf5.yaml | 11 +++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) create mode 100644 releasenotes/notes/checksum-before-considering-download-completed-91cca9fef34d8cf5.yaml diff --git a/ironic_python_agent/extensions/standby.py b/ironic_python_agent/extensions/standby.py index 53b1903c8..90e0d1350 100644 --- a/ironic_python_agent/extensions/standby.py +++ b/ironic_python_agent/extensions/standby.py @@ -554,7 +554,9 @@ def _download_image(image_info): msg = 'Unable to write image to {}. Error: {}'.format( image_location, str(e)) raise errors.ImageDownloadError(image_info['id'], msg) - except errors.ImageDownloadError as e: + image_download.verify_image(image_location) + except (errors.ImageDownloadError, + errors.ImageChecksumError) as e: if attempt == CONF.image_download_connection_retries: raise else: @@ -575,7 +577,6 @@ def _download_image(image_info): 'totaltime': totaltime, 'size': image_download.bytes_transferred, 'reported': image_download.content_length}) - image_download.verify_image(image_location) def _validate_image_info(ext, image_info=None, **kwargs): @@ -729,7 +730,12 @@ class StandbyExtension(base.BaseAgentExtension): msg = ('Unable to write image to device {}. ' 'Error: {}').format(device, str(e)) raise errors.ImageDownloadError(image_info['id'], msg) - except errors.ImageDownloadError as e: + # Verify the checksum of the streamed image is correct while + # still in the retry loop, so we can retry should a checksum + # failure be detected. + image_download.verify_image(device) + except (errors.ImageDownloadError, + errors.ImageChecksumError) as e: if attempt == CONF.image_download_connection_retries: raise else: @@ -749,8 +755,6 @@ class StandbyExtension(base.BaseAgentExtension): {'device': device, 'totaltime': totaltime, 'size': image_download.bytes_transferred, 'reported': image_download.content_length}) - # Verify if the checksum of the streamed image is correct - image_download.verify_image(device) # Fix any gpt partition try: disk_utils.fix_gpt_partition(device, node_uuid=None) diff --git a/ironic_python_agent/tests/unit/extensions/test_standby.py b/ironic_python_agent/tests/unit/extensions/test_standby.py index c015c036b..e0284ea1d 100644 --- a/ironic_python_agent/tests/unit/extensions/test_standby.py +++ b/ironic_python_agent/tests/unit/extensions/test_standby.py @@ -441,6 +441,11 @@ class TestStandbyExtension(base.IronicAgentTest): log_mock_calls = [ mock.call.info('Attempting to download image from %s', 'http://example.org'), + mock.call.debug('Verifying image at %(image_location)s against ' + '%(algo_name)s checksum %(checksum)s', + {'image_location': mock.ANY, + 'algo_name': mock.ANY, + 'checksum': 'fake-checksum'}), mock.call.info('Image downloaded from %(image_location)s in ' '%(totaltime)s seconds. Transferred %(size)s ' 'bytes. Server originaly reported: %(reported)s.', @@ -448,11 +453,6 @@ class TestStandbyExtension(base.IronicAgentTest): 'totaltime': mock.ANY, 'size': 11, 'reported': None}), - mock.call.debug('Verifying image at %(image_location)s against ' - '%(algo_name)s checksum %(checksum)s', - {'image_location': mock.ANY, - 'algo_name': mock.ANY, - 'checksum': 'fake-checksum'}) ] log_mock.assert_has_calls(log_mock_calls) @@ -509,6 +509,9 @@ class TestStandbyExtension(base.IronicAgentTest): @mock.patch('requests.get', autospec=True) def test_download_image_verify_fails(self, requests_mock, open_mock, hash_mock): + # Set the config to 0 retries, so we don't retry in this case + # and cause the test download to loop multiple times. + self.config(image_download_connection_retries=0) image_info = _build_fake_image_info() response = requests_mock.return_value response.status_code = 200 @@ -1334,6 +1337,11 @@ class TestStandbyExtension(base.IronicAgentTest): mock_log_calls = [ mock.call.info('Attempting to download image from %s', 'http://example.org'), + mock.call.debug('Verifying image at %(image_location)s ' + 'against %(algo_name)s checksum %(checksum)s', + {'image_location': '/dev/foo', + 'algo_name': mock.ANY, + 'checksum': 'fake-checksum'}), mock.call.info('Image streamed onto device %(device)s in ' '%(totaltime)s seconds for %(size)s bytes. ' 'Server originaly reported %(reported)s.', @@ -1341,11 +1349,6 @@ class TestStandbyExtension(base.IronicAgentTest): 'totaltime': mock.ANY, 'size': 11, 'reported': 11}), - mock.call.debug('Verifying image at %(image_location)s ' - 'against %(algo_name)s checksum %(checksum)s', - {'image_location': '/dev/foo', - 'algo_name': mock.ANY, - 'checksum': 'fake-checksum'}), mock.call.info('%(device)s UUID is now %(root_uuid)s', {'device': '/dev/foo', 'root_uuid': 'aaaabbbb'}) ] diff --git a/releasenotes/notes/checksum-before-considering-download-completed-91cca9fef34d8cf5.yaml b/releasenotes/notes/checksum-before-considering-download-completed-91cca9fef34d8cf5.yaml new file mode 100644 index 000000000..07cf237f5 --- /dev/null +++ b/releasenotes/notes/checksum-before-considering-download-completed-91cca9fef34d8cf5.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes a failure case where downloads would not be retried when the + checksum fails verification. the agent now includes the checksum + activity as part of the file download operation, and will + automatically retry downloads when the checksum fails in + accordance with the existing download retry logic. + This is largely in response to what appears to be intermittent + transport failures at lower levels which we cannot otherwise + detect.