diff --git a/ironic_python_agent/extensions/standby.py b/ironic_python_agent/extensions/standby.py index dd7aaa889..909aded2c 100644 --- a/ironic_python_agent/extensions/standby.py +++ b/ironic_python_agent/extensions/standby.py @@ -408,16 +408,30 @@ def _download_image(image_info): """ starttime = time.time() image_location = _image_location(image_info) - image_download = ImageDownload(image_info, time_obj=starttime) - - with open(image_location, 'wb') as f: + for attempt in range(CONF.image_download_connection_retries + 1): try: - for chunk in image_download: - f.write(chunk) - except Exception as e: - msg = 'Unable to write image to {}. Error: {}'.format( - image_location, str(e)) - raise errors.ImageDownloadError(image_info['id'], msg) + image_download = ImageDownload(image_info, time_obj=starttime) + + with open(image_location, 'wb') as f: + try: + for chunk in image_download: + f.write(chunk) + except Exception as e: + msg = 'Unable to write image to {}. Error: {}'.format( + image_location, str(e)) + raise errors.ImageDownloadError(image_info['id'], msg) + except errors.ImageDownloadError as e: + if attempt == CONF.image_download_connection_retries: + raise + else: + LOG.warning('Image download failed, %(attempt)s of %(total)s: ' + '%(error)s', + {'attempt': attempt, + 'total': CONF.image_download_connection_retries, + 'error': e}) + time.sleep(CONF.image_download_connection_retry_interval) + else: + break totaltime = time.time() - starttime LOG.info("Image downloaded from {} in {} seconds".format(image_location, @@ -553,16 +567,31 @@ class StandbyExtension(base.BaseAgentExtension): match the checksum as reported by glance in image_info. """ starttime = time.time() - image_download = ImageDownload(image_info, time_obj=starttime) - - with open(device, 'wb+') as f: + total_retries = CONF.image_download_connection_retries + for attempt in range(total_retries + 1): try: - for chunk in image_download: - f.write(chunk) - except Exception as e: - msg = 'Unable to write image to device {}. Error: {}'.format( - device, str(e)) - raise errors.ImageDownloadError(image_info['id'], msg) + image_download = ImageDownload(image_info, time_obj=starttime) + + with open(device, 'wb+') as f: + try: + for chunk in image_download: + f.write(chunk) + except Exception as e: + msg = ('Unable to write image to device {}. ' + 'Error: {}').format(device, str(e)) + raise errors.ImageDownloadError(image_info['id'], msg) + except errors.ImageDownloadError as e: + if attempt == CONF.image_download_connection_retries: + raise + else: + LOG.warning('Image download failed, %(attempt)s of ' + '%(total)s: %(error)s', + {'attempt': attempt, + 'total': total_retries, + 'error': e}) + time.sleep(CONF.image_download_connection_retry_interval) + else: + break totaltime = time.time() - starttime LOG.info("Image streamed onto device {} in {} " diff --git a/ironic_python_agent/tests/unit/extensions/test_standby.py b/ironic_python_agent/tests/unit/extensions/test_standby.py index eaff2b272..fe212a9a2 100644 --- a/ironic_python_agent/tests/unit/extensions/test_standby.py +++ b/ironic_python_agent/tests/unit/extensions/test_standby.py @@ -1126,6 +1126,8 @@ class TestStandbyExtension(base.IronicAgentTest): @mock.patch('requests.get', autospec=True) def test_stream_raw_image_onto_device_write_error(self, requests_mock, open_mock, md5_mock): + self.config(image_download_connection_timeout=1) + self.config(image_download_connection_retry_interval=0) image_info = _build_fake_image_info() response = requests_mock.return_value response.status_code = 200 @@ -1139,12 +1141,20 @@ class TestStandbyExtension(base.IronicAgentTest): self.assertRaises(errors.ImageDownloadError, self.agent_extension._stream_raw_image_onto_device, image_info, '/dev/foo') - requests_mock.assert_called_once_with(image_info['urls'][0], - cert=None, verify=True, - stream=True, proxies={}, - timeout=60) - # Assert write was only called once and failed! - file_mock.write.assert_called_once_with('some') + calls = [mock.call('http://example.org', cert=None, proxies={}, + stream=True, timeout=1, verify=True), + mock.call().iter_content(mock.ANY), + mock.call('http://example.org', cert=None, proxies={}, + stream=True, timeout=1, verify=True), + mock.call().iter_content(mock.ANY), + mock.call('http://example.org', cert=None, proxies={}, + stream=True, timeout=1, verify=True), + mock.call().iter_content(mock.ANY)] + requests_mock.assert_has_calls(calls) + write_calls = [mock.call('some'), + mock.call('some'), + mock.call('some')] + file_mock.write.assert_has_calls(write_calls) @mock.patch('ironic_lib.disk_utils.get_disk_identifier', lambda dev: 'ROOT') @@ -1191,6 +1201,7 @@ class TestStandbyExtension(base.IronicAgentTest): return self self.config(image_download_connection_timeout=1) + self.config(image_download_connection_retry_interval=0) image_info = _build_fake_image_info() file_mock = mock.Mock() open_mock.return_value.__enter__.return_value = file_mock @@ -1203,11 +1214,19 @@ class TestStandbyExtension(base.IronicAgentTest): self.agent_extension._stream_raw_image_onto_device, image_info, '/dev/foo') - requests_mock.assert_called_once_with(image_info['urls'][0], - cert=None, verify=True, - stream=True, proxies={}, - timeout=1) - file_mock.write.assert_called_once_with('meow') + + calls = [mock.call(image_info['urls'][0], cert=None, verify=True, + stream=True, proxies={}, timeout=1), + mock.call(image_info['urls'][0], cert=None, verify=True, + stream=True, proxies={}, timeout=1), + mock.call(image_info['urls'][0], cert=None, verify=True, + stream=True, proxies={}, timeout=1)] + requests_mock.assert_has_calls(calls) + + write_calls = [mock.call('meow'), + mock.call('meow'), + mock.call('meow')] + file_mock.write.assert_has_calls(write_calls) fix_gpt_mock.assert_not_called() def test__message_format_partition_bios(self): diff --git a/releasenotes/notes/agent-fully-retries-image-downloads-67409a493c6d08ae.yaml b/releasenotes/notes/agent-fully-retries-image-downloads-67409a493c6d08ae.yaml new file mode 100644 index 000000000..660ec4c81 --- /dev/null +++ b/releasenotes/notes/agent-fully-retries-image-downloads-67409a493c6d08ae.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes deployment failures when the image download is interrupted + mid-stream while the contents are being downloaded. Previously retries + were limited to only opening the initial connection.