Add full download retries

Instead of just trying to get the connection and handler
for the download, lets try to retry the whole action of
of downloading.

Change-Id: I9217792d32e6f33c70f146a9b7d3ef58c5644d8a
(cherry picked from commit 159ab9f0ce)
This commit is contained in:
Julia Kreger 2020-06-16 16:45:43 -07:00
parent 33c96d0066
commit 1b15271a7d
3 changed files with 83 additions and 29 deletions

View File

@ -408,16 +408,30 @@ def _download_image(image_info):
"""
starttime = time.time()
image_location = _image_location(image_info)
image_download = ImageDownload(image_info, time_obj=starttime)
with open(image_location, 'wb') as f:
for attempt in range(CONF.image_download_connection_retries + 1):
try:
for chunk in image_download:
f.write(chunk)
except Exception as e:
msg = 'Unable to write image to {}. Error: {}'.format(
image_location, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
image_download = ImageDownload(image_info, time_obj=starttime)
with open(image_location, 'wb') as f:
try:
for chunk in image_download:
f.write(chunk)
except Exception as e:
msg = 'Unable to write image to {}. Error: {}'.format(
image_location, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
LOG.warning('Image download failed, %(attempt)s of %(total)s: '
'%(error)s',
{'attempt': attempt,
'total': CONF.image_download_connection_retries,
'error': e})
time.sleep(CONF.image_download_connection_retry_interval)
else:
break
totaltime = time.time() - starttime
LOG.info("Image downloaded from {} in {} seconds".format(image_location,
@ -553,16 +567,31 @@ class StandbyExtension(base.BaseAgentExtension):
match the checksum as reported by glance in image_info.
"""
starttime = time.time()
image_download = ImageDownload(image_info, time_obj=starttime)
with open(device, 'wb+') as f:
total_retries = CONF.image_download_connection_retries
for attempt in range(total_retries + 1):
try:
for chunk in image_download:
f.write(chunk)
except Exception as e:
msg = 'Unable to write image to device {}. Error: {}'.format(
device, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
image_download = ImageDownload(image_info, time_obj=starttime)
with open(device, 'wb+') as f:
try:
for chunk in image_download:
f.write(chunk)
except Exception as e:
msg = ('Unable to write image to device {}. '
'Error: {}').format(device, str(e))
raise errors.ImageDownloadError(image_info['id'], msg)
except errors.ImageDownloadError as e:
if attempt == CONF.image_download_connection_retries:
raise
else:
LOG.warning('Image download failed, %(attempt)s of '
'%(total)s: %(error)s',
{'attempt': attempt,
'total': total_retries,
'error': e})
time.sleep(CONF.image_download_connection_retry_interval)
else:
break
totaltime = time.time() - starttime
LOG.info("Image streamed onto device {} in {} "

View File

@ -1126,6 +1126,8 @@ class TestStandbyExtension(base.IronicAgentTest):
@mock.patch('requests.get', autospec=True)
def test_stream_raw_image_onto_device_write_error(self, requests_mock,
open_mock, md5_mock):
self.config(image_download_connection_timeout=1)
self.config(image_download_connection_retry_interval=0)
image_info = _build_fake_image_info()
response = requests_mock.return_value
response.status_code = 200
@ -1139,12 +1141,20 @@ class TestStandbyExtension(base.IronicAgentTest):
self.assertRaises(errors.ImageDownloadError,
self.agent_extension._stream_raw_image_onto_device,
image_info, '/dev/foo')
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={},
timeout=60)
# Assert write was only called once and failed!
file_mock.write.assert_called_once_with('some')
calls = [mock.call('http://example.org', cert=None, proxies={},
stream=True, timeout=1, verify=True),
mock.call().iter_content(mock.ANY),
mock.call('http://example.org', cert=None, proxies={},
stream=True, timeout=1, verify=True),
mock.call().iter_content(mock.ANY),
mock.call('http://example.org', cert=None, proxies={},
stream=True, timeout=1, verify=True),
mock.call().iter_content(mock.ANY)]
requests_mock.assert_has_calls(calls)
write_calls = [mock.call('some'),
mock.call('some'),
mock.call('some')]
file_mock.write.assert_has_calls(write_calls)
@mock.patch('ironic_lib.disk_utils.get_disk_identifier',
lambda dev: 'ROOT')
@ -1191,6 +1201,7 @@ class TestStandbyExtension(base.IronicAgentTest):
return self
self.config(image_download_connection_timeout=1)
self.config(image_download_connection_retry_interval=0)
image_info = _build_fake_image_info()
file_mock = mock.Mock()
open_mock.return_value.__enter__.return_value = file_mock
@ -1203,11 +1214,19 @@ class TestStandbyExtension(base.IronicAgentTest):
self.agent_extension._stream_raw_image_onto_device,
image_info,
'/dev/foo')
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={},
timeout=1)
file_mock.write.assert_called_once_with('meow')
calls = [mock.call(image_info['urls'][0], cert=None, verify=True,
stream=True, proxies={}, timeout=1),
mock.call(image_info['urls'][0], cert=None, verify=True,
stream=True, proxies={}, timeout=1),
mock.call(image_info['urls'][0], cert=None, verify=True,
stream=True, proxies={}, timeout=1)]
requests_mock.assert_has_calls(calls)
write_calls = [mock.call('meow'),
mock.call('meow'),
mock.call('meow')]
file_mock.write.assert_has_calls(write_calls)
fix_gpt_mock.assert_not_called()
def test__message_format_partition_bios(self):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixes deployment failures when the image download is interrupted
mid-stream while the contents are being downloaded. Previously retries
were limited to only opening the initial connection.