Add timeout and retries when connection to an image server

If the server is stuck for any reason, the download will hang for
a potentially long time. Provide a timeout (defaults to 60 seconds)
and 2 retries on failure.

Change-Id: Ie53519266edd914fdbfa82fe52b4a55151e5ec5f
This commit is contained in:
Dmitry Tantsur 2020-04-23 19:23:53 +02:00
parent 8a9df0b485
commit 8adb7e1a04
4 changed files with 118 additions and 18 deletions

View File

@ -239,6 +239,21 @@ cli_opts = [
'enforce token validation. The configuration provided '
'by the conductor MAY override this and force this '
'setting to be changed to True in memory.'),
cfg.IntOpt('image_download_connection_timeout', min=1,
default=APARAMS.get(
'ipa-image-download-connection-timeout', 60),
help='The connection timeout (in seconds) when downloading '
'an image. Does not affect the whole download.'),
cfg.IntOpt('image_download_connection_retries', min=0,
default=APARAMS.get('ipa-image-download-connection-retries', 2),
help='How many times to retry the connection when downloading '
'an image. Also retries on failure HTTP statuses.'),
cfg.IntOpt('image_download_connection_retry_interval', min=0,
default=APARAMS.get(
'ipa-image-download-connection-retry-interval', 5),
help='Interval (in seconds) between two attempts to establish '
'connection when downloading an image.'),
]
CONF.register_cli_opts(cli_opts)

View File

@ -70,12 +70,28 @@ def _download_with_proxy(image_info, url, image_id):
os.environ['no_proxy'] = no_proxy
proxies = image_info.get('proxies', {})
verify, cert = utils.get_ssl_client_options(CONF)
resp = requests.get(url, stream=True, proxies=proxies,
verify=verify, cert=cert)
if resp.status_code != 200:
msg = ('Received status code {} from {}, expected 200. Response '
'body: {}').format(resp.status_code, url, resp.text)
raise errors.ImageDownloadError(image_id, msg)
resp = None
for attempt in range(CONF.image_download_connection_retries + 1):
try:
resp = requests.get(url, stream=True, proxies=proxies,
verify=verify, cert=cert,
timeout=CONF.image_download_connection_timeout)
if resp.status_code != 200:
msg = ('Received status code {} from {}, expected 200. '
'Response body: {}').format(resp.status_code, url,
resp.text)
raise errors.ImageDownloadError(image_id, msg)
except (errors.ImageDownloadError, requests.RequestException) as e:
if (attempt == CONF.image_download_connection_retries
# NOTE(dtantsur): do not retry 4xx status codes
or (resp and resp.status_code < 500)):
raise
else:
LOG.warning('Unable to connect to %s, retrying. Error: %s',
url, e)
time.sleep(CONF.image_download_connection_retry_interval)
else:
break
return resp

View File

@ -18,6 +18,7 @@ from unittest import mock
from ironic_lib import exception
from oslo_concurrency import processutils
import requests
from ironic_python_agent import errors
from ironic_python_agent.extensions import standby
@ -369,7 +370,8 @@ class TestStandbyExtension(base.IronicAgentTest):
standby._download_image(image_info)
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={})
stream=True, proxies={},
timeout=60)
write = file_mock.write
write.assert_any_call('some')
write.assert_any_call('content')
@ -400,7 +402,8 @@ class TestStandbyExtension(base.IronicAgentTest):
self.assertEqual(no_proxy, os.environ['no_proxy'])
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies=proxies)
stream=True, proxies=proxies,
timeout=60)
write = file_mock.write
write.assert_any_call('some')
write.assert_any_call('content')
@ -1111,7 +1114,8 @@ class TestStandbyExtension(base.IronicAgentTest):
'/dev/foo')
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={})
stream=True, proxies={},
timeout=60)
expected_calls = [mock.call('some'), mock.call('content')]
file_mock.write.assert_has_calls(expected_calls)
fix_gpt_mock.assert_called_once_with('/dev/foo', node_uuid=None)
@ -1136,7 +1140,8 @@ class TestStandbyExtension(base.IronicAgentTest):
image_info, '/dev/foo')
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={})
stream=True, proxies={},
timeout=60)
# Assert write was only called once and failed!
file_mock.write.assert_called_once_with('some')
@ -1254,11 +1259,13 @@ class TestImageDownload(base.IronicAgentTest):
self.assertEqual(content, list(image_download))
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={})
stream=True, proxies={},
timeout=60)
self.assertEqual(image_info['checksum'],
image_download._hash_algo.hexdigest())
def test_download_image_fail(self, requests_mock, time_mock):
@mock.patch('time.sleep', autospec=True)
def test_download_image_fail(self, sleep_mock, requests_mock, time_mock):
response = requests_mock.return_value
response.status_code = 401
response.text = 'Unauthorized'
@ -1272,7 +1279,56 @@ class TestImageDownload(base.IronicAgentTest):
standby.ImageDownload, image_info)
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={})
stream=True, proxies={},
timeout=60)
self.assertFalse(sleep_mock.called)
@mock.patch('time.sleep', autospec=True)
def test_download_image_retries(self, sleep_mock, requests_mock,
time_mock):
response = requests_mock.return_value
response.status_code = 500
response.text = 'Oops'
time_mock.return_value = 0.0
image_info = _build_fake_image_info()
msg = ('Error downloading image: Download of image fake_id failed: '
'URL: http://example.org; time: .* seconds. Error: '
'Received status code 500 from http://example.org, expected '
'200. Response body: Oops')
self.assertRaisesRegex(errors.ImageDownloadError, msg,
standby.ImageDownload, image_info)
requests_mock.assert_called_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={},
timeout=60)
self.assertEqual(3, requests_mock.call_count)
sleep_mock.assert_called_with(5)
self.assertEqual(2, sleep_mock.call_count)
@mock.patch('time.sleep', autospec=True)
def test_download_image_retries_success(self, sleep_mock, requests_mock,
md5_mock):
content = ['SpongeBob', 'SquarePants']
fail_response = mock.Mock()
fail_response.status_code = 500
fail_response.text = " "
response = mock.Mock()
response.status_code = 200
response.iter_content.return_value = content
requests_mock.side_effect = [requests.Timeout, fail_response, response]
image_info = _build_fake_image_info()
md5_mock.return_value.hexdigest.return_value = image_info['checksum']
image_download = standby.ImageDownload(image_info)
self.assertEqual(content, list(image_download))
requests_mock.assert_called_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={},
timeout=60)
self.assertEqual(3, requests_mock.call_count)
sleep_mock.assert_called_with(5)
self.assertEqual(2, sleep_mock.call_count)
def test_download_image_and_checksum(self, requests_mock, md5_mock):
content = ['SpongeBob', 'SquarePants']
@ -1293,9 +1349,9 @@ class TestImageDownload(base.IronicAgentTest):
self.assertEqual(content, list(image_download))
requests_mock.assert_has_calls([
mock.call('http://example.com/checksum', cert=None, verify=True,
stream=True, proxies={}),
stream=True, proxies={}, timeout=60),
mock.call(image_info['urls'][0], cert=None, verify=True,
stream=True, proxies={}),
stream=True, proxies={}, timeout=60),
])
self.assertEqual(fake_cs, image_download._hash_algo.hexdigest())
@ -1323,9 +1379,9 @@ foobar irrelevant file.img
self.assertEqual(content, list(image_download))
requests_mock.assert_has_calls([
mock.call('http://example.com/checksum', cert=None, verify=True,
stream=True, proxies={}),
stream=True, proxies={}, timeout=60),
mock.call(image_info['urls'][0], cert=None, verify=True,
stream=True, proxies={}),
stream=True, proxies={}, timeout=60),
])
self.assertEqual(fake_cs, image_download._hash_algo.hexdigest())
@ -1378,7 +1434,9 @@ foobar irrelevant file.img
response = mock.Mock()
response.status_code = 200
response.iter_content.return_value = content
requests_mock.side_effect = [cs_response, response]
# 3 retries on status code
requests_mock.side_effect = [cs_response, cs_response, cs_response,
response]
image_info = _build_fake_image_info(
'http://example.com/path/image.img')

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Provides timeout and retries when establishing a connection to download
an image in the ``standby`` extension. Reduces probability of an image
download getting stuck in the event of network problems.
The default timeout is 60 seconds and can be set via the
``ipa-image-download-connection-timeout`` kernel parameter. The default
number of retries is 2 and can be set via the
``ipa-image-download-connection-retries`` parameter.