From 8adb7e1a045c667dd97456d06bd74d74a4da2787 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Thu, 23 Apr 2020 19:23:53 +0200 Subject: [PATCH] Add timeout and retries when connection to an image server If the server is stuck for any reason, the download will hang for a potentially long time. Provide a timeout (defaults to 60 seconds) and 2 retries on failure. Change-Id: Ie53519266edd914fdbfa82fe52b4a55151e5ec5f --- ironic_python_agent/config.py | 15 ++++ ironic_python_agent/extensions/standby.py | 28 +++++-- .../tests/unit/extensions/test_standby.py | 82 ++++++++++++++++--- ...age-download-retries-65ac31fe4328e438.yaml | 11 +++ 4 files changed, 118 insertions(+), 18 deletions(-) create mode 100644 releasenotes/notes/image-download-retries-65ac31fe4328e438.yaml diff --git a/ironic_python_agent/config.py b/ironic_python_agent/config.py index 5724a675e..c73e6fc96 100644 --- a/ironic_python_agent/config.py +++ b/ironic_python_agent/config.py @@ -239,6 +239,21 @@ cli_opts = [ 'enforce token validation. The configuration provided ' 'by the conductor MAY override this and force this ' 'setting to be changed to True in memory.'), + cfg.IntOpt('image_download_connection_timeout', min=1, + default=APARAMS.get( + 'ipa-image-download-connection-timeout', 60), + help='The connection timeout (in seconds) when downloading ' + 'an image. Does not affect the whole download.'), + cfg.IntOpt('image_download_connection_retries', min=0, + default=APARAMS.get('ipa-image-download-connection-retries', 2), + help='How many times to retry the connection when downloading ' + 'an image. Also retries on failure HTTP statuses.'), + cfg.IntOpt('image_download_connection_retry_interval', min=0, + default=APARAMS.get( + 'ipa-image-download-connection-retry-interval', 5), + help='Interval (in seconds) between two attempts to establish ' + 'connection when downloading an image.'), + ] CONF.register_cli_opts(cli_opts) diff --git a/ironic_python_agent/extensions/standby.py b/ironic_python_agent/extensions/standby.py index 289e1cc00..bd0ade60d 100644 --- a/ironic_python_agent/extensions/standby.py +++ b/ironic_python_agent/extensions/standby.py @@ -70,12 +70,28 @@ def _download_with_proxy(image_info, url, image_id): os.environ['no_proxy'] = no_proxy proxies = image_info.get('proxies', {}) verify, cert = utils.get_ssl_client_options(CONF) - resp = requests.get(url, stream=True, proxies=proxies, - verify=verify, cert=cert) - if resp.status_code != 200: - msg = ('Received status code {} from {}, expected 200. Response ' - 'body: {}').format(resp.status_code, url, resp.text) - raise errors.ImageDownloadError(image_id, msg) + resp = None + for attempt in range(CONF.image_download_connection_retries + 1): + try: + resp = requests.get(url, stream=True, proxies=proxies, + verify=verify, cert=cert, + timeout=CONF.image_download_connection_timeout) + if resp.status_code != 200: + msg = ('Received status code {} from {}, expected 200. ' + 'Response body: {}').format(resp.status_code, url, + resp.text) + raise errors.ImageDownloadError(image_id, msg) + except (errors.ImageDownloadError, requests.RequestException) as e: + if (attempt == CONF.image_download_connection_retries + # NOTE(dtantsur): do not retry 4xx status codes + or (resp and resp.status_code < 500)): + raise + else: + LOG.warning('Unable to connect to %s, retrying. Error: %s', + url, e) + time.sleep(CONF.image_download_connection_retry_interval) + else: + break return resp diff --git a/ironic_python_agent/tests/unit/extensions/test_standby.py b/ironic_python_agent/tests/unit/extensions/test_standby.py index d4030f092..b0cc081b1 100644 --- a/ironic_python_agent/tests/unit/extensions/test_standby.py +++ b/ironic_python_agent/tests/unit/extensions/test_standby.py @@ -18,6 +18,7 @@ from unittest import mock from ironic_lib import exception from oslo_concurrency import processutils +import requests from ironic_python_agent import errors from ironic_python_agent.extensions import standby @@ -369,7 +370,8 @@ class TestStandbyExtension(base.IronicAgentTest): standby._download_image(image_info) requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}) + stream=True, proxies={}, + timeout=60) write = file_mock.write write.assert_any_call('some') write.assert_any_call('content') @@ -400,7 +402,8 @@ class TestStandbyExtension(base.IronicAgentTest): self.assertEqual(no_proxy, os.environ['no_proxy']) requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies=proxies) + stream=True, proxies=proxies, + timeout=60) write = file_mock.write write.assert_any_call('some') write.assert_any_call('content') @@ -1111,7 +1114,8 @@ class TestStandbyExtension(base.IronicAgentTest): '/dev/foo') requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}) + stream=True, proxies={}, + timeout=60) expected_calls = [mock.call('some'), mock.call('content')] file_mock.write.assert_has_calls(expected_calls) fix_gpt_mock.assert_called_once_with('/dev/foo', node_uuid=None) @@ -1136,7 +1140,8 @@ class TestStandbyExtension(base.IronicAgentTest): image_info, '/dev/foo') requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}) + stream=True, proxies={}, + timeout=60) # Assert write was only called once and failed! file_mock.write.assert_called_once_with('some') @@ -1254,11 +1259,13 @@ class TestImageDownload(base.IronicAgentTest): self.assertEqual(content, list(image_download)) requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}) + stream=True, proxies={}, + timeout=60) self.assertEqual(image_info['checksum'], image_download._hash_algo.hexdigest()) - def test_download_image_fail(self, requests_mock, time_mock): + @mock.patch('time.sleep', autospec=True) + def test_download_image_fail(self, sleep_mock, requests_mock, time_mock): response = requests_mock.return_value response.status_code = 401 response.text = 'Unauthorized' @@ -1272,7 +1279,56 @@ class TestImageDownload(base.IronicAgentTest): standby.ImageDownload, image_info) requests_mock.assert_called_once_with(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}) + stream=True, proxies={}, + timeout=60) + self.assertFalse(sleep_mock.called) + + @mock.patch('time.sleep', autospec=True) + def test_download_image_retries(self, sleep_mock, requests_mock, + time_mock): + response = requests_mock.return_value + response.status_code = 500 + response.text = 'Oops' + time_mock.return_value = 0.0 + image_info = _build_fake_image_info() + msg = ('Error downloading image: Download of image fake_id failed: ' + 'URL: http://example.org; time: .* seconds. Error: ' + 'Received status code 500 from http://example.org, expected ' + '200. Response body: Oops') + self.assertRaisesRegex(errors.ImageDownloadError, msg, + standby.ImageDownload, image_info) + requests_mock.assert_called_with(image_info['urls'][0], + cert=None, verify=True, + stream=True, proxies={}, + timeout=60) + self.assertEqual(3, requests_mock.call_count) + sleep_mock.assert_called_with(5) + self.assertEqual(2, sleep_mock.call_count) + + @mock.patch('time.sleep', autospec=True) + def test_download_image_retries_success(self, sleep_mock, requests_mock, + md5_mock): + content = ['SpongeBob', 'SquarePants'] + fail_response = mock.Mock() + fail_response.status_code = 500 + fail_response.text = " " + response = mock.Mock() + response.status_code = 200 + response.iter_content.return_value = content + requests_mock.side_effect = [requests.Timeout, fail_response, response] + + image_info = _build_fake_image_info() + md5_mock.return_value.hexdigest.return_value = image_info['checksum'] + image_download = standby.ImageDownload(image_info) + + self.assertEqual(content, list(image_download)) + requests_mock.assert_called_with(image_info['urls'][0], + cert=None, verify=True, + stream=True, proxies={}, + timeout=60) + self.assertEqual(3, requests_mock.call_count) + sleep_mock.assert_called_with(5) + self.assertEqual(2, sleep_mock.call_count) def test_download_image_and_checksum(self, requests_mock, md5_mock): content = ['SpongeBob', 'SquarePants'] @@ -1293,9 +1349,9 @@ class TestImageDownload(base.IronicAgentTest): self.assertEqual(content, list(image_download)) requests_mock.assert_has_calls([ mock.call('http://example.com/checksum', cert=None, verify=True, - stream=True, proxies={}), + stream=True, proxies={}, timeout=60), mock.call(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}), + stream=True, proxies={}, timeout=60), ]) self.assertEqual(fake_cs, image_download._hash_algo.hexdigest()) @@ -1323,9 +1379,9 @@ foobar irrelevant file.img self.assertEqual(content, list(image_download)) requests_mock.assert_has_calls([ mock.call('http://example.com/checksum', cert=None, verify=True, - stream=True, proxies={}), + stream=True, proxies={}, timeout=60), mock.call(image_info['urls'][0], cert=None, verify=True, - stream=True, proxies={}), + stream=True, proxies={}, timeout=60), ]) self.assertEqual(fake_cs, image_download._hash_algo.hexdigest()) @@ -1378,7 +1434,9 @@ foobar irrelevant file.img response = mock.Mock() response.status_code = 200 response.iter_content.return_value = content - requests_mock.side_effect = [cs_response, response] + # 3 retries on status code + requests_mock.side_effect = [cs_response, cs_response, cs_response, + response] image_info = _build_fake_image_info( 'http://example.com/path/image.img') diff --git a/releasenotes/notes/image-download-retries-65ac31fe4328e438.yaml b/releasenotes/notes/image-download-retries-65ac31fe4328e438.yaml new file mode 100644 index 000000000..7f787a567 --- /dev/null +++ b/releasenotes/notes/image-download-retries-65ac31fe4328e438.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Provides timeout and retries when establishing a connection to download + an image in the ``standby`` extension. Reduces probability of an image + download getting stuck in the event of network problems. + + The default timeout is 60 seconds and can be set via the + ``ipa-image-download-connection-timeout`` kernel parameter. The default + number of retries is 2 and can be set via the + ``ipa-image-download-connection-retries`` parameter.