diff --git a/ironic_python_agent/extensions/standby.py b/ironic_python_agent/extensions/standby.py index 1add183df..3a496ab11 100644 --- a/ironic_python_agent/extensions/standby.py +++ b/ironic_python_agent/extensions/standby.py @@ -74,6 +74,19 @@ def _download_with_proxy(image_info, url, image_id): resp = None for attempt in range(CONF.image_download_connection_retries + 1): try: + # NOTE(TheJulia) The get request below does the following: + # * Performs dns lookups, if necessary + # * Opens the TCP socket to the remote host + # * Negotiates TLS, if applicable + # * Checks cert validity, if necessary, which may be + # more tcp socket connections. + # * Issues the get request and then returns back to the caller the + # handler which is used to stream the data into the agent. + # While this all may be at risk of transitory interrupts, most of + # these socket will have timeouts applied to them, although not + # exactly just as the timeout value exists. The risk in transitory + # failure is more so once we've started the download and we are + # processing the incoming data. resp = requests.get(url, stream=True, proxies=proxies, verify=verify, cert=cert, timeout=CONF.image_download_connection_timeout) @@ -275,6 +288,7 @@ class ImageDownload(object): any reason. """ self._time = time_obj or time.time() + self._last_chunk_time = None self._image_info = image_info self._request = None @@ -337,8 +351,25 @@ class ImageDownload(object): which is a constant in this module. """ for chunk in self._request.iter_content(IMAGE_CHUNK_SIZE): - self._hash_algo.update(chunk) - yield chunk + # Per requests forum posts/discussions, iter_content should + # periodically yield to the caller for the client to do things + # like stopwatch and potentially interrupt the download. + # While this seems weird and doesn't exactly seem to match the + # patterns in requests and urllib3, it does appear to be the + # case. Field testing in environments where TCP sockets were + # discovered in a read hanged state were navigated with + # this code. + if chunk: + self._last_chunk_time = time.time() + self._hash_algo.update(chunk) + yield chunk + elif (time.time() - self._last_chunk_time + > CONF.image_download_connection_timeout): + LOG.error('Timeout reached waiting for a chunk of data from ' + 'a remote server.') + raise errors.ImageDownloadError( + self._image_info['id'], + 'Timed out reading next chunk from webserver') def verify_image(self, image_location): """Verifies the checksum of the local images matches expectations. diff --git a/ironic_python_agent/tests/unit/extensions/test_standby.py b/ironic_python_agent/tests/unit/extensions/test_standby.py index e9bd9c130..adb2e0838 100644 --- a/ironic_python_agent/tests/unit/extensions/test_standby.py +++ b/ironic_python_agent/tests/unit/extensions/test_standby.py @@ -14,6 +14,7 @@ import os import tempfile +import time import mock from oslo_concurrency import processutils @@ -1111,6 +1112,61 @@ class TestStandbyExtension(base.IronicAgentTest): '/dev/fake root_uuid=ROOT') self.assertEqual(expected_msg, result_msg) + @mock.patch('ironic_lib.disk_utils.fix_gpt_partition', autospec=True) + @mock.patch('hashlib.md5', autospec=True) + @mock.patch('six.moves.builtins.open', autospec=True) + @mock.patch('requests.get', autospec=True) + def test_stream_raw_image_onto_device_socket_read_timeout( + self, requests_mock, open_mock, md5_mock, fix_gpt_mock): + + class create_timeout(object): + status_code = 200 + + def __init__(self, url, stream, proxies, verify, cert, timeout): + time.sleep(1) + self.count = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.count: + time.sleep(0.1) + return None + if self.count < 3: + self.count += 1 + return "meow" + else: + time.sleep(30) + raise StopIteration + + # Python 2 + next = __next__ + + def iter_content(self, chunk_size): + return self + + self.config(image_download_connection_timeout=1) + image_info = _build_fake_image_info() + file_mock = mock.Mock() + open_mock.return_value.__enter__.return_value = file_mock + file_mock.read.return_value = None + hexdigest_mock = md5_mock.return_value.hexdigest + hexdigest_mock.return_value = image_info['checksum'] + requests_mock.side_effect = create_timeout + self.assertRaisesRegex( + errors.ImageDownloadError, + 'Timed out reading next chunk', + self.agent_extension._stream_raw_image_onto_device, + image_info, + '/dev/foo') + requests_mock.assert_called_once_with(image_info['urls'][0], + cert=None, verify=True, + stream=True, proxies={}, + timeout=1) + file_mock.write.assert_called_once_with('meow') + self.assertFalse(fix_gpt_mock.called) + def test__message_format_partition_bios(self): image_info = _build_fake_partition_image_info() msg = ('image ({}) already present on device {} ') diff --git a/releasenotes/notes/timeout_on_file_download-ed77918318316075.yaml b/releasenotes/notes/timeout_on_file_download-ed77918318316075.yaml new file mode 100644 index 000000000..c90073e3a --- /dev/null +++ b/releasenotes/notes/timeout_on_file_download-ed77918318316075.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes failure to detect a hung file download connection in the + event that the kernel has not rapidly detected that the remote + server has hung up the socket. This can happen when there is + intermittent and transient connectivity issues such as those + that can occur due to LACP failure response hold-downs timers + in switching fabrics.