Merge "Add timeout operations to try and prevent hang on read()" into stable/train

This commit is contained in:
Zuul
2020-07-02 12:12:35 +00:00
committed by Gerrit Code Review
3 changed files with 98 additions and 2 deletions

View File

@@ -74,6 +74,19 @@ def _download_with_proxy(image_info, url, image_id):
resp = None
for attempt in range(CONF.image_download_connection_retries + 1):
try:
# NOTE(TheJulia) The get request below does the following:
# * Performs dns lookups, if necessary
# * Opens the TCP socket to the remote host
# * Negotiates TLS, if applicable
# * Checks cert validity, if necessary, which may be
# more tcp socket connections.
# * Issues the get request and then returns back to the caller the
# handler which is used to stream the data into the agent.
# While this all may be at risk of transitory interrupts, most of
# these socket will have timeouts applied to them, although not
# exactly just as the timeout value exists. The risk in transitory
# failure is more so once we've started the download and we are
# processing the incoming data.
resp = requests.get(url, stream=True, proxies=proxies,
verify=verify, cert=cert,
timeout=CONF.image_download_connection_timeout)
@@ -275,6 +288,7 @@ class ImageDownload(object):
any reason.
"""
self._time = time_obj or time.time()
self._last_chunk_time = None
self._image_info = image_info
self._request = None
@@ -337,8 +351,25 @@ class ImageDownload(object):
which is a constant in this module.
"""
for chunk in self._request.iter_content(IMAGE_CHUNK_SIZE):
self._hash_algo.update(chunk)
yield chunk
# Per requests forum posts/discussions, iter_content should
# periodically yield to the caller for the client to do things
# like stopwatch and potentially interrupt the download.
# While this seems weird and doesn't exactly seem to match the
# patterns in requests and urllib3, it does appear to be the
# case. Field testing in environments where TCP sockets were
# discovered in a read hanged state were navigated with
# this code.
if chunk:
self._last_chunk_time = time.time()
self._hash_algo.update(chunk)
yield chunk
elif (time.time() - self._last_chunk_time
> CONF.image_download_connection_timeout):
LOG.error('Timeout reached waiting for a chunk of data from '
'a remote server.')
raise errors.ImageDownloadError(
self._image_info['id'],
'Timed out reading next chunk from webserver')
def verify_image(self, image_location):
"""Verifies the checksum of the local images matches expectations.

View File

@@ -14,6 +14,7 @@
import os
import tempfile
import time
import mock
from oslo_concurrency import processutils
@@ -1111,6 +1112,61 @@ class TestStandbyExtension(base.IronicAgentTest):
'/dev/fake root_uuid=ROOT')
self.assertEqual(expected_msg, result_msg)
@mock.patch('ironic_lib.disk_utils.fix_gpt_partition', autospec=True)
@mock.patch('hashlib.md5', autospec=True)
@mock.patch('six.moves.builtins.open', autospec=True)
@mock.patch('requests.get', autospec=True)
def test_stream_raw_image_onto_device_socket_read_timeout(
self, requests_mock, open_mock, md5_mock, fix_gpt_mock):
class create_timeout(object):
status_code = 200
def __init__(self, url, stream, proxies, verify, cert, timeout):
time.sleep(1)
self.count = 0
def __iter__(self):
return self
def __next__(self):
if self.count:
time.sleep(0.1)
return None
if self.count < 3:
self.count += 1
return "meow"
else:
time.sleep(30)
raise StopIteration
# Python 2
next = __next__
def iter_content(self, chunk_size):
return self
self.config(image_download_connection_timeout=1)
image_info = _build_fake_image_info()
file_mock = mock.Mock()
open_mock.return_value.__enter__.return_value = file_mock
file_mock.read.return_value = None
hexdigest_mock = md5_mock.return_value.hexdigest
hexdigest_mock.return_value = image_info['checksum']
requests_mock.side_effect = create_timeout
self.assertRaisesRegex(
errors.ImageDownloadError,
'Timed out reading next chunk',
self.agent_extension._stream_raw_image_onto_device,
image_info,
'/dev/foo')
requests_mock.assert_called_once_with(image_info['urls'][0],
cert=None, verify=True,
stream=True, proxies={},
timeout=1)
file_mock.write.assert_called_once_with('meow')
self.assertFalse(fix_gpt_mock.called)
def test__message_format_partition_bios(self):
image_info = _build_fake_partition_image_info()
msg = ('image ({}) already present on device {} ')

View File

@@ -0,0 +1,9 @@
---
fixes:
- |
Fixes failure to detect a hung file download connection in the
event that the kernel has not rapidly detected that the remote
server has hung up the socket. This can happen when there is
intermittent and transient connectivity issues such as those
that can occur due to LACP failure response hold-downs timers
in switching fabrics.