Merge "Add timeout operations to try and prevent hang on read()" into stable/train
This commit is contained in:
@@ -74,6 +74,19 @@ def _download_with_proxy(image_info, url, image_id):
|
||||
resp = None
|
||||
for attempt in range(CONF.image_download_connection_retries + 1):
|
||||
try:
|
||||
# NOTE(TheJulia) The get request below does the following:
|
||||
# * Performs dns lookups, if necessary
|
||||
# * Opens the TCP socket to the remote host
|
||||
# * Negotiates TLS, if applicable
|
||||
# * Checks cert validity, if necessary, which may be
|
||||
# more tcp socket connections.
|
||||
# * Issues the get request and then returns back to the caller the
|
||||
# handler which is used to stream the data into the agent.
|
||||
# While this all may be at risk of transitory interrupts, most of
|
||||
# these socket will have timeouts applied to them, although not
|
||||
# exactly just as the timeout value exists. The risk in transitory
|
||||
# failure is more so once we've started the download and we are
|
||||
# processing the incoming data.
|
||||
resp = requests.get(url, stream=True, proxies=proxies,
|
||||
verify=verify, cert=cert,
|
||||
timeout=CONF.image_download_connection_timeout)
|
||||
@@ -275,6 +288,7 @@ class ImageDownload(object):
|
||||
any reason.
|
||||
"""
|
||||
self._time = time_obj or time.time()
|
||||
self._last_chunk_time = None
|
||||
self._image_info = image_info
|
||||
self._request = None
|
||||
|
||||
@@ -337,8 +351,25 @@ class ImageDownload(object):
|
||||
which is a constant in this module.
|
||||
"""
|
||||
for chunk in self._request.iter_content(IMAGE_CHUNK_SIZE):
|
||||
self._hash_algo.update(chunk)
|
||||
yield chunk
|
||||
# Per requests forum posts/discussions, iter_content should
|
||||
# periodically yield to the caller for the client to do things
|
||||
# like stopwatch and potentially interrupt the download.
|
||||
# While this seems weird and doesn't exactly seem to match the
|
||||
# patterns in requests and urllib3, it does appear to be the
|
||||
# case. Field testing in environments where TCP sockets were
|
||||
# discovered in a read hanged state were navigated with
|
||||
# this code.
|
||||
if chunk:
|
||||
self._last_chunk_time = time.time()
|
||||
self._hash_algo.update(chunk)
|
||||
yield chunk
|
||||
elif (time.time() - self._last_chunk_time
|
||||
> CONF.image_download_connection_timeout):
|
||||
LOG.error('Timeout reached waiting for a chunk of data from '
|
||||
'a remote server.')
|
||||
raise errors.ImageDownloadError(
|
||||
self._image_info['id'],
|
||||
'Timed out reading next chunk from webserver')
|
||||
|
||||
def verify_image(self, image_location):
|
||||
"""Verifies the checksum of the local images matches expectations.
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import mock
|
||||
from oslo_concurrency import processutils
|
||||
@@ -1111,6 +1112,61 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||
'/dev/fake root_uuid=ROOT')
|
||||
self.assertEqual(expected_msg, result_msg)
|
||||
|
||||
@mock.patch('ironic_lib.disk_utils.fix_gpt_partition', autospec=True)
|
||||
@mock.patch('hashlib.md5', autospec=True)
|
||||
@mock.patch('six.moves.builtins.open', autospec=True)
|
||||
@mock.patch('requests.get', autospec=True)
|
||||
def test_stream_raw_image_onto_device_socket_read_timeout(
|
||||
self, requests_mock, open_mock, md5_mock, fix_gpt_mock):
|
||||
|
||||
class create_timeout(object):
|
||||
status_code = 200
|
||||
|
||||
def __init__(self, url, stream, proxies, verify, cert, timeout):
|
||||
time.sleep(1)
|
||||
self.count = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.count:
|
||||
time.sleep(0.1)
|
||||
return None
|
||||
if self.count < 3:
|
||||
self.count += 1
|
||||
return "meow"
|
||||
else:
|
||||
time.sleep(30)
|
||||
raise StopIteration
|
||||
|
||||
# Python 2
|
||||
next = __next__
|
||||
|
||||
def iter_content(self, chunk_size):
|
||||
return self
|
||||
|
||||
self.config(image_download_connection_timeout=1)
|
||||
image_info = _build_fake_image_info()
|
||||
file_mock = mock.Mock()
|
||||
open_mock.return_value.__enter__.return_value = file_mock
|
||||
file_mock.read.return_value = None
|
||||
hexdigest_mock = md5_mock.return_value.hexdigest
|
||||
hexdigest_mock.return_value = image_info['checksum']
|
||||
requests_mock.side_effect = create_timeout
|
||||
self.assertRaisesRegex(
|
||||
errors.ImageDownloadError,
|
||||
'Timed out reading next chunk',
|
||||
self.agent_extension._stream_raw_image_onto_device,
|
||||
image_info,
|
||||
'/dev/foo')
|
||||
requests_mock.assert_called_once_with(image_info['urls'][0],
|
||||
cert=None, verify=True,
|
||||
stream=True, proxies={},
|
||||
timeout=1)
|
||||
file_mock.write.assert_called_once_with('meow')
|
||||
self.assertFalse(fix_gpt_mock.called)
|
||||
|
||||
def test__message_format_partition_bios(self):
|
||||
image_info = _build_fake_partition_image_info()
|
||||
msg = ('image ({}) already present on device {} ')
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes failure to detect a hung file download connection in the
|
||||
event that the kernel has not rapidly detected that the remote
|
||||
server has hung up the socket. This can happen when there is
|
||||
intermittent and transient connectivity issues such as those
|
||||
that can occur due to LACP failure response hold-downs timers
|
||||
in switching fabrics.
|
||||
Reference in New Issue
Block a user