Add timeout operations to try and prevent hang on read()
Socket read operations can be blocking and may not timeout as
expected when thinking of timeouts at the beginning of a
socket request. This can occur when streaming file contents
down to the agent and there is a hard connectivity break.
In other words, we could be in a situation like:
- read(fd, len) - Gets data
- Select returns context to the program, we do things with data.
** hard connectivity break for next 90 seconds**
- read(fd, len) - We drain the in-memory buffer side of the socket.
- Select returns context, we do things with our remaining data
** Server retransmits **
** Server times out due to no ack **
** Server closes socket and issues a FIN,RST packet to the client **
** Connectivity restored, Client never got FIN,RST **
** Client socket still waiting for more data **
- read(fd, len) - No data returned
- Select returns, yet we have no data to act on as the buffer is
empty OR the buffered data doesn't meet our requried read len value.
tl;dr noop
- read(fd, len) <-- We continue to try and read until the socket is
recognized as dead, which could be a long time.
NOTE: The above read()s are python's read() on an contents being
streamed. Lower level reads exist, but brains will hurt
if we try to cover the dynamics at that level.
As such, we need to keep an eye on when the last time we
received a packet, and treat that as if we have timed out
or not. Requests periodically yeilds back even when no data
has been received, in order to allow the caller to wall
clock the progress/status and take appropriate action.
When we exceed the timeout time value with our wall clock,
we will fail the download.
Change-Id: I7214fc9dbd903789c9e39ee809f05454aeb5a240
(cherry picked from commit c5b97eb781
)
This commit is contained in:
parent
ed3f00f07a
commit
33c96d0066
|
@ -73,6 +73,19 @@ def _download_with_proxy(image_info, url, image_id):
|
||||||
resp = None
|
resp = None
|
||||||
for attempt in range(CONF.image_download_connection_retries + 1):
|
for attempt in range(CONF.image_download_connection_retries + 1):
|
||||||
try:
|
try:
|
||||||
|
# NOTE(TheJulia) The get request below does the following:
|
||||||
|
# * Performs dns lookups, if necessary
|
||||||
|
# * Opens the TCP socket to the remote host
|
||||||
|
# * Negotiates TLS, if applicable
|
||||||
|
# * Checks cert validity, if necessary, which may be
|
||||||
|
# more tcp socket connections.
|
||||||
|
# * Issues the get request and then returns back to the caller the
|
||||||
|
# handler which is used to stream the data into the agent.
|
||||||
|
# While this all may be at risk of transitory interrupts, most of
|
||||||
|
# these socket will have timeouts applied to them, although not
|
||||||
|
# exactly just as the timeout value exists. The risk in transitory
|
||||||
|
# failure is more so once we've started the download and we are
|
||||||
|
# processing the incoming data.
|
||||||
resp = requests.get(url, stream=True, proxies=proxies,
|
resp = requests.get(url, stream=True, proxies=proxies,
|
||||||
verify=verify, cert=cert,
|
verify=verify, cert=cert,
|
||||||
timeout=CONF.image_download_connection_timeout)
|
timeout=CONF.image_download_connection_timeout)
|
||||||
|
@ -279,6 +292,7 @@ class ImageDownload(object):
|
||||||
any reason.
|
any reason.
|
||||||
"""
|
"""
|
||||||
self._time = time_obj or time.time()
|
self._time = time_obj or time.time()
|
||||||
|
self._last_chunk_time = None
|
||||||
self._image_info = image_info
|
self._image_info = image_info
|
||||||
self._request = None
|
self._request = None
|
||||||
|
|
||||||
|
@ -341,7 +355,23 @@ class ImageDownload(object):
|
||||||
which is a constant in this module.
|
which is a constant in this module.
|
||||||
"""
|
"""
|
||||||
for chunk in self._request.iter_content(IMAGE_CHUNK_SIZE):
|
for chunk in self._request.iter_content(IMAGE_CHUNK_SIZE):
|
||||||
self._hash_algo.update(chunk)
|
# Per requests forum posts/discussions, iter_content should
|
||||||
|
# periodically yield to the caller for the client to do things
|
||||||
|
# like stopwatch and potentially interrupt the download.
|
||||||
|
# While this seems weird and doesn't exactly seem to match the
|
||||||
|
# patterns in requests and urllib3, it does appear to be the
|
||||||
|
# case. Field testing in environments where TCP sockets were
|
||||||
|
# discovered in a read hanged state were navigated with
|
||||||
|
# this code.
|
||||||
|
if chunk:
|
||||||
|
self._last_chunk_time = time.time()
|
||||||
|
self._hash_algo.update(chunk)
|
||||||
|
elif (time.time() - self._last_chunk_time
|
||||||
|
> CONF.image_download_connection_timeout):
|
||||||
|
LOG.error('Timeout reached waiting for a chunk of data from '
|
||||||
|
'a remote server.')
|
||||||
|
raise errors.ImageDownloadError(
|
||||||
|
'Timed out reading next chunk from webserver')
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
def verify_image(self, image_location):
|
def verify_image(self, image_location):
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import time
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
from ironic_lib import exception
|
from ironic_lib import exception
|
||||||
|
@ -1158,6 +1159,57 @@ class TestStandbyExtension(base.IronicAgentTest):
|
||||||
'/dev/fake root_uuid=ROOT')
|
'/dev/fake root_uuid=ROOT')
|
||||||
self.assertEqual(expected_msg, result_msg)
|
self.assertEqual(expected_msg, result_msg)
|
||||||
|
|
||||||
|
@mock.patch('ironic_lib.disk_utils.fix_gpt_partition', autospec=True)
|
||||||
|
@mock.patch('hashlib.md5', autospec=True)
|
||||||
|
@mock.patch('builtins.open', autospec=True)
|
||||||
|
@mock.patch('requests.get', autospec=True)
|
||||||
|
def test_stream_raw_image_onto_device_socket_read_timeout(
|
||||||
|
self, requests_mock, open_mock, md5_mock, fix_gpt_mock):
|
||||||
|
|
||||||
|
class create_timeout(object):
|
||||||
|
status_code = 200
|
||||||
|
|
||||||
|
def __init__(self, url, stream, proxies, verify, cert, timeout):
|
||||||
|
time.sleep(1)
|
||||||
|
self.count = 0
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
if self.count == 1:
|
||||||
|
time.sleep(4)
|
||||||
|
return None
|
||||||
|
if self.count < 3:
|
||||||
|
self.count += 1
|
||||||
|
return "meow"
|
||||||
|
else:
|
||||||
|
time.sleep(30)
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
def iter_content(self, chunk_size):
|
||||||
|
return self
|
||||||
|
|
||||||
|
self.config(image_download_connection_timeout=1)
|
||||||
|
image_info = _build_fake_image_info()
|
||||||
|
file_mock = mock.Mock()
|
||||||
|
open_mock.return_value.__enter__.return_value = file_mock
|
||||||
|
file_mock.read.return_value = None
|
||||||
|
hexdigest_mock = md5_mock.return_value.hexdigest
|
||||||
|
hexdigest_mock.return_value = image_info['checksum']
|
||||||
|
requests_mock.side_effect = create_timeout
|
||||||
|
self.assertRaises(
|
||||||
|
errors.ImageDownloadError,
|
||||||
|
self.agent_extension._stream_raw_image_onto_device,
|
||||||
|
image_info,
|
||||||
|
'/dev/foo')
|
||||||
|
requests_mock.assert_called_once_with(image_info['urls'][0],
|
||||||
|
cert=None, verify=True,
|
||||||
|
stream=True, proxies={},
|
||||||
|
timeout=1)
|
||||||
|
file_mock.write.assert_called_once_with('meow')
|
||||||
|
fix_gpt_mock.assert_not_called()
|
||||||
|
|
||||||
def test__message_format_partition_bios(self):
|
def test__message_format_partition_bios(self):
|
||||||
image_info = _build_fake_partition_image_info()
|
image_info = _build_fake_partition_image_info()
|
||||||
msg = ('image ({}) already present on device {} ')
|
msg = ('image ({}) already present on device {} ')
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes failure to detect a hung file download connection in the
|
||||||
|
event that the kernel has not rapidly detected that the remote
|
||||||
|
server has hung up the socket. This can happen when there is
|
||||||
|
intermittent and transient connectivity issues such as those
|
||||||
|
that can occur due to LACP failure response hold-downs timers
|
||||||
|
in switching fabrics.
|
Loading…
Reference in New Issue