Merge "Fix TypeError on agent lookup failure" into stable/rocky

This commit is contained in:
Zuul 2020-08-24 14:57:07 +00:00 committed by Gerrit Code Review
commit 353dfe0915
3 changed files with 84 additions and 4 deletions

View File

@ -148,12 +148,41 @@ class APIClient(object):
'GET', self.lookup_api,
headers=self._get_ironic_api_version_header(),
params=params)
except Exception as err:
LOG.exception(
'Unhandled error looking up node with addresses %r at %s: %s',
params['addresses'], self.api_url, err,
except (requests.exceptions.Timeout,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
requests.exceptions.HTTPError) as err:
LOG.warning(
'Error detected while attempting to perform lookup '
'with %s, retrying. Error: %s', self.api_url, err
)
return False
except Exception as err:
# NOTE(TheJulia): If you're looking here, and you're wondering
# why the retry logic is not working or your investigating a weird
# error or even IPA just exiting,
# See https://storyboard.openstack.org/#!/story/2007968
# To be clear, we're going to try to provide as much detail as
# possible in the exit handling
msg = ('Unhandled error looking up node with addresses {} at '
'{}: {}'.format(params['addresses'], self.api_url, err))
# No matter what we do at this point, IPA is going to exit.
# This is because we don't know why the exception occured and
# we likely should not try to retry as such.
# We will attempt to provide as much detail to the logs as
# possible as to what occured, although depending on the logging
# subsystem, additional errors can occur, thus the additional
# handling below.
try:
LOG.exception(msg)
return False
except Exception as exc_err:
LOG.error(msg)
exc_msg = ('Unexpected exception occured while trying to '
'log additional detail. Error: {}'.format(exc_err))
LOG.error(exc_msg)
raise errors.LookupNodeError(msg)
if response.status_code != requests.codes.OK:
LOG.warning(

View File

@ -15,6 +15,7 @@
import mock
from oslo_serialization import jsonutils
from oslo_service import loopingcall
import requests
from ironic_python_agent import errors
from ironic_python_agent import hardware
@ -286,6 +287,47 @@ class TestBaseIronicPythonAgent(base.IronicAgentTest):
'node_uuid': 'someuuid'},
params)
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
def test_do_lookup_transient_exceptions(self, mock_log):
exc_list = [requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
requests.exceptions.HTTPError,
requests.exceptions.Timeout,
requests.exceptions.ConnectTimeout]
self.api_client.session.request = mock.Mock()
for exc in exc_list:
self.api_client.session.request.reset_mock()
mock_log.reset_mock()
self.api_client.session.request.side_effect = exc
error = self.api_client._do_lookup(self.hardware_info,
node_uuid=None)
self.assertFalse(error)
mock_log.error.assert_has_calls([])
self.assertEqual(1, mock_log.warning.call_count)
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
def test_do_lookup_unknown_exception(self, mock_log):
self.api_client.session.request = mock.Mock()
self.api_client.session.request.side_effect = \
requests.exceptions.RequestException('meow')
self.assertFalse(
self.api_client._do_lookup(self.hardware_info,
node_uuid=None))
self.assertEqual(1, mock_log.exception.call_count)
@mock.patch.object(ironic_api_client, 'LOG', autospec=True)
def test_do_lookup_unknown_exception_fallback(self, mock_log):
mock_log.exception.side_effect = TypeError
self.api_client.session.request = mock.Mock()
self.api_client.session.request.side_effect = \
requests.exceptions.RequestException('meow')
self.assertRaises(errors.LookupNodeError,
self.api_client._do_lookup,
self.hardware_info,
node_uuid=None)
self.assertEqual(1, mock_log.exception.call_count)
self.assertEqual(2, mock_log.error.call_count)
def test_do_lookup_bad_response_code(self):
response = FakeResponse(status_code=400, content={
'node': {

View File

@ -0,0 +1,9 @@
---
fixes:
- |
Fixes retry logic issues with the Agent Lookup which can result in
the lookup failing prematurely before being completed, typically
resulting in an abrupt end to the agent logging and potentially
weird errors like TypeError being reported on the agent process
standard error output. For more information see
`bug 2007968 <https://storyboard.openstack.org/#!/story/2007968>`_.