From 6befbd6c45c2590da95d7920b19d882f19d22957 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Thu, 30 Jul 2020 16:16:57 -0700 Subject: [PATCH] Fix TypeError on agent lookup failure Agent lookups can fail as we presently use logging.exception, better known in our code as LOG.exception, which can also generate other fun issues on journald based systems where additional errors could be raised resulting in us being unable to troubleshoot the the actual issue. Because of the mis-use of LOG.exception and the default behavior of the backoff retry handler, the retry logic was also not functional as any error no matter how small caused IPA to just exit. Change-Id: Ic4608b7c6ff9773d1403926efb3d59869c71343b Story: 2007968 Task: 40465 (cherry picked from commit 5eab9bced63b2b9a6753cbbf594dda7ef9d03a3a) (cherry picked from commit e45ee3f64221962ad243f7df8bc123510c632935) (cherry picked from commit 9fe5cb92be3c3b0342612a0317e34254fc551cda) (cherry picked from commit 47a8ae6744fb80e3eb51fac4021d637e4379984a) --- ironic_python_agent/ironic_api_client.py | 37 ++++++++++++++-- .../tests/unit/test_ironic_api_client.py | 42 +++++++++++++++++++ ...agent-lookup-retries-1b4bb90b8e783aca.yaml | 9 ++++ 3 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml diff --git a/ironic_python_agent/ironic_api_client.py b/ironic_python_agent/ironic_api_client.py index cadd6876a..e64664cf6 100644 --- a/ironic_python_agent/ironic_api_client.py +++ b/ironic_python_agent/ironic_api_client.py @@ -148,12 +148,41 @@ class APIClient(object): 'GET', self.lookup_api, headers=self._get_ironic_api_version_header(), params=params) - except Exception as err: - LOG.exception( - 'Unhandled error looking up node with addresses %r at %s: %s', - params['addresses'], self.api_url, err, + except (requests.exceptions.Timeout, + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + requests.exceptions.HTTPError) as err: + LOG.warning( + 'Error detected while attempting to perform lookup ' + 'with %s, retrying. Error: %s', self.api_url, err ) return False + except Exception as err: + # NOTE(TheJulia): If you're looking here, and you're wondering + # why the retry logic is not working or your investigating a weird + # error or even IPA just exiting, + # See https://storyboard.openstack.org/#!/story/2007968 + # To be clear, we're going to try to provide as much detail as + # possible in the exit handling + msg = ('Unhandled error looking up node with addresses {} at ' + '{}: {}'.format(params['addresses'], self.api_url, err)) + # No matter what we do at this point, IPA is going to exit. + # This is because we don't know why the exception occured and + # we likely should not try to retry as such. + # We will attempt to provide as much detail to the logs as + # possible as to what occured, although depending on the logging + # subsystem, additional errors can occur, thus the additional + # handling below. + try: + LOG.exception(msg) + return False + except Exception as exc_err: + LOG.error(msg) + exc_msg = ('Unexpected exception occured while trying to ' + 'log additional detail. Error: {}'.format(exc_err)) + LOG.error(exc_msg) + raise errors.LookupNodeError(msg) if response.status_code != requests.codes.OK: LOG.warning( diff --git a/ironic_python_agent/tests/unit/test_ironic_api_client.py b/ironic_python_agent/tests/unit/test_ironic_api_client.py index bcf36b337..a45970de2 100644 --- a/ironic_python_agent/tests/unit/test_ironic_api_client.py +++ b/ironic_python_agent/tests/unit/test_ironic_api_client.py @@ -15,6 +15,7 @@ import mock from oslo_serialization import jsonutils from oslo_service import loopingcall +import requests from ironic_python_agent import errors from ironic_python_agent import hardware @@ -286,6 +287,47 @@ class TestBaseIronicPythonAgent(base.IronicAgentTest): 'node_uuid': 'someuuid'}, params) + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_transient_exceptions(self, mock_log): + exc_list = [requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + requests.exceptions.ConnectTimeout] + self.api_client.session.request = mock.Mock() + for exc in exc_list: + self.api_client.session.request.reset_mock() + mock_log.reset_mock() + self.api_client.session.request.side_effect = exc + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertFalse(error) + mock_log.error.assert_has_calls([]) + self.assertEqual(1, mock_log.warning.call_count) + + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_unknown_exception(self, mock_log): + self.api_client.session.request = mock.Mock() + self.api_client.session.request.side_effect = \ + requests.exceptions.RequestException('meow') + self.assertFalse( + self.api_client._do_lookup(self.hardware_info, + node_uuid=None)) + self.assertEqual(1, mock_log.exception.call_count) + + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_unknown_exception_fallback(self, mock_log): + mock_log.exception.side_effect = TypeError + self.api_client.session.request = mock.Mock() + self.api_client.session.request.side_effect = \ + requests.exceptions.RequestException('meow') + self.assertRaises(errors.LookupNodeError, + self.api_client._do_lookup, + self.hardware_info, + node_uuid=None) + self.assertEqual(1, mock_log.exception.call_count) + self.assertEqual(2, mock_log.error.call_count) + def test_do_lookup_bad_response_code(self): response = FakeResponse(status_code=400, content={ 'node': { diff --git a/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml b/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml new file mode 100644 index 000000000..057765013 --- /dev/null +++ b/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes retry logic issues with the Agent Lookup which can result in + the lookup failing prematurely before being completed, typically + resulting in an abrupt end to the agent logging and potentially + weird errors like TypeError being reported on the agent process + standard error output. For more information see + `bug 2007968 `_.