diff --git a/ironic_python_agent/ironic_api_client.py b/ironic_python_agent/ironic_api_client.py index fda420d99..67f9e6b91 100644 --- a/ironic_python_agent/ironic_api_client.py +++ b/ironic_python_agent/ironic_api_client.py @@ -167,12 +167,41 @@ class APIClient(object): 'GET', self.lookup_api, headers=self._get_ironic_api_version_header(), params=params) - except Exception as err: - LOG.exception( - 'Unhandled error looking up node with addresses %r at %s: %s', - params['addresses'], self.api_url, err, + except (requests.exceptions.Timeout, + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + requests.exceptions.HTTPError) as err: + LOG.warning( + 'Error detected while attempting to perform lookup ' + 'with %s, retrying. Error: %s', self.api_url, err ) return False + except Exception as err: + # NOTE(TheJulia): If you're looking here, and you're wondering + # why the retry logic is not working or your investigating a weird + # error or even IPA just exiting, + # See https://storyboard.openstack.org/#!/story/2007968 + # To be clear, we're going to try to provide as much detail as + # possible in the exit handling + msg = ('Unhandled error looking up node with addresses {} at ' + '{}: {}'.format(params['addresses'], self.api_url, err)) + # No matter what we do at this point, IPA is going to exit. + # This is because we don't know why the exception occured and + # we likely should not try to retry as such. + # We will attempt to provide as much detail to the logs as + # possible as to what occured, although depending on the logging + # subsystem, additional errors can occur, thus the additional + # handling below. + try: + LOG.exception(msg) + return False + except Exception as exc_err: + LOG.error(msg) + exc_msg = ('Unexpected exception occured while trying to ' + 'log additional detail. Error: {}'.format(exc_err)) + LOG.error(exc_msg) + raise errors.LookupNodeError(msg) if response.status_code != requests.codes.OK: LOG.warning( diff --git a/ironic_python_agent/tests/unit/test_ironic_api_client.py b/ironic_python_agent/tests/unit/test_ironic_api_client.py index c0f8bb740..63e100fff 100644 --- a/ironic_python_agent/tests/unit/test_ironic_api_client.py +++ b/ironic_python_agent/tests/unit/test_ironic_api_client.py @@ -16,6 +16,7 @@ from unittest import mock from oslo_serialization import jsonutils from oslo_service import loopingcall +import requests from ironic_python_agent import errors from ironic_python_agent import hardware @@ -312,6 +313,47 @@ class TestBaseIronicPythonAgent(base.IronicAgentTest): 'node_uuid': 'someuuid'}, params) + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_transient_exceptions(self, mock_log): + exc_list = [requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + requests.exceptions.HTTPError, + requests.exceptions.Timeout, + requests.exceptions.ConnectTimeout] + self.api_client.session.request = mock.Mock() + for exc in exc_list: + self.api_client.session.request.reset_mock() + mock_log.reset_mock() + self.api_client.session.request.side_effect = exc + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertFalse(error) + mock_log.error.assert_has_calls([]) + self.assertEqual(1, mock_log.warning.call_count) + + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_unknown_exception(self, mock_log): + self.api_client.session.request = mock.Mock() + self.api_client.session.request.side_effect = \ + requests.exceptions.RequestException('meow') + self.assertFalse( + self.api_client._do_lookup(self.hardware_info, + node_uuid=None)) + self.assertEqual(1, mock_log.exception.call_count) + + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_unknown_exception_fallback(self, mock_log): + mock_log.exception.side_effect = TypeError + self.api_client.session.request = mock.Mock() + self.api_client.session.request.side_effect = \ + requests.exceptions.RequestException('meow') + self.assertRaises(errors.LookupNodeError, + self.api_client._do_lookup, + self.hardware_info, + node_uuid=None) + self.assertEqual(1, mock_log.exception.call_count) + self.assertEqual(2, mock_log.error.call_count) + def test_do_lookup_bad_response_code(self): response = FakeResponse(status_code=400, content={ 'node': { diff --git a/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml b/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml new file mode 100644 index 000000000..057765013 --- /dev/null +++ b/releasenotes/notes/fixes-agent-lookup-retries-1b4bb90b8e783aca.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes retry logic issues with the Agent Lookup which can result in + the lookup failing prematurely before being completed, typically + resulting in an abrupt end to the agent logging and potentially + weird errors like TypeError being reported on the agent process + standard error output. For more information see + `bug 2007968 `_.