diff --git a/ironic_python_agent/ironic_api_client.py b/ironic_python_agent/ironic_api_client.py index a2e7e1d00..8ea7029e7 100644 --- a/ironic_python_agent/ironic_api_client.py +++ b/ironic_python_agent/ironic_api_client.py @@ -13,6 +13,7 @@ # limitations under the License. import json +import time from oslo_config import cfg from oslo_log import log @@ -45,6 +46,7 @@ class APIClient(object): heartbeat_api = '/%s/heartbeat/{uuid}' % api_version _ironic_api_version = None agent_token = None + lookup_lock_pause = 0 def __init__(self, api_url): self.api_url = api_url.rstrip('/') @@ -176,7 +178,7 @@ class APIClient(object): raise errors.HeartbeatError(error) def lookup_node(self, hardware_info, timeout, starting_interval, - node_uuid=None, max_interval=30): + node_uuid=None, max_interval=60): retry = tenacity.retry( retry=tenacity.retry_if_result(lambda r: r is False), stop=tenacity.stop_after_delay(timeout), @@ -244,10 +246,31 @@ class APIClient(object): LOG.error(exc_msg) raise errors.LookupNodeError(msg) + if response.status_code == requests.codes.CONFLICT: + if self.lookup_lock_pause == 0: + self.lookup_lock_pause = 5 + elif self.lookup_lock_pause == 5: + self.lookup_lock_pause = 10 + elif self.lookup_lock_pause == 10: + # If we're reaching this point, we've got a long held + # persistent lock, which means things can go very sideways + # or the ironic deployment is downright grumpy. Either way, + # we need to slow things down. + self.lookup_lock_pause = 30 + LOG.warning( + 'Ironic has responded with a conflict, signaling the ' + 'node is locked. We will wait %(time)s seconds before trying ' + 'again. %(err)s', + {'time': self.lookup_lock_pause, + 'error': self._error_from_response(response)} + ) + time.sleep(self.lookup_lock_pause) + return False + if response.status_code != requests.codes.OK: LOG.warning( 'Failed looking up node with addresses %r at %s. ' - '%s. Check if inspection has completed.', + 'Check if inspection has completed? %s', params['addresses'], self.api_url, self._error_from_response(response) ) diff --git a/ironic_python_agent/tests/unit/test_ironic_api_client.py b/ironic_python_agent/tests/unit/test_ironic_api_client.py index 8c05652ce..6c101acc7 100644 --- a/ironic_python_agent/tests/unit/test_ironic_api_client.py +++ b/ironic_python_agent/tests/unit/test_ironic_api_client.py @@ -13,6 +13,7 @@ # limitations under the License. import json +import time from unittest import mock from oslo_config import cfg @@ -461,6 +462,32 @@ class TestBaseIronicPythonAgent(base.IronicAgentTest): mock_log.error.assert_has_calls([]) self.assertEqual(1, mock_log.warning.call_count) + @mock.patch.object(time, 'sleep', autospec=True) + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) + def test_do_lookup_node_locked(self, mock_log, mock_sleep): + response = FakeResponse(status_code=409, content={}) + self.api_client.session.request = mock.Mock() + self.api_client.session.request.return_value = response + self.assertEqual(0, self.api_client.lookup_lock_pause) + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertFalse(error) + mock_log.error.assert_has_calls([]) + self.assertEqual(1, mock_log.warning.call_count) + self.assertEqual(1, mock_sleep.call_count) + self.assertEqual(5, self.api_client.lookup_lock_pause) + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertEqual(10, self.api_client.lookup_lock_pause) + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertFalse(error) + self.assertEqual(30, self.api_client.lookup_lock_pause) + error = self.api_client._do_lookup(self.hardware_info, + node_uuid=None) + self.assertFalse(error) + self.assertEqual(30, self.api_client.lookup_lock_pause) + @mock.patch.object(ironic_api_client, 'LOG', autospec=True) def test_do_lookup_unknown_exception(self, mock_log): self.api_client.session.request = mock.Mock() diff --git a/releasenotes/notes/understand-node-conflict-is-locked-2ea21dd45abfe4f7.yaml b/releasenotes/notes/understand-node-conflict-is-locked-2ea21dd45abfe4f7.yaml new file mode 100644 index 000000000..c0e77964e --- /dev/null +++ b/releasenotes/notes/understand-node-conflict-is-locked-2ea21dd45abfe4f7.yaml @@ -0,0 +1,13 @@ +--- +fixes: + - | + Fixes, or at least lessens the case where a running Ironic agent can stack + up numerous lookup requests against an Ironic deployment when a node is + locked. In particular, this is beause the lookup also drives generation of + the agent token, which requires the conductor to allocate a worker, and + generate the token, and return the result to the API client. + Ironic's retry logic will now wait up to ``60`` seconds, and if an HTTP + Conflict (409) message is received, the agent will automatically pause + lookup operations for thirty seconds as opposed continue to attempt + lookups which could create more work for the Ironic deployment + needlessly.