From 6787142617e93dabdbdc73eecbf9fdc0b6ec2e29 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Fri, 7 May 2021 10:38:16 +0200 Subject: [PATCH] Fix deployment when executing a command fails after the command starts If the agent accepts a command, but is unable to reply to Ironic (which sporadically happens before of the eventlet's TLS implementation), we currently retry the request and fail because the command is already executing. Ironic now detects this situation by checking the list of executing commands after receiving a connection error. If the requested command is last one, we assume that the command request succeeded. Ideally, we should pass a request ID to IPA and then compare it. Such a change would affect the API contract between the agent and Ironic and thus would not be backportable. Change-Id: I2ea21c9ec440fa7ddf8578cf7b34d6d0ebbb5dc8 (cherry picked from commit abfe383cc32d63dd3df2b90f0feace08d73b8364) --- ironic/drivers/modules/agent_client.py | 98 +++++++++++---- .../unit/drivers/modules/test_agent_client.py | 117 ++++++++++++++++++ .../agent-last-command-4ec6967c995ba84a.yaml | 9 ++ 3 files changed, 203 insertions(+), 21 deletions(-) create mode 100644 releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index f0c411c76b..59975777f4 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -177,11 +177,9 @@ class AgentClient(object): verify=self._get_verify(node), timeout=CONF.agent.command_timeout) except (requests.ConnectionError, requests.Timeout) as e: - msg = (_('Failed to connect to the agent running on node %(node)s ' - 'for invoking command %(method)s. Error: %(error)s') % - {'node': node.uuid, 'method': method, 'error': e}) - LOG.error(msg) - raise exception.AgentConnectionFailed(reason=msg) + result = self._handle_timeout_on_command_execution(node, method, + params, e) + response = None except requests.RequestException as e: msg = (_('Error invoking agent command %(method)s for node ' '%(node)s. Error: %(error)s') % @@ -189,28 +187,31 @@ class AgentClient(object): LOG.error(msg) raise exception.IronicException(msg) - # TODO(russellhaering): real error handling - try: - result = response.json() - except ValueError: - msg = _( - 'Unable to decode response as JSON.\n' - 'Request URL: %(url)s\nRequest body: "%(body)s"\n' - 'Response status code: %(code)s\n' - 'Response: "%(response)s"' - ) % ({'response': response.text, 'body': body, 'url': url, - 'code': response.status_code}) - LOG.error(msg) - raise exception.IronicException(msg) + if response is not None: + # TODO(russellhaering): real error handling + try: + result = response.json() + except ValueError: + msg = _( + 'Unable to decode response as JSON.\n' + 'Request URL: %(url)s\nRequest body: "%(body)s"\n' + 'Response status code: %(code)s\n' + 'Response: "%(response)s"' + ) % ({'response': response.text, 'body': body, 'url': url, + 'code': response.status_code}) + LOG.error(msg) + raise exception.IronicException(msg) error = result.get('command_error') LOG.debug('Agent command %(method)s for node %(node)s returned ' - 'result %(res)s, error %(error)s, HTTP status code %(code)d', + 'result %(res)s, error %(error)s, HTTP status code %(code)s', {'node': node.uuid, 'method': method, 'res': result.get('command_result'), 'error': error, - 'code': response.status_code}) - if response.status_code >= http_client.BAD_REQUEST: + 'code': response.status_code if response is not None + else 'unknown'}) + if (response is not None + and response.status_code >= http_client.BAD_REQUEST): faultstring = result.get('faultstring') if 'agent_token' in faultstring: LOG.error('Agent command %(method)s for node %(node)s ' @@ -315,6 +316,61 @@ class AgentClient(object): {'node': node.uuid, 'status': status}) return result + def _status_if_last_command_matches(self, node, method, params): + """Return the status of the given command if it's the last running.""" + try: + method = method.split('.', 1)[1] + except IndexError: + pass + + commands = self.get_commands_status(node) + if not commands: + return None + + # TODO(dtantsur): a more reliable way to detect repeated execution + # would be to pass a sort of require ID to the agent. + + command = commands[-1] + if command['command_name'] != method: + LOG.debug('Command %(cmd)s is not currently executing, the last ' + 'command is %(curr)s', + {'cmd': method, 'curr': command['command_name']}) + return None + + if command['command_status'] != 'RUNNING': + LOG.debug('Command %(cmd)s is not currently executing, its status ' + 'is %(curr)s', + {'cmd': method, 'curr': command['command_status']}) + return None + + return command + + def _handle_timeout_on_command_execution(self, node, method, params, + error): + result = None + # NOTE(dtantsur): it is possible, especially with eventlet+TLS, that + # agent receives a command but fails to return the result to Ironic. + # To avoid a failure, check if the last command is the one we're trying + # to execute. + try: + result = self._status_if_last_command_matches(node, method, params) + except Exception as e: + msg = (_('Failed to connect to the agent running on node ' + '%(node)s for checking the last command status ' + 'after failing to invoke command %(method)s. ' + 'Error: %(error)s') % + {'node': node.uuid, 'method': method, 'error': e}) + LOG.error(msg) + + if result is None: + msg = (_('Failed to connect to the agent running on node %(node)s ' + 'for invoking command %(method)s. Error: %(error)s') % + {'node': node.uuid, 'method': method, 'error': error}) + LOG.error(msg) + raise exception.AgentConnectionFailed(reason=msg) + + return result + def get_last_command_status(self, node, method): """Get the last status for the given command. diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index 0823d75a81..4afb8f7370 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -194,6 +194,123 @@ class TestAgentClient(base.TestCase): timeout=60, verify=True) self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_no_command_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_wrong_command_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'meow', 'command_status': 'RUNNING'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_command_not_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'bar', 'command_status': 'FINISHED'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_command_is_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'bar', 'command_status': 'RUNNING'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + result = self.client._command(self.node, method, params) + self.assertEqual({'command_name': 'bar', 'command_status': 'RUNNING'}, + result) + self.client.session.post.assert_called_once_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertTrue(self.client.session.get.called) def test__command_error_code(self): response_text = {"faultstring": "you dun goofd"} diff --git a/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml new file mode 100644 index 0000000000..1ab3a4846c --- /dev/null +++ b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + If the agent accepts a command, but is unable to reply to Ironic (which + sporadically happens before of the eventlet's TLS implementation), we + currently retry the request and fail because the command is already + executing. Ironic now detects this situation by checking the list of + executing commands after receiving a connection error. If the requested + command is the last one, we assume that the command request succeeded.