diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 6de28e785a..8d1318b8e4 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -179,11 +179,9 @@ class AgentClient(object): verify=self._get_verify(node), timeout=CONF.agent.command_timeout) except (requests.ConnectionError, requests.Timeout) as e: - msg = (_('Failed to connect to the agent running on node %(node)s ' - 'for invoking command %(method)s. Error: %(error)s') % - {'node': node.uuid, 'method': method, 'error': e}) - LOG.error(msg) - raise exception.AgentConnectionFailed(reason=msg) + result = self._handle_timeout_on_command_execution(node, method, + params, e) + response = None except requests.RequestException as e: msg = (_('Error invoking agent command %(method)s for node ' '%(node)s. Error: %(error)s') % @@ -191,28 +189,31 @@ class AgentClient(object): LOG.error(msg) raise exception.IronicException(msg) - # TODO(russellhaering): real error handling - try: - result = response.json() - except ValueError: - msg = _( - 'Unable to decode response as JSON.\n' - 'Request URL: %(url)s\nRequest body: "%(body)s"\n' - 'Response status code: %(code)s\n' - 'Response: "%(response)s"' - ) % ({'response': response.text, 'body': body, 'url': url, - 'code': response.status_code}) - LOG.error(msg) - raise exception.IronicException(msg) + if response is not None: + # TODO(russellhaering): real error handling + try: + result = response.json() + except ValueError: + msg = _( + 'Unable to decode response as JSON.\n' + 'Request URL: %(url)s\nRequest body: "%(body)s"\n' + 'Response status code: %(code)s\n' + 'Response: "%(response)s"' + ) % ({'response': response.text, 'body': body, 'url': url, + 'code': response.status_code}) + LOG.error(msg) + raise exception.IronicException(msg) error = result.get('command_error') LOG.debug('Agent command %(method)s for node %(node)s returned ' - 'result %(res)s, error %(error)s, HTTP status code %(code)d', + 'result %(res)s, error %(error)s, HTTP status code %(code)s', {'node': node.uuid, 'method': method, 'res': result.get('command_result'), 'error': error, - 'code': response.status_code}) - if response.status_code >= http_client.BAD_REQUEST: + 'code': response.status_code if response is not None + else 'unknown'}) + if (response is not None + and response.status_code >= http_client.BAD_REQUEST): faultstring = result.get('faultstring') if 'agent_token' in faultstring: LOG.error('Agent command %(method)s for node %(node)s ' @@ -319,6 +320,61 @@ class AgentClient(object): {'node': node.uuid, 'status': status}) return result + def _status_if_last_command_matches(self, node, method, params): + """Return the status of the given command if it's the last running.""" + try: + method = method.split('.', 1)[1] + except IndexError: + pass + + commands = self.get_commands_status(node) + if not commands: + return None + + # TODO(dtantsur): a more reliable way to detect repeated execution + # would be to pass a sort of require ID to the agent. + + command = commands[-1] + if command['command_name'] != method: + LOG.debug('Command %(cmd)s is not currently executing, the last ' + 'command is %(curr)s', + {'cmd': method, 'curr': command['command_name']}) + return None + + if command['command_status'] != 'RUNNING': + LOG.debug('Command %(cmd)s is not currently executing, its status ' + 'is %(curr)s', + {'cmd': method, 'curr': command['command_status']}) + return None + + return command + + def _handle_timeout_on_command_execution(self, node, method, params, + error): + result = None + # NOTE(dtantsur): it is possible, especially with eventlet+TLS, that + # agent receives a command but fails to return the result to Ironic. + # To avoid a failure, check if the last command is the one we're trying + # to execute. + try: + result = self._status_if_last_command_matches(node, method, params) + except Exception as e: + msg = (_('Failed to connect to the agent running on node ' + '%(node)s for checking the last command status ' + 'after failing to invoke command %(method)s. ' + 'Error: %(error)s') % + {'node': node.uuid, 'method': method, 'error': e}) + LOG.error(msg) + + if result is None: + msg = (_('Failed to connect to the agent running on node %(node)s ' + 'for invoking command %(method)s. Error: %(error)s') % + {'node': node.uuid, 'method': method, 'error': error}) + LOG.error(msg) + raise exception.AgentConnectionFailed(reason=msg) + + return result + def get_last_command_status(self, node, method): """Get the last status for the given command. diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index fcff80577d..b3542e011c 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -193,6 +193,123 @@ class TestAgentClient(base.TestCase): timeout=60, verify=True) self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_no_command_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_wrong_command_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'meow', 'command_status': 'RUNNING'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_command_not_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'bar', 'command_status': 'FINISHED'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + e = self.assertRaises(exception.AgentConnectionFailed, + self.client._command, + self.node, method, params) + self.assertEqual('Connection to agent failed: Failed to connect to ' + 'the agent running on node %(node)s for invoking ' + 'command %(method)s. Error: %(error)s' % + {'method': method, 'node': self.node.uuid, + 'error': error}, str(e)) + self.client.session.post.assert_called_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertEqual(3, self.client.session.post.call_count) + self.assertTrue(self.client.session.get.called) + + def test__command_fail_connect_command_is_running(self): + error = 'Boom' + self.client.session.post.side_effect = requests.ConnectionError(error) + self.client.session.get.return_value.json.return_value = { + 'commands': [ + {'command_name': 'bar', 'command_status': 'RUNNING'}, + ] + } + method = 'foo.bar' + params = {} + + url = self.client._get_command_url(self.node) + self.client._get_command_body(method, params) + + result = self.client._command(self.node, method, params) + self.assertEqual({'command_name': 'bar', 'command_status': 'RUNNING'}, + result) + self.client.session.post.assert_called_once_with( + url, + data=mock.ANY, + params={'wait': 'false'}, + timeout=60, + verify=True) + self.assertTrue(self.client.session.get.called) def test__command_error_code(self): response_text = {"faultstring": "you dun goofd"} diff --git a/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml new file mode 100644 index 0000000000..1ab3a4846c --- /dev/null +++ b/releasenotes/notes/agent-last-command-4ec6967c995ba84a.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + If the agent accepts a command, but is unable to reply to Ironic (which + sporadically happens before of the eventlet's TLS implementation), we + currently retry the request and fail because the command is already + executing. Ironic now detects this situation by checking the list of + executing commands after receiving a connection error. If the requested + command is the last one, we assume that the command request succeeded.