Fix deployment when executing a command fails after the command starts

If the agent accepts a command, but is unable to reply to Ironic (which
sporadically happens before of the eventlet's TLS implementation), we
currently retry the request and fail because the command is already
executing. Ironic now detects this situation by checking the list of
executing commands after receiving a connection error. If the requested
command is last one, we assume that the command request succeeded.

Ideally, we should pass a request ID to IPA and then compare it. Such
a change would affect the API contract between the agent and Ironic
and thus would not be backportable.

Change-Id: I2ea21c9ec440fa7ddf8578cf7b34d6d0ebbb5dc8
(cherry picked from commit abfe383cc3)
This commit is contained in:
Dmitry Tantsur 2021-05-07 10:38:16 +02:00
parent e88436688d
commit 6787142617
3 changed files with 203 additions and 21 deletions

View File

@ -177,11 +177,9 @@ class AgentClient(object):
verify=self._get_verify(node),
timeout=CONF.agent.command_timeout)
except (requests.ConnectionError, requests.Timeout) as e:
msg = (_('Failed to connect to the agent running on node %(node)s '
'for invoking command %(method)s. Error: %(error)s') %
{'node': node.uuid, 'method': method, 'error': e})
LOG.error(msg)
raise exception.AgentConnectionFailed(reason=msg)
result = self._handle_timeout_on_command_execution(node, method,
params, e)
response = None
except requests.RequestException as e:
msg = (_('Error invoking agent command %(method)s for node '
'%(node)s. Error: %(error)s') %
@ -189,28 +187,31 @@ class AgentClient(object):
LOG.error(msg)
raise exception.IronicException(msg)
# TODO(russellhaering): real error handling
try:
result = response.json()
except ValueError:
msg = _(
'Unable to decode response as JSON.\n'
'Request URL: %(url)s\nRequest body: "%(body)s"\n'
'Response status code: %(code)s\n'
'Response: "%(response)s"'
) % ({'response': response.text, 'body': body, 'url': url,
'code': response.status_code})
LOG.error(msg)
raise exception.IronicException(msg)
if response is not None:
# TODO(russellhaering): real error handling
try:
result = response.json()
except ValueError:
msg = _(
'Unable to decode response as JSON.\n'
'Request URL: %(url)s\nRequest body: "%(body)s"\n'
'Response status code: %(code)s\n'
'Response: "%(response)s"'
) % ({'response': response.text, 'body': body, 'url': url,
'code': response.status_code})
LOG.error(msg)
raise exception.IronicException(msg)
error = result.get('command_error')
LOG.debug('Agent command %(method)s for node %(node)s returned '
'result %(res)s, error %(error)s, HTTP status code %(code)d',
'result %(res)s, error %(error)s, HTTP status code %(code)s',
{'node': node.uuid, 'method': method,
'res': result.get('command_result'),
'error': error,
'code': response.status_code})
if response.status_code >= http_client.BAD_REQUEST:
'code': response.status_code if response is not None
else 'unknown'})
if (response is not None
and response.status_code >= http_client.BAD_REQUEST):
faultstring = result.get('faultstring')
if 'agent_token' in faultstring:
LOG.error('Agent command %(method)s for node %(node)s '
@ -315,6 +316,61 @@ class AgentClient(object):
{'node': node.uuid, 'status': status})
return result
def _status_if_last_command_matches(self, node, method, params):
"""Return the status of the given command if it's the last running."""
try:
method = method.split('.', 1)[1]
except IndexError:
pass
commands = self.get_commands_status(node)
if not commands:
return None
# TODO(dtantsur): a more reliable way to detect repeated execution
# would be to pass a sort of require ID to the agent.
command = commands[-1]
if command['command_name'] != method:
LOG.debug('Command %(cmd)s is not currently executing, the last '
'command is %(curr)s',
{'cmd': method, 'curr': command['command_name']})
return None
if command['command_status'] != 'RUNNING':
LOG.debug('Command %(cmd)s is not currently executing, its status '
'is %(curr)s',
{'cmd': method, 'curr': command['command_status']})
return None
return command
def _handle_timeout_on_command_execution(self, node, method, params,
error):
result = None
# NOTE(dtantsur): it is possible, especially with eventlet+TLS, that
# agent receives a command but fails to return the result to Ironic.
# To avoid a failure, check if the last command is the one we're trying
# to execute.
try:
result = self._status_if_last_command_matches(node, method, params)
except Exception as e:
msg = (_('Failed to connect to the agent running on node '
'%(node)s for checking the last command status '
'after failing to invoke command %(method)s. '
'Error: %(error)s') %
{'node': node.uuid, 'method': method, 'error': e})
LOG.error(msg)
if result is None:
msg = (_('Failed to connect to the agent running on node %(node)s '
'for invoking command %(method)s. Error: %(error)s') %
{'node': node.uuid, 'method': method, 'error': error})
LOG.error(msg)
raise exception.AgentConnectionFailed(reason=msg)
return result
def get_last_command_status(self, node, method):
"""Get the last status for the given command.

View File

@ -194,6 +194,123 @@ class TestAgentClient(base.TestCase):
timeout=60,
verify=True)
self.assertEqual(3, self.client.session.post.call_count)
self.assertTrue(self.client.session.get.called)
def test__command_fail_connect_no_command_running(self):
error = 'Boom'
self.client.session.post.side_effect = requests.ConnectionError(error)
self.client.session.get.return_value.json.return_value = {
'commands': []
}
method = 'foo.bar'
params = {}
url = self.client._get_command_url(self.node)
self.client._get_command_body(method, params)
e = self.assertRaises(exception.AgentConnectionFailed,
self.client._command,
self.node, method, params)
self.assertEqual('Connection to agent failed: Failed to connect to '
'the agent running on node %(node)s for invoking '
'command %(method)s. Error: %(error)s' %
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
self.client.session.post.assert_called_with(
url,
data=mock.ANY,
params={'wait': 'false'},
timeout=60,
verify=True)
self.assertEqual(3, self.client.session.post.call_count)
self.assertTrue(self.client.session.get.called)
def test__command_fail_connect_wrong_command_running(self):
error = 'Boom'
self.client.session.post.side_effect = requests.ConnectionError(error)
self.client.session.get.return_value.json.return_value = {
'commands': [
{'command_name': 'meow', 'command_status': 'RUNNING'},
]
}
method = 'foo.bar'
params = {}
url = self.client._get_command_url(self.node)
self.client._get_command_body(method, params)
e = self.assertRaises(exception.AgentConnectionFailed,
self.client._command,
self.node, method, params)
self.assertEqual('Connection to agent failed: Failed to connect to '
'the agent running on node %(node)s for invoking '
'command %(method)s. Error: %(error)s' %
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
self.client.session.post.assert_called_with(
url,
data=mock.ANY,
params={'wait': 'false'},
timeout=60,
verify=True)
self.assertEqual(3, self.client.session.post.call_count)
self.assertTrue(self.client.session.get.called)
def test__command_fail_connect_command_not_running(self):
error = 'Boom'
self.client.session.post.side_effect = requests.ConnectionError(error)
self.client.session.get.return_value.json.return_value = {
'commands': [
{'command_name': 'bar', 'command_status': 'FINISHED'},
]
}
method = 'foo.bar'
params = {}
url = self.client._get_command_url(self.node)
self.client._get_command_body(method, params)
e = self.assertRaises(exception.AgentConnectionFailed,
self.client._command,
self.node, method, params)
self.assertEqual('Connection to agent failed: Failed to connect to '
'the agent running on node %(node)s for invoking '
'command %(method)s. Error: %(error)s' %
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
self.client.session.post.assert_called_with(
url,
data=mock.ANY,
params={'wait': 'false'},
timeout=60,
verify=True)
self.assertEqual(3, self.client.session.post.call_count)
self.assertTrue(self.client.session.get.called)
def test__command_fail_connect_command_is_running(self):
error = 'Boom'
self.client.session.post.side_effect = requests.ConnectionError(error)
self.client.session.get.return_value.json.return_value = {
'commands': [
{'command_name': 'bar', 'command_status': 'RUNNING'},
]
}
method = 'foo.bar'
params = {}
url = self.client._get_command_url(self.node)
self.client._get_command_body(method, params)
result = self.client._command(self.node, method, params)
self.assertEqual({'command_name': 'bar', 'command_status': 'RUNNING'},
result)
self.client.session.post.assert_called_once_with(
url,
data=mock.ANY,
params={'wait': 'false'},
timeout=60,
verify=True)
self.assertTrue(self.client.session.get.called)
def test__command_error_code(self):
response_text = {"faultstring": "you dun goofd"}

View File

@ -0,0 +1,9 @@
---
fixes:
- |
If the agent accepts a command, but is unable to reply to Ironic (which
sporadically happens before of the eventlet's TLS implementation), we
currently retry the request and fail because the command is already
executing. Ironic now detects this situation by checking the list of
executing commands after receiving a connection error. If the requested
command is the last one, we assume that the command request succeeded.