agent: poll long-running commands till completion
Currently for install_bootloader we use wait=True with a longer timeout. As a more robust alternative, poll the agent until the command completes. This avoids trying to guess how long the command will actually take. Change-Id: I62e9086441fa2b164aee42f7489d12aed4076f49 Story: #2006963
This commit is contained in:
parent
a2ad31ddef
commit
7828fe8b64
@ -664,6 +664,10 @@ class AgentConnectionFailed(IronicException):
|
||||
_msg_fmt = _("Connection to agent failed: %(reason)s")
|
||||
|
||||
|
||||
class AgentCommandTimeout(IronicException):
|
||||
_msg_fmt = _("Timeout executing command %(command)s on node %(node)s")
|
||||
|
||||
|
||||
class NodeProtected(HTTPForbidden):
|
||||
_msg_fmt = _("Node %(node)s is protected and cannot be undeployed, "
|
||||
"rebuilt or deleted")
|
||||
|
@ -113,16 +113,20 @@ opts = [
|
||||
cfg.IntOpt('command_timeout',
|
||||
default=60,
|
||||
mutable=True,
|
||||
help=_('Timeout (in seconds) for IPA commands. '
|
||||
'Please note, the bootloader installation command '
|
||||
'to the agent is permitted a timeout of twice the '
|
||||
'value set here as these are IO heavy operations '
|
||||
'depending on the configuration of the instance.')),
|
||||
help=_('Timeout (in seconds) for IPA commands.')),
|
||||
cfg.IntOpt('max_command_attempts',
|
||||
default=3,
|
||||
help=_('This is the maximum number of attempts that will be '
|
||||
'done for IPA commands that fails due to network '
|
||||
'problems.')),
|
||||
cfg.IntOpt('command_wait_attempts',
|
||||
default=100,
|
||||
help=_('Number of attempts to check for asynchronous commands '
|
||||
'completion before timing out.')),
|
||||
cfg.IntOpt('command_wait_interval',
|
||||
default=6,
|
||||
help=_('Number of seconds to wait for between checks for '
|
||||
'asynchronous commands completion.')),
|
||||
cfg.IntOpt('neutron_agent_poll_interval',
|
||||
default=2,
|
||||
mutable=True,
|
||||
|
@ -56,13 +56,60 @@ class AgentClient(object):
|
||||
'params': params,
|
||||
})
|
||||
|
||||
def _raise_if_typeerror(self, result, node, method):
|
||||
error = result.get('command_error')
|
||||
if error and error.get('type') == 'TypeError':
|
||||
LOG.error('Agent command %(method)s for node %(node)s failed. '
|
||||
'Internal TypeError detected: Error %(error)s',
|
||||
{'method': method, 'node': node.uuid, 'error': error})
|
||||
raise exception.AgentAPIError(node=node.uuid,
|
||||
status=error.get('code'),
|
||||
error=result.get('faultstring'))
|
||||
|
||||
@METRICS.timer('AgentClient._wait_for_command')
|
||||
@retrying.retry(
|
||||
retry_on_exception=(
|
||||
lambda e: isinstance(e, exception.AgentCommandTimeout)),
|
||||
stop_max_attempt_number=CONF.agent.command_wait_attempts,
|
||||
wait_fixed=CONF.agent.command_wait_interval * 1000)
|
||||
def _wait_for_command(self, node, method):
|
||||
"""Wait for a command to complete.
|
||||
|
||||
:param node: A Node object.
|
||||
:param method: A string represents the command executed by agent.
|
||||
"""
|
||||
try:
|
||||
method = method.split('.', 1)[1]
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
commands = self.get_commands_status(node)
|
||||
try:
|
||||
result = next(c for c in reversed(commands)
|
||||
if c.get('command_name') == method)
|
||||
except StopIteration:
|
||||
LOG.debug('Command %(cmd)s is not in the executing commands list '
|
||||
'for node %(node)s',
|
||||
{'cmd': method, 'node': node.uuid})
|
||||
raise exception.AgentCommandTimeout(command=method, node=node.uuid)
|
||||
|
||||
if result.get('command_status') == 'RUNNING':
|
||||
LOG.debug('Command %(cmd)s has not finished yet for node %(node)s',
|
||||
{'cmd': method, 'node': node.uuid})
|
||||
raise exception.AgentCommandTimeout(command=method, node=node.uuid)
|
||||
else:
|
||||
LOG.debug('Command %(cmd)s has finished for node %(node)s with '
|
||||
'result %(result)s',
|
||||
{'cmd': method, 'node': node.uuid, 'result': result})
|
||||
self._raise_if_typeerror(result, node, method)
|
||||
return result
|
||||
|
||||
@METRICS.timer('AgentClient._command')
|
||||
@retrying.retry(
|
||||
retry_on_exception=(
|
||||
lambda e: isinstance(e, exception.AgentConnectionFailed)),
|
||||
stop_max_attempt_number=CONF.agent.max_command_attempts)
|
||||
def _command(self, node, method, params, wait=False,
|
||||
command_timeout_factor=1):
|
||||
def _command(self, node, method, params, wait=False, poll=False):
|
||||
"""Sends command to agent.
|
||||
|
||||
:param node: A Node object.
|
||||
@ -72,19 +119,16 @@ class AgentClient(object):
|
||||
body.
|
||||
:param wait: True to wait for the command to finish executing, False
|
||||
otherwise.
|
||||
:param command_timeout_factor: An integer, default 1, by which to
|
||||
multiply the [agent]command_timeout
|
||||
value. This is intended for use with
|
||||
extremely long running commands to
|
||||
the agent ramdisk where a general
|
||||
timeout value should not be extended
|
||||
in all cases.
|
||||
:param poll: Whether to poll the command until completion. Provides
|
||||
a better alternative to `wait` for long-running commands.
|
||||
:raises: IronicException when failed to issue the request or there was
|
||||
a malformed response from the agent.
|
||||
:raises: AgentAPIError when agent failed to execute specified command.
|
||||
:returns: A dict containing command result from agent, see
|
||||
get_commands_status for a sample.
|
||||
"""
|
||||
assert not (wait and poll)
|
||||
|
||||
url = self._get_command_url(node)
|
||||
body = self._get_command_body(method, params)
|
||||
request_params = {
|
||||
@ -99,7 +143,7 @@ class AgentClient(object):
|
||||
try:
|
||||
response = self.session.post(
|
||||
url, params=request_params, data=body,
|
||||
timeout=CONF.agent.command_timeout * command_timeout_factor)
|
||||
timeout=CONF.agent.command_timeout)
|
||||
except (requests.ConnectionError, requests.Timeout) as e:
|
||||
msg = (_('Failed to connect to the agent running on node %(node)s '
|
||||
'for invoking command %(method)s. Error: %(error)s') %
|
||||
@ -128,12 +172,6 @@ class AgentClient(object):
|
||||
raise exception.IronicException(msg)
|
||||
|
||||
error = result.get('command_error')
|
||||
exc_type = None
|
||||
if error:
|
||||
# if an error, we should see if a type field exists. This type
|
||||
# field may signal an exception that is compatability based.
|
||||
exc_type = error.get('type')
|
||||
|
||||
LOG.debug('Agent command %(method)s for node %(node)s returned '
|
||||
'result %(res)s, error %(error)s, HTTP status code %(code)d',
|
||||
{'node': node.uuid, 'method': method,
|
||||
@ -149,14 +187,11 @@ class AgentClient(object):
|
||||
raise exception.AgentAPIError(node=node.uuid,
|
||||
status=response.status_code,
|
||||
error=result.get('faultstring'))
|
||||
if exc_type == 'TypeError':
|
||||
LOG.error('Agent command %(method)s for node %(node)s failed. '
|
||||
'Internal %(exc_type)s error detected: Error %(error)s',
|
||||
{'method': method, 'node': node.uuid,
|
||||
'exc_type': exc_type, 'error': error})
|
||||
raise exception.AgentAPIError(node=node.uuid,
|
||||
status=error.get('code'),
|
||||
error=result.get('faultstring'))
|
||||
|
||||
self._raise_if_typeerror(result, node, method)
|
||||
|
||||
if poll:
|
||||
result = self._wait_for_command(node, method)
|
||||
|
||||
return result
|
||||
|
||||
@ -245,7 +280,7 @@ class AgentClient(object):
|
||||
return self._command(node=node,
|
||||
method='standby.prepare_image',
|
||||
params=params,
|
||||
wait=wait)
|
||||
poll=wait)
|
||||
|
||||
@METRICS.timer('AgentClient.start_iscsi_target')
|
||||
def start_iscsi_target(self, node, iqn,
|
||||
@ -313,8 +348,7 @@ class AgentClient(object):
|
||||
return self._command(node=node,
|
||||
method='image.install_bootloader',
|
||||
params=params,
|
||||
wait=True,
|
||||
command_timeout_factor=2)
|
||||
poll=True)
|
||||
except exception.AgentAPIError:
|
||||
# NOTE(arne_wiebalck): If for software RAID and 'uefi' as the boot
|
||||
# mode, we find that the IPA does not yet support the additional
|
||||
@ -338,8 +372,7 @@ class AgentClient(object):
|
||||
return self._command(node=node,
|
||||
method='image.install_bootloader',
|
||||
params=params,
|
||||
wait=True,
|
||||
command_timeout_factor=2)
|
||||
poll=True)
|
||||
|
||||
@METRICS.timer('AgentClient.get_clean_steps')
|
||||
def get_clean_steps(self, node, ports):
|
||||
|
@ -29,13 +29,29 @@ CONF = conf.CONF
|
||||
|
||||
|
||||
class MockResponse(object):
|
||||
def __init__(self, text, status_code=http_client.OK):
|
||||
assert isinstance(text, str)
|
||||
def __init__(self, data=None, status_code=http_client.OK, text=None):
|
||||
assert not (data and text)
|
||||
self.text = text
|
||||
self.data = data
|
||||
self.status_code = status_code
|
||||
|
||||
def json(self):
|
||||
if self.text:
|
||||
return json.loads(self.text)
|
||||
else:
|
||||
return self.data
|
||||
|
||||
|
||||
class MockCommandStatus(MockResponse):
|
||||
def __init__(self, status, name='fake', error=None):
|
||||
super().__init__({
|
||||
'commands': [
|
||||
{'command_name': name,
|
||||
'command_status': status,
|
||||
'command_result': 'I did something',
|
||||
'command_error': error}
|
||||
]
|
||||
})
|
||||
|
||||
|
||||
class MockNode(object):
|
||||
@ -87,8 +103,7 @@ class TestAgentClient(base.TestCase):
|
||||
|
||||
def test__command(self):
|
||||
response_data = {'status': 'ok'}
|
||||
response_text = json.dumps(response_data)
|
||||
self.client.session.post.return_value = MockResponse(response_text)
|
||||
self.client.session.post.return_value = MockResponse(response_data)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
@ -106,7 +121,8 @@ class TestAgentClient(base.TestCase):
|
||||
|
||||
def test__command_fail_json(self):
|
||||
response_text = 'this be not json matey!'
|
||||
self.client.session.post.return_value = MockResponse(response_text)
|
||||
self.client.session.post.return_value = MockResponse(
|
||||
text=response_text)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
@ -159,7 +175,7 @@ class TestAgentClient(base.TestCase):
|
||||
'error': error}, str(e))
|
||||
|
||||
def test__command_error_code(self):
|
||||
response_text = '{"faultstring": "you dun goofd"}'
|
||||
response_text = {"faultstring": "you dun goofd"}
|
||||
self.client.session.post.return_value = MockResponse(
|
||||
response_text, status_code=http_client.BAD_REQUEST)
|
||||
method = 'standby.run_image'
|
||||
@ -179,10 +195,9 @@ class TestAgentClient(base.TestCase):
|
||||
timeout=60)
|
||||
|
||||
def test__command_error_code_okay_error_typeerror_embedded(self):
|
||||
response_text = ('{"faultstring": "you dun goofd", '
|
||||
'"command_error": {"type": "TypeError"}}')
|
||||
self.client.session.post.return_value = MockResponse(
|
||||
response_text)
|
||||
response_data = {"faultstring": "you dun goofd",
|
||||
"command_error": {"type": "TypeError"}}
|
||||
self.client.session.post.return_value = MockResponse(response_data)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
@ -199,6 +214,36 @@ class TestAgentClient(base.TestCase):
|
||||
params={'wait': 'false'},
|
||||
timeout=60)
|
||||
|
||||
@mock.patch('time.sleep', lambda seconds: None)
|
||||
def test__command_poll(self):
|
||||
response_data = {'status': 'ok'}
|
||||
final_status = MockCommandStatus('SUCCEEDED', name='run_image')
|
||||
self.client.session.post.return_value = MockResponse(response_data)
|
||||
self.client.session.get.side_effect = [
|
||||
MockCommandStatus('RUNNING', name='run_image'),
|
||||
final_status,
|
||||
]
|
||||
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
expected = {'command_error': None,
|
||||
'command_name': 'run_image',
|
||||
'command_result': 'I did something',
|
||||
'command_status': 'SUCCEEDED'}
|
||||
|
||||
url = self.client._get_command_url(self.node)
|
||||
body = self.client._get_command_body(method, params)
|
||||
|
||||
response = self.client._command(self.node, method, params, poll=True)
|
||||
self.assertEqual(expected, response)
|
||||
self.client.session.post.assert_called_once_with(
|
||||
url,
|
||||
data=body,
|
||||
params={'wait': 'false'},
|
||||
timeout=60)
|
||||
self.client.session.get.assert_called_with(url, timeout=60)
|
||||
|
||||
def test_get_commands_status(self):
|
||||
with mock.patch.object(self.client.session, 'get',
|
||||
autospec=True) as mock_get:
|
||||
@ -234,7 +279,7 @@ class TestAgentClient(base.TestCase):
|
||||
wait=False)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='standby.prepare_image',
|
||||
params=params, wait=False)
|
||||
params=params, poll=False)
|
||||
|
||||
def test_prepare_image_with_configdrive(self):
|
||||
self.client._command = mock.MagicMock(spec_set=[])
|
||||
@ -251,7 +296,19 @@ class TestAgentClient(base.TestCase):
|
||||
wait=False)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='standby.prepare_image',
|
||||
params=params, wait=False)
|
||||
params=params, poll=False)
|
||||
|
||||
def test_prepare_image_with_wait(self):
|
||||
self.client._command = mock.MagicMock(spec_set=[])
|
||||
image_info = {'image_id': 'image'}
|
||||
params = {'image_info': image_info}
|
||||
|
||||
self.client.prepare_image(self.node,
|
||||
image_info,
|
||||
wait=True)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='standby.prepare_image',
|
||||
params=params, poll=True)
|
||||
|
||||
def test_start_iscsi_target(self):
|
||||
self.client._command = mock.MagicMock(spec_set=[])
|
||||
@ -305,9 +362,8 @@ class TestAgentClient(base.TestCase):
|
||||
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
|
||||
prep_boot_part_uuid=prep_boot_part_uuid, target_boot_mode='hello')
|
||||
self.client._command.assert_called_once_with(
|
||||
command_timeout_factor=2, node=self.node,
|
||||
method='image.install_bootloader', params=params,
|
||||
wait=True)
|
||||
node=self.node, method='image.install_bootloader', params=params,
|
||||
poll=True)
|
||||
|
||||
def test_install_bootloader(self):
|
||||
self._test_install_bootloader(root_uuid='fake-root-uuid',
|
||||
@ -415,8 +471,7 @@ class TestAgentClient(base.TestCase):
|
||||
|
||||
def test__command_agent_client(self):
|
||||
response_data = {'status': 'ok'}
|
||||
response_text = json.dumps(response_data)
|
||||
self.client.session.post.return_value = MockResponse(response_text)
|
||||
self.client.session.post.return_value = MockResponse(response_data)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
@ -472,13 +527,12 @@ class TestAgentClientAttempts(base.TestCase):
|
||||
mock_sleep.return_value = None
|
||||
error = 'Connection Timeout'
|
||||
response_data = {'status': 'ok'}
|
||||
response_text = json.dumps(response_data)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
self.client.session.post.side_effect = [requests.Timeout(error),
|
||||
requests.Timeout(error),
|
||||
MockResponse(response_text)]
|
||||
MockResponse(response_data)]
|
||||
|
||||
response = self.client._command(self.node, method, params)
|
||||
self.assertEqual(3, self.client.session.post.call_count)
|
||||
@ -494,12 +548,11 @@ class TestAgentClientAttempts(base.TestCase):
|
||||
mock_sleep.return_value = None
|
||||
error = 'Connection Timeout'
|
||||
response_data = {'status': 'ok'}
|
||||
response_text = json.dumps(response_data)
|
||||
method = 'standby.run_image'
|
||||
image_info = {'image_id': 'test_image'}
|
||||
params = {'image_info': image_info}
|
||||
self.client.session.post.side_effect = [requests.Timeout(error),
|
||||
MockResponse(response_text),
|
||||
MockResponse(response_data),
|
||||
requests.Timeout(error)]
|
||||
|
||||
response = self.client._command(self.node, method, params)
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Instead of increasing timeout when running long synchronous tasks on
|
||||
ironic-python-agent, ironic now runs them asynchronously and polls
|
||||
the agent until completion. It is no longer necessary to account for
|
||||
long-running tasks when setting ``[agent]command_timeout``.
|
Loading…
Reference in New Issue
Block a user