From 7828fe8b64c0681727c8b09bfd0af8452ad93645 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Sun, 7 Jun 2020 13:05:13 +0200 Subject: [PATCH] agent: poll long-running commands till completion Currently for install_bootloader we use wait=True with a longer timeout. As a more robust alternative, poll the agent until the command completes. This avoids trying to guess how long the command will actually take. Change-Id: I62e9086441fa2b164aee42f7489d12aed4076f49 Story: #2006963 --- ironic/common/exception.py | 4 + ironic/conf/agent.py | 14 ++- ironic/drivers/modules/agent_client.py | 91 +++++++++++------ .../unit/drivers/modules/test_agent_client.py | 97 ++++++++++++++----- .../agent-client-poll-ce16fd589e88c95a.yaml | 7 ++ 5 files changed, 157 insertions(+), 56 deletions(-) create mode 100644 releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml diff --git a/ironic/common/exception.py b/ironic/common/exception.py index 221a21ca43..a314e75e01 100644 --- a/ironic/common/exception.py +++ b/ironic/common/exception.py @@ -664,6 +664,10 @@ class AgentConnectionFailed(IronicException): _msg_fmt = _("Connection to agent failed: %(reason)s") +class AgentCommandTimeout(IronicException): + _msg_fmt = _("Timeout executing command %(command)s on node %(node)s") + + class NodeProtected(HTTPForbidden): _msg_fmt = _("Node %(node)s is protected and cannot be undeployed, " "rebuilt or deleted") diff --git a/ironic/conf/agent.py b/ironic/conf/agent.py index 64299742c4..8572a25bd7 100644 --- a/ironic/conf/agent.py +++ b/ironic/conf/agent.py @@ -113,16 +113,20 @@ opts = [ cfg.IntOpt('command_timeout', default=60, mutable=True, - help=_('Timeout (in seconds) for IPA commands. ' - 'Please note, the bootloader installation command ' - 'to the agent is permitted a timeout of twice the ' - 'value set here as these are IO heavy operations ' - 'depending on the configuration of the instance.')), + help=_('Timeout (in seconds) for IPA commands.')), cfg.IntOpt('max_command_attempts', default=3, help=_('This is the maximum number of attempts that will be ' 'done for IPA commands that fails due to network ' 'problems.')), + cfg.IntOpt('command_wait_attempts', + default=100, + help=_('Number of attempts to check for asynchronous commands ' + 'completion before timing out.')), + cfg.IntOpt('command_wait_interval', + default=6, + help=_('Number of seconds to wait for between checks for ' + 'asynchronous commands completion.')), cfg.IntOpt('neutron_agent_poll_interval', default=2, mutable=True, diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 3edd891969..74f03cc2f6 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -56,13 +56,60 @@ class AgentClient(object): 'params': params, }) + def _raise_if_typeerror(self, result, node, method): + error = result.get('command_error') + if error and error.get('type') == 'TypeError': + LOG.error('Agent command %(method)s for node %(node)s failed. ' + 'Internal TypeError detected: Error %(error)s', + {'method': method, 'node': node.uuid, 'error': error}) + raise exception.AgentAPIError(node=node.uuid, + status=error.get('code'), + error=result.get('faultstring')) + + @METRICS.timer('AgentClient._wait_for_command') + @retrying.retry( + retry_on_exception=( + lambda e: isinstance(e, exception.AgentCommandTimeout)), + stop_max_attempt_number=CONF.agent.command_wait_attempts, + wait_fixed=CONF.agent.command_wait_interval * 1000) + def _wait_for_command(self, node, method): + """Wait for a command to complete. + + :param node: A Node object. + :param method: A string represents the command executed by agent. + """ + try: + method = method.split('.', 1)[1] + except IndexError: + pass + + commands = self.get_commands_status(node) + try: + result = next(c for c in reversed(commands) + if c.get('command_name') == method) + except StopIteration: + LOG.debug('Command %(cmd)s is not in the executing commands list ' + 'for node %(node)s', + {'cmd': method, 'node': node.uuid}) + raise exception.AgentCommandTimeout(command=method, node=node.uuid) + + if result.get('command_status') == 'RUNNING': + LOG.debug('Command %(cmd)s has not finished yet for node %(node)s', + {'cmd': method, 'node': node.uuid}) + raise exception.AgentCommandTimeout(command=method, node=node.uuid) + else: + LOG.debug('Command %(cmd)s has finished for node %(node)s with ' + 'result %(result)s', + {'cmd': method, 'node': node.uuid, 'result': result}) + self._raise_if_typeerror(result, node, method) + return result + @METRICS.timer('AgentClient._command') @retrying.retry( retry_on_exception=( lambda e: isinstance(e, exception.AgentConnectionFailed)), stop_max_attempt_number=CONF.agent.max_command_attempts) - def _command(self, node, method, params, wait=False, - command_timeout_factor=1): + def _command(self, node, method, params, wait=False, poll=False): """Sends command to agent. :param node: A Node object. @@ -72,19 +119,16 @@ class AgentClient(object): body. :param wait: True to wait for the command to finish executing, False otherwise. - :param command_timeout_factor: An integer, default 1, by which to - multiply the [agent]command_timeout - value. This is intended for use with - extremely long running commands to - the agent ramdisk where a general - timeout value should not be extended - in all cases. + :param poll: Whether to poll the command until completion. Provides + a better alternative to `wait` for long-running commands. :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. :returns: A dict containing command result from agent, see get_commands_status for a sample. """ + assert not (wait and poll) + url = self._get_command_url(node) body = self._get_command_body(method, params) request_params = { @@ -99,7 +143,7 @@ class AgentClient(object): try: response = self.session.post( url, params=request_params, data=body, - timeout=CONF.agent.command_timeout * command_timeout_factor) + timeout=CONF.agent.command_timeout) except (requests.ConnectionError, requests.Timeout) as e: msg = (_('Failed to connect to the agent running on node %(node)s ' 'for invoking command %(method)s. Error: %(error)s') % @@ -128,12 +172,6 @@ class AgentClient(object): raise exception.IronicException(msg) error = result.get('command_error') - exc_type = None - if error: - # if an error, we should see if a type field exists. This type - # field may signal an exception that is compatability based. - exc_type = error.get('type') - LOG.debug('Agent command %(method)s for node %(node)s returned ' 'result %(res)s, error %(error)s, HTTP status code %(code)d', {'node': node.uuid, 'method': method, @@ -149,14 +187,11 @@ class AgentClient(object): raise exception.AgentAPIError(node=node.uuid, status=response.status_code, error=result.get('faultstring')) - if exc_type == 'TypeError': - LOG.error('Agent command %(method)s for node %(node)s failed. ' - 'Internal %(exc_type)s error detected: Error %(error)s', - {'method': method, 'node': node.uuid, - 'exc_type': exc_type, 'error': error}) - raise exception.AgentAPIError(node=node.uuid, - status=error.get('code'), - error=result.get('faultstring')) + + self._raise_if_typeerror(result, node, method) + + if poll: + result = self._wait_for_command(node, method) return result @@ -245,7 +280,7 @@ class AgentClient(object): return self._command(node=node, method='standby.prepare_image', params=params, - wait=wait) + poll=wait) @METRICS.timer('AgentClient.start_iscsi_target') def start_iscsi_target(self, node, iqn, @@ -313,8 +348,7 @@ class AgentClient(object): return self._command(node=node, method='image.install_bootloader', params=params, - wait=True, - command_timeout_factor=2) + poll=True) except exception.AgentAPIError: # NOTE(arne_wiebalck): If for software RAID and 'uefi' as the boot # mode, we find that the IPA does not yet support the additional @@ -338,8 +372,7 @@ class AgentClient(object): return self._command(node=node, method='image.install_bootloader', params=params, - wait=True, - command_timeout_factor=2) + poll=True) @METRICS.timer('AgentClient.get_clean_steps') def get_clean_steps(self, node, ports): diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index 4fdf20a888..22c8575555 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -29,13 +29,29 @@ CONF = conf.CONF class MockResponse(object): - def __init__(self, text, status_code=http_client.OK): - assert isinstance(text, str) + def __init__(self, data=None, status_code=http_client.OK, text=None): + assert not (data and text) self.text = text + self.data = data self.status_code = status_code def json(self): - return json.loads(self.text) + if self.text: + return json.loads(self.text) + else: + return self.data + + +class MockCommandStatus(MockResponse): + def __init__(self, status, name='fake', error=None): + super().__init__({ + 'commands': [ + {'command_name': name, + 'command_status': status, + 'command_result': 'I did something', + 'command_error': error} + ] + }) class MockNode(object): @@ -87,8 +103,7 @@ class TestAgentClient(base.TestCase): def test__command(self): response_data = {'status': 'ok'} - response_text = json.dumps(response_data) - self.client.session.post.return_value = MockResponse(response_text) + self.client.session.post.return_value = MockResponse(response_data) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} @@ -106,7 +121,8 @@ class TestAgentClient(base.TestCase): def test__command_fail_json(self): response_text = 'this be not json matey!' - self.client.session.post.return_value = MockResponse(response_text) + self.client.session.post.return_value = MockResponse( + text=response_text) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} @@ -159,7 +175,7 @@ class TestAgentClient(base.TestCase): 'error': error}, str(e)) def test__command_error_code(self): - response_text = '{"faultstring": "you dun goofd"}' + response_text = {"faultstring": "you dun goofd"} self.client.session.post.return_value = MockResponse( response_text, status_code=http_client.BAD_REQUEST) method = 'standby.run_image' @@ -179,10 +195,9 @@ class TestAgentClient(base.TestCase): timeout=60) def test__command_error_code_okay_error_typeerror_embedded(self): - response_text = ('{"faultstring": "you dun goofd", ' - '"command_error": {"type": "TypeError"}}') - self.client.session.post.return_value = MockResponse( - response_text) + response_data = {"faultstring": "you dun goofd", + "command_error": {"type": "TypeError"}} + self.client.session.post.return_value = MockResponse(response_data) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} @@ -199,6 +214,36 @@ class TestAgentClient(base.TestCase): params={'wait': 'false'}, timeout=60) + @mock.patch('time.sleep', lambda seconds: None) + def test__command_poll(self): + response_data = {'status': 'ok'} + final_status = MockCommandStatus('SUCCEEDED', name='run_image') + self.client.session.post.return_value = MockResponse(response_data) + self.client.session.get.side_effect = [ + MockCommandStatus('RUNNING', name='run_image'), + final_status, + ] + + method = 'standby.run_image' + image_info = {'image_id': 'test_image'} + params = {'image_info': image_info} + expected = {'command_error': None, + 'command_name': 'run_image', + 'command_result': 'I did something', + 'command_status': 'SUCCEEDED'} + + url = self.client._get_command_url(self.node) + body = self.client._get_command_body(method, params) + + response = self.client._command(self.node, method, params, poll=True) + self.assertEqual(expected, response) + self.client.session.post.assert_called_once_with( + url, + data=body, + params={'wait': 'false'}, + timeout=60) + self.client.session.get.assert_called_with(url, timeout=60) + def test_get_commands_status(self): with mock.patch.object(self.client.session, 'get', autospec=True) as mock_get: @@ -234,7 +279,7 @@ class TestAgentClient(base.TestCase): wait=False) self.client._command.assert_called_once_with( node=self.node, method='standby.prepare_image', - params=params, wait=False) + params=params, poll=False) def test_prepare_image_with_configdrive(self): self.client._command = mock.MagicMock(spec_set=[]) @@ -251,7 +296,19 @@ class TestAgentClient(base.TestCase): wait=False) self.client._command.assert_called_once_with( node=self.node, method='standby.prepare_image', - params=params, wait=False) + params=params, poll=False) + + def test_prepare_image_with_wait(self): + self.client._command = mock.MagicMock(spec_set=[]) + image_info = {'image_id': 'image'} + params = {'image_info': image_info} + + self.client.prepare_image(self.node, + image_info, + wait=True) + self.client._command.assert_called_once_with( + node=self.node, method='standby.prepare_image', + params=params, poll=True) def test_start_iscsi_target(self): self.client._command = mock.MagicMock(spec_set=[]) @@ -305,9 +362,8 @@ class TestAgentClient(base.TestCase): self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid, prep_boot_part_uuid=prep_boot_part_uuid, target_boot_mode='hello') self.client._command.assert_called_once_with( - command_timeout_factor=2, node=self.node, - method='image.install_bootloader', params=params, - wait=True) + node=self.node, method='image.install_bootloader', params=params, + poll=True) def test_install_bootloader(self): self._test_install_bootloader(root_uuid='fake-root-uuid', @@ -415,8 +471,7 @@ class TestAgentClient(base.TestCase): def test__command_agent_client(self): response_data = {'status': 'ok'} - response_text = json.dumps(response_data) - self.client.session.post.return_value = MockResponse(response_text) + self.client.session.post.return_value = MockResponse(response_data) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} @@ -472,13 +527,12 @@ class TestAgentClientAttempts(base.TestCase): mock_sleep.return_value = None error = 'Connection Timeout' response_data = {'status': 'ok'} - response_text = json.dumps(response_data) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} self.client.session.post.side_effect = [requests.Timeout(error), requests.Timeout(error), - MockResponse(response_text)] + MockResponse(response_data)] response = self.client._command(self.node, method, params) self.assertEqual(3, self.client.session.post.call_count) @@ -494,12 +548,11 @@ class TestAgentClientAttempts(base.TestCase): mock_sleep.return_value = None error = 'Connection Timeout' response_data = {'status': 'ok'} - response_text = json.dumps(response_data) method = 'standby.run_image' image_info = {'image_id': 'test_image'} params = {'image_info': image_info} self.client.session.post.side_effect = [requests.Timeout(error), - MockResponse(response_text), + MockResponse(response_data), requests.Timeout(error)] response = self.client._command(self.node, method, params) diff --git a/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml b/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml new file mode 100644 index 0000000000..693adebe8d --- /dev/null +++ b/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Instead of increasing timeout when running long synchronous tasks on + ironic-python-agent, ironic now runs them asynchronously and polls + the agent until completion. It is no longer necessary to account for + long-running tasks when setting ``[agent]command_timeout``.