From 7828fe8b64c0681727c8b09bfd0af8452ad93645 Mon Sep 17 00:00:00 2001
From: Dmitry Tantsur <dtantsur@protonmail.com>
Date: Sun, 7 Jun 2020 13:05:13 +0200
Subject: [PATCH] agent: poll long-running commands till completion

Currently for install_bootloader we use wait=True with a longer
timeout. As a more robust alternative, poll the agent until
the command completes. This avoids trying to guess how long
the command will actually take.

Change-Id: I62e9086441fa2b164aee42f7489d12aed4076f49
Story: #2006963
---
 ironic/common/exception.py                    |  4 +
 ironic/conf/agent.py                          | 14 ++-
 ironic/drivers/modules/agent_client.py        | 91 +++++++++++------
 .../unit/drivers/modules/test_agent_client.py | 97 ++++++++++++++-----
 .../agent-client-poll-ce16fd589e88c95a.yaml   |  7 ++
 5 files changed, 157 insertions(+), 56 deletions(-)
 create mode 100644 releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml

diff --git a/ironic/common/exception.py b/ironic/common/exception.py
index 221a21ca43..a314e75e01 100644
--- a/ironic/common/exception.py
+++ b/ironic/common/exception.py
@@ -664,6 +664,10 @@ class AgentConnectionFailed(IronicException):
     _msg_fmt = _("Connection to agent failed: %(reason)s")
 
 
+class AgentCommandTimeout(IronicException):
+    _msg_fmt = _("Timeout executing command %(command)s on node %(node)s")
+
+
 class NodeProtected(HTTPForbidden):
     _msg_fmt = _("Node %(node)s is protected and cannot be undeployed, "
                  "rebuilt or deleted")
diff --git a/ironic/conf/agent.py b/ironic/conf/agent.py
index 64299742c4..8572a25bd7 100644
--- a/ironic/conf/agent.py
+++ b/ironic/conf/agent.py
@@ -113,16 +113,20 @@ opts = [
     cfg.IntOpt('command_timeout',
                default=60,
                mutable=True,
-               help=_('Timeout (in seconds) for IPA commands. '
-                      'Please note, the bootloader installation command '
-                      'to the agent is permitted a timeout of twice the '
-                      'value set here as these are IO heavy operations '
-                      'depending on the configuration of the instance.')),
+               help=_('Timeout (in seconds) for IPA commands.')),
     cfg.IntOpt('max_command_attempts',
                default=3,
                help=_('This is the maximum number of attempts that will be '
                       'done for IPA commands that fails due to network '
                       'problems.')),
+    cfg.IntOpt('command_wait_attempts',
+               default=100,
+               help=_('Number of attempts to check for asynchronous commands '
+                      'completion before timing out.')),
+    cfg.IntOpt('command_wait_interval',
+               default=6,
+               help=_('Number of seconds to wait for between checks for '
+                      'asynchronous commands completion.')),
     cfg.IntOpt('neutron_agent_poll_interval',
                default=2,
                mutable=True,
diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py
index 3edd891969..74f03cc2f6 100644
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@@ -56,13 +56,60 @@ class AgentClient(object):
             'params': params,
         })
 
+    def _raise_if_typeerror(self, result, node, method):
+        error = result.get('command_error')
+        if error and error.get('type') == 'TypeError':
+            LOG.error('Agent command %(method)s for node %(node)s failed. '
+                      'Internal TypeError detected: Error %(error)s',
+                      {'method': method, 'node': node.uuid, 'error': error})
+            raise exception.AgentAPIError(node=node.uuid,
+                                          status=error.get('code'),
+                                          error=result.get('faultstring'))
+
+    @METRICS.timer('AgentClient._wait_for_command')
+    @retrying.retry(
+        retry_on_exception=(
+            lambda e: isinstance(e, exception.AgentCommandTimeout)),
+        stop_max_attempt_number=CONF.agent.command_wait_attempts,
+        wait_fixed=CONF.agent.command_wait_interval * 1000)
+    def _wait_for_command(self, node, method):
+        """Wait for a command to complete.
+
+        :param node: A Node object.
+        :param method: A string represents the command executed by agent.
+        """
+        try:
+            method = method.split('.', 1)[1]
+        except IndexError:
+            pass
+
+        commands = self.get_commands_status(node)
+        try:
+            result = next(c for c in reversed(commands)
+                          if c.get('command_name') == method)
+        except StopIteration:
+            LOG.debug('Command %(cmd)s is not in the executing commands list '
+                      'for node %(node)s',
+                      {'cmd': method, 'node': node.uuid})
+            raise exception.AgentCommandTimeout(command=method, node=node.uuid)
+
+        if result.get('command_status') == 'RUNNING':
+            LOG.debug('Command %(cmd)s has not finished yet for node %(node)s',
+                      {'cmd': method, 'node': node.uuid})
+            raise exception.AgentCommandTimeout(command=method, node=node.uuid)
+        else:
+            LOG.debug('Command %(cmd)s has finished for node %(node)s with '
+                      'result %(result)s',
+                      {'cmd': method, 'node': node.uuid, 'result': result})
+            self._raise_if_typeerror(result, node, method)
+            return result
+
     @METRICS.timer('AgentClient._command')
     @retrying.retry(
         retry_on_exception=(
             lambda e: isinstance(e, exception.AgentConnectionFailed)),
         stop_max_attempt_number=CONF.agent.max_command_attempts)
-    def _command(self, node, method, params, wait=False,
-                 command_timeout_factor=1):
+    def _command(self, node, method, params, wait=False, poll=False):
         """Sends command to agent.
 
         :param node: A Node object.
@@ -72,19 +119,16 @@ class AgentClient(object):
                        body.
         :param wait: True to wait for the command to finish executing, False
                      otherwise.
-        :param command_timeout_factor: An integer, default 1, by which to
-                                       multiply the [agent]command_timeout
-                                       value. This is intended for use with
-                                       extremely long running commands to
-                                       the agent ramdisk where a general
-                                       timeout value should not be extended
-                                       in all cases.
+        :param poll: Whether to poll the command until completion. Provides
+                     a better alternative to `wait` for long-running commands.
         :raises: IronicException when failed to issue the request or there was
                  a malformed response from the agent.
         :raises: AgentAPIError when agent failed to execute specified command.
         :returns: A dict containing command result from agent, see
                   get_commands_status for a sample.
         """
+        assert not (wait and poll)
+
         url = self._get_command_url(node)
         body = self._get_command_body(method, params)
         request_params = {
@@ -99,7 +143,7 @@ class AgentClient(object):
         try:
             response = self.session.post(
                 url, params=request_params, data=body,
-                timeout=CONF.agent.command_timeout * command_timeout_factor)
+                timeout=CONF.agent.command_timeout)
         except (requests.ConnectionError, requests.Timeout) as e:
             msg = (_('Failed to connect to the agent running on node %(node)s '
                      'for invoking command %(method)s. Error: %(error)s') %
@@ -128,12 +172,6 @@ class AgentClient(object):
             raise exception.IronicException(msg)
 
         error = result.get('command_error')
-        exc_type = None
-        if error:
-            # if an error, we should see if a type field exists. This type
-            # field may signal an exception that is compatability based.
-            exc_type = error.get('type')
-
         LOG.debug('Agent command %(method)s for node %(node)s returned '
                   'result %(res)s, error %(error)s, HTTP status code %(code)d',
                   {'node': node.uuid, 'method': method,
@@ -149,14 +187,11 @@ class AgentClient(object):
             raise exception.AgentAPIError(node=node.uuid,
                                           status=response.status_code,
                                           error=result.get('faultstring'))
-        if exc_type == 'TypeError':
-            LOG.error('Agent command %(method)s for node %(node)s failed. '
-                      'Internal %(exc_type)s error detected: Error %(error)s',
-                      {'method': method, 'node': node.uuid,
-                       'exc_type': exc_type, 'error': error})
-            raise exception.AgentAPIError(node=node.uuid,
-                                          status=error.get('code'),
-                                          error=result.get('faultstring'))
+
+        self._raise_if_typeerror(result, node, method)
+
+        if poll:
+            result = self._wait_for_command(node, method)
 
         return result
 
@@ -245,7 +280,7 @@ class AgentClient(object):
         return self._command(node=node,
                              method='standby.prepare_image',
                              params=params,
-                             wait=wait)
+                             poll=wait)
 
     @METRICS.timer('AgentClient.start_iscsi_target')
     def start_iscsi_target(self, node, iqn,
@@ -313,8 +348,7 @@ class AgentClient(object):
             return self._command(node=node,
                                  method='image.install_bootloader',
                                  params=params,
-                                 wait=True,
-                                 command_timeout_factor=2)
+                                 poll=True)
         except exception.AgentAPIError:
             # NOTE(arne_wiebalck): If for software RAID and 'uefi' as the boot
             # mode, we find that the IPA does not yet support the additional
@@ -338,8 +372,7 @@ class AgentClient(object):
                 return self._command(node=node,
                                      method='image.install_bootloader',
                                      params=params,
-                                     wait=True,
-                                     command_timeout_factor=2)
+                                     poll=True)
 
     @METRICS.timer('AgentClient.get_clean_steps')
     def get_clean_steps(self, node, ports):
diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py
index 4fdf20a888..22c8575555 100644
--- a/ironic/tests/unit/drivers/modules/test_agent_client.py
+++ b/ironic/tests/unit/drivers/modules/test_agent_client.py
@@ -29,13 +29,29 @@ CONF = conf.CONF
 
 
 class MockResponse(object):
-    def __init__(self, text, status_code=http_client.OK):
-        assert isinstance(text, str)
+    def __init__(self, data=None, status_code=http_client.OK, text=None):
+        assert not (data and text)
         self.text = text
+        self.data = data
         self.status_code = status_code
 
     def json(self):
-        return json.loads(self.text)
+        if self.text:
+            return json.loads(self.text)
+        else:
+            return self.data
+
+
+class MockCommandStatus(MockResponse):
+    def __init__(self, status, name='fake', error=None):
+        super().__init__({
+            'commands': [
+                {'command_name': name,
+                 'command_status': status,
+                 'command_result': 'I did something',
+                 'command_error': error}
+            ]
+        })
 
 
 class MockNode(object):
@@ -87,8 +103,7 @@ class TestAgentClient(base.TestCase):
 
     def test__command(self):
         response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(response_data)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
@@ -106,7 +121,8 @@ class TestAgentClient(base.TestCase):
 
     def test__command_fail_json(self):
         response_text = 'this be not json matey!'
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(
+            text=response_text)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
@@ -159,7 +175,7 @@ class TestAgentClient(base.TestCase):
                           'error': error}, str(e))
 
     def test__command_error_code(self):
-        response_text = '{"faultstring": "you dun goofd"}'
+        response_text = {"faultstring": "you dun goofd"}
         self.client.session.post.return_value = MockResponse(
             response_text, status_code=http_client.BAD_REQUEST)
         method = 'standby.run_image'
@@ -179,10 +195,9 @@ class TestAgentClient(base.TestCase):
             timeout=60)
 
     def test__command_error_code_okay_error_typeerror_embedded(self):
-        response_text = ('{"faultstring": "you dun goofd", '
-                         '"command_error": {"type": "TypeError"}}')
-        self.client.session.post.return_value = MockResponse(
-            response_text)
+        response_data = {"faultstring": "you dun goofd",
+                         "command_error": {"type": "TypeError"}}
+        self.client.session.post.return_value = MockResponse(response_data)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
@@ -199,6 +214,36 @@ class TestAgentClient(base.TestCase):
             params={'wait': 'false'},
             timeout=60)
 
+    @mock.patch('time.sleep', lambda seconds: None)
+    def test__command_poll(self):
+        response_data = {'status': 'ok'}
+        final_status = MockCommandStatus('SUCCEEDED', name='run_image')
+        self.client.session.post.return_value = MockResponse(response_data)
+        self.client.session.get.side_effect = [
+            MockCommandStatus('RUNNING', name='run_image'),
+            final_status,
+        ]
+
+        method = 'standby.run_image'
+        image_info = {'image_id': 'test_image'}
+        params = {'image_info': image_info}
+        expected = {'command_error': None,
+                    'command_name': 'run_image',
+                    'command_result': 'I did something',
+                    'command_status': 'SUCCEEDED'}
+
+        url = self.client._get_command_url(self.node)
+        body = self.client._get_command_body(method, params)
+
+        response = self.client._command(self.node, method, params, poll=True)
+        self.assertEqual(expected, response)
+        self.client.session.post.assert_called_once_with(
+            url,
+            data=body,
+            params={'wait': 'false'},
+            timeout=60)
+        self.client.session.get.assert_called_with(url, timeout=60)
+
     def test_get_commands_status(self):
         with mock.patch.object(self.client.session, 'get',
                                autospec=True) as mock_get:
@@ -234,7 +279,7 @@ class TestAgentClient(base.TestCase):
                                   wait=False)
         self.client._command.assert_called_once_with(
             node=self.node, method='standby.prepare_image',
-            params=params, wait=False)
+            params=params, poll=False)
 
     def test_prepare_image_with_configdrive(self):
         self.client._command = mock.MagicMock(spec_set=[])
@@ -251,7 +296,19 @@ class TestAgentClient(base.TestCase):
                                   wait=False)
         self.client._command.assert_called_once_with(
             node=self.node, method='standby.prepare_image',
-            params=params, wait=False)
+            params=params, poll=False)
+
+    def test_prepare_image_with_wait(self):
+        self.client._command = mock.MagicMock(spec_set=[])
+        image_info = {'image_id': 'image'}
+        params = {'image_info': image_info}
+
+        self.client.prepare_image(self.node,
+                                  image_info,
+                                  wait=True)
+        self.client._command.assert_called_once_with(
+            node=self.node, method='standby.prepare_image',
+            params=params, poll=True)
 
     def test_start_iscsi_target(self):
         self.client._command = mock.MagicMock(spec_set=[])
@@ -305,9 +362,8 @@ class TestAgentClient(base.TestCase):
             self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
             prep_boot_part_uuid=prep_boot_part_uuid, target_boot_mode='hello')
         self.client._command.assert_called_once_with(
-            command_timeout_factor=2, node=self.node,
-            method='image.install_bootloader', params=params,
-            wait=True)
+            node=self.node, method='image.install_bootloader', params=params,
+            poll=True)
 
     def test_install_bootloader(self):
         self._test_install_bootloader(root_uuid='fake-root-uuid',
@@ -415,8 +471,7 @@ class TestAgentClient(base.TestCase):
 
     def test__command_agent_client(self):
         response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
-        self.client.session.post.return_value = MockResponse(response_text)
+        self.client.session.post.return_value = MockResponse(response_data)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
@@ -472,13 +527,12 @@ class TestAgentClientAttempts(base.TestCase):
         mock_sleep.return_value = None
         error = 'Connection Timeout'
         response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
         self.client.session.post.side_effect = [requests.Timeout(error),
                                                 requests.Timeout(error),
-                                                MockResponse(response_text)]
+                                                MockResponse(response_data)]
 
         response = self.client._command(self.node, method, params)
         self.assertEqual(3, self.client.session.post.call_count)
@@ -494,12 +548,11 @@ class TestAgentClientAttempts(base.TestCase):
         mock_sleep.return_value = None
         error = 'Connection Timeout'
         response_data = {'status': 'ok'}
-        response_text = json.dumps(response_data)
         method = 'standby.run_image'
         image_info = {'image_id': 'test_image'}
         params = {'image_info': image_info}
         self.client.session.post.side_effect = [requests.Timeout(error),
-                                                MockResponse(response_text),
+                                                MockResponse(response_data),
                                                 requests.Timeout(error)]
 
         response = self.client._command(self.node, method, params)
diff --git a/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml b/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml
new file mode 100644
index 0000000000..693adebe8d
--- /dev/null
+++ b/releasenotes/notes/agent-client-poll-ce16fd589e88c95a.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    Instead of increasing timeout when running long synchronous tasks on
+    ironic-python-agent, ironic now runs them asynchronously and polls
+    the agent until completion. It is no longer necessary to account for
+    long-running tasks when setting ``[agent]command_timeout``.