Do not assume that prepare_image is the last command to run

The get_deploy_steps command can be run after it breaking deploy. Change-Id: I8e641a521a574462010a95a19e8a64ac36d4e52d
2020-09-04 11:33:31 +02:00 · 2020-09-04 11:33:31 +02:00 · ce46cc461d
commit ce46cc461d
parent b605ab585a
4 changed files with 84 additions and 32 deletions
--- a/ironic/drivers/modules/agent.py
+++ b/ironic/drivers/modules/agent.py
@ -292,25 +292,33 @@ class AgentDeployMixin(agent_base.AgentDeployMixin):

    # TODO(dtantsur): remove in W
    def _get_uuid_from_result(self, task, type_uuid):
-        command = self._client.get_commands_status(task.node)[-1]
+        command = self._client.get_last_command_status(task.node,
+                                                       'prepare_image')
+        if (not command
+                or not command.get('command_result', {}).get('result')):
+            msg = _('Unexpected response from the agent for node %s: the '
+                    'running command list does not include prepare_image '
+                    'or its result is malformed') % task.node.uuid
+            LOG.error(msg)
+            deploy_utils.set_failed_state(task, msg)
+            return

-        if command['command_result'] is not None:
-            words = command['command_result']['result'].split()
-            for word in words:
-                if type_uuid in word:
-                    result = word.split('=')[1]
-                    if not result:
-                        msg = (_('Command result did not return %(type_uuid)s '
-                                 'for node %(node)s. The version of the IPA '
-                                 'ramdisk used in the deployment might not '
-                                 'have support for provisioning of '
-                                 'partition images.') %
-                               {'type_uuid': type_uuid,
-                                'node': task.node.uuid})
-                        LOG.error(msg)
-                        deploy_utils.set_failed_state(task, msg)
-                        return
-                    return result
+        words = command['command_result']['result'].split()
+        for word in words:
+            if type_uuid in word:
+                result = word.split('=')[1]
+                if not result:
+                    msg = (_('Command result did not return %(type_uuid)s '
+                             'for node %(node)s. The version of the IPA '
+                             'ramdisk used in the deployment might not '
+                             'have support for provisioning of '
+                             'partition images.') %
+                           {'type_uuid': type_uuid,
+                            'node': task.node.uuid})
+                    LOG.error(msg)
+                    deploy_utils.set_failed_state(task, msg)
+                    return
+                return result

    @METRICS.timer('AgentDeployMixin.prepare_instance_boot')
    @base.deploy_step(priority=60)
--- a/ironic/drivers/modules/agent_client.py
+++ b/ironic/drivers/modules/agent_client.py
@ -102,23 +102,11 @@ class AgentClient(object):
        :param method: A string represents the command executed by agent.
        :raises: AgentCommandTimeout if timeout is reached.
        """
-        try:
-            method = method.split('.', 1)[1]
-        except IndexError:
-            pass
-
        # NOTE(dtantsur): this function uses AgentCommandTimeout on every
        # failure, but unless the timeout is reached, the exception is caught
        # and retried by the @retry decorator above.
-
-        commands = self.get_commands_status(node)
-        try:
-            result = next(c for c in reversed(commands)
-                          if c.get('command_name') == method)
-        except StopIteration:
-            LOG.debug('Command %(cmd)s is not in the executing commands list '
-                      'for node %(node)s',
-                      {'cmd': method, 'node': node.uuid})
+        result = self.get_last_command_status(node, method)
+        if result is None:
            raise exception.AgentCommandTimeout(command=method, node=node.uuid)

        if result.get('command_status') == 'RUNNING':
@ -312,6 +300,29 @@ class AgentClient(object):
                  {'node': node.uuid, 'status': status})
        return result

+    def get_last_command_status(self, node, method):
+        """Get the last status for the given command.
+
+        :param node: A Node object.
+        :param method: Command name.
+        :returns: A dict containing command status from agent or None
+            if the command was not found.
+        """
+        try:
+            method = method.split('.', 1)[1]
+        except IndexError:
+            pass
+
+        commands = self.get_commands_status(node)
+        try:
+            return next(c for c in reversed(commands)
+                        if c.get('command_name') == method)
+        except StopIteration:
+            LOG.debug('Command %(cmd)s is not in the executing commands list '
+                      'for node %(node)s',
+                      {'cmd': method, 'node': node.uuid})
+            return None
+
    @METRICS.timer('AgentClient.prepare_image')
    def prepare_image(self, node, image_info, wait=False):
        """Call the `prepare_image` method on the node.
--- a/ironic/tests/unit/drivers/modules/test_agent.py
+++ b/ironic/tests/unit/drivers/modules/test_agent.py
@ -1649,6 +1649,34 @@ class TestAgentDeploy(db_base.DbTestCase):
        self.node.refresh()
        self.assertEqual('bar', self.node.instance_info['foo'])

+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_get_uuid_from_result(self, mock_statuses):
+        mock_statuses.return_value = [
+            {'command_name': 'banana', 'command_result': None},
+            {'command_name': 'prepare_image',
+             'command_result': {'result': 'okay root_uuid=abcd'}},
+            {'command_name': 'get_deploy_steps',
+             'command_result': {'deploy_steps': []}}
+        ]
+        with task_manager.acquire(
+                self.context, self.node['uuid'], shared=False) as task:
+            result = self.driver._get_uuid_from_result(task, 'root_uuid')
+            self.assertEqual('abcd', result)
+
+    @mock.patch.object(agent_client.AgentClient, 'get_commands_status',
+                       autospec=True)
+    def test_get_uuid_from_result_fails(self, mock_statuses):
+        mock_statuses.return_value = [
+            {'command_name': 'banana', 'command_result': None},
+            {'command_name': 'get_deploy_steps',
+             'command_result': {'deploy_steps': []}}
+        ]
+        with task_manager.acquire(
+                self.context, self.node['uuid'], shared=False) as task:
+            result = self.driver._get_uuid_from_result(task, 'root_uuid')
+            self.assertIsNone(result)
+
    @mock.patch.object(manager_utils, 'restore_power_state_if_needed',
                       autospec=True)
    @mock.patch.object(manager_utils, 'power_on_node_if_needed',
--- a/releasenotes/notes/agent-uuid-5d86bc18849acda3.yaml
+++ b/releasenotes/notes/agent-uuid-5d86bc18849acda3.yaml
@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes the deployment failure with Ussuri (and older) ramdisks that happens
+    when another IPA command runs after ``prepare_image``.