From b5b5cab61da98c8bcf2c4e52a6d8ce0108dcfc64 Mon Sep 17 00:00:00 2001
From: Steve Baker <sbaker@redhat.com>
Date: Wed, 4 Sep 2019 10:09:04 +1200
Subject: [PATCH] Fix race in execution finishing

An execution state can go from RUNNING to SUCCESS between fetching the
last message from the websocket and polling the execution state. This
means the SUCCESS payload is never returned and the overcloud
deployment fails at the end with no indication as to why.

This change turns the output of the execution into the last payload,
allowing the calling SUCCESS logic to run.

Change-Id: Ic22021ba9a2717de199629e361c656e2f562fb38
Closes-Bug: #1842987
---
 tripleoclient/tests/workflows/test_base.py | 31 ++++++++++++++++++++++
 tripleoclient/workflows/base.py            |  8 ++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tripleoclient/tests/workflows/test_base.py b/tripleoclient/tests/workflows/test_base.py
index dbcd67c51..7dcb88d15 100644
--- a/tripleoclient/tests/workflows/test_base.py
+++ b/tripleoclient/tests/workflows/test_base.py
@@ -12,6 +12,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
+import json
 import mock
 
 from osc_lib.tests import utils
@@ -89,6 +90,7 @@ class TestBaseWorkflows(utils.TestCommand):
         }
 
         mistral = mock.Mock()
+        mistral.executions.get.return_value.state = 'RUNNING'
         websocket = mock.Mock()
         websocket.wait_for_messages.return_value = iter([payload_a, payload_b])
         execution = mock.Mock()
@@ -139,6 +141,7 @@ class TestBaseWorkflows(utils.TestCommand):
         }
 
         mistral = mock.Mock()
+        mistral.executions.get.return_value.state = 'RUNNING'
         websocket = mock.Mock()
         websocket.wait_for_messages.return_value = iter([payload_a, payload_b])
         execution = mock.Mock()
@@ -172,3 +175,31 @@ class TestBaseWorkflows(utils.TestCommand):
 
         self.assertRaises(ex.WorkflowActionError,
                           base.call_action, mistral, action)
+
+    def test_wait_for_messages_execution_complete(self):
+        payload_a = {
+            'status': 'RUNNING',
+            'execution_id': 'aaaa',
+            'root_execution_id': 'aaaa'
+        }
+        payload_b = {
+            'status': 'SUCCESS',
+            'execution_id': 'aaaa',
+            'root_execution_id': 'aaaa'
+        }
+
+        mistral = mock.Mock()
+        mistral.executions.get.return_value.state = 'SUCCESS'
+        mistral.executions.get.return_value.output = json.dumps(payload_b)
+        websocket = mock.Mock()
+        websocket.wait_for_messages.return_value = iter([payload_a])
+        execution = mock.Mock()
+        execution.id = 'aaaa'
+
+        messages = list(base.wait_for_messages(mistral, websocket, execution))
+
+        # Assert only payload_b was returned
+        self.assertEqual([payload_a, payload_b], messages)
+        mistral.executions.get.assert_called_with('aaaa')
+
+        websocket.wait_for_messages.assert_called_with(timeout=None)
diff --git a/tripleoclient/workflows/base.py b/tripleoclient/workflows/base.py
index 565aa27fb..b8afd22ed 100644
--- a/tripleoclient/workflows/base.py
+++ b/tripleoclient/workflows/base.py
@@ -91,8 +91,12 @@ def wait_for_messages(mistral, websocket, execution, timeout=None):
             # default to running and assume it is just an "in progress"
             # message from the workflow.
             # Workflows should end with SUCCESS or ERROR statuses.
-            if payload.get('status', 'RUNNING') != "RUNNING" or \
-                    mistral.executions.get(execution.id).state != "RUNNING":
+            if payload.get('status', 'RUNNING') != "RUNNING":
+                return
+            execution = mistral.executions.get(execution.id)
+            if execution.state != "RUNNING":
+                # yield the output as the last payload which was missed
+                yield json.loads(execution.output)
                 return
     except (exceptions.WebSocketTimeout, exceptions.WebSocketConnectionClosed):
         check_execution_status(mistral, execution.id)