From a85e6e07cfa34725922d5d31dbe6956797e9b3f2 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Thu, 2 Apr 2020 17:47:20 +0200
Subject: [PATCH] Stabilize functional tests

If a driver.spawn fails without triggering a re-schedule then the
placement cleanup and the compute build failure counter update
happens after every externally observable change (instance
state change, instance fault recording, notification sending) in
the finally block of _locked_do_build_and_run_instance() call.

We have couple of functional tests that assert either the placement
cleanup or the state of the build counter. These tests are unstable as
the test races with the compute manager code. As there are no externally
visible change to wait for this patch introduces some retry to
these tests to stabilize them.

Closes-Bug: #1870385

Change-Id: I68369e7fa7630a212f4b20fc09f4c40796934bb9
---
 nova/tests/functional/integrated_helpers.py | 15 ++++++
 nova/tests/functional/test_servers.py       | 59 ++++++++++++++++-----
 2 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index 007984ae2318..29239104cbde 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -195,6 +195,21 @@ class InstanceHelperMixin(object):
 
         self.fail('The line "%(log_line)s" did not appear in the log')
 
+    def _wait_for_assert(self, assert_func, max_retries=10, sleep=0.5):
+        """Waits and retries the assert_func either until it does not raise
+        AssertionError any more or until the max_retries run out.
+        """
+        last_error = None
+        for i in range(max_retries):
+            try:
+                return assert_func()
+            except AssertionError as e:
+                last_error = e
+
+            time.sleep(sleep)
+
+        raise last_error
+
     def _create_aggregate(self, name, availability_zone=None):
         """Creates a host aggregate with the given name and optional AZ
 
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index c2651f59f00b..e502cfb141b6 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -143,8 +143,18 @@ class ServersTest(ServersTestBase):
         # We should have no (persisted) build failures until we update
         # resources, after which we should have one
         self.assertEqual([0], list(self._get_node_build_failures().values()))
-        self._run_periodics()
-        self.assertEqual([1], list(self._get_node_build_failures().values()))
+
+        # BuildAbortException will not trigger a reschedule and the build
+        # failure update is the last step in the compute manager after
+        # instance state setting, fault recording and notification sending. So
+        # we have no other way than simply wait to ensure the node build
+        # failure counter updated before we assert it.
+        def failed_counter_updated():
+            self._run_periodics()
+            self.assertEqual(
+                [1], list(self._get_node_build_failures().values()))
+
+        self._wait_for_assert(failed_counter_updated)
 
     def test_create_server_with_image_type_filter(self):
         self.flags(query_placement_for_image_type_support=True,
@@ -207,9 +217,18 @@ class ServersTest(ServersTestBase):
         self.flags(max_attempts=1, group='scheduler')
         fails = self._test_create_server_with_error_with_retries()
         self.assertEqual(1, fails)
-        self._run_periodics()
-        self.assertEqual(
-            [0, 1], list(sorted(self._get_node_build_failures().values())))
+
+        # The build failure update is the last step in build_and_run_instance
+        # in the compute manager after instance state setting, fault
+        # recording and notification sending. So we have no other way than
+        # simply wait to ensure the node build failure counter updated
+        # before we assert it.
+        def failed_counter_updated():
+            self._run_periodics()
+            self.assertEqual(
+                [0, 1], list(sorted(self._get_node_build_failures().values())))
+
+        self._wait_for_assert(failed_counter_updated)
 
     def test_create_and_delete_server(self):
         # Creates and deletes a server.
@@ -3660,11 +3679,18 @@ class ServerBuildAbortTests(integrated_helpers.ProviderUsageBaseTestCase):
 
         failed_hostname = self.compute1.manager.host
 
-        failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
-        # Expects no allocation records on the failed host.
-        self.assertRequestMatchesUsage({'VCPU': 0,
-                                        'MEMORY_MB': 0,
-                                        'DISK_GB': 0}, failed_rp_uuid)
+        # BuildAbortException coming from the FakeBuildAbortDriver will not
+        # trigger a reschedule and the placement cleanup is the last step in
+        # the compute manager after instance state setting, fault recording
+        # and notification sending. So we have no other way than simply wait
+        # to ensure the placement cleanup happens before we assert it.
+        def placement_cleanup():
+            failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
+            # Expects no allocation records on the failed host.
+            self.assertRequestMatchesUsage({'VCPU': 0,
+                                            'MEMORY_MB': 0,
+                                            'DISK_GB': 0}, failed_rp_uuid)
+        self._wait_for_assert(placement_cleanup)
 
 
 class ServerDeleteBuildTests(integrated_helpers.ProviderUsageBaseTestCase):
@@ -8001,8 +8027,17 @@ class AcceleratorServerTest(AcceleratorServerBase):
         server_uuid = server['id']
         # Check that Cyborg was called to delete ARQs
         self.cyborg.mock_del_arqs.assert_called_once_with(server_uuid)
-        # An instance in error state should consume no resources
-        self._check_no_allocs_usage(server_uuid)
+
+        # BuildAbortException will not trigger a reschedule and the placement
+        # cleanup is the last step in the compute manager after instance state
+        # setting, fault recording and notification sending. So we have no
+        # other way than simply wait to ensure the placement cleanup happens
+        # before we assert it.
+        def placement_cleanup():
+            # An instance in error state should consume no resources
+            self._check_no_allocs_usage(server_uuid)
+
+        self._wait_for_assert(placement_cleanup)
 
         self.api.delete_server(server_uuid)
         self._wait_until_deleted(server)