Stabilize functional tests

If a driver.spawn fails without triggering a re-schedule then the placement cleanup and the compute build failure counter update happens after every externally observable change (instance state change, instance fault recording, notification sending) in the finally block of _locked_do_build_and_run_instance() call. We have couple of functional tests that assert either the placement cleanup or the state of the build counter. These tests are unstable as the test races with the compute manager code. As there are no externally visible change to wait for this patch introduces some retry to these tests to stabilize them. Closes-Bug: #1870385 Change-Id: I68369e7fa7630a212f4b20fc09f4c40796934bb9
2020-04-02 17:47:20 +02:00 · 2020-04-02 17:47:20 +02:00 · a85e6e07cf
parent 0639f96cdb
commit a85e6e07cf
2 changed files with 62 additions and 12 deletions
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -195,6 +195,21 @@ class InstanceHelperMixin(object):

        self.fail('The line "%(log_line)s" did not appear in the log')

+    def _wait_for_assert(self, assert_func, max_retries=10, sleep=0.5):
+        """Waits and retries the assert_func either until it does not raise
+        AssertionError any more or until the max_retries run out.
+        """
+        last_error = None
+        for i in range(max_retries):
+            try:
+                return assert_func()
+            except AssertionError as e:
+                last_error = e
+
+            time.sleep(sleep)
+
+        raise last_error
+
    def _create_aggregate(self, name, availability_zone=None):
        """Creates a host aggregate with the given name and optional AZ

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -143,8 +143,18 @@ class ServersTest(ServersTestBase):
        # We should have no (persisted) build failures until we update
        # resources, after which we should have one
        self.assertEqual([0], list(self._get_node_build_failures().values()))
-        self._run_periodics()
-        self.assertEqual([1], list(self._get_node_build_failures().values()))
+
+        # BuildAbortException will not trigger a reschedule and the build
+        # failure update is the last step in the compute manager after
+        # instance state setting, fault recording and notification sending. So
+        # we have no other way than simply wait to ensure the node build
+        # failure counter updated before we assert it.
+        def failed_counter_updated():
+            self._run_periodics()
+            self.assertEqual(
+                [1], list(self._get_node_build_failures().values()))
+
+        self._wait_for_assert(failed_counter_updated)

    def test_create_server_with_image_type_filter(self):
        self.flags(query_placement_for_image_type_support=True,
@ -207,9 +217,18 @@ class ServersTest(ServersTestBase):
        self.flags(max_attempts=1, group='scheduler')
        fails = self._test_create_server_with_error_with_retries()
        self.assertEqual(1, fails)
-        self._run_periodics()
-        self.assertEqual(
-            [0, 1], list(sorted(self._get_node_build_failures().values())))
+
+        # The build failure update is the last step in build_and_run_instance
+        # in the compute manager after instance state setting, fault
+        # recording and notification sending. So we have no other way than
+        # simply wait to ensure the node build failure counter updated
+        # before we assert it.
+        def failed_counter_updated():
+            self._run_periodics()
+            self.assertEqual(
+                [0, 1], list(sorted(self._get_node_build_failures().values())))
+
+        self._wait_for_assert(failed_counter_updated)

    def test_create_and_delete_server(self):
        # Creates and deletes a server.
@ -3660,11 +3679,18 @@ class ServerBuildAbortTests(integrated_helpers.ProviderUsageBaseTestCase):

        failed_hostname = self.compute1.manager.host

-        failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
-        # Expects no allocation records on the failed host.
-        self.assertRequestMatchesUsage({'VCPU': 0,
-                                        'MEMORY_MB': 0,
-                                        'DISK_GB': 0}, failed_rp_uuid)
+        # BuildAbortException coming from the FakeBuildAbortDriver will not
+        # trigger a reschedule and the placement cleanup is the last step in
+        # the compute manager after instance state setting, fault recording
+        # and notification sending. So we have no other way than simply wait
+        # to ensure the placement cleanup happens before we assert it.
+        def placement_cleanup():
+            failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
+            # Expects no allocation records on the failed host.
+            self.assertRequestMatchesUsage({'VCPU': 0,
+                                            'MEMORY_MB': 0,
+                                            'DISK_GB': 0}, failed_rp_uuid)
+        self._wait_for_assert(placement_cleanup)


 class ServerDeleteBuildTests(integrated_helpers.ProviderUsageBaseTestCase):
@ -8001,8 +8027,17 @@ class AcceleratorServerTest(AcceleratorServerBase):
        server_uuid = server['id']
        # Check that Cyborg was called to delete ARQs
        self.cyborg.mock_del_arqs.assert_called_once_with(server_uuid)
-        # An instance in error state should consume no resources
-        self._check_no_allocs_usage(server_uuid)
+
+        # BuildAbortException will not trigger a reschedule and the placement
+        # cleanup is the last step in the compute manager after instance state
+        # setting, fault recording and notification sending. So we have no
+        # other way than simply wait to ensure the placement cleanup happens
+        # before we assert it.
+        def placement_cleanup():
+            # An instance in error state should consume no resources
+            self._check_no_allocs_usage(server_uuid)
+
+        self._wait_for_assert(placement_cleanup)

        self.api.delete_server(server_uuid)
        self._wait_until_deleted(server)