Stabilize functional tests

If a driver.spawn fails without triggering a re-schedule then the
placement cleanup and the compute build failure counter update
happens after every externally observable change (instance
state change, instance fault recording, notification sending) in
the finally block of _locked_do_build_and_run_instance() call.

We have couple of functional tests that assert either the placement
cleanup or the state of the build counter. These tests are unstable as
the test races with the compute manager code. As there are no externally
visible change to wait for this patch introduces some retry to
these tests to stabilize them.

Closes-Bug: #1870385

Change-Id: I68369e7fa7630a212f4b20fc09f4c40796934bb9
This commit is contained in:
Balazs Gibizer 2020-04-02 17:47:20 +02:00
parent 0639f96cdb
commit a85e6e07cf
2 changed files with 62 additions and 12 deletions

View File

@ -195,6 +195,21 @@ class InstanceHelperMixin(object):
self.fail('The line "%(log_line)s" did not appear in the log')
def _wait_for_assert(self, assert_func, max_retries=10, sleep=0.5):
"""Waits and retries the assert_func either until it does not raise
AssertionError any more or until the max_retries run out.
"""
last_error = None
for i in range(max_retries):
try:
return assert_func()
except AssertionError as e:
last_error = e
time.sleep(sleep)
raise last_error
def _create_aggregate(self, name, availability_zone=None):
"""Creates a host aggregate with the given name and optional AZ

View File

@ -143,8 +143,18 @@ class ServersTest(ServersTestBase):
# We should have no (persisted) build failures until we update
# resources, after which we should have one
self.assertEqual([0], list(self._get_node_build_failures().values()))
self._run_periodics()
self.assertEqual([1], list(self._get_node_build_failures().values()))
# BuildAbortException will not trigger a reschedule and the build
# failure update is the last step in the compute manager after
# instance state setting, fault recording and notification sending. So
# we have no other way than simply wait to ensure the node build
# failure counter updated before we assert it.
def failed_counter_updated():
self._run_periodics()
self.assertEqual(
[1], list(self._get_node_build_failures().values()))
self._wait_for_assert(failed_counter_updated)
def test_create_server_with_image_type_filter(self):
self.flags(query_placement_for_image_type_support=True,
@ -207,9 +217,18 @@ class ServersTest(ServersTestBase):
self.flags(max_attempts=1, group='scheduler')
fails = self._test_create_server_with_error_with_retries()
self.assertEqual(1, fails)
self._run_periodics()
self.assertEqual(
[0, 1], list(sorted(self._get_node_build_failures().values())))
# The build failure update is the last step in build_and_run_instance
# in the compute manager after instance state setting, fault
# recording and notification sending. So we have no other way than
# simply wait to ensure the node build failure counter updated
# before we assert it.
def failed_counter_updated():
self._run_periodics()
self.assertEqual(
[0, 1], list(sorted(self._get_node_build_failures().values())))
self._wait_for_assert(failed_counter_updated)
def test_create_and_delete_server(self):
# Creates and deletes a server.
@ -3660,11 +3679,18 @@ class ServerBuildAbortTests(integrated_helpers.ProviderUsageBaseTestCase):
failed_hostname = self.compute1.manager.host
failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
# Expects no allocation records on the failed host.
self.assertRequestMatchesUsage({'VCPU': 0,
'MEMORY_MB': 0,
'DISK_GB': 0}, failed_rp_uuid)
# BuildAbortException coming from the FakeBuildAbortDriver will not
# trigger a reschedule and the placement cleanup is the last step in
# the compute manager after instance state setting, fault recording
# and notification sending. So we have no other way than simply wait
# to ensure the placement cleanup happens before we assert it.
def placement_cleanup():
failed_rp_uuid = self._get_provider_uuid_by_host(failed_hostname)
# Expects no allocation records on the failed host.
self.assertRequestMatchesUsage({'VCPU': 0,
'MEMORY_MB': 0,
'DISK_GB': 0}, failed_rp_uuid)
self._wait_for_assert(placement_cleanup)
class ServerDeleteBuildTests(integrated_helpers.ProviderUsageBaseTestCase):
@ -8001,8 +8027,17 @@ class AcceleratorServerTest(AcceleratorServerBase):
server_uuid = server['id']
# Check that Cyborg was called to delete ARQs
self.cyborg.mock_del_arqs.assert_called_once_with(server_uuid)
# An instance in error state should consume no resources
self._check_no_allocs_usage(server_uuid)
# BuildAbortException will not trigger a reschedule and the placement
# cleanup is the last step in the compute manager after instance state
# setting, fault recording and notification sending. So we have no
# other way than simply wait to ensure the placement cleanup happens
# before we assert it.
def placement_cleanup():
# An instance in error state should consume no resources
self._check_no_allocs_usage(server_uuid)
self._wait_for_assert(placement_cleanup)
self.api.delete_server(server_uuid)
self._wait_until_deleted(server)