Reproduce bug #1819460 in functional test

There are two calls during ConductorTaskManager.build_instances, used during re-schedule, that could potentially raise exceptions which leads to that the instance is stuck in BUILD state instead of going to ERROR state. This patch adds two functional testcase to reproduce the problems. Conflicts: nova/tests/functional/test_servers.py Change-Id: If80c4e4776b81cc06293989ee41d39b53735352b Related-Bug: #1819460 (cherry picked from commit b63c42a0d4)
2019-03-11 14:39:10 +01:00 · 2019-03-11 14:39:10 +01:00 · 07a1a8ff7d
parent 6755034e10
commit 07a1a8ff7d
2 changed files with 89 additions and 6 deletions
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -245,7 +245,8 @@ class _IntegratedTestBase(test.TestCase):

 class InstanceHelperMixin(object):
    def _wait_for_server_parameter(self, admin_api, server, expected_params,
-                                   max_retries=10):
+                                   max_retries=10,
+                                   fail_when_run_out_of_retries=True):
        retry_count = 0
        while True:
            server = admin_api.get_server(server['id'])
@ -254,17 +255,22 @@ class InstanceHelperMixin(object):
                break
            retry_count += 1
            if retry_count == max_retries:
-                self.fail('Wait for state change failed, '
-                          'expected_params=%s, server=%s'
-                          % (expected_params, server))
+                if fail_when_run_out_of_retries:
+                    self.fail('Wait for state change failed, '
+                              'expected_params=%s, server=%s'
+                              % (expected_params, server))
+                else:
+                    break
            time.sleep(0.5)

        return server

    def _wait_for_state_change(self, admin_api, server, expected_status,
-                               max_retries=10):
+                               max_retries=10,
+                               fail_when_run_out_of_retries=True):
        return self._wait_for_server_parameter(
-            admin_api, server, {'status': expected_status}, max_retries)
+            admin_api, server, {'status': expected_status}, max_retries,
+            fail_when_run_out_of_retries=fail_when_run_out_of_retries)

    def _build_minimal_create_server_request(self, api, name, image_uuid=None,
                                             flavor_id=None, networks=None,
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -35,11 +35,13 @@ from nova.compute import api as compute_api
 from nova.compute import instance_actions
 from nova.compute import manager as compute_manager
 from nova.compute import rpcapi
+from nova.conductor import manager
 from nova import context
 from nova import exception
 from nova import objects
 from nova.objects import block_device as block_device_obj
 from nova import rc_fields
+from nova.scheduler import utils
 from nova.scheduler import weights
 from nova import test
 from nova.tests import fixtures as nova_fixtures
@ -4035,6 +4037,37 @@ class ServerRescheduleTests(integrated_helpers.ProviderUsageBaseTestCase):
        # Ensure the allocation records on the destination host.
        self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor1)

+    def test_allocation_fails_during_reschedule(self):
+        """Verify that if nova fails to allocate resources during re-schedule
+        then the server is put into ERROR state properly.
+        """
+
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'some-server', flavor_id=self.flavor1['id'],
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            networks='none')
+
+        orig_claim = utils.claim_resources
+        # First call is during boot, we want that to succeed normally. Then the
+        # fake virt driver triggers a re-schedule. During that re-schedule we
+        # simulate that the placement call fails.
+        with mock.patch('nova.scheduler.utils.claim_resources',
+                        side_effect=[
+                            orig_claim,
+                            exception.AllocationUpdateFailed(
+                                consumer_uuid=uuids.inst1, error='testing')]):
+
+            server = self.api.post_server({'server': server_req})
+            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
+            # instead of going to ERROR state
+            server = self._wait_for_state_change(
+                self.admin_api, server, 'ERROR',
+                fail_when_run_out_of_retries=False)
+
+            self.assertEqual('BUILD', server['status'])
+
+        self._delete_and_check_allocations(server)
+

 class ServerRescheduleTestsWithNestedResourcesRequest(ServerRescheduleTests):
    compute_driver = 'fake.FakeRescheduleDriverWithNestedCustomResources'
@ -6754,3 +6787,47 @@ class PortResourceRequestReSchedulingTest(
        updated_port = self.neutron.show_port(port['id'])['port']
        binding_profile = updated_port['binding:profile']
        self.assertNotIn('allocation', binding_profile)
+
+    def test_boot_reschedule_fill_provider_mapping_raises(self):
+        """Verify that if the  _fill_provider_mapping raises during re-schedule
+        then the instance is properly put into ERROR state.
+        """
+
+        port = self.neutron.port_with_resource_request
+
+        # First call is during boot, we want that to succeed normally. Then the
+        # fake virt driver triggers a re-schedule. During that re-schedule the
+        # fill is called again, and we simulate that call raises.
+        fill = manager.ComputeTaskManager._fill_provider_mapping
+
+        with mock.patch(
+                'nova.conductor.manager.ComputeTaskManager.'
+                '_fill_provider_mapping',
+                side_effect=[
+                    fill,
+                    exception.ConsumerAllocationRetrievalFailed(
+                        consumer_uuid=uuids.inst1, error='testing')],
+                autospec=True):
+            server = self._create_server(
+                flavor=self.flavor,
+                networks=[{'port': port['id']}])
+            # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state
+            server = self._wait_for_state_change(
+                self.admin_api, server, 'ERROR',
+                fail_when_run_out_of_retries=False)
+
+            self.assertEqual('BUILD', server['status'])
+
+        # NOTE(gibi): Due to bug 1819460 the server stuck in BUILD state and no
+        # error is presented to the user
+        # self.assertIn(
+        #     'Failed to retrieve allocations for consumer',
+        #     server['fault']['message'])
+        #
+        # NOTE(gibi): even after delete the allocation of such server is leaked
+        # self._delete_and_check_allocations(server)
+        #
+        # # assert that unbind removes the allocation from the binding
+        # updated_port = self.neutron.show_port(port['id'])['port']
+        # binding_profile = updated_port['binding:profile']
+        # self.assertNotIn('allocation', binding_profile)