From d0375c2f9e5cc4dac7f8805382c09dbe6224dcd9 Mon Sep 17 00:00:00 2001
From: Matt Riedemann <mriedem.os@gmail.com>
Date: Thu, 31 Aug 2017 22:47:07 -0400
Subject: [PATCH] Add recreate test for evacuate claim failure

This adds a functional recreate test for when the MoveClaim
fails on the destination node and the allocation on the
destination node is not cleaned up. This is because the
MoveClaim fails in it's constructor so it never exits the
Claim context manager to call the drop_move_claim which
would remove the destination node allocation.

Eventually we'll have to manually remove the destination
node allocation in the ComputeManager.rebuild_instance method.

Change-Id: I8900ace4436c4837beb8b4eb1e1d05905efc6dce
Related-Bug: #1713786
(cherry picked from commit 6ed80ddcdd3f9e40f0ada581ac7bd524ec48135b)
---
 nova/tests/functional/test_servers.py | 89 +++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index a718db4661a0..30f25d2b2299 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1740,6 +1740,95 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
         self._delete_and_check_allocations(
             server, source_rp_uuid, dest_rp_uuid)
 
+    def test_evacuate_claim_on_dest_fails(self):
+        """Tests that the allocations on the destination node are cleaned up
+        when the rebuild move claim fails due to insufficient resources.
+        """
+        source_hostname = self.compute1.host
+        dest_hostname = self.compute2.host
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
+
+        server = self._boot_and_check_allocations(
+            self.flavor1, source_hostname)
+
+        source_compute_id = self.admin_api.get_services(
+            host=source_hostname, binary='nova-compute')[0]['id']
+
+        self.compute1.stop()
+        # force it down to avoid waiting for the service group to time out
+        self.admin_api.put_service(
+            source_compute_id, {'forced_down': 'true'})
+
+        # NOTE(mriedem): This isn't great, and I'd like to fake out the driver
+        # to make the claim fail, by doing something like returning a too high
+        # memory_mb overhead, but the limits dict passed to the claim is empty
+        # so the claim test is considering it as unlimited and never actually
+        # performs a claim test. Configuring the scheduler to use the RamFilter
+        # to get the memory_mb limit at least seems like it should work but
+        # it doesn't appear to for some reason...
+        def fake_move_claim(*args, **kwargs):
+            # Assert the destination node allocation exists.
+            dest_usages = self._get_provider_usages(dest_rp_uuid)
+            self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
+            raise exception.ComputeResourcesUnavailable(
+                    reason='test_evacuate_claim_on_dest_fails')
+
+        with mock.patch('nova.compute.claims.MoveClaim', fake_move_claim):
+            # evacuate the server
+            self.api.post_server_action(server['id'], {'evacuate': {}})
+            # the migration will fail on the dest node and the instance will
+            # go into error state
+            server = self._wait_for_state_change(self.api, server, 'ERROR')
+
+        # Run the periodics to show those don't modify allocations.
+        self._run_periodics()
+
+        # The allocation should still exist on the source node since it's
+        # still down, and the allocation on the destination node should be
+        # cleaned up.
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+
+        dest_usages = self._get_provider_usages(dest_rp_uuid)
+        # FIXME(mriedem): This is bug 1713786 where the claim fails and the
+        # dest node allocation isn't cleaned up. Uncomment when fixed.
+        # self.assertFlavorMatchesAllocation(
+        #     {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        # FIXME(mriedem): Uncomment when bug 1713786 is fixed.
+        # self.assertEqual(1, len(allocations))
+        self.assertEqual(2, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
+
+        # start up the source compute
+        self.compute1.start()
+        self.admin_api.put_service(
+            source_compute_id, {'forced_down': 'false'})
+
+        # Run the periodics again to show they don't change anything.
+        self._run_periodics()
+
+        # The source compute shouldn't have cleaned up the allocation for
+        # itself since the instance didn't move.
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        # FIXME(mriedem): Uncomment when bug 1713786 is fixed.
+        # self.assertEqual(1, len(allocations))
+        self.assertEqual(2, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
+
     def _boot_then_shelve_and_check_allocations(self, hostname, rp_uuid):
         # avoid automatic shelve offloading
         self.flags(shelved_offload_time=-1)