Drop source node allocations if finish_resize fails

By the time finish_resize runs on the dest host, the instance host/node values are already pointing at the dest (they are set by resize_instance on the source compute before casting to finish_resize on the dest). If finish_resize fails, the instance is essentially stuck on the dest host so rather than revert the allocations (which will drop the new flavor allocations against the dest host where the instance now lives) we should just drop the old flavor allocations on the source node resource provider, which is what this change does. The functional regression recreate test is updated to show this working. Conflicts: nova/tests/functional/regressions/test_bug_1825537.py nova/tests/functional/test_servers.py NOTE(mriedem): The functional test conflicts are due to not having change If6aa37d9b6b48791e070799ab026c816fda4441c in Rocky. The code fix is also different because we don't have change I0851e2d54a1fdc82fe3291fb7e286e790f121e92 in Rocky and cannot use the _delete_allocation_after_move method as documented inline. This also means we have to deal with migration-based and legacy tracked allocations (where the source and dest node allocations are both on the instance consumer). As such, similar logic to _delete_allocation_after_move is used to try and delete the migration-based allocations first and failing that, attempt to cleanup the source node allocations the legacy way by removing them from the "doubled up" allocations created by the scheduler. Since this is new logic in the backport, a functional test is added to cover the legacy cleanup flow. Change-Id: I52c8d038118c858004e17e71b2fba9e9e2714815 Closes-Bug: #1825537 (cherry picked from commit ea297d6ffb) (cherry picked from commit e6c6178d22)
2019-04-19 12:28:34 -04:00 · 2019-04-19 12:28:34 -04:00 · cbf6a46d8f
parent 9a977cb28c
commit cbf6a46d8f
4 changed files with 82 additions and 18 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -4693,12 +4693,50 @@ class ComputeManager(manager.Manager):
        new host machine.

        """
+        # _finish_resize sets instance.old_flavor to instance.flavor and
+        # changes instance.flavor to instance.new_flavor (if doing a resize
+        # rather than a cold migration). We save off the old_flavor here in
+        # case we need it for error handling below.
+        old_flavor = instance.flavor
        try:
            self._finish_resize_helper(context, disk_info, image, instance,
                                       migration)
        except Exception:
            with excutils.save_and_reraise_exception():
-                self._revert_allocation(context, instance, migration)
+                # At this point, resize_instance (which runs on the source) has
+                # already updated the instance host/node values to point to
+                # this (the dest) compute, so we need to leave the allocations
+                # against the dest node resource provider intact and drop the
+                # allocations against the source node resource provider. If the
+                # user tries to recover the server by hard rebooting it, it
+                # will happen on this host so that's where the allocations
+                # should go.
+                LOG.info('Deleting allocations for old flavor on source node '
+                         '%s after finish_resize failure. You may be able to '
+                         'recover the instance by hard rebooting it.',
+                         migration.source_compute, instance=instance)
+                # NOTE(mriedem): We can't use _delete_allocation_after_move
+                # because it relies on the resource tracker to look up the
+                # node uuid and since we are on the dest host, passing the
+                # source nodename won't work since the RT isn't tracking that
+                # node here. So we just try to remove the migration-based
+                # allocations directly and handle the case they don't exist.
+                if not self.reportclient.delete_allocation_for_instance(
+                        context, migration.uuid):
+                    # No migration-based allocation. Try to cleanup directly.
+                    cn = objects.ComputeNode.get_by_host_and_nodename(
+                        context, migration.source_compute,
+                        migration.source_node)
+                    if not scheduler_utils.remove_allocation_from_compute(
+                            context, instance, cn.uuid, self.reportclient,
+                            flavor=old_flavor):
+                        LOG.error('Failed to delete allocations for old '
+                                  'flavor %s against source node %s. The '
+                                  'instance is now on the dest node %s. The '
+                                  'allocations against the source node need '
+                                  'to be manually cleaned up in Placement.',
+                                  old_flavor.flavorid, migration.source_node,
+                                  migration.dest_node, instance=instance)

    def _finish_resize_helper(self, context, disk_info, image, instance,
                              migration):
--- a/nova/tests/functional/regressions/test_bug_1825537.py
+++ b/nova/tests/functional/regressions/test_bug_1825537.py
@ -24,6 +24,13 @@ class FinishResizeErrorAllocationCleanupTestCase(

    compute_driver = 'fake.FakeFinishMigrationFailDriver'

+    # ProviderUsageBaseTestCase uses the AllServicesCurrent fixture which
+    # means we'll use migration-based allocations by default. This flag allows
+    # us to control the logic in conductor to handle legacy allocations where
+    # the source (old flavor) and dest (new flavor) node allocations are
+    # doubled up on the instance.
+    migration_based_allocations = True
+
    def setUp(self):
        super(FinishResizeErrorAllocationCleanupTestCase, self).setUp()
        # Get the flavors we're going to use.
@ -31,6 +38,10 @@ class FinishResizeErrorAllocationCleanupTestCase(
        self.flavor1 = flavors[0]
        self.flavor2 = flavors[1]

+        self.stub_out('nova.conductor.tasks.migrate.'
+                      'should_do_migration_allocation',
+                      lambda *args, **kwargs: self.migration_based_allocations)
+
    def _resize_and_assert_error(self, server, dest_host):
        # Now resize the server and wait for it to go to ERROR status because
        # the finish_migration virt driver method in host2 should fail.
@ -67,16 +78,20 @@ class FinishResizeErrorAllocationCleanupTestCase(
        # allocations should still exist with the new flavor.
        source_rp_uuid = self._get_provider_uuid_by_host('host1')
        dest_rp_uuid = self._get_provider_uuid_by_host('host2')
-        # FIXME(mriedem): This is bug 1825537 where the allocations are
-        # reverted when finish_resize fails so the dest node resource provider
-        # does not have any allocations and the instance allocations are for
-        # the old flavor on the source node resource provider even though the
-        # instance is not running on the source host nor pointed at the source
-        # host in the DB.
-        # self.assertFlavorMatchesAllocation(
-        #     self.flavor2, server['id'], dest_rp_uuid)
        dest_rp_usages = self._get_provider_usages(dest_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_rp_usages)
+        # And the source node provider should not have any usage.
+        source_rp_usages = self._get_provider_usages(source_rp_uuid)
        no_usage = {'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}
-        self.assertEqual(no_usage, dest_rp_usages)
-        source_usages = self._get_provider_usages(source_rp_uuid)
-        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+        self.assertEqual(no_usage, source_rp_usages)
+
+
+class FinishResizeErrorAllocationCleanupLegacyTestCase(
+        FinishResizeErrorAllocationCleanupTestCase):
+    """Variant of FinishResizeErrorAllocationCleanupTestCase which does not
+    use migration-based allocations, e.g. tests the scenario that there are
+    older computes in the deployment so the source and dest node allocations
+    are doubled up on the instance consumer record rather than the migration
+    record.
+    """
+    migration_based_allocations = False
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -3088,10 +3088,13 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
        # Ensure the allocation records still exist on the host.
        source_rp_uuid = self._get_provider_uuid_by_host(hostname)
        source_usages = self._get_provider_usages(source_rp_uuid)
-        # FIXME(mriedem): This is wrong for the _finish_resize case.
-        # The new_flavor should have been subtracted from the doubled
-        # allocation which just leaves us with the original flavor.
-        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
+        if failing_method == '_finish_resize':
+            # finish_resize will drop the old flavor allocations.
+            self.assertFlavorMatchesAllocation(self.flavor2, source_usages)
+        else:
+            # The new_flavor should have been subtracted from the doubled
+            # allocation which just leaves us with the original flavor.
+            self.assertFlavorMatchesAllocation(self.flavor1, source_usages)

    def test_resize_to_same_host_prep_resize_fails(self):
        self._test_resize_to_same_host_instance_fails(
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -6675,7 +6675,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
                test_instance_fault.fake_faults['fake-uuid'][0])
            yield _finish_resize

-    def test_finish_resize_failure(self):
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'delete_allocation_for_instance')
+    def test_finish_resize_failure(self, mock_del_allocs):
        self.migration.status = 'post-migrating'

        with self._mock_finish_resize() as _finish_resize:
@ -6689,10 +6691,14 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):

        # Assert that we set the migration to an error state
        self.assertEqual("error", self.migration.status)
+        mock_del_allocs.assert_called_once_with(
+            self.context, self.migration.uuid)

+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'delete_allocation_for_instance')
    @mock.patch('nova.compute.manager.ComputeManager.'
                '_notify_about_instance_usage')
-    def test_finish_resize_notify_failure(self, notify):
+    def test_finish_resize_notify_failure(self, notify, mock_del_allocs):
        self.migration.status = 'post-migrating'

        with self._mock_finish_resize():
@ -6706,6 +6712,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):

        # Assert that we did not set the migration to an error state
        self.assertEqual('post-migrating', self.migration.status)
+        mock_del_allocs.assert_called_once_with(
+            self.context, self.migration.uuid)

    @contextlib.contextmanager
    def _mock_resize_instance(self):