Browse Source

Merge "Drop source node allocations if finish_resize fails" into stable/rocky

tags/18.2.2
Zuul 1 month ago
parent
commit
069bda35c1

+ 39
- 1
nova/compute/manager.py View File

@@ -4727,12 +4727,50 @@ class ComputeManager(manager.Manager):
4727 4727
         new host machine.
4728 4728
 
4729 4729
         """
4730
+        # _finish_resize sets instance.old_flavor to instance.flavor and
4731
+        # changes instance.flavor to instance.new_flavor (if doing a resize
4732
+        # rather than a cold migration). We save off the old_flavor here in
4733
+        # case we need it for error handling below.
4734
+        old_flavor = instance.flavor
4730 4735
         try:
4731 4736
             self._finish_resize_helper(context, disk_info, image, instance,
4732 4737
                                        migration)
4733 4738
         except Exception:
4734 4739
             with excutils.save_and_reraise_exception():
4735
-                self._revert_allocation(context, instance, migration)
4740
+                # At this point, resize_instance (which runs on the source) has
4741
+                # already updated the instance host/node values to point to
4742
+                # this (the dest) compute, so we need to leave the allocations
4743
+                # against the dest node resource provider intact and drop the
4744
+                # allocations against the source node resource provider. If the
4745
+                # user tries to recover the server by hard rebooting it, it
4746
+                # will happen on this host so that's where the allocations
4747
+                # should go.
4748
+                LOG.info('Deleting allocations for old flavor on source node '
4749
+                         '%s after finish_resize failure. You may be able to '
4750
+                         'recover the instance by hard rebooting it.',
4751
+                         migration.source_compute, instance=instance)
4752
+                # NOTE(mriedem): We can't use _delete_allocation_after_move
4753
+                # because it relies on the resource tracker to look up the
4754
+                # node uuid and since we are on the dest host, passing the
4755
+                # source nodename won't work since the RT isn't tracking that
4756
+                # node here. So we just try to remove the migration-based
4757
+                # allocations directly and handle the case they don't exist.
4758
+                if not self.reportclient.delete_allocation_for_instance(
4759
+                        context, migration.uuid):
4760
+                    # No migration-based allocation. Try to cleanup directly.
4761
+                    cn = objects.ComputeNode.get_by_host_and_nodename(
4762
+                        context, migration.source_compute,
4763
+                        migration.source_node)
4764
+                    if not scheduler_utils.remove_allocation_from_compute(
4765
+                            context, instance, cn.uuid, self.reportclient,
4766
+                            flavor=old_flavor):
4767
+                        LOG.error('Failed to delete allocations for old '
4768
+                                  'flavor %s against source node %s. The '
4769
+                                  'instance is now on the dest node %s. The '
4770
+                                  'allocations against the source node need '
4771
+                                  'to be manually cleaned up in Placement.',
4772
+                                  old_flavor.flavorid, migration.source_node,
4773
+                                  migration.dest_node, instance=instance)
4736 4774
 
4737 4775
     def _finish_resize_helper(self, context, disk_info, image, instance,
4738 4776
                               migration):

+ 26
- 11
nova/tests/functional/regressions/test_bug_1825537.py View File

@@ -24,6 +24,13 @@ class FinishResizeErrorAllocationCleanupTestCase(
24 24
 
25 25
     compute_driver = 'fake.FakeFinishMigrationFailDriver'
26 26
 
27
+    # ProviderUsageBaseTestCase uses the AllServicesCurrent fixture which
28
+    # means we'll use migration-based allocations by default. This flag allows
29
+    # us to control the logic in conductor to handle legacy allocations where
30
+    # the source (old flavor) and dest (new flavor) node allocations are
31
+    # doubled up on the instance.
32
+    migration_based_allocations = True
33
+
27 34
     def setUp(self):
28 35
         super(FinishResizeErrorAllocationCleanupTestCase, self).setUp()
29 36
         # Get the flavors we're going to use.
@@ -31,6 +38,10 @@ class FinishResizeErrorAllocationCleanupTestCase(
31 38
         self.flavor1 = flavors[0]
32 39
         self.flavor2 = flavors[1]
33 40
 
41
+        self.stub_out('nova.conductor.tasks.migrate.'
42
+                      'should_do_migration_allocation',
43
+                      lambda *args, **kwargs: self.migration_based_allocations)
44
+
34 45
     def _resize_and_assert_error(self, server, dest_host):
35 46
         # Now resize the server and wait for it to go to ERROR status because
36 47
         # the finish_migration virt driver method in host2 should fail.
@@ -67,16 +78,20 @@ class FinishResizeErrorAllocationCleanupTestCase(
67 78
         # allocations should still exist with the new flavor.
68 79
         source_rp_uuid = self._get_provider_uuid_by_host('host1')
69 80
         dest_rp_uuid = self._get_provider_uuid_by_host('host2')
70
-        # FIXME(mriedem): This is bug 1825537 where the allocations are
71
-        # reverted when finish_resize fails so the dest node resource provider
72
-        # does not have any allocations and the instance allocations are for
73
-        # the old flavor on the source node resource provider even though the
74
-        # instance is not running on the source host nor pointed at the source
75
-        # host in the DB.
76
-        # self.assertFlavorMatchesAllocation(
77
-        #     self.flavor2, server['id'], dest_rp_uuid)
78 81
         dest_rp_usages = self._get_provider_usages(dest_rp_uuid)
82
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_rp_usages)
83
+        # And the source node provider should not have any usage.
84
+        source_rp_usages = self._get_provider_usages(source_rp_uuid)
79 85
         no_usage = {'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}
80
-        self.assertEqual(no_usage, dest_rp_usages)
81
-        source_usages = self._get_provider_usages(source_rp_uuid)
82
-        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
86
+        self.assertEqual(no_usage, source_rp_usages)
87
+
88
+
89
+class FinishResizeErrorAllocationCleanupLegacyTestCase(
90
+        FinishResizeErrorAllocationCleanupTestCase):
91
+    """Variant of FinishResizeErrorAllocationCleanupTestCase which does not
92
+    use migration-based allocations, e.g. tests the scenario that there are
93
+    older computes in the deployment so the source and dest node allocations
94
+    are doubled up on the instance consumer record rather than the migration
95
+    record.
96
+    """
97
+    migration_based_allocations = False

+ 7
- 4
nova/tests/functional/test_servers.py View File

@@ -3088,10 +3088,13 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
3088 3088
         # Ensure the allocation records still exist on the host.
3089 3089
         source_rp_uuid = self._get_provider_uuid_by_host(hostname)
3090 3090
         source_usages = self._get_provider_usages(source_rp_uuid)
3091
-        # FIXME(mriedem): This is wrong for the _finish_resize case.
3092
-        # The new_flavor should have been subtracted from the doubled
3093
-        # allocation which just leaves us with the original flavor.
3094
-        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
3091
+        if failing_method == '_finish_resize':
3092
+            # finish_resize will drop the old flavor allocations.
3093
+            self.assertFlavorMatchesAllocation(self.flavor2, source_usages)
3094
+        else:
3095
+            # The new_flavor should have been subtracted from the doubled
3096
+            # allocation which just leaves us with the original flavor.
3097
+            self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
3095 3098
 
3096 3099
     def test_resize_to_same_host_prep_resize_fails(self):
3097 3100
         self._test_resize_to_same_host_instance_fails(

+ 10
- 2
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -6766,7 +6766,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6766 6766
                 test_instance_fault.fake_faults['fake-uuid'][0])
6767 6767
             yield _finish_resize
6768 6768
 
6769
-    def test_finish_resize_failure(self):
6769
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6770
+                'delete_allocation_for_instance')
6771
+    def test_finish_resize_failure(self, mock_del_allocs):
6770 6772
         self.migration.status = 'post-migrating'
6771 6773
 
6772 6774
         with self._mock_finish_resize() as _finish_resize:
@@ -6780,10 +6782,14 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6780 6782
 
6781 6783
         # Assert that we set the migration to an error state
6782 6784
         self.assertEqual("error", self.migration.status)
6785
+        mock_del_allocs.assert_called_once_with(
6786
+            self.context, self.migration.uuid)
6783 6787
 
6788
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
6789
+                'delete_allocation_for_instance')
6784 6790
     @mock.patch('nova.compute.manager.ComputeManager.'
6785 6791
                 '_notify_about_instance_usage')
6786
-    def test_finish_resize_notify_failure(self, notify):
6792
+    def test_finish_resize_notify_failure(self, notify, mock_del_allocs):
6787 6793
         self.migration.status = 'post-migrating'
6788 6794
 
6789 6795
         with self._mock_finish_resize():
@@ -6797,6 +6803,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6797 6803
 
6798 6804
         # Assert that we did not set the migration to an error state
6799 6805
         self.assertEqual('post-migrating', self.migration.status)
6806
+        mock_del_allocs.assert_called_once_with(
6807
+            self.context, self.migration.uuid)
6800 6808
 
6801 6809
     @contextlib.contextmanager
6802 6810
     def _mock_resize_instance(self):

Loading…
Cancel
Save