Browse Source

Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest
host and the instance host/node values in the database
are pointing at the dest host, so the _confirm_resize method
on the source is really best effort. If something fails, we
should not leak allocations in placement for the source compute
node resource provider since the instance is not actually
consuming the source node provider resources.

This change refactors the error handling around the _confirm_resize
call so the big nesting for _error_out_instance_on_exception is
moved to confirm_resize and then a try/finally is added around
_confirm_resize so we can be sure to try and cleanup the allocations
even if _confirm_resize fails in some obscure way. If _confirm_resize
does fail, the error gets re-raised along with logging a traceback
and hint about how to correct the instance state in the DB by hard
rebooting the server on the dest host.

Conflicts:
      nova/compute/manager.py
      nova/tests/unit/compute/test_compute_mgr.py

NOTE(mriedem): The manager.py conflict is due to not having change
Ibb8c12fb2799bb5ceb9e3d72a2b86dbb4f14451e in Rocky. In addition,
since change I0851e2d54a1fdc82fe3291fb7e286e790f121e92 is not in
Rocky the _delete_allocation_after_move signature needs to be
handled a bit different in this backport. The test_compute_mgr.py
conflict and source_node addition in the setUp is due to the same
two changes.

Change-Id: I29c5f491ec20a71283190a1599e7732541de736f
Closes-Bug: #1821594
(cherry picked from commit 03a6d26691)
(cherry picked from commit 5f515060f6)
tags/18.2.1
Matt Riedemann 5 months ago
parent
commit
37ac54a42e
2 changed files with 89 additions and 57 deletions
  1. 72
    52
      nova/compute/manager.py
  2. 17
    5
      nova/tests/unit/compute/test_compute_mgr.py

+ 72
- 52
nova/compute/manager.py View File

@@ -3875,7 +3875,31 @@ class ComputeManager(manager.Manager):
3875 3875
                          instance=instance)
3876 3876
                 return
3877 3877
 
3878
-            self._confirm_resize(context, instance, migration=migration)
3878
+            with self._error_out_instance_on_exception(context, instance):
3879
+                old_instance_type = instance.old_flavor
3880
+                try:
3881
+                    self._confirm_resize(
3882
+                        context, instance, migration=migration)
3883
+                except Exception:
3884
+                    # Something failed when cleaning up the source host so
3885
+                    # log a traceback and leave a hint about hard rebooting
3886
+                    # the server to correct its state in the DB.
3887
+                    with excutils.save_and_reraise_exception(logger=LOG):
3888
+                        LOG.exception(
3889
+                            'Confirm resize failed on source host %s. '
3890
+                            'Resource allocations in the placement service '
3891
+                            'will be removed regardless because the instance '
3892
+                            'is now on the destination host %s. You can try '
3893
+                            'hard rebooting the instance to correct its '
3894
+                            'state.', self.host, migration.dest_compute,
3895
+                            instance=instance)
3896
+                finally:
3897
+                    # Whether an error occurred or not, at this point the
3898
+                    # instance is on the dest host so to avoid leaking
3899
+                    # allocations in placement, delete them here.
3900
+                    self._delete_allocation_after_move(
3901
+                        context, instance, migration, old_instance_type,
3902
+                        migration.source_node)
3879 3903
 
3880 3904
         do_confirm_resize(context, instance, migration.id)
3881 3905
 
@@ -3887,63 +3911,59 @@ class ComputeManager(manager.Manager):
3887 3911
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3888 3912
             phase=fields.NotificationPhase.START)
3889 3913
 
3890
-        with self._error_out_instance_on_exception(context, instance):
3891
-            # NOTE(danms): delete stashed migration information
3892
-            old_instance_type = instance.old_flavor
3893
-            instance.old_flavor = None
3894
-            instance.new_flavor = None
3895
-            instance.system_metadata.pop('old_vm_state', None)
3896
-            instance.save()
3897
-
3898
-            # NOTE(tr3buchet): tear down networks on source host
3899
-            self.network_api.setup_networks_on_host(context, instance,
3900
-                               migration.source_compute, teardown=True)
3914
+        # NOTE(danms): delete stashed migration information
3915
+        old_instance_type = instance.old_flavor
3916
+        instance.old_flavor = None
3917
+        instance.new_flavor = None
3918
+        instance.system_metadata.pop('old_vm_state', None)
3919
+        instance.save()
3901 3920
 
3902
-            network_info = self.network_api.get_instance_nw_info(context,
3903
-                                                                 instance)
3904
-            # TODO(mriedem): Get BDMs here and pass them to the driver.
3905
-            self.driver.confirm_migration(context, migration, instance,
3906
-                                          network_info)
3921
+        # NOTE(tr3buchet): tear down networks on source host
3922
+        self.network_api.setup_networks_on_host(context, instance,
3923
+                           migration.source_compute, teardown=True)
3907 3924
 
3908
-            migration.status = 'confirmed'
3909
-            with migration.obj_as_admin():
3910
-                migration.save()
3925
+        network_info = self.network_api.get_instance_nw_info(context,
3926
+                                                             instance)
3927
+        # TODO(mriedem): Get BDMs here and pass them to the driver.
3928
+        self.driver.confirm_migration(context, migration, instance,
3929
+                                      network_info)
3911 3930
 
3912
-            rt = self._get_resource_tracker()
3913
-            rt.drop_move_claim(context, instance, migration.source_node,
3914
-                               old_instance_type, prefix='old_')
3915
-            self._delete_allocation_after_move(context, instance, migration,
3916
-                                               old_instance_type,
3917
-                                               migration.source_node)
3918
-            instance.drop_migration_context()
3931
+        migration.status = 'confirmed'
3932
+        with migration.obj_as_admin():
3933
+            migration.save()
3919 3934
 
3920
-            # NOTE(mriedem): The old_vm_state could be STOPPED but the user
3921
-            # might have manually powered up the instance to confirm the
3922
-            # resize/migrate, so we need to check the current power state
3923
-            # on the instance and set the vm_state appropriately. We default
3924
-            # to ACTIVE because if the power state is not SHUTDOWN, we
3925
-            # assume _sync_instance_power_state will clean it up.
3926
-            p_state = instance.power_state
3927
-            vm_state = None
3928
-            if p_state == power_state.SHUTDOWN:
3929
-                vm_state = vm_states.STOPPED
3930
-                LOG.debug("Resized/migrated instance is powered off. "
3931
-                          "Setting vm_state to '%s'.", vm_state,
3932
-                          instance=instance)
3933
-            else:
3934
-                vm_state = vm_states.ACTIVE
3935
+        rt = self._get_resource_tracker()
3936
+        rt.drop_move_claim(context, instance, migration.source_node,
3937
+                           old_instance_type, prefix='old_')
3938
+        instance.drop_migration_context()
3939
+
3940
+        # NOTE(mriedem): The old_vm_state could be STOPPED but the user
3941
+        # might have manually powered up the instance to confirm the
3942
+        # resize/migrate, so we need to check the current power state
3943
+        # on the instance and set the vm_state appropriately. We default
3944
+        # to ACTIVE because if the power state is not SHUTDOWN, we
3945
+        # assume _sync_instance_power_state will clean it up.
3946
+        p_state = instance.power_state
3947
+        vm_state = None
3948
+        if p_state == power_state.SHUTDOWN:
3949
+            vm_state = vm_states.STOPPED
3950
+            LOG.debug("Resized/migrated instance is powered off. "
3951
+                      "Setting vm_state to '%s'.", vm_state,
3952
+                      instance=instance)
3953
+        else:
3954
+            vm_state = vm_states.ACTIVE
3935 3955
 
3936
-            instance.vm_state = vm_state
3937
-            instance.task_state = None
3938
-            instance.save(expected_task_state=[None, task_states.DELETING,
3939
-                                               task_states.SOFT_DELETING])
3956
+        instance.vm_state = vm_state
3957
+        instance.task_state = None
3958
+        instance.save(expected_task_state=[None, task_states.DELETING,
3959
+                                           task_states.SOFT_DELETING])
3940 3960
 
3941
-            self._notify_about_instance_usage(
3942
-                context, instance, "resize.confirm.end",
3943
-                network_info=network_info)
3944
-            compute_utils.notify_about_instance_action(context, instance,
3945
-                   self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3946
-                   phase=fields.NotificationPhase.END)
3961
+        self._notify_about_instance_usage(
3962
+            context, instance, "resize.confirm.end",
3963
+            network_info=network_info)
3964
+        compute_utils.notify_about_instance_action(context, instance,
3965
+               self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3966
+               phase=fields.NotificationPhase.END)
3947 3967
 
3948 3968
     def _delete_allocation_after_move(self, context, instance, migration,
3949 3969
                                       flavor, nodename):

+ 17
- 5
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -6636,6 +6636,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6636 6636
             dest_node=None,
6637 6637
             dest_host=None,
6638 6638
             source_compute='source_compute',
6639
+            source_node='source_node',
6639 6640
             status='migrating')
6640 6641
         self.migration.save = mock.MagicMock()
6641 6642
         self.useFixture(fixtures.SpawnIsSynchronousFixture())
@@ -6995,6 +6996,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6995 6996
         do_finish_revert_resize()
6996 6997
 
6997 6998
     def test_confirm_resize_deletes_allocations(self):
6999
+        @mock.patch('nova.objects.Instance.get_by_uuid')
7000
+        @mock.patch('nova.objects.Migration.get_by_id')
6998 7001
         @mock.patch.object(self.migration, 'save')
6999 7002
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
7000 7003
         @mock.patch.object(self.compute, 'network_api')
@@ -7005,12 +7008,15 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
7005 7008
         @mock.patch.object(self.instance, 'save')
7006 7009
         def do_confirm_resize(mock_save, mock_drop, mock_delete, mock_get_rt,
7007 7010
                               mock_confirm, mock_nwapi, mock_notify,
7008
-                              mock_mig_save):
7011
+                              mock_mig_save, mock_mig_get, mock_inst_get):
7009 7012
             self.instance.migration_context = objects.MigrationContext()
7010 7013
             self.migration.source_compute = self.instance['host']
7011 7014
             self.migration.source_node = self.instance['node']
7012
-            self.compute._confirm_resize(self.context, self.instance,
7013
-                                         self.migration)
7015
+            self.migration.status = 'confirming'
7016
+            mock_mig_get.return_value = self.migration
7017
+            mock_inst_get.return_value = self.instance
7018
+            self.compute.confirm_resize(self.context, self.instance,
7019
+                                        self.migration)
7014 7020
             mock_delete.assert_called_once_with(self.context, self.instance,
7015 7021
                                                 self.migration,
7016 7022
                                                 self.instance.old_flavor,
@@ -7066,9 +7072,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
7066 7072
         with test.nested(
7067 7073
             mock.patch.object(self.compute, 'network_api'),
7068 7074
             mock.patch.object(self.compute.driver, 'confirm_migration',
7069
-                              side_effect=error)
7075
+                              side_effect=error),
7076
+            mock.patch.object(self.compute, '_delete_allocation_after_move')
7070 7077
         ) as (
7071
-            network_api, confirm_migration
7078
+            network_api, confirm_migration, delete_allocation
7072 7079
         ):
7073 7080
             self.assertRaises(exception.HypervisorUnavailable,
7074 7081
                               self.compute.confirm_resize,
@@ -7082,6 +7089,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
7082 7089
         self.assertEqual(2, instance_save.call_count)
7083 7090
         # The migration.status should have been saved.
7084 7091
         self.migration.save.assert_called_once_with()
7092
+        # Allocations should always be cleaned up even if cleaning up the
7093
+        # source host fails.
7094
+        delete_allocation.assert_called_once_with(
7095
+            self.context, self.instance, self.migration,
7096
+            self.instance.old_flavor, self.migration.source_node)
7085 7097
         # Assert other mocks we care less about.
7086 7098
         notify_usage.assert_called_once()
7087 7099
         notify_action.assert_called_once()

Loading…
Cancel
Save