Browse Source

Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest
host and the instance host/node values in the database
are pointing at the dest host, so the _confirm_resize method
on the source is really best effort. If something fails, we
should not leak allocations in placement for the source compute
node resource provider since the instance is not actually
consuming the source node provider resources.

This change refactors the error handling around the _confirm_resize
call so the big nesting for _error_out_instance_on_exception is
moved to confirm_resize and then a try/finally is added around
_confirm_resize so we can be sure to try and cleanup the allocations
even if _confirm_resize fails in some obscure way. If _confirm_resize
does fail, the error gets re-raised along with logging a traceback
and hint about how to correct the instance state in the DB by hard
rebooting the server on the dest host.

Conflicts:
      nova/compute/manager.py

NOTE(mriedem): The conflict is due to not having change
I49034f87e240775cca7a4d769819dd788b86832a in Stein.

Change-Id: I29c5f491ec20a71283190a1599e7732541de736f
Closes-Bug: #1821594
(cherry picked from commit 03a6d26691)
tags/19.0.1
Matt Riedemann 5 months ago
parent
commit
5f515060f6
2 changed files with 84 additions and 54 deletions
  1. 69
    49
      nova/compute/manager.py
  2. 15
    5
      nova/tests/unit/compute/test_compute_mgr.py

+ 69
- 49
nova/compute/manager.py View File

@@ -3979,7 +3979,29 @@ class ComputeManager(manager.Manager):
3979 3979
                          instance=instance)
3980 3980
                 return
3981 3981
 
3982
-            self._confirm_resize(context, instance, migration=migration)
3982
+            with self._error_out_instance_on_exception(context, instance):
3983
+                try:
3984
+                    self._confirm_resize(
3985
+                        context, instance, migration=migration)
3986
+                except Exception:
3987
+                    # Something failed when cleaning up the source host so
3988
+                    # log a traceback and leave a hint about hard rebooting
3989
+                    # the server to correct its state in the DB.
3990
+                    with excutils.save_and_reraise_exception(logger=LOG):
3991
+                        LOG.exception(
3992
+                            'Confirm resize failed on source host %s. '
3993
+                            'Resource allocations in the placement service '
3994
+                            'will be removed regardless because the instance '
3995
+                            'is now on the destination host %s. You can try '
3996
+                            'hard rebooting the instance to correct its '
3997
+                            'state.', self.host, migration.dest_compute,
3998
+                            instance=instance)
3999
+                finally:
4000
+                    # Whether an error occurred or not, at this point the
4001
+                    # instance is on the dest host so to avoid leaking
4002
+                    # allocations in placement, delete them here.
4003
+                    self._delete_allocation_after_move(
4004
+                        context, instance, migration)
3983 4005
 
3984 4006
         do_confirm_resize(context, instance, migration.id)
3985 4007
 
@@ -3991,60 +4013,58 @@ class ComputeManager(manager.Manager):
3991 4013
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3992 4014
             phase=fields.NotificationPhase.START)
3993 4015
 
3994
-        with self._error_out_instance_on_exception(context, instance):
3995
-            # NOTE(danms): delete stashed migration information
3996
-            old_instance_type = instance.old_flavor
3997
-            instance.old_flavor = None
3998
-            instance.new_flavor = None
3999
-            instance.system_metadata.pop('old_vm_state', None)
4000
-            instance.save()
4001
-
4002
-            # NOTE(tr3buchet): tear down networks on source host
4003
-            self.network_api.setup_networks_on_host(context, instance,
4004
-                               migration.source_compute, teardown=True)
4016
+        # NOTE(danms): delete stashed migration information
4017
+        old_instance_type = instance.old_flavor
4018
+        instance.old_flavor = None
4019
+        instance.new_flavor = None
4020
+        instance.system_metadata.pop('old_vm_state', None)
4021
+        instance.save()
4005 4022
 
4006
-            network_info = self.network_api.get_instance_nw_info(context,
4007
-                                                                 instance)
4008
-            # TODO(mriedem): Get BDMs here and pass them to the driver.
4009
-            self.driver.confirm_migration(context, migration, instance,
4010
-                                          network_info)
4023
+        # NOTE(tr3buchet): tear down networks on source host
4024
+        self.network_api.setup_networks_on_host(context, instance,
4025
+                           migration.source_compute, teardown=True)
4011 4026
 
4012
-            migration.status = 'confirmed'
4013
-            with migration.obj_as_admin():
4014
-                migration.save()
4027
+        network_info = self.network_api.get_instance_nw_info(context,
4028
+                                                             instance)
4029
+        # TODO(mriedem): Get BDMs here and pass them to the driver.
4030
+        self.driver.confirm_migration(context, migration, instance,
4031
+                                      network_info)
4015 4032
 
4016
-            self.rt.drop_move_claim(context, instance, migration.source_node,
4017
-                                    old_instance_type, prefix='old_')
4018
-            self._delete_allocation_after_move(context, instance, migration)
4019
-            instance.drop_migration_context()
4033
+        migration.status = 'confirmed'
4034
+        with migration.obj_as_admin():
4035
+            migration.save()
4020 4036
 
4021
-            # NOTE(mriedem): The old_vm_state could be STOPPED but the user
4022
-            # might have manually powered up the instance to confirm the
4023
-            # resize/migrate, so we need to check the current power state
4024
-            # on the instance and set the vm_state appropriately. We default
4025
-            # to ACTIVE because if the power state is not SHUTDOWN, we
4026
-            # assume _sync_instance_power_state will clean it up.
4027
-            p_state = instance.power_state
4028
-            vm_state = None
4029
-            if p_state == power_state.SHUTDOWN:
4030
-                vm_state = vm_states.STOPPED
4031
-                LOG.debug("Resized/migrated instance is powered off. "
4032
-                          "Setting vm_state to '%s'.", vm_state,
4033
-                          instance=instance)
4034
-            else:
4035
-                vm_state = vm_states.ACTIVE
4037
+        self.rt.drop_move_claim(context, instance, migration.source_node,
4038
+                                old_instance_type, prefix='old_')
4039
+        instance.drop_migration_context()
4040
+
4041
+        # NOTE(mriedem): The old_vm_state could be STOPPED but the user
4042
+        # might have manually powered up the instance to confirm the
4043
+        # resize/migrate, so we need to check the current power state
4044
+        # on the instance and set the vm_state appropriately. We default
4045
+        # to ACTIVE because if the power state is not SHUTDOWN, we
4046
+        # assume _sync_instance_power_state will clean it up.
4047
+        p_state = instance.power_state
4048
+        vm_state = None
4049
+        if p_state == power_state.SHUTDOWN:
4050
+            vm_state = vm_states.STOPPED
4051
+            LOG.debug("Resized/migrated instance is powered off. "
4052
+                      "Setting vm_state to '%s'.", vm_state,
4053
+                      instance=instance)
4054
+        else:
4055
+            vm_state = vm_states.ACTIVE
4036 4056
 
4037
-            instance.vm_state = vm_state
4038
-            instance.task_state = None
4039
-            instance.save(expected_task_state=[None, task_states.DELETING,
4040
-                                               task_states.SOFT_DELETING])
4057
+        instance.vm_state = vm_state
4058
+        instance.task_state = None
4059
+        instance.save(expected_task_state=[None, task_states.DELETING,
4060
+                                           task_states.SOFT_DELETING])
4041 4061
 
4042
-            self._notify_about_instance_usage(
4043
-                context, instance, "resize.confirm.end",
4044
-                network_info=network_info)
4045
-            compute_utils.notify_about_instance_action(context, instance,
4046
-                   self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4047
-                   phase=fields.NotificationPhase.END)
4062
+        self._notify_about_instance_usage(
4063
+            context, instance, "resize.confirm.end",
4064
+            network_info=network_info)
4065
+        compute_utils.notify_about_instance_action(context, instance,
4066
+               self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4067
+               phase=fields.NotificationPhase.END)
4048 4068
 
4049 4069
     def _delete_allocation_after_move(self, context, instance, migration):
4050 4070
         """Deletes resource allocations held by the migration record against

+ 15
- 5
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -7192,6 +7192,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7192 7192
         do_finish_revert_resize()
7193 7193
 
7194 7194
     def test_confirm_resize_deletes_allocations(self):
7195
+        @mock.patch('nova.objects.Instance.get_by_uuid')
7196
+        @mock.patch('nova.objects.Migration.get_by_id')
7195 7197
         @mock.patch.object(self.migration, 'save')
7196 7198
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
7197 7199
         @mock.patch.object(self.compute, 'network_api')
@@ -7201,13 +7203,16 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7201 7203
         @mock.patch.object(self.instance, 'save')
7202 7204
         def do_confirm_resize(mock_save, mock_drop, mock_delete,
7203 7205
                               mock_confirm, mock_nwapi, mock_notify,
7204
-                              mock_mig_save):
7206
+                              mock_mig_save, mock_mig_get, mock_inst_get):
7205 7207
             self._mock_rt()
7206 7208
             self.instance.migration_context = objects.MigrationContext()
7207 7209
             self.migration.source_compute = self.instance['host']
7208 7210
             self.migration.source_node = self.instance['node']
7209
-            self.compute._confirm_resize(self.context, self.instance,
7210
-                                         self.migration)
7211
+            self.migration.status = 'confirming'
7212
+            mock_mig_get.return_value = self.migration
7213
+            mock_inst_get.return_value = self.instance
7214
+            self.compute.confirm_resize(self.context, self.instance,
7215
+                                        self.migration)
7211 7216
             mock_delete.assert_called_once_with(self.context, self.instance,
7212 7217
                                                 self.migration)
7213 7218
             mock_save.assert_called_with(expected_task_state=
@@ -7238,9 +7243,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7238 7243
         with test.nested(
7239 7244
             mock.patch.object(self.compute, 'network_api'),
7240 7245
             mock.patch.object(self.compute.driver, 'confirm_migration',
7241
-                              side_effect=error)
7246
+                              side_effect=error),
7247
+            mock.patch.object(self.compute, '_delete_allocation_after_move')
7242 7248
         ) as (
7243
-            network_api, confirm_migration
7249
+            network_api, confirm_migration, delete_allocation
7244 7250
         ):
7245 7251
             self.assertRaises(exception.HypervisorUnavailable,
7246 7252
                               self.compute.confirm_resize,
@@ -7254,6 +7260,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7254 7260
         self.assertEqual(2, instance_save.call_count)
7255 7261
         # The migration.status should have been saved.
7256 7262
         self.migration.save.assert_called_once_with()
7263
+        # Allocations should always be cleaned up even if cleaning up the
7264
+        # source host fails.
7265
+        delete_allocation.assert_called_once_with(
7266
+            self.context, self.instance, self.migration)
7257 7267
         # Assert other mocks we care less about.
7258 7268
         notify_usage.assert_called_once()
7259 7269
         notify_action.assert_called_once()

Loading…
Cancel
Save