Browse Source

Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest
host and the instance host/node values in the database
are pointing at the dest host, so the _confirm_resize method
on the source is really best effort. If something fails, we
should not leak allocations in placement for the source compute
node resource provider since the instance is not actually
consuming the source node provider resources.

This change refactors the error handling around the _confirm_resize
call so the big nesting for _error_out_instance_on_exception is
moved to confirm_resize and then a try/finally is added around
_confirm_resize so we can be sure to try and cleanup the allocations
even if _confirm_resize fails in some obscure way. If _confirm_resize
does fail, the error gets re-raised along with logging a traceback
and hint about how to correct the instance state in the DB by hard
rebooting the server on the dest host.

Change-Id: I29c5f491ec20a71283190a1599e7732541de736f
Closes-Bug: #1821594
changes/66/647566/4
Matt Riedemann 5 months ago
parent
commit
03a6d26691
2 changed files with 83 additions and 53 deletions
  1. 68
    48
      nova/compute/manager.py
  2. 15
    5
      nova/tests/unit/compute/test_compute_mgr.py

+ 68
- 48
nova/compute/manager.py View File

@@ -3973,7 +3973,29 @@ class ComputeManager(manager.Manager):
3973 3973
                          instance=instance)
3974 3974
                 return
3975 3975
 
3976
-            self._confirm_resize(context, instance, migration=migration)
3976
+            with self._error_out_instance_on_exception(context, instance):
3977
+                try:
3978
+                    self._confirm_resize(
3979
+                        context, instance, migration=migration)
3980
+                except Exception:
3981
+                    # Something failed when cleaning up the source host so
3982
+                    # log a traceback and leave a hint about hard rebooting
3983
+                    # the server to correct its state in the DB.
3984
+                    with excutils.save_and_reraise_exception(logger=LOG):
3985
+                        LOG.exception(
3986
+                            'Confirm resize failed on source host %s. '
3987
+                            'Resource allocations in the placement service '
3988
+                            'will be removed regardless because the instance '
3989
+                            'is now on the destination host %s. You can try '
3990
+                            'hard rebooting the instance to correct its '
3991
+                            'state.', self.host, migration.dest_compute,
3992
+                            instance=instance)
3993
+                finally:
3994
+                    # Whether an error occurred or not, at this point the
3995
+                    # instance is on the dest host so to avoid leaking
3996
+                    # allocations in placement, delete them here.
3997
+                    self._delete_allocation_after_move(
3998
+                        context, instance, migration)
3977 3999
 
3978 4000
         do_confirm_resize(context, instance, migration.id)
3979 4001
 
@@ -3985,59 +4007,57 @@ class ComputeManager(manager.Manager):
3985 4007
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3986 4008
             phase=fields.NotificationPhase.START)
3987 4009
 
3988
-        with self._error_out_instance_on_exception(context, instance):
3989
-            # NOTE(danms): delete stashed migration information
3990
-            old_instance_type = instance.old_flavor
3991
-            instance.old_flavor = None
3992
-            instance.new_flavor = None
3993
-            instance.system_metadata.pop('old_vm_state', None)
3994
-            instance.save()
3995
-
3996
-            # NOTE(tr3buchet): tear down networks on source host
3997
-            self.network_api.setup_networks_on_host(context, instance,
3998
-                               migration.source_compute, teardown=True)
4010
+        # NOTE(danms): delete stashed migration information
4011
+        old_instance_type = instance.old_flavor
4012
+        instance.old_flavor = None
4013
+        instance.new_flavor = None
4014
+        instance.system_metadata.pop('old_vm_state', None)
4015
+        instance.save()
3999 4016
 
4000
-            network_info = self.network_api.get_instance_nw_info(context,
4001
-                                                                 instance)
4002
-            # TODO(mriedem): Get BDMs here and pass them to the driver.
4003
-            self.driver.confirm_migration(context, migration, instance,
4004
-                                          network_info)
4017
+        # NOTE(tr3buchet): tear down networks on source host
4018
+        self.network_api.setup_networks_on_host(context, instance,
4019
+                           migration.source_compute, teardown=True)
4005 4020
 
4006
-            migration.status = 'confirmed'
4007
-            migration.save()
4021
+        network_info = self.network_api.get_instance_nw_info(context,
4022
+                                                             instance)
4023
+        # TODO(mriedem): Get BDMs here and pass them to the driver.
4024
+        self.driver.confirm_migration(context, migration, instance,
4025
+                                      network_info)
4008 4026
 
4009
-            self.rt.drop_move_claim(context, instance, migration.source_node,
4010
-                                    old_instance_type, prefix='old_')
4011
-            self._delete_allocation_after_move(context, instance, migration)
4012
-            instance.drop_migration_context()
4027
+        migration.status = 'confirmed'
4028
+        migration.save()
4013 4029
 
4014
-            # NOTE(mriedem): The old_vm_state could be STOPPED but the user
4015
-            # might have manually powered up the instance to confirm the
4016
-            # resize/migrate, so we need to check the current power state
4017
-            # on the instance and set the vm_state appropriately. We default
4018
-            # to ACTIVE because if the power state is not SHUTDOWN, we
4019
-            # assume _sync_instance_power_state will clean it up.
4020
-            p_state = instance.power_state
4021
-            vm_state = None
4022
-            if p_state == power_state.SHUTDOWN:
4023
-                vm_state = vm_states.STOPPED
4024
-                LOG.debug("Resized/migrated instance is powered off. "
4025
-                          "Setting vm_state to '%s'.", vm_state,
4026
-                          instance=instance)
4027
-            else:
4028
-                vm_state = vm_states.ACTIVE
4030
+        self.rt.drop_move_claim(context, instance, migration.source_node,
4031
+                                old_instance_type, prefix='old_')
4032
+        instance.drop_migration_context()
4033
+
4034
+        # NOTE(mriedem): The old_vm_state could be STOPPED but the user
4035
+        # might have manually powered up the instance to confirm the
4036
+        # resize/migrate, so we need to check the current power state
4037
+        # on the instance and set the vm_state appropriately. We default
4038
+        # to ACTIVE because if the power state is not SHUTDOWN, we
4039
+        # assume _sync_instance_power_state will clean it up.
4040
+        p_state = instance.power_state
4041
+        vm_state = None
4042
+        if p_state == power_state.SHUTDOWN:
4043
+            vm_state = vm_states.STOPPED
4044
+            LOG.debug("Resized/migrated instance is powered off. "
4045
+                      "Setting vm_state to '%s'.", vm_state,
4046
+                      instance=instance)
4047
+        else:
4048
+            vm_state = vm_states.ACTIVE
4029 4049
 
4030
-            instance.vm_state = vm_state
4031
-            instance.task_state = None
4032
-            instance.save(expected_task_state=[None, task_states.DELETING,
4033
-                                               task_states.SOFT_DELETING])
4050
+        instance.vm_state = vm_state
4051
+        instance.task_state = None
4052
+        instance.save(expected_task_state=[None, task_states.DELETING,
4053
+                                           task_states.SOFT_DELETING])
4034 4054
 
4035
-            self._notify_about_instance_usage(
4036
-                context, instance, "resize.confirm.end",
4037
-                network_info=network_info)
4038
-            compute_utils.notify_about_instance_action(context, instance,
4039
-                   self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4040
-                   phase=fields.NotificationPhase.END)
4055
+        self._notify_about_instance_usage(
4056
+            context, instance, "resize.confirm.end",
4057
+            network_info=network_info)
4058
+        compute_utils.notify_about_instance_action(context, instance,
4059
+               self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
4060
+               phase=fields.NotificationPhase.END)
4041 4061
 
4042 4062
     def _delete_allocation_after_move(self, context, instance, migration):
4043 4063
         """Deletes resource allocations held by the migration record against

+ 15
- 5
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -7183,6 +7183,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7183 7183
         do_finish_revert_resize()
7184 7184
 
7185 7185
     def test_confirm_resize_deletes_allocations(self):
7186
+        @mock.patch('nova.objects.Instance.get_by_uuid')
7187
+        @mock.patch('nova.objects.Migration.get_by_id')
7186 7188
         @mock.patch.object(self.migration, 'save')
7187 7189
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
7188 7190
         @mock.patch.object(self.compute, 'network_api')
@@ -7192,13 +7194,16 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7192 7194
         @mock.patch.object(self.instance, 'save')
7193 7195
         def do_confirm_resize(mock_save, mock_drop, mock_delete,
7194 7196
                               mock_confirm, mock_nwapi, mock_notify,
7195
-                              mock_mig_save):
7197
+                              mock_mig_save, mock_mig_get, mock_inst_get):
7196 7198
             self._mock_rt()
7197 7199
             self.instance.migration_context = objects.MigrationContext()
7198 7200
             self.migration.source_compute = self.instance['host']
7199 7201
             self.migration.source_node = self.instance['node']
7200
-            self.compute._confirm_resize(self.context, self.instance,
7201
-                                         self.migration)
7202
+            self.migration.status = 'confirming'
7203
+            mock_mig_get.return_value = self.migration
7204
+            mock_inst_get.return_value = self.instance
7205
+            self.compute.confirm_resize(self.context, self.instance,
7206
+                                        self.migration)
7202 7207
             mock_delete.assert_called_once_with(self.context, self.instance,
7203 7208
                                                 self.migration)
7204 7209
             mock_save.assert_called_with(expected_task_state=
@@ -7229,9 +7234,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7229 7234
         with test.nested(
7230 7235
             mock.patch.object(self.compute, 'network_api'),
7231 7236
             mock.patch.object(self.compute.driver, 'confirm_migration',
7232
-                              side_effect=error)
7237
+                              side_effect=error),
7238
+            mock.patch.object(self.compute, '_delete_allocation_after_move')
7233 7239
         ) as (
7234
-            network_api, confirm_migration
7240
+            network_api, confirm_migration, delete_allocation
7235 7241
         ):
7236 7242
             self.assertRaises(exception.HypervisorUnavailable,
7237 7243
                               self.compute.confirm_resize,
@@ -7245,6 +7251,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7245 7251
         self.assertEqual(2, instance_save.call_count)
7246 7252
         # The migration.status should have been saved.
7247 7253
         self.migration.save.assert_called_once_with()
7254
+        # Allocations should always be cleaned up even if cleaning up the
7255
+        # source host fails.
7256
+        delete_allocation.assert_called_once_with(
7257
+            self.context, self.instance, self.migration)
7248 7258
         # Assert other mocks we care less about.
7249 7259
         notify_usage.assert_called_once()
7250 7260
         notify_action.assert_called_once()

Loading…
Cancel
Save