Browse Source

Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest
host and the instance host/node values in the database
are pointing at the dest host, so the _confirm_resize method
on the source is really best effort. If something fails, we
should not leak allocations in placement for the source compute
node resource provider since the instance is not actually
consuming the source node provider resources.

This change refactors the error handling around the _confirm_resize
call so the big nesting for _error_out_instance_on_exception is
moved to confirm_resize and then a try/finally is added around
_confirm_resize so we can be sure to try and cleanup the allocations
even if _confirm_resize fails in some obscure way. If _confirm_resize
does fail, the error gets re-raised along with logging a traceback
and hint about how to correct the instance state in the DB by hard
rebooting the server on the dest host.

Change-Id: I29c5f491ec20a71283190a1599e7732541de736f
Closes-Bug: #1821594
(cherry picked from commit 03a6d26691)
(cherry picked from commit 5f515060f6)
(cherry picked from commit 37ac54a42e)
tags/17.0.11
Matt Riedemann 5 months ago
parent
commit
b8435d6001
2 changed files with 89 additions and 57 deletions
  1. 72
    52
      nova/compute/manager.py
  2. 17
    5
      nova/tests/unit/compute/test_compute_mgr.py

+ 72
- 52
nova/compute/manager.py View File

@@ -3746,7 +3746,31 @@ class ComputeManager(manager.Manager):
3746 3746
                          instance=instance)
3747 3747
                 return
3748 3748
 
3749
-            self._confirm_resize(context, instance, migration=migration)
3749
+            with self._error_out_instance_on_exception(context, instance):
3750
+                old_instance_type = instance.old_flavor
3751
+                try:
3752
+                    self._confirm_resize(
3753
+                        context, instance, migration=migration)
3754
+                except Exception:
3755
+                    # Something failed when cleaning up the source host so
3756
+                    # log a traceback and leave a hint about hard rebooting
3757
+                    # the server to correct its state in the DB.
3758
+                    with excutils.save_and_reraise_exception(logger=LOG):
3759
+                        LOG.exception(
3760
+                            'Confirm resize failed on source host %s. '
3761
+                            'Resource allocations in the placement service '
3762
+                            'will be removed regardless because the instance '
3763
+                            'is now on the destination host %s. You can try '
3764
+                            'hard rebooting the instance to correct its '
3765
+                            'state.', self.host, migration.dest_compute,
3766
+                            instance=instance)
3767
+                finally:
3768
+                    # Whether an error occurred or not, at this point the
3769
+                    # instance is on the dest host so to avoid leaking
3770
+                    # allocations in placement, delete them here.
3771
+                    self._delete_allocation_after_move(
3772
+                        context, instance, migration, old_instance_type,
3773
+                        migration.source_node)
3750 3774
 
3751 3775
         do_confirm_resize(context, instance, migration.id)
3752 3776
 
@@ -3758,63 +3782,59 @@ class ComputeManager(manager.Manager):
3758 3782
             self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3759 3783
             phase=fields.NotificationPhase.START)
3760 3784
 
3761
-        with self._error_out_instance_on_exception(context, instance):
3762
-            # NOTE(danms): delete stashed migration information
3763
-            old_instance_type = instance.old_flavor
3764
-            instance.old_flavor = None
3765
-            instance.new_flavor = None
3766
-            instance.system_metadata.pop('old_vm_state', None)
3767
-            instance.save()
3768
-
3769
-            # NOTE(tr3buchet): tear down networks on source host
3770
-            self.network_api.setup_networks_on_host(context, instance,
3771
-                               migration.source_compute, teardown=True)
3785
+        # NOTE(danms): delete stashed migration information
3786
+        old_instance_type = instance.old_flavor
3787
+        instance.old_flavor = None
3788
+        instance.new_flavor = None
3789
+        instance.system_metadata.pop('old_vm_state', None)
3790
+        instance.save()
3772 3791
 
3773
-            network_info = self.network_api.get_instance_nw_info(context,
3774
-                                                                 instance)
3775
-            # TODO(mriedem): Get BDMs here and pass them to the driver.
3776
-            self.driver.confirm_migration(context, migration, instance,
3777
-                                          network_info)
3792
+        # NOTE(tr3buchet): tear down networks on source host
3793
+        self.network_api.setup_networks_on_host(context, instance,
3794
+                           migration.source_compute, teardown=True)
3778 3795
 
3779
-            migration.status = 'confirmed'
3780
-            with migration.obj_as_admin():
3781
-                migration.save()
3796
+        network_info = self.network_api.get_instance_nw_info(context,
3797
+                                                             instance)
3798
+        # TODO(mriedem): Get BDMs here and pass them to the driver.
3799
+        self.driver.confirm_migration(context, migration, instance,
3800
+                                      network_info)
3782 3801
 
3783
-            rt = self._get_resource_tracker()
3784
-            rt.drop_move_claim(context, instance, migration.source_node,
3785
-                               old_instance_type, prefix='old_')
3786
-            self._delete_allocation_after_move(context, instance, migration,
3787
-                                               old_instance_type,
3788
-                                               migration.source_node)
3789
-            instance.drop_migration_context()
3802
+        migration.status = 'confirmed'
3803
+        with migration.obj_as_admin():
3804
+            migration.save()
3790 3805
 
3791
-            # NOTE(mriedem): The old_vm_state could be STOPPED but the user
3792
-            # might have manually powered up the instance to confirm the
3793
-            # resize/migrate, so we need to check the current power state
3794
-            # on the instance and set the vm_state appropriately. We default
3795
-            # to ACTIVE because if the power state is not SHUTDOWN, we
3796
-            # assume _sync_instance_power_state will clean it up.
3797
-            p_state = instance.power_state
3798
-            vm_state = None
3799
-            if p_state == power_state.SHUTDOWN:
3800
-                vm_state = vm_states.STOPPED
3801
-                LOG.debug("Resized/migrated instance is powered off. "
3802
-                          "Setting vm_state to '%s'.", vm_state,
3803
-                          instance=instance)
3804
-            else:
3805
-                vm_state = vm_states.ACTIVE
3806
+        rt = self._get_resource_tracker()
3807
+        rt.drop_move_claim(context, instance, migration.source_node,
3808
+                           old_instance_type, prefix='old_')
3809
+        instance.drop_migration_context()
3810
+
3811
+        # NOTE(mriedem): The old_vm_state could be STOPPED but the user
3812
+        # might have manually powered up the instance to confirm the
3813
+        # resize/migrate, so we need to check the current power state
3814
+        # on the instance and set the vm_state appropriately. We default
3815
+        # to ACTIVE because if the power state is not SHUTDOWN, we
3816
+        # assume _sync_instance_power_state will clean it up.
3817
+        p_state = instance.power_state
3818
+        vm_state = None
3819
+        if p_state == power_state.SHUTDOWN:
3820
+            vm_state = vm_states.STOPPED
3821
+            LOG.debug("Resized/migrated instance is powered off. "
3822
+                      "Setting vm_state to '%s'.", vm_state,
3823
+                      instance=instance)
3824
+        else:
3825
+            vm_state = vm_states.ACTIVE
3806 3826
 
3807
-            instance.vm_state = vm_state
3808
-            instance.task_state = None
3809
-            instance.save(expected_task_state=[None, task_states.DELETING,
3810
-                                               task_states.SOFT_DELETING])
3827
+        instance.vm_state = vm_state
3828
+        instance.task_state = None
3829
+        instance.save(expected_task_state=[None, task_states.DELETING,
3830
+                                           task_states.SOFT_DELETING])
3811 3831
 
3812
-            self._notify_about_instance_usage(
3813
-                context, instance, "resize.confirm.end",
3814
-                network_info=network_info)
3815
-            compute_utils.notify_about_instance_action(context, instance,
3816
-                   self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3817
-                   phase=fields.NotificationPhase.END)
3832
+        self._notify_about_instance_usage(
3833
+            context, instance, "resize.confirm.end",
3834
+            network_info=network_info)
3835
+        compute_utils.notify_about_instance_action(context, instance,
3836
+               self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
3837
+               phase=fields.NotificationPhase.END)
3818 3838
 
3819 3839
     def _delete_allocation_after_move(self, context, instance, migration,
3820 3840
                                       flavor, nodename):

+ 17
- 5
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -6264,6 +6264,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6264 6264
             dest_node=None,
6265 6265
             dest_host=None,
6266 6266
             source_compute='source_compute',
6267
+            source_node='source_node',
6267 6268
             status='migrating')
6268 6269
         self.migration.save = mock.MagicMock()
6269 6270
         self.useFixture(fixtures.SpawnIsSynchronousFixture())
@@ -6617,6 +6618,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6617 6618
         do_finish_revert_resize()
6618 6619
 
6619 6620
     def test_confirm_resize_deletes_allocations(self):
6621
+        @mock.patch('nova.objects.Instance.get_by_uuid')
6622
+        @mock.patch('nova.objects.Migration.get_by_id')
6620 6623
         @mock.patch.object(self.migration, 'save')
6621 6624
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
6622 6625
         @mock.patch.object(self.compute, 'network_api')
@@ -6627,12 +6630,15 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6627 6630
         @mock.patch.object(self.instance, 'save')
6628 6631
         def do_confirm_resize(mock_save, mock_drop, mock_delete, mock_get_rt,
6629 6632
                               mock_confirm, mock_nwapi, mock_notify,
6630
-                              mock_mig_save):
6633
+                              mock_mig_save, mock_mig_get, mock_inst_get):
6631 6634
             self.instance.migration_context = objects.MigrationContext()
6632 6635
             self.migration.source_compute = self.instance['host']
6633 6636
             self.migration.source_node = self.instance['node']
6634
-            self.compute._confirm_resize(self.context, self.instance,
6635
-                                         self.migration)
6637
+            self.migration.status = 'confirming'
6638
+            mock_mig_get.return_value = self.migration
6639
+            mock_inst_get.return_value = self.instance
6640
+            self.compute.confirm_resize(self.context, self.instance,
6641
+                                        self.migration)
6636 6642
             mock_delete.assert_called_once_with(self.context, self.instance,
6637 6643
                                                 self.migration,
6638 6644
                                                 self.instance.old_flavor,
@@ -6688,9 +6694,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6688 6694
         with test.nested(
6689 6695
             mock.patch.object(self.compute, 'network_api'),
6690 6696
             mock.patch.object(self.compute.driver, 'confirm_migration',
6691
-                              side_effect=error)
6697
+                              side_effect=error),
6698
+            mock.patch.object(self.compute, '_delete_allocation_after_move')
6692 6699
         ) as (
6693
-            network_api, confirm_migration
6700
+            network_api, confirm_migration, delete_allocation
6694 6701
         ):
6695 6702
             self.assertRaises(exception.HypervisorUnavailable,
6696 6703
                               self.compute.confirm_resize,
@@ -6704,6 +6711,11 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
6704 6711
         self.assertEqual(2, instance_save.call_count)
6705 6712
         # The migration.status should have been saved.
6706 6713
         self.migration.save.assert_called_once_with()
6714
+        # Allocations should always be cleaned up even if cleaning up the
6715
+        # source host fails.
6716
+        delete_allocation.assert_called_once_with(
6717
+            self.context, self.instance, self.migration,
6718
+            self.instance.old_flavor, self.migration.source_node)
6707 6719
         # Assert other mocks we care less about.
6708 6720
         notify_usage.assert_called_once()
6709 6721
         notify_action.assert_called_once()

Loading…
Cancel
Save