Browse Source

Error out migration when confirm_resize fails

If anything fails and raises an exception during
confirm_resize, the migration status is stuck in
"confirming" status even though the instance status
may be "ERROR".

This change adds the errors_out_migration decorator
to the confirm_resize method to make sure the migration
status is "error" if an error is raised.

In bug 1821594 it was the driver.confirm_migration
method that raised some exception, so a unit test is
added here which simulates a similar scenario.

This only partially closes the bug because we are still
leaking allocations on the source node resource provider
since _delete_allocation_after_move is not called. That
will be dealt with in a separate patch.

Change-Id: Ic7d78ad43a2bad7f932c22c98944accbbed9e9e2
Partial-Bug: #1821594
(cherry picked from commit 408ef8f84a)
tags/19.0.1
Matt Riedemann 5 months ago
parent
commit
972d4e0eb3

+ 1
- 0
nova/compute/manager.py View File

@@ -3929,6 +3929,7 @@ class ComputeManager(manager.Manager):
3929 3929
 
3930 3930
     @wrap_exception()
3931 3931
     @wrap_instance_event(prefix='compute')
3932
+    @errors_out_migration
3932 3933
     @wrap_instance_fault
3933 3934
     def confirm_resize(self, context, instance, migration):
3934 3935
         """Confirms a migration/resize and deletes the 'old' instance.

+ 1
- 2
nova/tests/functional/test_servers.py View File

@@ -5147,8 +5147,7 @@ class ConsumerGenerationConflictTest(
5147 5147
         self.assertEqual('migration', migrations[0]['migration_type'])
5148 5148
         self.assertEqual(server['id'], migrations[0]['instance_uuid'])
5149 5149
         self.assertEqual(source_hostname, migrations[0]['source_compute'])
5150
-        # NOTE(gibi): it might be better to mark the migration as error
5151
-        self.assertEqual('confirmed', migrations[0]['status'])
5150
+        self.assertEqual('error', migrations[0]['status'])
5152 5151
 
5153 5152
         # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even
5154 5153
         # after the instance is deleted. At least nova logs a fat ERROR.

+ 48
- 0
nova/tests/unit/compute/test_compute_mgr.py View File

@@ -6833,6 +6833,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
6833 6833
                 expected_attrs=['metadata', 'system_metadata', 'info_cache'])
6834 6834
         self.migration = objects.Migration(
6835 6835
             context=self.context.elevated(),
6836
+            id=1,
6836 6837
             uuid=uuids.migration_uuid,
6837 6838
             instance_uuid=self.instance.uuid,
6838 6839
             new_instance_type_id=7,
@@ -7215,6 +7216,53 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
7215 7216
 
7216 7217
         do_confirm_resize()
7217 7218
 
7219
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
7220
+    @mock.patch('nova.objects.Migration.get_by_id')
7221
+    @mock.patch('nova.objects.Instance.get_by_uuid')
7222
+    @mock.patch('nova.compute.utils.notify_about_instance_usage')
7223
+    @mock.patch('nova.compute.utils.notify_about_instance_action')
7224
+    @mock.patch('nova.objects.Instance.save')
7225
+    def test_confirm_resize_driver_confirm_migration_fails(
7226
+            self, instance_save, notify_action, notify_usage,
7227
+            instance_get_by_uuid, migration_get_by_id, add_fault):
7228
+        """Tests the scenario that driver.confirm_migration raises some error
7229
+        to make sure the error is properly handled, like the instance and
7230
+        migration status is set to 'error'.
7231
+        """
7232
+        self.migration.status = 'confirming'
7233
+        migration_get_by_id.return_value = self.migration
7234
+        instance_get_by_uuid.return_value = self.instance
7235
+
7236
+        error = exception.HypervisorUnavailable(
7237
+            host=self.migration.source_compute)
7238
+        with test.nested(
7239
+            mock.patch.object(self.compute, 'network_api'),
7240
+            mock.patch.object(self.compute.driver, 'confirm_migration',
7241
+                              side_effect=error)
7242
+        ) as (
7243
+            network_api, confirm_migration
7244
+        ):
7245
+            self.assertRaises(exception.HypervisorUnavailable,
7246
+                              self.compute.confirm_resize,
7247
+                              self.context, self.instance, self.migration)
7248
+        # Make sure the instance is in ERROR status.
7249
+        self.assertEqual(vm_states.ERROR, self.instance.vm_state)
7250
+        # Make sure the migration is in error status.
7251
+        self.assertEqual('error', self.migration.status)
7252
+        # Instance.save is called twice, once to clear the resize metadata
7253
+        # and once to set the instance to ERROR status.
7254
+        self.assertEqual(2, instance_save.call_count)
7255
+        # The migration.status should have been saved.
7256
+        self.migration.save.assert_called_once_with()
7257
+        # Assert other mocks we care less about.
7258
+        notify_usage.assert_called_once()
7259
+        notify_action.assert_called_once()
7260
+        add_fault.assert_called_once()
7261
+        confirm_migration.assert_called_once()
7262
+        network_api.setup_networks_on_host.assert_called_once()
7263
+        instance_get_by_uuid.assert_called_once()
7264
+        migration_get_by_id.assert_called_once()
7265
+
7218 7266
     def test_delete_allocation_after_move_confirm_by_migration(self):
7219 7267
         with mock.patch.object(self.compute, 'reportclient') as mock_report:
7220 7268
             mock_report.delete_allocation_for_instance.return_value = True

Loading…
Cancel
Save