Reset vm_state to original value if rebuild claim fails

If while evacuating an active or stopped server the rebuild resource claim or group affinity policy check fails, the state of the server has not actually changed but the vm_state is changed to ERROR because of the _error_out_instance_on_exception context manager. This builds on Ie4f9177f4d54cbc7dbcf58bd107fd5f24c60d8bb by wrapping the BuildAbortException in InstanceFaultRollback for the claim/group policy failures so the vm_state remains unchanged. Note that the overall instance action record will still be marked as a failure since the BuildAbortException is re-raised and the wrap_instance_event decorator will fail the action (this is how the user can know the operation failed). Change-Id: I07fa46690d8f7b846665bc59c5e361873154382b Closes-Bug: #1784983
2019-10-30 12:11:43 -04:00 · 2019-10-30 12:11:43 -04:00 · 26e1d9c723
parent 8f341eb4a4
commit 26e1d9c723
3 changed files with 44 additions and 6 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -3335,7 +3335,12 @@ class ComputeManager(manager.Manager):
        allocs = self.reportclient.get_allocations_for_consumer(
                    context, instance.uuid)

-        with self._error_out_instance_on_exception(context, instance):
+        # If the resource claim or group policy validation fails before we
+        # do anything to the guest or its networking/volumes we want to keep
+        # the current status rather than put the instance into ERROR status.
+        instance_state = instance.vm_state
+        with self._error_out_instance_on_exception(
+                context, instance, instance_state=instance_state):
            try:
                self._do_rebuild_instance_with_claim(
                    context, instance, orig_image_ref,
@ -3369,8 +3374,13 @@ class ComputeManager(manager.Manager):
                self.rt.delete_allocation_for_evacuated_instance(
                    context, instance, scheduled_node, node_type='destination')
                self._notify_instance_rebuild_error(context, instance, e, bdms)
-                raise exception.BuildAbortException(
-                    instance_uuid=instance.uuid, reason=e.format_message())
+                # Wrap this in InstanceFaultRollback so that the
+                # _error_out_instance_on_exception context manager keeps the
+                # vm_state unchanged.
+                raise exception.InstanceFaultRollback(
+                    inner_exception=exception.BuildAbortException(
+                        instance_uuid=instance.uuid,
+                        reason=e.format_message()))
            except (exception.InstanceNotFound,
                    exception.UnexpectedDeletingTaskStateError) as e:
                LOG.debug('Instance was deleted while rebuilding',
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -2503,8 +2503,10 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
            # evacuate the server
            self.api.post_server_action(server['id'], {'evacuate': {}})
            # the migration will fail on the dest node and the instance will
-            # go into error state
-            server = self._wait_for_state_change(self.api, server, 'ERROR')
+            # stay ACTIVE and task_state will be set to None.
+            server = self._wait_for_server_parameter(
+                self.api, server, {'status': 'ACTIVE',
+                                   'OS-EXT-STS:task_state': None})

        # Run the periodics to show those don't modify allocations.
        self._run_periodics()
@ -7204,7 +7206,8 @@ class ServerMoveWithPortResourceRequestTest(
            server = self._wait_for_server_parameter(
                self.api, server,
                {'OS-EXT-SRV-ATTR:host': 'host1',
-                 'status': 'ERROR'})
+                 'status': 'ACTIVE',
+                 'OS-EXT-STS:task_state': None})

        self._wait_for_migration_status(server, ['failed'])

--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -5007,6 +5007,28 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
        delete_alloc.assert_called_once_with(self.context, instance, 'foo',
                                             node_type='destination')

+    @mock.patch('nova.objects.Instance.save')
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
+                'delete_allocation_for_evacuated_instance')
+    def test_rebuild_compute_resources_unavailable(self, mock_delete_alloc,
+                                                   mock_add_fault,
+                                                   mock_save):
+        """Tests that when the rebuild_claim fails with
+        ComputeResourcesUnavailable the vm_state on the instance remains
+        unchanged.
+        """
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.vm_state = vm_states.ACTIVE
+        ex = exception.ComputeResourcesUnavailable(reason='out of foo')
+        self.assertRaises(exception.BuildAbortException,
+                          self._test_rebuild_ex, instance, ex)
+        # Make sure the instance vm_state did not change.
+        self.assertEqual(vm_states.ACTIVE, instance.vm_state)
+        mock_delete_alloc.assert_called_once()
+        mock_save.assert_called()
+        mock_add_fault.assert_called_once()
+
    @mock.patch('nova.context.RequestContext.elevated')
    @mock.patch('nova.objects.instance.Instance.drop_migration_context')
    @mock.patch('nova.objects.instance.Instance.apply_migration_context')
@ -5071,6 +5093,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
        instance = fake_instance.fake_instance_obj(self.context)
        instance.info_cache = None
        instance.system_metadata = {}
+        instance.vm_state = vm_states.ACTIVE
        elevated_context = mock.Mock()
        mock_context_elevated.return_value = elevated_context
        request_spec = objects.RequestSpec()
@ -5094,6 +5117,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
        mock_notify.assert_called_once_with(
            elevated_context, instance, 'fake-mini', bdms=None, exception=exc,
            phase='error', tb=mock.ANY)
+        # Make sure the instance vm_state did not change.
+        self.assertEqual(vm_states.ACTIVE, instance.vm_state)

    def test_rebuild_node_not_updated_if_not_recreate(self):
        node = uuidutils.generate_uuid()  # ironic node uuid