From 26e1d9c7237f7bd97ec5f1fd3e572b3927eea725 Mon Sep 17 00:00:00 2001
From: Matt Riedemann <mriedem.os@gmail.com>
Date: Wed, 30 Oct 2019 12:11:43 -0400
Subject: [PATCH] Reset vm_state to original value if rebuild claim fails

If while evacuating an active or stopped server the rebuild
resource claim or group affinity policy check fails, the state
of the server has not actually changed but the vm_state is changed
to ERROR because of the _error_out_instance_on_exception context
manager.

This builds on Ie4f9177f4d54cbc7dbcf58bd107fd5f24c60d8bb by
wrapping the BuildAbortException in InstanceFaultRollback for the
claim/group policy failures so the vm_state remains unchanged.
Note that the overall instance action record will still be marked
as a failure since the BuildAbortException is re-raised and the
wrap_instance_event decorator will fail the action (this is how the
user can know the operation failed).

Change-Id: I07fa46690d8f7b846665bc59c5e361873154382b
Closes-Bug: #1784983
---
 nova/compute/manager.py                     | 16 ++++++++++---
 nova/tests/functional/test_servers.py       |  9 +++++---
 nova/tests/unit/compute/test_compute_mgr.py | 25 +++++++++++++++++++++
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 81c7ee51b0ae..a7b2f483cd34 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -3335,7 +3335,12 @@ class ComputeManager(manager.Manager):
         allocs = self.reportclient.get_allocations_for_consumer(
                     context, instance.uuid)
 
-        with self._error_out_instance_on_exception(context, instance):
+        # If the resource claim or group policy validation fails before we
+        # do anything to the guest or its networking/volumes we want to keep
+        # the current status rather than put the instance into ERROR status.
+        instance_state = instance.vm_state
+        with self._error_out_instance_on_exception(
+                context, instance, instance_state=instance_state):
             try:
                 self._do_rebuild_instance_with_claim(
                     context, instance, orig_image_ref,
@@ -3369,8 +3374,13 @@ class ComputeManager(manager.Manager):
                 self.rt.delete_allocation_for_evacuated_instance(
                     context, instance, scheduled_node, node_type='destination')
                 self._notify_instance_rebuild_error(context, instance, e, bdms)
-                raise exception.BuildAbortException(
-                    instance_uuid=instance.uuid, reason=e.format_message())
+                # Wrap this in InstanceFaultRollback so that the
+                # _error_out_instance_on_exception context manager keeps the
+                # vm_state unchanged.
+                raise exception.InstanceFaultRollback(
+                    inner_exception=exception.BuildAbortException(
+                        instance_uuid=instance.uuid,
+                        reason=e.format_message()))
             except (exception.InstanceNotFound,
                     exception.UnexpectedDeletingTaskStateError) as e:
                 LOG.debug('Instance was deleted while rebuilding',
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index c6a9fdb59ece..54f7df2bd995 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -2503,8 +2503,10 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
             # evacuate the server
             self.api.post_server_action(server['id'], {'evacuate': {}})
             # the migration will fail on the dest node and the instance will
-            # go into error state
-            server = self._wait_for_state_change(self.api, server, 'ERROR')
+            # stay ACTIVE and task_state will be set to None.
+            server = self._wait_for_server_parameter(
+                self.api, server, {'status': 'ACTIVE',
+                                   'OS-EXT-STS:task_state': None})
 
         # Run the periodics to show those don't modify allocations.
         self._run_periodics()
@@ -7204,7 +7206,8 @@ class ServerMoveWithPortResourceRequestTest(
             server = self._wait_for_server_parameter(
                 self.api, server,
                 {'OS-EXT-SRV-ATTR:host': 'host1',
-                 'status': 'ERROR'})
+                 'status': 'ACTIVE',
+                 'OS-EXT-STS:task_state': None})
 
         self._wait_for_migration_status(server, ['failed'])
 
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 348169e7bd43..18b2f736090d 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -5007,6 +5007,28 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         delete_alloc.assert_called_once_with(self.context, instance, 'foo',
                                              node_type='destination')
 
+    @mock.patch('nova.objects.Instance.save')
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
+                'delete_allocation_for_evacuated_instance')
+    def test_rebuild_compute_resources_unavailable(self, mock_delete_alloc,
+                                                   mock_add_fault,
+                                                   mock_save):
+        """Tests that when the rebuild_claim fails with
+        ComputeResourcesUnavailable the vm_state on the instance remains
+        unchanged.
+        """
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.vm_state = vm_states.ACTIVE
+        ex = exception.ComputeResourcesUnavailable(reason='out of foo')
+        self.assertRaises(exception.BuildAbortException,
+                          self._test_rebuild_ex, instance, ex)
+        # Make sure the instance vm_state did not change.
+        self.assertEqual(vm_states.ACTIVE, instance.vm_state)
+        mock_delete_alloc.assert_called_once()
+        mock_save.assert_called()
+        mock_add_fault.assert_called_once()
+
     @mock.patch('nova.context.RequestContext.elevated')
     @mock.patch('nova.objects.instance.Instance.drop_migration_context')
     @mock.patch('nova.objects.instance.Instance.apply_migration_context')
@@ -5071,6 +5093,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         instance = fake_instance.fake_instance_obj(self.context)
         instance.info_cache = None
         instance.system_metadata = {}
+        instance.vm_state = vm_states.ACTIVE
         elevated_context = mock.Mock()
         mock_context_elevated.return_value = elevated_context
         request_spec = objects.RequestSpec()
@@ -5094,6 +5117,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
         mock_notify.assert_called_once_with(
             elevated_context, instance, 'fake-mini', bdms=None, exception=exc,
             phase='error', tb=mock.ANY)
+        # Make sure the instance vm_state did not change.
+        self.assertEqual(vm_states.ACTIVE, instance.vm_state)
 
     def test_rebuild_node_not_updated_if_not_recreate(self):
         node = uuidutils.generate_uuid()  # ironic node uuid