Add functional test for resize crash compute restart revert

During review of change I51673e58fc8d5f051df911630f6d7a928d123a5b there was discussion about the RESIZE_MIGRATING crashed resize cleanup on restart of the compute service and how it may or may not work but is likely missing some things to cleanup like fields set on the instance during prep_resize and resource allocations in placement. This adds a functional test to hit that code and make assertions about what it does and does not cleanup after the crashed resize. Conflicts: nova/tests/functional/integrated_helpers.py Note: file conflict is due to not having patch Iea283322124cb35fc0bc6d25f35548621e8c8c2f in queens, and this is why the test_init_host change is needed as well. Related-Bug: #1836369 Change-Id: I107d842520c088b4859a3b36621ce6bd8e970475 (cherry picked from commit 8db712fe04) (cherry picked from commit c565cab479) (cherry picked from commit f04fc63ffc)
2019-07-11 17:11:32 -04:00 · 2019-07-11 17:11:32 -04:00 · dae6fbfce6
parent 52e68f121e
commit dae6fbfce6
1 changed files with 123 additions and 0 deletions
--- a/nova/tests/functional/compute/test_init_host.py
+++ b/nova/tests/functional/compute/test_init_host.py
@ -0,0 +1,123 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import mock
+import time
+
+from nova import context as nova_context
+from nova import objects
+from nova.tests.functional import test_servers
+from nova.tests.unit.image import fake as fake_image
+
+
+class ComputeManagerInitHostTestCase(test_servers.ProviderUsageBaseTestCase):
+    """Tests various actions performed when the nova-compute service starts."""
+
+    compute_driver = 'fake.MediumFakeDriver'
+
+    def _get_allocations_by_provider_uuid(self, rp_uuid):
+        return self.placement_api.get(
+            '/resource_providers/%s/allocations' % rp_uuid).body['allocations']
+
+    def test_migrate_disk_and_power_off_crash_finish_revert_migration(self):
+        """Tests the scenario that the compute service crashes while the
+        driver's migrate_disk_and_power_off method is running (we could be
+        slow transferring disks or something when it crashed) and on restart
+        of the compute service the driver's finish_revert_migration method
+        is called to cleanup the source host and reset the instnace task_state.
+        """
+        # Start two compute service so we migrate across hosts.
+        for x in range(2):
+            self._start_compute('host%d' % x)
+        # Create a server, it does not matter on which host it lands.
+        name = 'test_migrate_disk_and_power_off_crash_finish_revert_migration'
+        server = self._build_minimal_create_server_request(
+            self.api, name, image_uuid=fake_image.get_valid_image_id(),
+            networks='auto')
+        server = self.api.post_server({'server': server})
+        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
+        # Save the source hostname for assertions later.
+        source_host = server['OS-EXT-SRV-ATTR:host']
+
+        def fake_migrate_disk_and_power_off(*args, **kwargs):
+            # Simulate the source compute service crashing by restarting it.
+            self.restart_compute_service(self.computes[source_host])
+            # We have to keep the method from returning before asserting the
+            # _init_instance restart behavior otherwise resize_instance will
+            # fail and set the instance to ERROR status, revert allocations,
+            # etc which is not realistic if the service actually crashed while
+            # migrate_disk_and_power_off was running.
+            time.sleep(30)
+
+        source_driver = self.computes[source_host].manager.driver
+        with mock.patch.object(source_driver, 'migrate_disk_and_power_off',
+                               side_effect=fake_migrate_disk_and_power_off):
+            # Initiate a cold migration from the source host.
+            self.admin_api.post_server_action(server['id'], {'migrate': None})
+            # Now wait for the task_state to be reset to None during
+            # _init_instance.
+            server = self._wait_for_server_parameter(
+                self.admin_api, server, {
+                    'status': 'ACTIVE',
+                    'OS-EXT-STS:task_state': None,
+                    'OS-EXT-SRV-ATTR:host': source_host
+                }
+            )
+
+        # Assert we went through the _init_instance processing we expect.
+        log_out = self.stdlog.logger.output
+        self.assertIn('Instance found in migrating state during startup. '
+                      'Resetting task_state', log_out)
+        # Assert that driver.finish_revert_migration did not raise an error.
+        self.assertNotIn('Failed to revert crashed migration', log_out)
+
+        # The migration status should be "error" rather than stuck as
+        # "migrating".
+        context = nova_context.get_admin_context()
+        # FIXME(mriedem): This is bug 1836369 because we would normally expect
+        # Migration.get_by_instance_and_status to raise
+        # MigrationNotFoundByStatus since the status should be "error".
+        objects.Migration.get_by_instance_and_status(
+            context, server['id'], 'migrating')
+
+        # Assert things related to the resize get cleaned up:
+        # - things set on the instance during prep_resize like:
+        #   - migration_context
+        #   - new_flavor
+        #   - stashed old_vm_state in system_metadata
+        # - migration-based allocations from conductor/scheduler, i.e. that the
+        #   allocations created by the scheduler for the instance and dest host
+        #   are gone and the source host allocations are back on the instance
+        #   rather than the migration record
+        instance = objects.Instance.get_by_uuid(
+            context, server['id'], expected_attrs=[
+                'migration_context', 'flavor', 'system_metadata'
+            ])
+        # FIXME(mriedem): Leaving these fields set on the instance is
+        # bug 1836369.
+        self.assertIsNotNone(instance.migration_context)
+        self.assertIsNotNone(instance.new_flavor)
+        self.assertEqual('active', instance.system_metadata['old_vm_state'])
+
+        dest_host = 'host0' if source_host == 'host1' else 'host1'
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_host)
+        dest_allocations = self._get_allocations_by_provider_uuid(dest_rp_uuid)
+        # FIXME(mriedem): This is bug 1836369 because we orphaned the
+        # allocations created by the scheduler for the server on the dest host.
+        self.assertIn(server['id'], dest_allocations)
+        source_rp_uuid = self._get_provider_uuid_by_host(source_host)
+        source_allocations = self._get_allocations_by_provider_uuid(
+            source_rp_uuid)
+        # FIXME(mriedem): This is bug 1836369 because the server is running on
+        # the source host but is not tracking allocations against the source
+        # host.
+        self.assertNotIn(server['id'], source_allocations)