Add functional test for resize crash compute restart revert

During review of change I51673e58fc8d5f051df911630f6d7a928d123a5b there was discussion about the RESIZE_MIGRATING crashed resize cleanup on restart of the compute service and how it may or may not work but is likely missing some things to cleanup like fields set on the instance during prep_resize and resource allocations in placement. This adds a functional test to hit that code and make assertions about what it does and does not cleanup after the crashed resize. Related-Bug: #1836369 Change-Id: I107d842520c088b4859a3b36621ce6bd8e970475
2019-07-11 17:11:32 -04:00 · 2019-07-11 17:11:32 -04:00 · 8db712fe04
parent 78f9961d29
commit 8db712fe04
1 changed files with 120 additions and 0 deletions
--- a/nova/tests/functional/compute/test_init_host.py
+++ b/nova/tests/functional/compute/test_init_host.py
@ -0,0 +1,120 @@
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 # License for the specific language governing permissions and limitations
 # under the License.
 import mock
 import time
 from nova import context as nova_context
 from nova import objects
 from nova.tests.functional import integrated_helpers
 from nova.tests.unit.image import fake as fake_image
 class ComputeManagerInitHostTestCase(
        integrated_helpers.ProviderUsageBaseTestCase):
    """Tests various actions performed when the nova-compute service starts."""
    compute_driver = 'fake.MediumFakeDriver'
    def test_migrate_disk_and_power_off_crash_finish_revert_migration(self):
        """Tests the scenario that the compute service crashes while the
        driver's migrate_disk_and_power_off method is running (we could be
        slow transferring disks or something when it crashed) and on restart
        of the compute service the driver's finish_revert_migration method
        is called to cleanup the source host and reset the instnace task_state.
        """
        # Start two compute service so we migrate across hosts.
        for x in range(2):
            self._start_compute('host%d' % x)
        # Create a server, it does not matter on which host it lands.
        name = 'test_migrate_disk_and_power_off_crash_finish_revert_migration'
        server = self._build_minimal_create_server_request(
            self.api, name, image_uuid=fake_image.get_valid_image_id(),
            networks='auto')
        server = self.api.post_server({'server': server})
        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
        # Save the source hostname for assertions later.
        source_host = server['OS-EXT-SRV-ATTR:host']
        def fake_migrate_disk_and_power_off(*args, **kwargs):
            # Simulate the source compute service crashing by restarting it.
            self.restart_compute_service(self.computes[source_host])
            # We have to keep the method from returning before asserting the
            # _init_instance restart behavior otherwise resize_instance will
            # fail and set the instance to ERROR status, revert allocations,
            # etc which is not realistic if the service actually crashed while
            # migrate_disk_and_power_off was running.
            time.sleep(30)
        source_driver = self.computes[source_host].manager.driver
        with mock.patch.object(source_driver, 'migrate_disk_and_power_off',
                               side_effect=fake_migrate_disk_and_power_off):
            # Initiate a cold migration from the source host.
            self.admin_api.post_server_action(server['id'], {'migrate': None})
            # Now wait for the task_state to be reset to None during
            # _init_instance.
            server = self._wait_for_server_parameter(
                self.admin_api, server, {
                    'status': 'ACTIVE',
                    'OS-EXT-STS:task_state': None,
                    'OS-EXT-SRV-ATTR:host': source_host
                }
            )
        # Assert we went through the _init_instance processing we expect.
        log_out = self.stdlog.logger.output
        self.assertIn('Instance found in migrating state during startup. '
                      'Resetting task_state', log_out)
        # Assert that driver.finish_revert_migration did not raise an error.
        self.assertNotIn('Failed to revert crashed migration', log_out)
        # The migration status should be "error" rather than stuck as
        # "migrating".
        context = nova_context.get_admin_context()
        # FIXME(mriedem): This is bug 1836369 because we would normally expect
        # Migration.get_by_instance_and_status to raise
        # MigrationNotFoundByStatus since the status should be "error".
        objects.Migration.get_by_instance_and_status(
            context, server['id'], 'migrating')
        # Assert things related to the resize get cleaned up:
        # - things set on the instance during prep_resize like:
        #   - migration_context
        #   - new_flavor
        #   - stashed old_vm_state in system_metadata
        # - migration-based allocations from conductor/scheduler, i.e. that the
        #   allocations created by the scheduler for the instance and dest host
        #   are gone and the source host allocations are back on the instance
        #   rather than the migration record
        instance = objects.Instance.get_by_uuid(
            context, server['id'], expected_attrs=[
                'migration_context', 'flavor', 'system_metadata'
            ])
        # FIXME(mriedem): Leaving these fields set on the instance is
        # bug 1836369.
        self.assertIsNotNone(instance.migration_context)
        self.assertIsNotNone(instance.new_flavor)
        self.assertEqual('active', instance.system_metadata['old_vm_state'])
        dest_host = 'host0' if source_host == 'host1' else 'host1'
        dest_rp_uuid = self._get_provider_uuid_by_host(dest_host)
        dest_allocations = self._get_allocations_by_provider_uuid(dest_rp_uuid)
        # FIXME(mriedem): This is bug 1836369 because we orphaned the
        # allocations created by the scheduler for the server on the dest host.
        self.assertIn(server['id'], dest_allocations)
        source_rp_uuid = self._get_provider_uuid_by_host(source_host)
        source_allocations = self._get_allocations_by_provider_uuid(
            source_rp_uuid)
        # FIXME(mriedem): This is bug 1836369 because the server is running on
        # the source host but is not tracking allocations against the source
        # host.
        self.assertNotIn(server['id'], source_allocations)