Browse Source

Add functional test for resize crash compute restart revert

During review of change I51673e58fc8d5f051df911630f6d7a928d123a5b
there was discussion about the RESIZE_MIGRATING crashed resize
cleanup on restart of the compute service and how it may or may
not work but is likely missing some things to cleanup like fields
set on the instance during prep_resize and resource allocations
in placement.

This adds a functional test to hit that code and make assertions
about what it does and does not cleanup after the crashed resize.

Related-Bug: #1836369

Change-Id: I107d842520c088b4859a3b36621ce6bd8e970475
tags/20.0.0.0rc1
Matt Riedemann 3 months ago
parent
commit
8db712fe04
1 changed files with 120 additions and 0 deletions
  1. 120
    0
      nova/tests/functional/compute/test_init_host.py

+ 120
- 0
nova/tests/functional/compute/test_init_host.py View File

@@ -0,0 +1,120 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+import mock
14
+import time
15
+
16
+from nova import context as nova_context
17
+from nova import objects
18
+from nova.tests.functional import integrated_helpers
19
+from nova.tests.unit.image import fake as fake_image
20
+
21
+
22
+class ComputeManagerInitHostTestCase(
23
+        integrated_helpers.ProviderUsageBaseTestCase):
24
+    """Tests various actions performed when the nova-compute service starts."""
25
+
26
+    compute_driver = 'fake.MediumFakeDriver'
27
+
28
+    def test_migrate_disk_and_power_off_crash_finish_revert_migration(self):
29
+        """Tests the scenario that the compute service crashes while the
30
+        driver's migrate_disk_and_power_off method is running (we could be
31
+        slow transferring disks or something when it crashed) and on restart
32
+        of the compute service the driver's finish_revert_migration method
33
+        is called to cleanup the source host and reset the instnace task_state.
34
+        """
35
+        # Start two compute service so we migrate across hosts.
36
+        for x in range(2):
37
+            self._start_compute('host%d' % x)
38
+        # Create a server, it does not matter on which host it lands.
39
+        name = 'test_migrate_disk_and_power_off_crash_finish_revert_migration'
40
+        server = self._build_minimal_create_server_request(
41
+            self.api, name, image_uuid=fake_image.get_valid_image_id(),
42
+            networks='auto')
43
+        server = self.api.post_server({'server': server})
44
+        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
45
+        # Save the source hostname for assertions later.
46
+        source_host = server['OS-EXT-SRV-ATTR:host']
47
+
48
+        def fake_migrate_disk_and_power_off(*args, **kwargs):
49
+            # Simulate the source compute service crashing by restarting it.
50
+            self.restart_compute_service(self.computes[source_host])
51
+            # We have to keep the method from returning before asserting the
52
+            # _init_instance restart behavior otherwise resize_instance will
53
+            # fail and set the instance to ERROR status, revert allocations,
54
+            # etc which is not realistic if the service actually crashed while
55
+            # migrate_disk_and_power_off was running.
56
+            time.sleep(30)
57
+
58
+        source_driver = self.computes[source_host].manager.driver
59
+        with mock.patch.object(source_driver, 'migrate_disk_and_power_off',
60
+                               side_effect=fake_migrate_disk_and_power_off):
61
+            # Initiate a cold migration from the source host.
62
+            self.admin_api.post_server_action(server['id'], {'migrate': None})
63
+            # Now wait for the task_state to be reset to None during
64
+            # _init_instance.
65
+            server = self._wait_for_server_parameter(
66
+                self.admin_api, server, {
67
+                    'status': 'ACTIVE',
68
+                    'OS-EXT-STS:task_state': None,
69
+                    'OS-EXT-SRV-ATTR:host': source_host
70
+                }
71
+            )
72
+
73
+        # Assert we went through the _init_instance processing we expect.
74
+        log_out = self.stdlog.logger.output
75
+        self.assertIn('Instance found in migrating state during startup. '
76
+                      'Resetting task_state', log_out)
77
+        # Assert that driver.finish_revert_migration did not raise an error.
78
+        self.assertNotIn('Failed to revert crashed migration', log_out)
79
+
80
+        # The migration status should be "error" rather than stuck as
81
+        # "migrating".
82
+        context = nova_context.get_admin_context()
83
+        # FIXME(mriedem): This is bug 1836369 because we would normally expect
84
+        # Migration.get_by_instance_and_status to raise
85
+        # MigrationNotFoundByStatus since the status should be "error".
86
+        objects.Migration.get_by_instance_and_status(
87
+            context, server['id'], 'migrating')
88
+
89
+        # Assert things related to the resize get cleaned up:
90
+        # - things set on the instance during prep_resize like:
91
+        #   - migration_context
92
+        #   - new_flavor
93
+        #   - stashed old_vm_state in system_metadata
94
+        # - migration-based allocations from conductor/scheduler, i.e. that the
95
+        #   allocations created by the scheduler for the instance and dest host
96
+        #   are gone and the source host allocations are back on the instance
97
+        #   rather than the migration record
98
+        instance = objects.Instance.get_by_uuid(
99
+            context, server['id'], expected_attrs=[
100
+                'migration_context', 'flavor', 'system_metadata'
101
+            ])
102
+        # FIXME(mriedem): Leaving these fields set on the instance is
103
+        # bug 1836369.
104
+        self.assertIsNotNone(instance.migration_context)
105
+        self.assertIsNotNone(instance.new_flavor)
106
+        self.assertEqual('active', instance.system_metadata['old_vm_state'])
107
+
108
+        dest_host = 'host0' if source_host == 'host1' else 'host1'
109
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_host)
110
+        dest_allocations = self._get_allocations_by_provider_uuid(dest_rp_uuid)
111
+        # FIXME(mriedem): This is bug 1836369 because we orphaned the
112
+        # allocations created by the scheduler for the server on the dest host.
113
+        self.assertIn(server['id'], dest_allocations)
114
+        source_rp_uuid = self._get_provider_uuid_by_host(source_host)
115
+        source_allocations = self._get_allocations_by_provider_uuid(
116
+            source_rp_uuid)
117
+        # FIXME(mriedem): This is bug 1836369 because the server is running on
118
+        # the source host but is not tracking allocations against the source
119
+        # host.
120
+        self.assertNotIn(server['id'], source_allocations)

Loading…
Cancel
Save