Browse Source

Merge "Add functional regression test for bug 1837955" into stable/rocky

tags/18.2.2
Zuul 1 month ago
parent
commit
979ec661de
1 changed files with 115 additions and 0 deletions
  1. 115
    0
      nova/tests/functional/regressions/test_bug_1837955.py

+ 115
- 0
nova/tests/functional/regressions/test_bug_1837955.py View File

@@ -0,0 +1,115 @@
1
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
2
+# not use this file except in compliance with the License. You may obtain
3
+# a copy of the License at
4
+#
5
+#      http://www.apache.org/licenses/LICENSE-2.0
6
+#
7
+# Unless required by applicable law or agreed to in writing, software
8
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10
+# License for the specific language governing permissions and limitations
11
+# under the License.
12
+
13
+import time
14
+
15
+from nova import exception
16
+from nova.tests import fixtures as nova_fixtures
17
+from nova.tests.functional import integrated_helpers
18
+from nova.tests.unit import fake_notifier
19
+from nova.tests.unit.image import fake as fake_image
20
+
21
+
22
+class BuildRescheduleClaimFailsTestCase(
23
+        integrated_helpers.ProviderUsageBaseTestCase):
24
+    """Regression test case for bug 1837955 where a server build fails on the
25
+    primary host and then attempting to allocate resources on the alternate
26
+    host, the alternate host is full and the allocations claim in placement
27
+    fails, resulting in the build failing due to MaxRetriesExceeded and the
28
+    server going to ERROR status.
29
+    """
30
+    compute_driver = 'fake.SmallFakeDriver'
31
+
32
+    def _wait_for_unversioned_notification(self, event_type):
33
+        for x in range(20):  # wait up to 10 seconds
34
+            for notification in fake_notifier.NOTIFICATIONS:
35
+                if notification.event_type == event_type:
36
+                    return notification
37
+            time.sleep(.5)
38
+        self.fail('Timed out waiting for unversioned notification %s. Got: %s'
39
+                  % (event_type, fake_notifier.NOTIFICATIONS))
40
+
41
+    def test_build_reschedule_alt_host_alloc_fails(self):
42
+        # Start two compute services so we have one alternate host.
43
+        # Set cpu_allocation_ratio=1.0 to make placement inventory
44
+        # and allocations for VCPU easier to manage.
45
+        self.flags(cpu_allocation_ratio=1.0)
46
+        for x in range(2):
47
+            self._start_compute('host%i' % x)
48
+
49
+        def fake_instance_claim(_self, _context, _inst, nodename, *a, **kw):
50
+            # Before triggering the reschedule to the other host, max out the
51
+            # capacity on the alternate host.
52
+            alt_nodename = 'host0' if nodename == 'host1' else 'host1'
53
+            rp_uuid = self._get_provider_uuid_by_host(alt_nodename)
54
+            inventories = self._get_provider_inventory(rp_uuid)
55
+            # Fake some other consumer taking all of the VCPU on the alt host.
56
+            # Since we set cpu_allocation_ratio=1.0 the total is the total
57
+            # capacity for VCPU on the host.
58
+            total_vcpu = inventories['VCPU']['total']
59
+            alt_consumer = '7d32d0bc-af16-44b2-8019-a24925d76152'
60
+            allocs = {
61
+                'allocations': {
62
+                    rp_uuid: {
63
+                        'resources': {
64
+                            'VCPU': total_vcpu
65
+                        }
66
+                    }
67
+                },
68
+                'project_id': self.api.project_id,
69
+                'user_id': self.api.project_id
70
+            }
71
+            resp = self.placement_api.put(
72
+                '/allocations/%s' % alt_consumer, allocs, version='1.12')
73
+            self.assertEqual(204, resp.status, resp.content)
74
+            raise exception.ComputeResourcesUnavailable(reason='overhead!')
75
+
76
+        # Stub out the instance claim (regardless of which host the scheduler
77
+        # picks as the primary) to trigger a reschedule.
78
+        self.stub_out('nova.compute.manager.resource_tracker.ResourceTracker.'
79
+                      'instance_claim', fake_instance_claim)
80
+
81
+        # Now that our stub is in place, try to create a server and wait for it
82
+        # to go to ERROR status.
83
+        server = self._build_minimal_create_server_request(
84
+            self.api, 'test_build_reschedule_alt_host_alloc_fails',
85
+            image_uuid=fake_image.get_valid_image_id(),
86
+            networks=[{'port': nova_fixtures.NeutronFixture.port_1['id']}])
87
+        server = self.api.post_server({'server': server})
88
+        # FIXME(mriedem): This is bug 1837955 where the status is stuck in
89
+        # BUILD rather than the vm_state being set to error and the task_state
90
+        # being set to None. Uncomment this when the bug is fixed.
91
+        # server = self._wait_for_state_change(self.api, server, 'ERROR')
92
+
93
+        # Wait for the MaxRetriesExceeded fault to be recorded.
94
+        # set_vm_state_and_notify sets the vm_state to ERROR before the fault
95
+        # is recorded but after the notification is sent. So wait for the
96
+        # unversioned notification to show up and then get the fault.
97
+        # FIXME(mriedem): Uncomment this when bug 1837955 is fixed.
98
+        # self._wait_for_unversioned_notification(
99
+        #     'compute_task.build_instances')
100
+        # server = self.api.get_server(server['id'])
101
+        # self.assertIn('fault', server)
102
+        # self.assertIn('Exceeded maximum number of retries',
103
+        #               server['fault']['message'])
104
+
105
+        # TODO(mriedem): Remove this when the bug is fixed. We need to assert
106
+        # something before the bug is fixed to show the failure so check the
107
+        # logs.
108
+        for x in range(20):
109
+            logs = self.stdlog.logger.output
110
+            if 'MaxRetriesExceeded' in logs:
111
+                break
112
+            time.sleep(.5)
113
+        else:
114
+            self.fail('Timed out waiting for MaxRetriesExceeded to show up '
115
+                      'in the logs.')

Loading…
Cancel
Save