From c96c7c5e13bde39944a9dde7da7fe418b311ca2d Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Tue, 28 May 2019 13:59:20 -0400 Subject: [PATCH] Add regression recreate test for bug 1830747 Before change I4244f7dd8fe74565180f73684678027067b4506e in Stein, when a cold migration would reschedule to conductor it would not send the RequestSpec, only the filter_properties. The filter_properties contain a primitive version of the instance group information from the RequestSpec for things like the group members, hosts and policies, but not the uuid. When conductor is trying to reschedule the cold migration without a RequestSpec, it builds a RequestSpec from the components it has, like the filter_properties. This results in a RequestSpec with an instance_group field set but with no uuid field in the RequestSpec.instance_group. That RequestSpec gets persisted and then because of change Ie70c77db753711e1449e99534d3b83669871943f, later attempts to load the RequestSpec from the database will fail because of the missing RequestSpec.instance_group.uuid. The test added here recreates the pre-Stein scenario which could still be a problem (on master) for any corrupted RequestSpecs for older instances. Change-Id: I05700c97f756edb7470be7273d5c9c3d76d63e29 Related-Bug: #1830747 --- .../regressions/test_bug_1830747.py | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_1830747.py diff --git a/nova/tests/functional/regressions/test_bug_1830747.py b/nova/tests/functional/regressions/test_bug_1830747.py new file mode 100644 index 000000000000..96c76e588acd --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1830747.py @@ -0,0 +1,158 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock +import six + +from nova.conductor import api as conductor_api +from nova import context as nova_context +from nova import exception +from nova import objects +from nova.scheduler import weights +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit.image import fake as fake_image +from nova.virt import fake as fake_virt + + +class HostNameWeigher(weights.BaseHostWeigher): + # Prefer host1 over host2. + weights = {'host1': 100, 'host2': 50} + + def _weigh_object(self, host_state, weight_properties): + return self.weights.get(host_state.host, 0) + + +class MissingReqSpecInstanceGroupUUIDTestCase( + test.TestCase, integrated_helpers.InstanceHelperMixin): + """Regression recreate test for bug 1830747 + + Before change I4244f7dd8fe74565180f73684678027067b4506e in Stein, when + a cold migration would reschedule to conductor it would not send the + RequestSpec, only the filter_properties. The filter_properties contain + a primitive version of the instance group information from the RequestSpec + for things like the group members, hosts and policies, but not the uuid. + When conductor is trying to reschedule the cold migration without a + RequestSpec, it builds a RequestSpec from the components it has, like the + filter_properties. This results in a RequestSpec with an instance_group + field set but with no uuid field in the RequestSpec.instance_group. + That RequestSpec gets persisted and then because of change + Ie70c77db753711e1449e99534d3b83669871943f, later attempts to load the + RequestSpec from the database will fail because of the missing + RequestSpec.instance_group.uuid. + + This test recreates the regression scenario by cold migrating a server + to a host which fails and triggers a reschedule but without the RequestSpec + so a RequestSpec is created/updated for the instance without the + instance_group.uuid set which will lead to a failure loading the + RequestSpec from the DB later. + """ + + def setUp(self): + super(MissingReqSpecInstanceGroupUUIDTestCase, self).setUp() + # Stub out external dependencies. + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(func_fixtures.PlacementFixture()) + fake_image.stub_out_image_service(self) + self.addCleanup(fake_image.FakeImageService_reset) + # Configure the API to allow resizing to the same host so we can keep + # the number of computes down to two in the test. + self.flags(allow_resize_to_same_host=True) + # Start nova controller services. + api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + self.api = api_fixture.admin_api + self.start_service('conductor') + # Use our custom weigher defined above to make sure that we have + # a predictable scheduling sort order. + self.flags(weight_classes=[__name__ + '.HostNameWeigher'], + group='filter_scheduler') + self.start_service('scheduler') + # Start two computes, one where the server will be created and another + # where we'll cold migrate it. + self.addCleanup(fake_virt.restore_nodes) + self.computes = {} # keep track of the compute services per host name + for host in ('host1', 'host2'): + fake_virt.set_nodes([host]) + compute_service = self.start_service('compute', host=host) + self.computes[host] = compute_service + + def test_cold_migrate_reschedule(self): + # Create an anti-affinity group for the server. + body = { + 'server_group': { + 'name': 'test-group', + 'policies': ['anti-affinity'] + } + } + group_id = self.api.api_post( + '/os-server-groups', body).body['server_group']['id'] + + # Create a server in the group which should land on host1 due to our + # custom weigher. + server = self._build_minimal_create_server_request( + self.api, 'test_cold_migrate_reschedule') + body = dict(server=server) + body['os:scheduler_hints'] = {'group': group_id} + server = self.api.post_server(body) + server = self._wait_for_state_change(self.api, server, 'ACTIVE') + self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host']) + + # Verify the group uuid is set in the request spec. + ctxt = nova_context.get_admin_context() + reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id']) + self.assertEqual(group_id, reqspec.instance_group.uuid) + + # Stub out the ComputeTaskAPI.resize_instance method to not pass the + # request spec from compute to conductor during a reschedule. + # This simulates the pre-Stein reschedule behavior. + original_resize_instance = conductor_api.ComputeTaskAPI.resize_instance + + def stub_resize_instance(_self, context, instance, + extra_instance_updates, scheduler_hint, + *args, **kwargs): + # Only remove the request spec if we know we're rescheduling + # which we can determine from the filter_properties retry dict. + filter_properties = scheduler_hint['filter_properties'] + if filter_properties.get('retry', {}).get('exc'): + kwargs.pop('request_spec', None) + return original_resize_instance( + _self, context, instance, extra_instance_updates, + scheduler_hint, *args, **kwargs) + self.stub_out('nova.conductor.api.ComputeTaskAPI.resize_instance', + stub_resize_instance) + + # Now cold migrate the server. Because of allow_resize_to_same_host and + # the weigher, the scheduler will pick host1 first. The FakeDriver + # actually allows migrating to the same host so we need to stub that + # out so the compute will raise UnableToMigrateToSelf like when using + # the libvirt driver. + host1_driver = self.computes['host1'].driver + with mock.patch.dict(host1_driver.capabilities, + supports_migrate_to_same_host=False): + self.api.post_server_action(server['id'], {'migrate': None}) + # FIXME(mriedem): Due to bug 1830747 we don't go to VERIFY_RESIZE + # because the reschedule fails and the instance is put into + # ERROR status. When the bug is fixed the status should be + # VERIFY_RESIZE and the server should be on host2. + server = self._wait_for_state_change( + self.api, server, 'ERROR') + self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host']) + + # And the RequestSpec.instance_group.uuid should be missing which + # leads to us failing to load the RequestSpec. + ex = self.assertRaises(exception.ObjectActionError, + objects.RequestSpec.get_by_instance_uuid, + ctxt, server['id']) + self.assertIn('unable to load uuid', six.text_type(ex))