Recalculate request group - RP mapping during re-schedule

Re-scheduling uses an alternative allocation candidate from placement to get resource for the server. This also means that the previously calculated request group - resource provider mapping is not valid any more and needs to be recalculated based on the new allocation of the server. blueprint bandwidth-resource-provider Change-Id: If8a13f74d2b3c99f05365eb49dcfa01d9042fefa
2018-11-21 17:06:24 +01:00 · 2018-11-21 17:06:24 +01:00 · bc57a916c4
parent bea316d479
commit bc57a916c4
3 changed files with 100 additions and 9 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -2125,7 +2125,9 @@ class ComputeManager(manager.Manager):
                                                     scheduler_hints)
                image_meta = objects.ImageMeta.from_dict(image)

-                if request_spec:
+                if (request_spec
+                        and 'requested_resources' in request_spec
+                        and request_spec.requested_resources is not None):
                    request_group_resource_providers_mapping = {
                        group.requester_id: group.provider_uuids
                        for group in request_spec.requested_resources
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -567,7 +567,7 @@ class ComputeTaskManager(base.Base):
        # will the API send the request_spec if using cells v1, so we need
        # to check and build our own if one is not provided.
        if request_spec is None:
-            request_spec = scheduler_utils.build_request_spec(
+            legacy_request_spec = scheduler_utils.build_request_spec(
                image, instances)
        else:
            # TODO(mriedem): This is annoying but to populate the local
@ -576,7 +576,10 @@ class ComputeTaskManager(base.Base):
            # and we can remove it once the populate_retry and
            # populate_filter_properties utility methods are converted to
            # work on a RequestSpec object rather than filter_properties.
-            request_spec = request_spec.to_legacy_request_spec_dict()
+            # NOTE(gibi): we have to keep a reference to the original
+            # RequestSpec object passed to this function as we lose information
+            # during the below legacy conversion
+            legacy_request_spec = request_spec.to_legacy_request_spec_dict()

        # 'host_lists' will be None in one of two cases: when running cellsv1,
        # or during a reschedule from a pre-Queens compute. In all other cases,
@ -593,7 +596,7 @@ class ComputeTaskManager(base.Base):
                filter_properties, instances[0].uuid)
            instance_uuids = [instance.uuid for instance in instances]
            spec_obj = objects.RequestSpec.from_primitives(
-                    context, request_spec, filter_properties)
+                    context, legacy_request_spec, filter_properties)
            LOG.debug("Rescheduling: %s", is_reschedule)
            if is_reschedule:
                # Make sure that we have a host, as we may have exhausted all
@ -629,7 +632,7 @@ class ComputeTaskManager(base.Base):
            for instance in instances:
                self._set_vm_state_and_notify(
                    context, instance.uuid, 'build_instances', updates,
-                    exc, request_spec)
+                    exc, legacy_request_spec)
                # If num_attempts > 1, we're in a reschedule and probably
                # either hit NoValidHost or MaxRetriesExceeded. Either way,
                # the build request should already be gone and we probably
@ -668,6 +671,12 @@ class ComputeTaskManager(base.Base):
                                elevated, self.report_client, spec_obj,
                                instance.uuid, alloc_req,
                                host.allocation_request_version)
+                        if request_spec:
+                            # NOTE(gibi): redo the request group - resource
+                            # provider mapping as the above claim call moves
+                            # the allocation of the instance to another host
+                            self._fill_provider_mapping(
+                                context, instance.uuid, request_spec)
                    else:
                        # Some deployments use different schedulers that do not
                        # use Placement, so they will not have an
@ -704,11 +713,16 @@ class ComputeTaskManager(base.Base):
            # could have come from a compute via reschedule and it would
            # already have some things set, like scheduler_hints.
            local_reqspec = objects.RequestSpec.from_primitives(
-                context, request_spec, local_filter_props)
+                context, legacy_request_spec, local_filter_props)

-            # TODO(gibi): make sure that the request group - provider mapping
-            # is updated in the request spec during reschedule
-            local_reqspec.requested_resources = []
+            # NOTE(gibi): at this point the request spec already got converted
+            # to a legacy dict and then back to an object so we lost the non
+            # legacy part of the spec. Re-populate the requested_resource
+            # field based on the original request spec object passed to this
+            # function.
+            if request_spec:
+                local_reqspec.requested_resources = (
+                    request_spec.requested_resources)

            # The block_device_mapping passed from the api doesn't contain
            # instance specific information
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@ -951,6 +951,81 @@ class _BaseTaskTestCase(object):

        do_test()

+    @mock.patch.object(objects.Instance, 'save', new=mock.MagicMock())
+    @mock.patch.object(query.SchedulerQueryClient, 'select_destinations')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_set_vm_state_and_notify', new=mock.MagicMock())
+    def test_build_instances_reschedule_recalculates_provider_mapping(self,
+            mock_select_dests):
+
+        rg1 = objects.RequestGroup(resources={"CUSTOM_FOO": 1})
+        request_spec = objects.RequestSpec(requested_resources=[rg1])
+
+        mock_select_dests.return_value = [[fake_selection1]]
+        instance = fake_instance.fake_instance_obj(self.context)
+        image = {'fake-data': 'should_pass_silently'}
+
+        # build_instances() is a cast, we need to wait for it to complete
+        self.useFixture(cast_as_call.CastAsCall(self))
+
+        @mock.patch('nova.conductor.manager.ComputeTaskManager.'
+                    '_fill_provider_mapping')
+        @mock.patch('nova.scheduler.utils.claim_resources')
+        @mock.patch('nova.objects.request_spec.RequestSpec.from_primitives',
+                    return_value=request_spec)
+        @mock.patch.object(self.conductor_manager.compute_rpcapi,
+                           'build_and_run_instance')
+        @mock.patch.object(self.conductor_manager,
+                           '_populate_instance_mapping')
+        @mock.patch.object(self.conductor_manager, '_destroy_build_request')
+        def do_test(mock_destroy_build_req, mock_pop_inst_map,
+                    mock_build_and_run, mock_request_spec_from_primitives,
+                    mock_claim, mock_rp_mapping):
+            self.conductor.build_instances(
+                context=self.context,
+                instances=[instance],
+                image=image,
+                filter_properties={'retry': {'num_attempts': 1, 'hosts': []}},
+                admin_password='admin_password',
+                injected_files='injected_files',
+                requested_networks=None,
+                security_groups='security_groups',
+                block_device_mapping='block_device_mapping',
+                legacy_bdm=False,
+                host_lists=fake_host_lists1,
+                request_spec=request_spec)
+
+            expected_build_run_host_list = copy.copy(fake_host_lists1[0])
+            if expected_build_run_host_list:
+                expected_build_run_host_list.pop(0)
+            mock_build_and_run.assert_called_once_with(
+                self.context,
+                instance=mock.ANY,
+                host='host1',
+                image=image,
+                request_spec=request_spec,
+                filter_properties={'retry': {'num_attempts': 2,
+                                             'hosts': [['host1', 'node1']]},
+                                   'limits': {}},
+                admin_password='admin_password',
+                injected_files='injected_files',
+                requested_networks=None,
+                security_groups='security_groups',
+                block_device_mapping=test.MatchType(
+                    objects.BlockDeviceMappingList),
+                node='node1',
+                limits=None,
+                host_list=expected_build_run_host_list)
+
+            mock_rp_mapping.assert_called_once_with(
+                self.context, instance.uuid, mock.ANY)
+            actual_request_spec = mock_rp_mapping.mock_calls[0][1][2]
+            self.assertEqual(
+                rg1.resources,
+                actual_request_spec.requested_resources[0].resources)
+
+        do_test()
+
    @mock.patch.object(cinder.API, 'attachment_get')
    @mock.patch.object(cinder.API, 'attachment_create')
    @mock.patch.object(block_device_obj.BlockDeviceMapping, 'save')