[nova-manage]support extended resource request

The nova-manage placement heal_allocations CLI is capable of healing missing placement allocations due to port resource requests. To support the new extended port resource request this code needs to be adapted too. When the heal_allocation command got the port resource request support in train, the only way to figure out the missing allocations was to dig into the placement RP tree directly. Since then nova gained support for interface attach with such ports and to support that placement gained support for in_tree filtering in allocation candidate queries. So now the healing logic can be generalized to following: For a given instance 1) Find the ports that has resource request but no allocation key in the binding profile. These are the ports we need to heal 2) Gather the RequestGroups from the these ports and run an allocation_candidates query restricted to the current compute of the instance with in_tree filtering. 3) Extend the existing instance allocation with a returned allocation candidate and update the instance allocation in placement. 4) Update the binding profile of these ports in neutron The main change compared to the existing implementation is in step 2) the rest mostly the same. Note that support for old resource request format is kept alongside of the new resource request format until Neutron makes the new format mandatory. blueprint: qos-minimum-guaranteed-packet-rate Change-Id: I58869d2a5a4ed988fc786a6f1824be441dd48484
2021-07-23 18:13:37 +02:00 · 2021-07-23 18:13:37 +02:00 · 90ed7e574d
commit 90ed7e574d
parent e14eef0719
5 changed files with 550 additions and 304 deletions
--- a/doc/source/cli/nova-manage.rst
+++ b/doc/source/cli/nova-manage.rst
@ -1224,6 +1224,13 @@ This command requires that the

    Added :option:`--force` option.

+.. versionchanged:: 25.0.0 (Yoga)
+
+    Added support for healing port allocations if port-resource-request-groups
+    neutron API extension is enabled and therefore ports can request multiple
+    group of resources e.g. by using both guaranteed minimum bandwidth and
+    guaranteed minimum packet rate QoS policy rules.
+
 .. rubric:: Options

 .. option:: --max-count <max_count>
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@ -28,6 +28,7 @@ import re
 import sys
 import time
 import traceback
+import typing as ty
 from urllib import parse as urlparse

 from dateutil import parser as dateutil_parser
@ -1458,48 +1459,13 @@ class PlacementCommands(object):
                instance_uuid=instance.uuid, error=str(e))

    @staticmethod
-    def _has_request_but_no_allocation(port):
-        request = port.get(constants.RESOURCE_REQUEST)
+    def _has_request_but_no_allocation(port, neutron):
+        has_res_req = neutron_api.API()._has_resource_request(
+            context.get_admin_context(), port, neutron)
+
        binding_profile = neutron_api.get_binding_profile(port)
        allocation = binding_profile.get(constants.ALLOCATION)
-        # We are defensive here about 'resources' and 'required' in the
-        # 'resource_request' as neutron API is not clear about those fields
-        # being optional.
-        return (request and request.get('resources') and
-                request.get('required') and
-                not allocation)
-
-    @staticmethod
-    def _get_rps_in_tree_with_required_traits(
-            ctxt, rp_uuid, required_traits, placement):
-        """Find the RPs that have all the required traits in the given rp tree.
-
-        :param ctxt: nova.context.RequestContext
-        :param rp_uuid: the RP uuid that will be used to query the tree.
-        :param required_traits: the traits that need to be supported by
-            the returned resource providers.
-        :param placement: nova.scheduler.client.report.SchedulerReportClient
-            to communicate with the Placement service API.
-        :raise PlacementAPIConnectFailure: if placement API cannot be reached
-        :raise ResourceProviderRetrievalFailed: if the resource provider does
-            not exist.
-        :raise ResourceProviderTraitRetrievalFailed: if resource provider
-            trait information cannot be read from placement.
-        :return: A list of RP UUIDs that supports every required traits and
-            in the tree for the provider rp_uuid.
-        """
-        try:
-            rps = placement.get_providers_in_tree(ctxt, rp_uuid)
-            matching_rps = [
-                rp['uuid']
-                for rp in rps
-                if set(required_traits).issubset(
-                    placement.get_provider_traits(ctxt, rp['uuid']).traits)
-            ]
-        except ks_exc.ClientException:
-            raise exception.PlacementAPIConnectFailure()
-
-        return matching_rps
+        return has_res_req and not allocation

    @staticmethod
    def _merge_allocations(alloc1, alloc2):
@ -1525,67 +1491,105 @@ class PlacementCommands(object):
                    allocations[rp_uuid]['resources'][rc] += amount
        return allocations

-    def _get_port_allocation(
-            self, ctxt, node_uuid, port, instance_uuid, placement):
-        """Return the extra allocation the instance needs due to the given
-        port.
+    @staticmethod
+    def _get_resource_request_from_ports(
+        ctxt: context.RequestContext,
+        ports: ty.List[ty.Dict[str, ty.Any]]
+    ) -> ty.Tuple[
+            ty.Dict[str, ty.List['objects.RequestGroup']],
+            'objects.RequestLevelParams']:
+        """Collect RequestGroups and RequestLevelParams for all ports

-        :param ctxt: nova.context.RequestContext
-        :param node_uuid: the ComputeNode uuid the instance is running on.
-        :param port: the port dict returned from neutron
-        :param instance_uuid: The uuid of the instance the port is bound to
-        :param placement: nova.scheduler.client.report.SchedulerReportClient
-            to communicate with the Placement service API.
-        :raise PlacementAPIConnectFailure: if placement API cannot be reached
-        :raise ResourceProviderRetrievalFailed: compute node resource provider
-            does not exist.
-        :raise ResourceProviderTraitRetrievalFailed: if resource provider
-            trait information cannot be read from placement.
-        :raise MoreThanOneResourceProviderToHealFrom: if it cannot be decided
-            unambiguously which resource provider to heal from.
-        :raise NoResourceProviderToHealFrom: if there is no resource provider
-            found to heal from.
-        :return: A dict of resources keyed by RP uuid to be included in the
-            instance allocation dict.
+        :param ctxt: the request context
+        :param ports: a list of port dicts
+        :returns: A two tuple where the first item is a dict mapping port
+            uuids to a list of request groups coming from that port, the
+            second item is a combined RequestLevelParams object from all ports.
        """
-        matching_rp_uuids = self._get_rps_in_tree_with_required_traits(
-            ctxt, node_uuid, port[constants.RESOURCE_REQUEST]['required'],
-            placement)
+        groups = {}
+        request_level_params = objects.RequestLevelParams()
+        extended_res_req = (
+            neutron_api.API().has_extended_resource_request_extension(
+                ctxt)
+        )

-        if len(matching_rp_uuids) > 1:
-            # If there is more than one such RP then it is an ambiguous
-            # situation that we cannot handle here efficiently because that
-            # would require the reimplementation of most of the allocation
-            # candidate query functionality of placement. Also if more
-            # than one such RP exists then selecting the right one might
-            # need extra information from the compute node. For example
-            # which PCI PF the VF is allocated from and which RP represents
-            # that PCI PF in placement. When migration is supported with such
-            # servers then we can ask the admin to migrate these servers
-            # instead to heal their allocation.
-            raise exception.MoreThanOneResourceProviderToHealFrom(
-                rp_uuids=','.join(matching_rp_uuids),
-                port_id=port['id'],
-                instance_uuid=instance_uuid)
+        for port in ports:
+            resource_request = port.get(constants.RESOURCE_REQUEST)
+            if extended_res_req:
+                groups[port['id']] = (
+                    objects.RequestGroup.from_extended_port_request(
+                        ctxt, resource_request
+                    )
+                )
+                request_level_params.extend_with(
+                    objects.RequestLevelParams.from_port_request(
+                        resource_request
+                    )
+                )
+            else:
+                # This is the legacy format, only one group per port and no
+                # request level param support
+                # TODO(gibi): remove this path once the extended resource
+                # request extension is mandatory in neutron
+                groups[port['id']] = [
+                    objects.RequestGroup.from_port_request(
+                        ctxt, port['id'], resource_request
+                    )
+                ]

-        if len(matching_rp_uuids) == 0:
-            raise exception.NoResourceProviderToHealFrom(
-                port_id=port['id'],
-                instance_uuid=instance_uuid,
-                traits=port[constants.RESOURCE_REQUEST]['required'],
-                node_uuid=node_uuid)
+        return groups, request_level_params

-        # We found one RP that matches the traits. Assume that we can allocate
-        # the resources from it. If there is not enough inventory left on the
-        # RP then the PUT /allocations placement call will detect that.
-        rp_uuid = matching_rp_uuids[0]
+    @staticmethod
+    def _get_port_binding_profile_allocation(
+        ctxt: context.RequestContext,
+        neutron: neutron_api.ClientWrapper,
+        port: ty.Dict[str, ty.Any],
+        request_groups: ty.List['objects.RequestGroup'],
+        resource_provider_mapping: ty.Dict[str, ty.List[str]],
+    ) -> ty.Dict[str, str]:
+        """Generate the value of the allocation key of the port binding profile
+        based on the provider mapping returned from placement

-        port_allocation = {
-            rp_uuid: {
-                'resources': port[constants.RESOURCE_REQUEST]['resources']
+        :param ctxt: the request context
+        :param neutron: the neutron client
+        :param port: the port dict from neutron
+        :param request_groups: the list of RequestGroups object generated from
+            the port resource request
+        :param resource_provider_mapping: The dict of request group to resource
+            provider mapping returned by the Placement allocation candidate
+            query
+        :returns: a dict mapping request group ids to resource provider uuids
+            in the form as Neutron expects in the port binding profile.
+        """
+        if neutron_api.API().has_extended_resource_request_extension(
+            ctxt, neutron
+        ):
+            # The extended resource request format also means that a
+            # port has more than a one request groups.
+            # Each request group id from the port needs to be mapped to
+            # a single provider id from the provider mappings. Each
+            # group from the port is mapped to a numbered request group
+            # in placement so we can assume that they are mapped to
+            # a single provider and therefore the provider mapping list
+            # has a single provider id.
+            allocation = {
+                group.requester_id: resource_provider_mapping[
+                    group.requester_id][0]
+                for group in request_groups
            }
-        }
-        return port_allocation
+        else:
+            # This is the legacy resource request format where a port
+            # is mapped to a single request group
+            # NOTE(gibi): In the resource provider mapping there can be
+            # more than one RP fulfilling a request group. But resource
+            # requests of a Neutron port is always mapped to a
+            # numbered request group that is always fulfilled by one
+            # resource provider. So we only pass that single RP UUID
+            # here.
+            allocation = resource_provider_mapping[
+                port['id']][0]
+
+        return allocation

    def _get_port_allocations_to_heal(
            self, ctxt, instance, node_cache, placement, neutron, output):
@ -1604,15 +1608,10 @@ class PlacementCommands(object):
        :raise nova.exception.ComputeHostNotFound: if compute node of the
            instance not found in the db.
        :raise PlacementAPIConnectFailure: if placement API cannot be reached
-        :raise ResourceProviderRetrievalFailed: if the resource provider
-            representing the compute node the instance is running on does not
-            exist.
-        :raise ResourceProviderTraitRetrievalFailed: if resource provider
-            trait information cannot be read from placement.
-        :raise MoreThanOneResourceProviderToHealFrom: if it cannot be decided
-            unambiguously which resource provider to heal from.
-        :raise NoResourceProviderToHealFrom: if there is no resource provider
-            found to heal from.
+        :raise AllocationUpdateFailed: if there is either no allocation
+            candidate returned from placement for the missing port allocations
+            or there are more than one candidates making the healing
+            ambiguous.
        :return: A two tuple where the first item is a dict of resources keyed
            by RP uuid to be included in the instance allocation dict. The
            second item is a list of port dicts to be updated in Neutron.
@ -1629,7 +1628,7 @@ class PlacementCommands(object):
        # are not on any host.
        ports_to_heal = [
            port for port in self._get_ports(ctxt, instance, neutron)
-            if self._has_request_but_no_allocation(port)]
+            if self._has_request_but_no_allocation(port, neutron)]

        if not ports_to_heal:
            # nothing to do, return early
@ -1638,26 +1637,107 @@ class PlacementCommands(object):
        node_uuid = self._get_compute_node_uuid(
            ctxt, instance, node_cache)

-        allocations = {}
+        # NOTE(gibi): We need to handle both legacy and extended resource
+        # request. So we need to handle ports with multiple request groups
+        # allocating from multiple providers.
+        # The logic what we follow here is pretty similar to the logic
+        # implemented in ComputeManager._allocate_port_resource_for_instance
+        # for the interface attach case. We just apply it to more then one
+        # ports here.
+        request_groups_per_port, req_lvl_params = (
+            self._get_resource_request_from_ports(ctxt, ports_to_heal)
+        )
+        # flatten the list of list of groups
+        request_groups = [
+            group
+            for groups in request_groups_per_port.values()
+            for group in groups
+        ]
+
+        # we can have multiple request groups, it would be enough to restrict
+        # only one of them to the compute tree but for symmetry we restrict
+        # all of them
+        for request_group in request_groups:
+            request_group.in_tree = node_uuid
+
+        # If there are multiple groups then the group_policy is mandatory in
+        # the allocation candidate query. We can assume that if this instance
+        # booted successfully then we have the policy in the flavor. If there
+        # is only one group and therefore no policy then the value of the
+        # policy in the allocation candidate query is ignored, so we simply
+        # default it here.
+        group_policy = instance.flavor.extra_specs.get("group_policy", "none")
+
+        rr = scheduler_utils.ResourceRequest.from_request_groups(
+            request_groups, req_lvl_params, group_policy)
+        res = placement.get_allocation_candidates(ctxt, rr)
+        # NOTE(gibi): the get_allocation_candidates method has the
+        # @safe_connect decorator applied. Such decorator will return None
+        # if the connection to Placement is failed. So we raise an exception
+        # here. The case when Placement successfully return a response, even
+        # if it is a negative or empty response, the method will return a three
+        # tuple. That case is handled couple of lines below.
+        if not res:
+            raise exception.PlacementAPIConnectFailure()
+        alloc_reqs, __, __ = res
+
+        if not alloc_reqs:
+            port_ids = [port['id'] for port in ports_to_heal]
+            raise exception.AllocationUpdateFailed(
+                consumer_uuid=instance.uuid,
+                error=f'Placement returned no allocation candidate to fulfill '
+                      f'the resource request of the port(s) {port_ids}'
+            )
+        if len(alloc_reqs) > 1:
+            # If there is more than one candidates then it is an ambiguous
+            # situation that we cannot handle here because selecting the right
+            # one might need extra information from the compute node. For
+            # example which PCI PF the VF is allocated from and which RP
+            # represents that PCI PF in placement.
+            # TODO(gibi): One way to get that missing information to resolve
+            # ambiguity would be to load up the InstancePciRequest objects and
+            # try to use the parent_if_name in their spec to find the proper
+            # candidate that allocates for the same port from the PF RP that
+            # has the same name.
+            port_ids = [port['id'] for port in ports_to_heal]
+            raise exception.AllocationUpdateFailed(
+                consumer_uuid=instance.uuid,
+                error=f'Placement returned more than one possible allocation '
+                      f'candidates to fulfill the resource request of the '
+                      f'port(s) {port_ids}. This script does not have enough '
+                      f'information to select the proper candidate to heal the'
+                      f'missing allocations. A possible way to heal the'
+                      f'allocation of this instance is to migrate it to '
+                      f'another compute as the migration process re-creates '
+                      f'the full allocation on the target host.'
+            )
+
+        # so we have one candidate, lets use that to get the needed allocations
+        # and the provider mapping for the ports' binding profile
+        alloc_req = alloc_reqs[0]
+        allocations = alloc_req["allocations"]
+        provider_mappings = alloc_req["mappings"]
+
        for port in ports_to_heal:
-            port_allocation = self._get_port_allocation(
-                ctxt, node_uuid, port, instance.uuid, placement)
-            rp_uuid = list(port_allocation)[0]
-            allocations = self._merge_allocations(
-                allocations, port_allocation)
-            # We also need to record the RP we are allocated from in the
+            # We also need to record the RPs we are allocated from in the
            # port. This will be sent back to Neutron before the allocation
            # is updated in placement
+            profile_allocation = self._get_port_binding_profile_allocation(
+                ctxt, neutron, port, request_groups_per_port[port['id']],
+                provider_mappings
+            )
            binding_profile = neutron_api.get_binding_profile(port)
-            binding_profile[constants.ALLOCATION] = rp_uuid
+            binding_profile[constants.ALLOCATION] = profile_allocation
            port[constants.BINDING_PROFILE] = binding_profile

-            output(_("Found resource provider %(rp_uuid)s having matching "
-                     "traits for port %(port_uuid)s with resource request "
-                     "%(request)s attached to instance %(instance_uuid)s") %
-                     {"rp_uuid": rp_uuid, "port_uuid": port["id"],
-                      "request": port.get(constants.RESOURCE_REQUEST),
-                      "instance_uuid": instance.uuid})
+            output(_(
+                "Found a request group : resource provider mapping "
+                "%(mapping)s for the port %(port_uuid)s with resource request "
+                "%(request)s attached to the instance %(instance_uuid)s") %
+                {"mapping": profile_allocation, "port_uuid": port['id'],
+                 "request": port.get(constants.RESOURCE_REQUEST),
+                 "instance_uuid": instance.uuid}
+            )

        return allocations, ports_to_heal

@ -1791,15 +1871,6 @@ class PlacementCommands(object):
            a given instance with consumer project/user information
        :raise UnableToQueryPorts: If the neutron list ports query fails.
        :raise PlacementAPIConnectFailure: if placement API cannot be reached
-        :raise ResourceProviderRetrievalFailed: if the resource provider
-            representing the compute node the instance is running on does not
-            exist.
-        :raise ResourceProviderTraitRetrievalFailed: if resource provider
-            trait information cannot be read from placement.
-        :raise MoreThanOneResourceProviderToHealFrom: if it cannot be decided
-            unambiguously which resource provider to heal from.
-        :raise NoResourceProviderToHealFrom: if there is no resource provider
-            found to heal from.
        :raise UnableToUpdatePorts: if a port update failed in neutron but any
            partial update was rolled back successfully.
        :raise UnableToRollbackPortUpdates: if a port update failed in neutron
@ -1956,15 +2027,6 @@ class PlacementCommands(object):
            a given instance with consumer project/user information
        :raise UnableToQueryPorts: If the neutron list ports query fails.
        :raise PlacementAPIConnectFailure: if placement API cannot be reached
-        :raise ResourceProviderRetrievalFailed: if the resource provider
-            representing the compute node the instance is running on does not
-            exist.
-        :raise ResourceProviderTraitRetrievalFailed: if resource provider
-            trait information cannot be read from placement.
-        :raise MoreThanOneResourceProviderToHealFrom: if it cannot be decided
-            unambiguously which resource provider to heal from.
-        :raise NoResourceProviderToHealFrom: if there is no resource provider
-            found to heal from.
        :raise UnableToUpdatePorts: if a port update failed in neutron but any
            partial update was rolled back successfully.
        :raise UnableToRollbackPortUpdates: if a port update failed in neutron
@ -2186,13 +2248,11 @@ class PlacementCommands(object):
                except exception.ComputeHostNotFound as e:
                    print(e.format_message())
                    return 2
-                except (exception.AllocationCreateFailed,
-                        exception.AllocationUpdateFailed,
-                        exception.NoResourceProviderToHealFrom,
-                        exception.MoreThanOneResourceProviderToHealFrom,
-                        exception.PlacementAPIConnectFailure,
-                        exception.ResourceProviderRetrievalFailed,
-                        exception.ResourceProviderTraitRetrievalFailed) as e:
+                except (
+                    exception.AllocationCreateFailed,
+                    exception.AllocationUpdateFailed,
+                    exception.PlacementAPIConnectFailure
+                ) as e:
                    print(e.format_message())
                    return 3
                except exception.UnableToQueryPorts as e:
--- a/nova/exception.py
+++ b/nova/exception.py
@ -2291,28 +2291,6 @@ class HealPortAllocationException(NovaException):
    msg_fmt = _("Healing port allocation failed.")


-class MoreThanOneResourceProviderToHealFrom(HealPortAllocationException):
-    msg_fmt = _("More than one matching resource provider %(rp_uuids)s is "
-                "available for healing the port allocation for port "
-                "%(port_id)s for instance %(instance_uuid)s. This script "
-                "does not have enough information to select the proper "
-                "resource provider from which to heal.")
-
-
-class NoResourceProviderToHealFrom(HealPortAllocationException):
-    msg_fmt = _("No matching resource provider is "
-                "available for healing the port allocation for port "
-                "%(port_id)s for instance %(instance_uuid)s. There are no "
-                "resource providers with matching traits %(traits)s in the "
-                "provider tree of the resource provider %(node_uuid)s ."
-                "This probably means that the neutron QoS configuration is "
-                "wrong. Consult with "
-                "https://docs.openstack.org/neutron/latest/admin/"
-                "config-qos-min-bw.html for information on how to configure "
-                "neutron. If the configuration is fixed the script can be run "
-                "again.")
-
-
 class UnableToQueryPorts(HealPortAllocationException):
    msg_fmt = _("Unable to query ports for instance %(instance_uuid)s: "
                "%(error)s")
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@ -1,4 +1,5 @@
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
@ -20,7 +21,7 @@ import mock
 from neutronclient.common import exceptions as neutron_client_exc
 import os_resource_classes as orc
 from oslo_serialization import jsonutils
-from oslo_utils.fixture import uuidsentinel
+from oslo_utils.fixture import uuidsentinel as uuids
 from oslo_utils import timeutils

 from nova.cmd import manage
@ -33,7 +34,7 @@ from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional import fixtures as func_fixtures
 from nova.tests.functional import integrated_helpers
-from nova.tests.functional import test_servers_resource_request
+from nova.tests.functional import test_servers_resource_request as test_res_req
 from nova import utils as nova_utils

 CONF = config.CONF
@ -1063,7 +1064,8 @@ class TestNovaManagePlacementHealAllocations(


 class TestNovaManagePlacementHealPortAllocations(
-    test_servers_resource_request.PortResourceRequestBasedSchedulingTestBase):
+    test_res_req.PortResourceRequestBasedSchedulingTestBase
+):

    def setUp(self):
        super(TestNovaManagePlacementHealPortAllocations, self).setUp()
@ -1087,15 +1089,17 @@ class TestNovaManagePlacementHealPortAllocations(
        bound_port = self.neutron._ports[port_id]
        bound_port[constants.RESOURCE_REQUEST] = resource_request

+    @staticmethod
+    def _get_default_resource_request(port_uuid):
+        return {
+            "resources": {
+                orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
+                orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
+            "required": ["CUSTOM_PHYSNET2", "CUSTOM_VNIC_TYPE_NORMAL"]
+        }
+
    def _create_server_with_missing_port_alloc(
            self, ports, resource_request=None):
-        if not resource_request:
-            resource_request = {
-                "resources": {
-                    orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
-                    orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
-                "required": ["CUSTOM_PHYSNET2", "CUSTOM_VNIC_TYPE_NORMAL"]
-            }

        server = self._create_server(
            flavor=self.flavor,
@ -1105,14 +1109,22 @@ class TestNovaManagePlacementHealPortAllocations(
        # This is a hack to simulate that we have a server that is missing
        # allocation for its port
        for port in ports:
-            self._add_resource_request_to_a_bound_port(
-                port['id'], resource_request)
+            if not resource_request:
+                rr = self._get_default_resource_request(port['id'])
+            else:
+                rr = resource_request
+
+            self._add_resource_request_to_a_bound_port(port['id'], rr)

        updated_ports = [
            self.neutron.show_port(port['id'])['port'] for port in ports]

        return server, updated_ports

+    def _get_resource_request(self, port):
+        res_req = port[constants.RESOURCE_REQUEST]
+        return res_req
+
    def _assert_placement_updated(self, server, ports):
        rsp = self.placement.get(
            '/allocations/%s' % server['id'],
@ -1137,7 +1149,7 @@ class TestNovaManagePlacementHealPortAllocations(
        # bridge RP
        total_request = collections.defaultdict(int)
        for port in ports:
-            port_request = port[constants.RESOURCE_REQUEST]['resources']
+            port_request = self._get_resource_request(port)['resources']
            for rc, amount in port_request.items():
                total_request[rc] += amount
        self.assertEqual(total_request, network_allocations)
@ -1362,83 +1374,64 @@ class TestNovaManagePlacementHealPortAllocations(
            'Successfully created allocations for instance', output)
        self.assertEqual(0, result)

-    def test_heal_port_allocation_not_enough_resources_for_port(self):
-        """Test that a port needs allocation but not enough inventory
-        available.
-        """
+    @staticmethod
+    def _get_too_big_resource_request():
        # The port will request too much NET_BW_IGR_KILOBIT_PER_SEC so there is
        # no RP on the host that can provide it.
-        resource_request = {
+        return {
            "resources": {
                orc.NET_BW_IGR_KILOBIT_PER_SEC: 100000000000,
                orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
            "required": ["CUSTOM_PHYSNET2",
                         "CUSTOM_VNIC_TYPE_NORMAL"]
        }
+
+    def test_heal_port_allocation_not_enough_resources_for_port(self):
+        """Test that a port needs allocation but not enough inventory
+        available.
+        """
        server, ports = self._create_server_with_missing_port_alloc(
-            [self.neutron.port_1], resource_request)
+            [self.neutron.port_1], self._get_too_big_resource_request())

        # let's trigger a heal
        result = self.cli.heal_allocations(verbose=True, max_count=2)

        self._assert_placement_not_updated(server)
-        # Actually the ports were updated but the update is rolled back when
-        # the placement update failed
        self._assert_ports_not_updated(ports)

        output = self.output.getvalue()
        self.assertIn(
-            'Rolling back port update',
-            output)
-        self.assertIn(
-            'Failed to update allocations for consumer',
+            'Placement returned no allocation candidate',
            output)
        self.assertEqual(3, result)

-    def test_heal_port_allocation_no_rp_providing_required_traits(self):
-        """Test that a port needs allocation but no rp is providing the
-        required traits.
-        """
-        # The port will request a trait, CUSTOM_PHYSNET_NONEXISTENT that will
-        # not be provided by any RP on this host
-        resource_request = {
-            "resources": {
-                orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
-                orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
-            "required": ["CUSTOM_PHYSNET_NONEXISTENT",
-                         "CUSTOM_VNIC_TYPE_NORMAL"]
-        }
-        server, ports = self._create_server_with_missing_port_alloc(
-            [self.neutron.port_1], resource_request)
-
-        # let's trigger a heal
-        result = self.cli.heal_allocations(verbose=True, max_count=2)
-
-        self._assert_placement_not_updated(server)
-        self._assert_ports_not_updated(ports)
-
-        self.assertIn(
-            'No matching resource provider is available for healing the port '
-            'allocation',
-            self.output.getvalue())
-        self.assertEqual(3, result)
-
-    def test_heal_port_allocation_ambiguous_rps(self):
-        """Test that there are more than one matching RPs are available on the
-        compute.
-        """
-
-        # The port will request CUSTOM_VNIC_TYPE_DIRECT trait and there are
-        # two RPs that supports such trait.
-        resource_request = {
+    @staticmethod
+    def _get_ambiguous_resource_request():
+        return {
            "resources": {
                orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
                orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
            "required": ["CUSTOM_PHYSNET2",
                         "CUSTOM_VNIC_TYPE_DIRECT"]
        }
+
+    def test_heal_port_allocation_ambiguous_candidates(self):
+        """Test that there are more than one matching set of RPs are available
+        on the compute.
+        """
+        # Add bandwidth inventory for PF3 so that both FP2 and FP3 could
+        # support the port's request making the situation ambiguous
+        inventories = {
+            orc.NET_BW_IGR_KILOBIT_PER_SEC: {"total": 100000},
+            orc.NET_BW_EGR_KILOBIT_PER_SEC: {"total": 100000},
+        }
+        self._set_provider_inventories(
+            self.sriov_dev_rp_per_host[self.compute1_rp_uuid][self.PF3],
+            {"inventories": inventories}
+        )
+
        server, ports = self._create_server_with_missing_port_alloc(
-            [self.neutron.port_1], resource_request)
+            [self.neutron.port_1], self._get_ambiguous_resource_request())

        # let's trigger a heal
        result = self.cli.heal_allocations(verbose=True, max_count=2)
@ -1447,7 +1440,7 @@ class TestNovaManagePlacementHealPortAllocations(
        self._assert_ports_not_updated(ports)

        self.assertIn(
-            'More than one matching resource provider',
+            ' Placement returned more than one possible allocation candidates',
            self.output.getvalue())
        self.assertEqual(3, result)

@ -1595,12 +1588,18 @@ class TestNovaManagePlacementHealPortAllocations(
            output)
        self.assertEqual(7, result)

-    def _test_heal_port_allocation_placement_unavailable(
-            self, server, ports, error):
+    def test_heal_port_allocation_placement_unavailable_during_a_c(self):
+        server, ports = self._create_server_with_missing_port_alloc(
+            [self.neutron.port_1])

-        with mock.patch('nova.cmd.manage.PlacementCommands.'
-                        '_get_rps_in_tree_with_required_traits',
-                        side_effect=error):
+        # Simulate that placement is unavailable during the allocation
+        # candidate query. The safe_connect decorator signals that via
+        # returning None.
+        with mock.patch(
+            'nova.scheduler.client.report.SchedulerReportClient.'
+            'get_allocation_candidates',
+            return_value=None
+        ):
            result = self.cli.heal_allocations(verbose=True, max_count=2)

        self._assert_placement_not_updated(server)
@ -1608,18 +1607,194 @@ class TestNovaManagePlacementHealPortAllocations(

        self.assertEqual(3, result)

-    def test_heal_port_allocation_placement_unavailable(self):
+    def test_heal_port_allocation_placement_unavailable_during_update(self):
        server, ports = self._create_server_with_missing_port_alloc(
            [self.neutron.port_1])

-        for error in [
-            exception.PlacementAPIConnectFailure(),
-            exception.ResourceProviderRetrievalFailed(uuid=uuidsentinel.rp1),
-            exception.ResourceProviderTraitRetrievalFailed(
-                uuid=uuidsentinel.rp1)]:
+        # Simulate that placement is unavailable during updating the
+        # allocation. The retry decorator signals that via returning False
+        with mock.patch(
+            'nova.scheduler.client.report.SchedulerReportClient.'
+            'put_allocations',
+            return_value=False
+        ):
+            result = self.cli.heal_allocations(verbose=True, max_count=2)

-            self._test_heal_port_allocation_placement_unavailable(
-                server, ports, error)
+        self._assert_placement_not_updated(server)
+        # Actually there was a port update but that was rolled back when
+        # the allocation update failed
+        self._assert_ports_not_updated(ports)
+
+        output = self.output.getvalue()
+        self.assertIn(
+            'Rolling back port update',
+            output)
+        self.assertEqual(3, result)
+
+
+class TestNovaManagePlacementHealPortAllocationsExtended(
+    TestNovaManagePlacementHealPortAllocations
+):
+    """Run the same tests as in TestNovaManagePlacementHealPortAllocations but
+    with the ExtendedResourceRequestNeutronFixture that automatically
+    translates the resource request in the ports used by these test to the
+    extended format. Note that this will test the extended format handling but
+    only with a single request group per port.
+    """
+    def setUp(self):
+        super().setUp()
+        self.neutron = self.useFixture(
+            test_res_req.ExtendedResourceRequestNeutronFixture(self))
+
+    def _get_resource_request(self, port):
+        # we assume a single resource request group in this test class
+        res_req = port[constants.RESOURCE_REQUEST][constants.REQUEST_GROUPS]
+        assert len(res_req) == 1
+        return res_req[0]
+
+    def _assert_port_updated(self, port_uuid):
+        updated_port = self.neutron.show_port(port_uuid)['port']
+        binding_profile = updated_port.get('binding:profile', {})
+        self.assertEqual(
+            {port_uuid: self.ovs_bridge_rp_per_host[self.compute1_rp_uuid]},
+            binding_profile['allocation'])
+
+
+class TestNovaManagePlacementHealPortAllocationsMultiGroup(
+    TestNovaManagePlacementHealPortAllocations
+):
+    """Run the same tests as in TestNovaManagePlacementHealPortAllocations but
+    with the MultiGroupResourceRequestNeutronFixture to test with extended
+    resource request with multiple groups.
+    """
+    def setUp(self):
+        super().setUp()
+        self.neutron = self.useFixture(
+            test_res_req.MultiGroupResourceRequestNeutronFixture(self))
+
+    @staticmethod
+    def _get_default_resource_request(port_uuid):
+        # we need unique uuids per port for multi port tests
+        g1 = getattr(uuids, port_uuid + "group1")
+        g2 = getattr(uuids, port_uuid + "group2")
+        return {
+            "request_groups": [
+                {
+                    "id": g1,
+                    "resources": {
+                        orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
+                        orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
+                    "required": [
+                        "CUSTOM_PHYSNET2", "CUSTOM_VNIC_TYPE_NORMAL"
+                    ]
+                },
+                {
+                    "id": g2,
+                    "resources": {
+                        orc.NET_PACKET_RATE_KILOPACKET_PER_SEC: 1000
+                    },
+                    "required": ["CUSTOM_VNIC_TYPE_NORMAL"]
+                }
+            ],
+            "same_subtree": [g1, g2],
+        }
+
+    @staticmethod
+    def _get_too_big_resource_request():
+        # The port will request too much NET_BW_IGR_KILOBIT_PER_SEC so there is
+        # no RP on the host that can provide it.
+        return {
+            "request_groups": [
+                {
+                    "id": uuids.g1,
+                    "resources": {
+                        orc.NET_BW_IGR_KILOBIT_PER_SEC: 10000000000000,
+                        orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
+                    "required": [
+                        "CUSTOM_PHYSNET2", "CUSTOM_VNIC_TYPE_NORMAL"
+                    ]
+                },
+                {
+                    "id": uuids.g2,
+                    "resources": {
+                        orc.NET_PACKET_RATE_KILOPACKET_PER_SEC: 1000
+                    },
+                    "required": ["CUSTOM_VNIC_TYPE_NORMAL"]
+                }
+            ],
+            "same_subtree": [uuids.g1, uuids.g2],
+        }
+
+    @staticmethod
+    def _get_ambiguous_resource_request():
+        # ambiguity cannot really be simulated with multiple groups as that
+        # would require multiple OVS bridges instead of multiple PFs. So
+        # falling back to a single group in this specific test case.
+        return {
+            "request_groups": [
+                {
+                    "id": uuids.g1,
+                    "resources": {
+                        orc.NET_BW_IGR_KILOBIT_PER_SEC: 1000,
+                        orc.NET_BW_EGR_KILOBIT_PER_SEC: 1000},
+                    "required": [
+                        "CUSTOM_PHYSNET2", "CUSTOM_VNIC_TYPE_DIRECT"
+                    ]
+                },
+            ],
+            "same_subtree": [uuids.g1],
+        }
+
+    def _assert_placement_updated(self, server, ports):
+        rsp = self.placement.get(
+            '/allocations/%s' % server['id'],
+            version=1.28).body
+
+        allocations = rsp['allocations']
+
+        # we expect one allocation for the compute resources, one on the OVS
+        # bridge RP due to bandwidth and one on the OVS agent RP due to packet
+        # rate request
+        self.assertEqual(3, len(allocations))
+        self.assertEqual(
+            self._resources_from_flavor(self.flavor),
+            allocations[self.compute1_rp_uuid]['resources'])
+
+        self.assertEqual(server['tenant_id'], rsp['project_id'])
+        self.assertEqual(server['user_id'], rsp['user_id'])
+
+        ovs_bridge_allocations = allocations[
+            self.ovs_bridge_rp_per_host[self.compute1_rp_uuid]]['resources']
+        ovs_agent_allocations = allocations[
+            self.ovs_agent_rp_per_host[self.compute1_rp_uuid]]['resources']
+
+        total_bandwidth_request = collections.defaultdict(int)
+        total_packet_rate_request = collections.defaultdict(int)
+        for port in ports:
+            res_req = (port.get(constants.RESOURCE_REQUEST) or {})
+            for group in res_req.get(constants.REQUEST_GROUPS):
+                port_request = group['resources']
+                for rc, amount in port_request.items():
+                    if rc == orc.NET_PACKET_RATE_KILOPACKET_PER_SEC:
+                        total_packet_rate_request[rc] += amount
+                    else:
+                        total_bandwidth_request[rc] += amount
+
+        self.assertEqual(total_bandwidth_request, ovs_bridge_allocations)
+        self.assertEqual(total_packet_rate_request, ovs_agent_allocations)
+
+    def _assert_port_updated(self, port_uuid):
+        updated_port = self.neutron.show_port(port_uuid)['port']
+        binding_profile = updated_port.get('binding:profile', {})
+        self.assertEqual(
+            {
+                getattr(uuids, port_uuid + "group1"):
+                    self.ovs_bridge_rp_per_host[self.compute1_rp_uuid],
+                getattr(uuids, port_uuid + "group2"):
+                    self.ovs_agent_rp_per_host[self.compute1_rp_uuid],
+            },
+            binding_profile['allocation']
+        )


 class TestNovaManagePlacementSyncAggregates(
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@ -2780,6 +2780,10 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                      self.output.getvalue())
        self.assertIn("Conflict!", self.output.getvalue())

+    @mock.patch(
+        'nova.network.neutron.API.has_extended_resource_request_extension',
+        new=mock.Mock(return_value=False),
+    )
    def test_has_request_but_no_allocation(self):
        # False because there is a full resource_request and allocation set.
        self.assertFalse(
@ -2795,7 +2799,10 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                        ]
                    },
                    'binding:profile': {'allocation': uuidsentinel.rp1}
-                }))
+                },
+                mock.sentinel.neutron,
+            )
+        )
        # True because there is a full resource_request but no allocation set.
        self.assertTrue(
            self.cli._has_request_but_no_allocation(
@ -2810,7 +2817,10 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                        ]
                    },
                    'binding:profile': {}
-                }))
+                },
+                mock.sentinel.neutron,
+            )
+        )
        # True because there is a full resource_request but no allocation set.
        self.assertTrue(
            self.cli._has_request_but_no_allocation(
@ -2825,73 +2835,89 @@ class TestNovaManagePlacement(test.NoDBTestCase):
                        ]
                    },
                    'binding:profile': None,
-                }))
-        # False because there are no resources in the resource_request.
-        self.assertFalse(
-            self.cli._has_request_but_no_allocation(
-                {
-                    'id': uuidsentinel.empty_resources,
-                    'resource_request': {
-                        'resources': {},
-                        'required': [
-                            'CUSTOM_VNIC_TYPE_NORMAL'
-                        ]
-                    },
-                    'binding:profile': {}
-                }))
-        # False because there are no resources in the resource_request.
-        self.assertFalse(
-            self.cli._has_request_but_no_allocation(
-                {
-                    'id': uuidsentinel.missing_resources,
-                    'resource_request': {
-                        'required': [
-                            'CUSTOM_VNIC_TYPE_NORMAL'
-                        ]
-                    },
-                    'binding:profile': {}
-                }))
-        # False because there are no required traits in the resource_request.
-        self.assertFalse(
-            self.cli._has_request_but_no_allocation(
-                {
-                    'id': uuidsentinel.empty_required,
-                    'resource_request': {
-                        'resources': {
-                            'NET_BW_EGR_KILOBIT_PER_SEC': 1000,
-                        },
-                        'required': []
-                    },
-                    'binding:profile': {}
-                }))
-        # False because there are no required traits in the resource_request.
-        self.assertFalse(
-            self.cli._has_request_but_no_allocation(
-                {
-                    'id': uuidsentinel.missing_required,
-                    'resource_request': {
-                        'resources': {
-                            'NET_BW_EGR_KILOBIT_PER_SEC': 1000,
-                        },
-                    },
-                    'binding:profile': {}
-                }))
-        # False because there are no resources or required traits in the
-        # resource_request.
-        self.assertFalse(
-            self.cli._has_request_but_no_allocation(
-                {
-                    'id': uuidsentinel.empty_resource_request,
-                    'resource_request': {},
-                    'binding:profile': {}
-                }))
+                },
+                mock.sentinel.neutron,
+            )
+        )
        # False because there is no resource_request.
        self.assertFalse(
            self.cli._has_request_but_no_allocation(
                {
                    'id': uuidsentinel.missing_resource_request,
                    'binding:profile': {}
-                }))
+                },
+                mock.sentinel.neutron,
+            )
+        )
+
+    @mock.patch(
+        'nova.network.neutron.API.has_extended_resource_request_extension',
+        new=mock.Mock(return_value=True),
+    )
+    def test_has_request_but_no_allocation_extended(self):
+        # False because there is resource_request and allocation set.
+        self.assertFalse(
+            self.cli._has_request_but_no_allocation(
+                {
+                    'id': uuidsentinel.healed,
+                    'resource_request': {
+                        'request_groups': [
+                            {
+                                'id': uuidsentinel.group1,
+                                'resources': {
+                                    'NET_BW_EGR_KILOBIT_PER_SEC': 1000,
+                                },
+                                'required': [
+                                    'CUSTOM_VNIC_TYPE_NORMAL'
+                                ]
+                            },
+                        ],
+                    },
+                    'binding:profile': {
+                        'allocation': {uuidsentinel.group1: uuidsentinel.rp1}
+                    }
+                },
+                mock.sentinel.neutron,
+            )
+        )
+        # False because there no resource_request
+        self.assertFalse(
+            self.cli._has_request_but_no_allocation(
+                {
+                    'id': uuidsentinel.healed,
+                    'resource_request': None,
+                    'binding:profile': {
+                        'allocation': {uuidsentinel.group1: uuidsentinel.rp1}
+                    }
+                },
+                mock.sentinel.neutron,
+            )
+        )
+        # True because we have request but no allocation set
+        self.assertTrue(
+            self.cli._has_request_but_no_allocation(
+                {
+                    'id': uuidsentinel.healed,
+                    'resource_request': {
+                        'request_groups': [
+                            {
+                                'id': uuidsentinel.group1,
+                                'resources': {
+                                    'NET_BW_EGR_KILOBIT_PER_SEC': 1000,
+                                },
+                                'required': [
+                                    'CUSTOM_VNIC_TYPE_NORMAL'
+                                ]
+                            },
+                        ],
+                    },
+                    'binding:profile': {
+                        'allocation': {}
+                    }
+                },
+                mock.sentinel.neutron,
+            )
+        )

    def test_update_ports_only_updates_binding_profile(self):
        """Simple test to make sure that only the port's binding:profile is