From 55a36f8f6a04388cfe5e845c7cd30d7c41cb4b66 Mon Sep 17 00:00:00 2001
From: Sylvain Bauza <sbauza@redhat.com>
Date: Tue, 14 Jun 2022 15:39:47 +0200
Subject: [PATCH] Support multiple allocations for vGPUs

Removing the TODO that only allows one VGPU allocation per instance. Now we no
longer need to support the very old VGPU usage for the root provider, this
is easy.

Change-Id: I48d2b700049c81071710e37c05579239255c3539
Related-Bug: #1758086
Signed-off-by: Sylvain Bauza <sbauza@redhat.com>
---
 doc/source/admin/virtual-gpu.rst              |  34 ++++++
 nova/tests/functional/libvirt/test_vgpu.py    |  14 ++-
 nova/tests/unit/virt/libvirt/test_driver.py   |  50 +++++++--
 nova/virt/libvirt/driver.py                   | 101 +++++++++---------
 .../notes/bug-1758086-e9d147380d149789.yaml   |   9 ++
 5 files changed, 145 insertions(+), 63 deletions(-)
 create mode 100644 releasenotes/notes/bug-1758086-e9d147380d149789.yaml

diff --git a/doc/source/admin/virtual-gpu.rst b/doc/source/admin/virtual-gpu.rst
index c724f1520d08..65bb47d0f972 100644
--- a/doc/source/admin/virtual-gpu.rst
+++ b/doc/source/admin/virtual-gpu.rst
@@ -172,6 +172,39 @@ provided by compute nodes.
    $ openstack server create --flavor vgpu_1 --image cirros-0.3.5-x86_64-uec --wait test-vgpu
 
 
+Ask for more than one vGPU per instance by the flavor
+-----------------------------------------------------
+
+.. versionchanged:: 33.0.0
+
+We have an open bug report `bug 1758086`_ explaining that the nvidia driver
+doesn't support more than one vGPU per instance (and per GPU resource - which
+can be a physical GPU or a virtual function, see nvidia docs for more details).
+In order to alleviate this problem, this is mandatory to require in the flavor
+to have all the vGPUs to be spread between multiple GPU resource providers.
+
+For example, you can request two groups of vGPUs this way :
+
+.. code-block:: console
+
+   $ openstack flavor set vgpu_2 --property "resources1:VGPU=1" \
+                                 --property "resources2:VGPU=1" \
+
+
+With SR-IOV GPUs (you may need to refer to nvidia documentation to know the
+distinction), this will work without requiring further attributes as every
+single VGPU Resource Provider only provides a single VGPU resource.
+
+For non-SRIOV GPUs, you may require other properties in order to request
+Placement to allocate you some host with two distinct GPUs.
+You may need to create distinct custom traits per GPU or custom resource
+classes for explicitly telling in your flavor that you would want resources
+from distinct entities, or you could use ``group_policy=isolate`` as a property
+but you would need to make sure that you don't ask for other resources but
+virtual GPUs in your flavor or Placement would shard all the allocations for
+*all* resource groups.
+
+
 How to discover a GPU type
 --------------------------
 
@@ -490,6 +523,7 @@ For nested vGPUs:
 .. _bug 1762688: https://bugs.launchpad.net/nova/+bug/1762688
 .. _bug 1948705: https://bugs.launchpad.net/nova/+bug/1948705
 .. _supports vGPU live-migrations: https://specs.openstack.org/openstack/nova-specs/specs/2024.1/approved/libvirt-mdev-live-migrate.html
+.. _bug 1758086: https://bugs.launchpad.net/nova/+bug/1758086
 
 .. Links
 .. _Intel GVT-g: https://01.org/igvt-g
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
index 366b3afa1d09..16240c29717a 100644
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@@ -321,10 +321,16 @@ class VGPUTests(VGPUTestBase):
             image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
             flavor_id=flavor, networks='auto', host=self.compute1.host)
 
-        # FIXME(sbauza): Unfortunately, we only accept one allocation per
-        # instance by the libvirt driver as you can see in _allocate_mdevs().
-        # So, eventually, we only have one vGPU for this instance.
-        self.assert_mdev_usage(self.compute1, expected_amount=1)
+        # Eventually, we have two allocations and two mdevs
+        self.assert_mdev_usage(self.compute1, expected_amount=2)
+        # Let's verify those are spread between both GPU RPs
+        rp_uuid = self.compute_rp_uuids['host1']
+        rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
+        for rp in rp_uuids:
+            inventory = self._get_provider_inventory(rp)
+            if orc.VGPU in inventory:
+                usage = self._get_provider_usages(rp)
+                self.assertEqual(1, usage[orc.VGPU])
 
 
 class VGPUMultipleTypesTests(VGPUTestBase):
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index c25ea49e1440..a803e89b7f4c 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -28547,13 +28547,14 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
             }
         }
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
-        self.assertIsNone(drvr._allocate_mdevs(allocations=allocations))
+        self.assertEqual([], drvr._allocate_mdevs(allocations=allocations))
 
     def _get_fake_provider_tree_with_vgpu(self):
-        """Returns a fake ProviderTree with VGPU inventory on two children RPs
-        with one with a correct name and the other one wrong.
+        """Returns a fake ProviderTree with VGPU inventory on 3 children RPs
+        with the first two with a correct name and the third wrong.
 
-        The child provider is named rp1 and its UUID is uuids.rp1.
+        The child providers are named rp[1-3] and their UUIDs are uuids.rp1,
+        uuids.rp2 and uuids.rp3
         """
         cn_rp = dict(
             uuid=uuids.cn,
@@ -28573,10 +28574,14 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         pt.new_child(cn_rp['name'] + '_' + 'pci_0000_06_00_0', cn_rp['uuid'],
                      uuid=uuids.rp1, generation=0)
         pt.update_inventory(uuids.rp1, vgpu_rp_inv)
-        # Create a second child with a bad naming convention
-        pt.new_child('oops_I_did_it_again', cn_rp['uuid'],
+        # Create a second child also with a correct naming attribute
+        pt.new_child(cn_rp['name'] + '_' + 'pci_0000_07_00_0', cn_rp['uuid'],
                      uuid=uuids.rp2, generation=0)
         pt.update_inventory(uuids.rp2, vgpu_rp_inv)
+        # Create a third child with a bad naming convention
+        pt.new_child('oops_I_did_it_again', cn_rp['uuid'],
+                     uuid=uuids.rp3, generation=0)
+        pt.update_inventory(uuids.rp3, vgpu_rp_inv)
         return pt
 
     @mock.patch.object(libvirt_driver.LibvirtDriver,
@@ -28603,6 +28608,37 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
         get_unassigned_mdevs.assert_called_once_with('pci_0000_06_00_0',
                                                      ['nvidia-11'])
 
+    @mock.patch.object(libvirt_driver.LibvirtDriver,
+                       '_get_existing_mdevs_not_assigned')
+    @mock.patch.object(libvirt_driver.LibvirtDriver,
+                       '_get_supported_mdev_resource_classes')
+    def test_allocate_mdevs_with_multiple_allocs(self, get_supported_mdev_rcs,
+                                                 get_unassigned_mdevs):
+        self.flags(enabled_mdev_types=['nvidia-11'], group='devices')
+        allocations = {
+            uuids.rp1: {
+                'resources': {
+                    orc.VGPU: 1,
+                }
+            },
+            uuids.rp2: {
+                'resources': {
+                    orc.VGPU: 1,
+                }
+            }
+        }
+        get_supported_mdev_rcs.return_value = set([orc.VGPU])
+        get_unassigned_mdevs.side_effect = (set([uuids.mdev1]),
+                                            set([uuids.mdev2]))
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+        # Mock the fact update_provider_tree() should have run
+        drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
+        self.assertEqual([uuids.mdev1, uuids.mdev2],
+                         drvr._allocate_mdevs(allocations=allocations))
+        get_unassigned_mdevs.assert_has_calls(
+            [mock.call('pci_0000_06_00_0', ['nvidia-11']),
+             mock.call('pci_0000_07_00_0', ['nvidia-11'])])
+
     @mock.patch.object(libvirt_driver.LibvirtDriver,
                        '_get_mdev_capable_devices')
     @mock.patch.object(libvirt_driver.LibvirtDriver,
@@ -28663,7 +28699,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
 
         # Test that we were unable to guess the RP name
         allocations = {
-            uuids.rp2: {
+            uuids.rp3: {
                 'resources': {
                     orc.VGPU: 1,
                 }
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 4375dc754b60..ade622d8ad8c 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -9060,43 +9060,39 @@ class LibvirtDriver(driver.ComputeDriver):
                             That code is supporting Placement API version 1.12
         """
         vgpu_allocations = self._vgpu_allocations(allocations)
-        if not vgpu_allocations:
-            return
-        # TODO(sbauza): For the moment, we only support allocations for only
-        # one pGPU.
-        if len(vgpu_allocations) > 1:
-            LOG.warning('More than one allocation was passed over to libvirt '
-                        'while at the moment libvirt only supports one. Only '
-                        'the first allocation will be looked up.')
-        rp_uuid, alloc = next(iter(vgpu_allocations.items()))
-        # We only have one allocation with a supported resource class
-        vgpus_asked = list(alloc['resources'].values())[0]
 
-        # Find if we allocated against a specific pGPU (and then the allocation
-        # is made against a child RP) or any pGPU (in case the VGPU inventory
-        # is still on the root RP)
-        try:
-            allocated_rp = self.provider_tree.data(rp_uuid)
-        except ValueError:
-            # The provider doesn't exist, return a better understandable
-            # exception
-            raise exception.ComputeResourcesUnavailable(
-                reason='mdev-capable resource is not available')
-        # FIXME(sbauza): The functional reshape test assumes that we could
-        # run _allocate_mdevs() against non-nested RPs but this is impossible
-        # as all inventories have been reshaped *before now* since it's done
-        # on init_host() (when the compute restarts or whatever else calls it).
-        # That said, since fixing the functional test isn't easy yet, let's
-        # assume we still support a non-nested RP for now.
-        if allocated_rp.parent_uuid is None:
-            # We are on a root RP
-            parent_device = None
-        else:
+        chosen_mdevs = []
+        for rp_uuid, alloc in vgpu_allocations.items():
+            # We only have one allocation with a supported resource class
+            # FIXME(sbauza): If a new vfio-mdev usage supports more than one
+            # type per PCI device, we would need to modify this. For the
+            # moment, all of the vfio-mdev drivers that we know only support
+            # one type per mdev-supported device.
+            vgpus_asked = list(alloc['resources'].values())[0]
+
+            try:
+                allocated_rp = self.provider_tree.data(rp_uuid)
+            except ValueError:
+                # The provider doesn't exist, return a better understandable
+                # exception
+                raise exception.ComputeResourcesUnavailable(
+                    reason='Resource Provider %s is missing' % rp_uuid)
             rp_name = allocated_rp.name
             # There can be multiple roots, we need to find the root name
             # to guess the physical device name
             roots = list(self.provider_tree.roots)
             for root in roots:
+                # FIXME(sbauza): The functional reshape test assumes that we
+                # could run _allocate_mdevs() against non-nested RPs but this
+                # is impossible as all inventories have been reshaped *before
+                # now* since it's done on init_host() (when the compute
+                # restarts or whatever else calls it). That said, since fixing
+                # the functional test isn't easy yet, let's assume we still
+                # support a non-nested RP for now.
+                if allocated_rp.parent_uuid is None:
+                    # We are on a root RP
+                    parent_device = None
+                    break
                 if rp_name.startswith(root.name + '_'):
                     # The RP name convention is :
                     #    root_name + '_' + parent_device
@@ -9113,28 +9109,29 @@ class LibvirtDriver(driver.ComputeDriver):
                 raise exception.ComputeResourcesUnavailable(
                     reason='mdev-capable resource is not available')
 
-        supported_types = self.supported_vgpu_types
-        # Which mediated devices are created but not assigned to a guest ?
-        mdevs_available = self._get_existing_mdevs_not_assigned(
-            parent_device, supported_types)
+            supported_types = self.supported_vgpu_types
+            # Which mediated devices are created but not assigned to a guest ?
+            mdevs_available = self._get_existing_mdevs_not_assigned(
+                parent_device, supported_types)
 
-        chosen_mdevs = []
-        for c in range(vgpus_asked):
-            chosen_mdev = None
-            if mdevs_available:
-                # Take the first available mdev
-                chosen_mdev = mdevs_available.pop()
-            else:
-                LOG.debug('No available mdevs where found. '
-                          'Creating an new one...')
-                chosen_mdev = self._create_new_mediated_device(parent_device)
-            if not chosen_mdev:
-                # If we can't find devices having available VGPUs, just raise
-                raise exception.ComputeResourcesUnavailable(
-                    reason='mdev-capable resource is not available')
-            else:
-                chosen_mdevs.append(chosen_mdev)
-                LOG.info('Allocated mdev: %s.', chosen_mdev)
+            for c in range(vgpus_asked):
+                chosen_mdev = None
+                if mdevs_available:
+                    # Take the first available mdev
+                    chosen_mdev = mdevs_available.pop()
+                else:
+                    LOG.debug('No available mdevs where found. '
+                              'Creating a new one...')
+                    chosen_mdev = self._create_new_mediated_device(
+                        parent_device)
+                if not chosen_mdev:
+                    # If we can't find devices having available VGPUs, just
+                    # raise
+                    raise exception.ComputeResourcesUnavailable(
+                        reason='mdev-capable resource is not available')
+                else:
+                    chosen_mdevs.append(chosen_mdev)
+                    LOG.info('Allocated mdev: %s.', chosen_mdev)
         return chosen_mdevs
 
     def _detach_mediated_devices(self, guest):
diff --git a/releasenotes/notes/bug-1758086-e9d147380d149789.yaml b/releasenotes/notes/bug-1758086-e9d147380d149789.yaml
new file mode 100644
index 000000000000..22fe5a39d7e1
--- /dev/null
+++ b/releasenotes/notes/bug-1758086-e9d147380d149789.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    You can now request different resource groups in your flavor for VGPU or
+    generic mediated device custom resource classes. Previously, only the
+    first resource request group was honored. See `bug #1758086`_ for more
+    details.
+
+    .. _bug #1758086: https://bugs.launchpad.net/nova/+bug/1758086