libvirt: implement reshaper for vgpu

This implements the reshaper routine for the libvirt driver to detect and move, if necessary, VGPU inventory and allocations from the root compute node provider to a child provider of VGPU resources. The reshape will be performed on first start of nova-compute with this code. For a fresh compute node deploy, no reshaping will be necessary and the VGPU inventory will start on the child provider. Part of blueprint reshape-provider-tree Part of blueprint vgpu-stein Co-Authored-By: Sylvain Bauza <sbauza@free.fr> Change-Id: I511d26dc6487fadfcf22ba747abd385068e975a4
2018-09-01 18:58:17 -04:00 · 2018-09-01 18:58:17 -04:00 · 054eb3a652
commit 054eb3a652
parent 2d00da78d5
4 changed files with 760 additions and 76 deletions
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@ -18008,9 +18008,6 @@ class HostStateTestCase(test.NoDBTestCase):
        def _get_vcpu_used(self):
            return 0

-        def _get_vgpu_total(self):
-            return 0
-
        def _get_cpu_info(self):
            return HostStateTestCase.cpu_info

@ -18139,7 +18136,7 @@ class TestUpdateProviderTree(test.NoDBTestCase):

    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_cpu_traits',
                new=mock.Mock(return_value=cpu_traits))
-    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vgpu_total')
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_gpu_inventories')
    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info',
                return_value={'total': disk_gb})
    @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total',
@ -18147,8 +18144,10 @@ class TestUpdateProviderTree(test.NoDBTestCase):
    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vcpu_total',
                return_value=vcpus)
    def _test_update_provider_tree(self, mock_vcpu, mock_mem, mock_disk,
-                                   mock_vgpus, total_vgpus=0):
-        mock_vgpus.return_value = total_vgpus
+                                   mock_gpu_invs, gpu_invs=None):
+        if gpu_invs:
+            self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
+            mock_gpu_invs.return_value = gpu_invs
        self.driver.update_provider_tree(self.pt,
                                         self.cn_rp['name'])

@ -18160,18 +18159,58 @@ class TestUpdateProviderTree(test.NoDBTestCase):
                         self.pt.data(self.cn_rp['uuid']).traits)

    def test_update_provider_tree_with_vgpus(self):
-        self._test_update_provider_tree(total_vgpus=8)
+        pci_devices = ['pci_0000_06_00_0', 'pci_0000_07_00_0']
+        gpu_inventory_dicts = {
+            pci_devices[0]: {'total': 16,
+                             'max_unit': 16,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+            pci_devices[1]: {'total': 8,
+                             'max_unit': 8,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+        }
+        self._test_update_provider_tree(gpu_invs=gpu_inventory_dicts)
        inventory = self._get_inventory()
-        # Add VGPU in the expected inventory
-        inventory[orc.VGPU] = {'step_size': 1,
-                               'min_unit': 1,
-                               'max_unit': 8,
-                               'total': 8}
+        # root compute node provider inventory is unchanged
        self.assertEqual(inventory,
                         (self.pt.data(self.cn_rp['uuid'])).inventory)
+        # We should have two new pGPU child providers in the tree under the
+        # compute node root provider.
+        compute_node_tree_uuids = self.pt.get_provider_uuids(
+            self.cn_rp['name'])
+        self.assertEqual(3, len(compute_node_tree_uuids))
+        # Create a default GPU inventory with no total and max_unit amounts yet
+        default_gpu_inventory = {
+            orc.VGPU: {
+                'step_size': 1, 'min_unit': 1, 'reserved': 0,
+                'allocation_ratio': 1.0
+            }
+        }
+        # The pGPU child providers should be any item in the list but the first
+        # which is the root provider UUID
+        for rp_uuid in compute_node_tree_uuids[1:]:
+            pgpu_provider_data = self.pt.data(rp_uuid)
+            # Identify which PCI device is related to this Resource Provider
+            pci_device = (pci_devices[0]
+                          if pci_devices[0] in pgpu_provider_data.name
+                          else pci_devices[1])
+            self.assertEqual('%s_%s' % (self.cn_rp['name'], pci_device),
+                             pgpu_provider_data.name)
+            pgpu_inventory = default_gpu_inventory.copy()
+            inventory_dict = gpu_inventory_dicts[pci_device]
+            pgpu_inventory[orc.VGPU][
+                'total'] = inventory_dict['total']
+            pgpu_inventory[orc.VGPU][
+                'max_unit'] = inventory_dict['max_unit']
+            self.assertEqual(pgpu_inventory, pgpu_provider_data.inventory)

-    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vgpu_total',
-                return_value=0)
    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info',
                return_value={'total': disk_gb})
    @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total',
@ -18181,7 +18220,7 @@ class TestUpdateProviderTree(test.NoDBTestCase):
    # TODO(efried): Bug #1784020
    @unittest.expectedFailure
    def test_update_provider_tree_for_shared_disk_gb_resource(
-        self, mock_vcpu, mock_mem, mock_disk, mock_vgpus):
+        self, mock_vcpu, mock_mem, mock_disk):
        """Test to check DISK_GB is reported from shared resource
        provider.
        """
@ -18229,6 +18268,207 @@ class TestUpdateProviderTree(test.NoDBTestCase):
        self.assertEqual(set(['HW_CPU_X86_AVX512F', 'HW_CPU_X86_BMI']),
                         self.pt.data(self.cn_rp['uuid']).traits)

+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_cpu_traits',
+                new=mock.Mock(return_value=cpu_traits))
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
+                '_get_mediated_device_information')
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
+                '_get_all_assigned_mediated_devices')
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_gpu_inventories')
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info',
+                return_value={'total': disk_gb})
+    @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total',
+                return_value=memory_mb)
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vcpu_total',
+                return_value=vcpus)
+    def test_update_provider_tree_for_vgpu_reshape(
+            self, mock_vcpu, mock_mem, mock_disk, mock_gpus, mock_get_devs,
+            mock_get_mdev_info):
+        """Tests the VGPU reshape scenario."""
+        self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
+        # Let's assume we have two PCI devices each having 4 pGPUs for this
+        # type
+        pci_devices = ['pci_0000_06_00_0', 'pci_0000_07_00_0']
+        gpu_inventory_dicts = {
+            pci_devices[0]: {'total': 4,
+                             'max_unit': 4,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+            pci_devices[1]: {'total': 4,
+                             'max_unit': 4,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+        }
+        mock_gpus.return_value = gpu_inventory_dicts
+        # Fake the fact that we have one vGPU allocated to one instance and
+        # this vGPU is on the first PCI device
+        mock_get_devs.return_value = {uuids.mdev1: uuids.consumer1}
+        mock_get_mdev_info.side_effect = [
+            {"dev_id": "mdev_fake",
+             "uuid": uuids.mdev1,
+             "parent": pci_devices[0],
+             "type": "nvidia-11",
+             "iommu_group": 12
+             }]
+        # First create a provider tree with VGPU inventory on the root node
+        # provider. Since we have 2 devices with 4 pGPUs each, the total is 8
+        # as we were flattening all resources in one single inventory before
+        inventory = self._get_inventory()
+        vgpu_inventory = {
+            orc.VGPU: {
+                'step_size': 1, 'min_unit': 1, 'max_unit': 8, 'total': 8
+            }
+        }
+        inventory.update(vgpu_inventory)
+        self.pt.update_inventory(self.cn_rp['uuid'], inventory)
+        # Call update_provider_tree which will raise ReshapeNeeded because
+        # there is VGPU inventory on the root node provider.
+        self.assertRaises(exception.ReshapeNeeded,
+                          self.driver.update_provider_tree,
+                          self.pt, self.cn_rp['name'])
+        # Now make up some fake allocations to pass back to the upt method
+        # for the reshape.
+        allocations = {
+            uuids.consumer1: {
+                'allocations': {
+                    # This consumer has ram and vgpu allocations on the root
+                    # node provider and should be changed.
+                    self.cn_rp['uuid']: {
+                        'resources': {
+                            orc.MEMORY_MB: 512,
+                            orc.VGPU: 1
+                        }
+                    }
+                }
+            },
+            uuids.consumer2: {
+                'allocations': {
+                    # This consumer has ram and vcpu allocations on the root
+                    # node provider and should not be changed.
+                    self.cn_rp['uuid']: {
+                        'resources': {
+                            orc.MEMORY_MB: 256,
+                            orc.VCPU: 2
+                        }
+                    }
+                }
+            }
+        }
+        original_allocations = copy.deepcopy(allocations)
+        # Initiate the reshape.
+        self.driver.update_provider_tree(
+            self.pt, self.cn_rp['name'], allocations=allocations)
+        # We should have two new VGPU child providers in the tree under the
+        # compute node root provider.
+        compute_node_tree_uuids = self.pt.get_provider_uuids(
+            self.cn_rp['name'])
+        self.assertEqual(3, len(compute_node_tree_uuids))
+        rp_per_pci_device = {}
+        # The VGPU child providers should be the 2nd and 3rd UUIDs in that list
+        for rp_uuid in compute_node_tree_uuids[1:]:
+            # The VGPU inventory should be on the VGPU child provider
+            pgpu_provider_data = self.pt.data(rp_uuid)
+            # We want to map the PCI device with the RP UUID
+            if pci_devices[0] in pgpu_provider_data.name:
+                rp_per_pci_device[pci_devices[0]] = rp_uuid
+            elif pci_devices[1] in pgpu_provider_data.name:
+                rp_per_pci_device[pci_devices[1]] = rp_uuid
+        # Make sure we have two child resource providers
+        self.assertEqual(2, len(rp_per_pci_device))
+
+        # The compute node root provider should not have VGPU inventory.
+        del inventory[orc.VGPU]
+        self.assertEqual(inventory, self.pt.data(self.cn_rp['uuid']).inventory)
+        # consumer1 should now have allocations against two providers,
+        # MEMORY_MB on the root compute node provider and VGPU on the child
+        # provider.
+        consumer1_allocs = allocations[uuids.consumer1]['allocations']
+        self.assertEqual(2, len(consumer1_allocs))
+        self.assertEqual({orc.MEMORY_MB: 512},
+                         consumer1_allocs[self.cn_rp['uuid']]['resources'])
+        # Make sure the VGPU allocation moved to the corresponding child RP
+        self.assertEqual(
+            {orc.VGPU: 1},
+            consumer1_allocs[rp_per_pci_device[pci_devices[0]]]['resources'])
+        # The allocations on consumer2 should be unchanged.
+        self.assertEqual(original_allocations[uuids.consumer2],
+                         allocations[uuids.consumer2])
+
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_cpu_traits',
+                new=mock.Mock(return_value=cpu_traits))
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_gpu_inventories')
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info',
+                return_value={'total': disk_gb})
+    @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total',
+                return_value=memory_mb)
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vcpu_total',
+                return_value=vcpus)
+    def test_update_provider_tree_for_vgpu_reshape_fails(
+            self, mock_vcpu, mock_mem, mock_disk, mock_gpus):
+        """Tests the VGPU reshape failure scenario where VGPU allocations
+        are not on the root compute node provider as expected.
+        """
+        self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
+        # Let's assume we have two PCI devices each having 4 pGPUs for this
+        # type
+        pci_devices = ['pci_0000_06_00_0', 'pci_0000_07_00_0']
+        gpu_inventory_dicts = {
+            pci_devices[0]: {'total': 4,
+                             'max_unit': 4,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+            pci_devices[1]: {'total': 4,
+                             'max_unit': 4,
+                             'min_unit': 1,
+                             'step_size': 1,
+                             'reserved': 0,
+                             'allocation_ratio': 1.0,
+                             },
+        }
+        mock_gpus.return_value = gpu_inventory_dicts
+        # First create a provider tree with VGPU inventory on the root node
+        # provider.
+        inventory = self._get_inventory()
+        vgpu_inventory = {
+            orc.VGPU: {
+                'step_size': 1, 'min_unit': 1, 'max_unit': 8, 'total': 8
+            }
+        }
+        inventory.update(vgpu_inventory)
+        self.pt.update_inventory(self.cn_rp['uuid'], inventory)
+        # Now make up some fake allocations to pass back to the upt method
+        # for the reshape.
+        allocations = {
+            uuids.consumer1: {
+                'allocations': {
+                    # This consumer has invalid VGPU allocations on a non-root
+                    # compute node provider.
+                    uuids.other_rp: {
+                        'resources': {
+                            orc.MEMORY_MB: 512,
+                            orc.VGPU: 1
+                        }
+                    }
+                }
+            }
+        }
+        # Initiate the reshape.
+        ex = self.assertRaises(exception.ReshapeFailed,
+                               self.driver.update_provider_tree,
+                               self.pt, self.cn_rp['name'],
+                               allocations=allocations)
+        self.assertIn('Unexpected VGPU resource allocation on provider %s'
+                      % uuids.other_rp, six.text_type(ex))
+

 class TraitsComparisonMixin(object):

@ -20418,37 +20658,62 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
                '._get_mediated_devices')
    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver'
                '._get_mdev_capable_devices')
-    def test_get_vgpu_total(self, get_mdev_devs, get_mdevs):
-        get_mdev_devs.return_value = [
-            {'dev_id': 'pci_0000_84_00_0',
-             'vendor_id': 0x10de,
-             'types': {'nvidia-11': {'availableInstances': 14,
+    def test_get_gpu_inventories(self, get_mdev_capable_devs,
+                                  get_mediated_devices):
+        get_mdev_capable_devs.return_value = [
+            {"dev_id": "pci_0000_06_00_0",
+             "vendor_id": 0x10de,
+             "types": {'nvidia-11': {'availableInstances': 15,
                                     'name': 'GRID M60-0B',
                                     'deviceAPI': 'vfio-pci'},
-                        }}]
-        get_mdevs.return_value = [
-            {'dev_id': 'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
-             'uuid': "4b20d080-1b54-4048-85b3-a6a62d165c01",
-             'parent': 'pci_0000_84_00_0',
-             'type': 'nvidia-11',
-             'iommuGroup': 1
-            },
-            {'dev_id': 'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
-             'uuid': "4b20d080-1b54-4048-85b3-a6a62d165c02",
-             'parent': 'pci_0000_84_00_0',
-             'type': 'nvidia-11',
-             'iommuGroup': 1
-            },
+                       }
+             },
+            {"dev_id": "pci_0000_07_00_0",
+             "vendor_id": 0x0000,
+             "types": {'nvidia-11': {'availableInstances': 7,
+                                     'name': 'GRID M60-0B',
+                                     'deviceAPI': 'vfio-pci'},
+                       }
+             },
        ]
+        get_mediated_devices.return_value = [{'dev_id': 'mdev_some_uuid1',
+                                              'uuid': uuids.mdev1,
+                                              'parent': "pci_0000_06_00_0",
+                                              'type': 'nvidia-11',
+                                              'iommu_group': 1},
+                                             {'dev_id': 'mdev_some_uuid2',
+                                              'uuid': uuids.mdev2,
+                                              'parent': "pci_0000_07_00_0",
+                                              'type': 'nvidia-11',
+                                              'iommu_group': 1}]
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)

-        # By default, no specific types are supported
-        self.assertEqual(0, self.drvr._get_vgpu_total())
+        # If the operator doesn't provide GPU types
+        self.assertEqual({}, drvr._get_gpu_inventories())

-        # Now, ask for only one
+        # Now, set a specific GPU type
        self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
-        # We have 14 available for nvidia-11. We also have 2 mdevs of the type.
-        # So, as a total, we have 14+2, hence 16.
-        self.assertEqual(16, self.drvr._get_vgpu_total())
+        expected = {
+            # the first GPU also has one mdev allocated against it
+            'pci_0000_06_00_0': {'total': 15 + 1,
+                                 'max_unit': 15 + 1,
+                                 'min_unit': 1,
+                                 'step_size': 1,
+                                 'reserved': 0,
+                                 'allocation_ratio': 1.0,
+                                 },
+            # the second GPU also has another mdev
+            'pci_0000_07_00_0': {'total': 7 + 1,
+                                 'max_unit': 7 + 1,
+                                 'min_unit': 1,
+                                 'step_size': 1,
+                                 'reserved': 0,
+                                 'allocation_ratio': 1.0,
+                                 },
+        }
+        self.assertEqual(expected, drvr._get_gpu_inventories())
+        get_mdev_capable_devs.assert_called_once_with(types=['nvidia-11'])
+        get_mediated_devices.assert_called_once_with(types=['nvidia-11'])

    @mock.patch.object(host.Host, 'device_lookup_by_name')
    @mock.patch.object(host.Host, 'list_mdev_capable_devices')
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@ -985,6 +985,8 @@ class ComputeDriver(object):
        :raises ReshapeNeeded: If allocations is None and any inventory needs
            to be moved from one provider to another and/or to a different
            resource class.
+        :raises: ReshapeFailed if the requested tree reshape fails for
+            whatever reason.
        """
        raise NotImplementedError()

--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@ -5818,26 +5818,91 @@ class LibvirtDriver(driver.ComputeDriver):
        requested_types = CONF.devices.enabled_vgpu_types[:1]
        return requested_types

-    def _get_vgpu_total(self):
-        """Returns the number of total available vGPUs for any GPU type that is
-        enabled with the enabled_vgpu_types CONF option.
+    def _count_mediated_devices(self, enabled_vgpu_types):
+        """Counts the sysfs objects (handles) that represent a mediated device
+        and filtered by $enabled_vgpu_types.
+
+        Those handles can be in use by a libvirt guest or not.
+
+        :param enabled_vgpu_types: list of enabled VGPU types on this host
+        :returns: dict, keyed by parent GPU libvirt PCI device ID, of number of
+        mdev device handles for that GPU
        """
-        requested_types = self._get_supported_vgpu_types()
+
+        counts_per_parent = collections.defaultdict(int)
+        mediated_devices = self._get_mediated_devices(types=enabled_vgpu_types)
+        for mdev in mediated_devices:
+            counts_per_parent[mdev['parent']] += 1
+        return counts_per_parent
+
+    def _count_mdev_capable_devices(self, enabled_vgpu_types):
+        """Counts the mdev-capable devices on this host filtered by
+        $enabled_vgpu_types.
+
+        :param enabled_vgpu_types: list of enabled VGPU types on this host
+        :returns: dict, keyed by device name, to an integer count of available
+            instances of each type per device
+        """
+        mdev_capable_devices = self._get_mdev_capable_devices(
+            types=enabled_vgpu_types)
+        counts_per_dev = collections.defaultdict(int)
+        for dev in mdev_capable_devices:
+            # dev_id is the libvirt name for the PCI device,
+            # eg. pci_0000_84_00_0 which matches a PCI address of 0000:84:00.0
+            dev_name = dev['dev_id']
+            for _type in dev['types']:
+                available = dev['types'][_type]['availableInstances']
+                # TODO(sbauza): Once we support multiple types, check which
+                # PCI devices are set for this type
+                # NOTE(sbauza): Even if we support multiple types, Nova will
+                # only use one per physical GPU.
+                counts_per_dev[dev_name] += available
+        return counts_per_dev
+
+    def _get_gpu_inventories(self):
+        """Returns the inventories for each physical GPU for a specific type
+        supported by the enabled_vgpu_types CONF option.
+
+        :returns: dict, keyed by libvirt PCI name, of dicts like:
+                {'pci_0000_84_00_0':
+                    {'total': $TOTAL,
+                     'min_unit': 1,
+                     'max_unit': $TOTAL,
+                     'step_size': 1,
+                     'reserved': 0,
+                     'allocation_ratio': 1.0,
+                    }
+                }
+        """
+
        # Bail out early if operator doesn't care about providing vGPUs
-        if not requested_types:
-            return 0
+        enabled_vgpu_types = self._get_supported_vgpu_types()
+        if not enabled_vgpu_types:
+            return {}
+        inventories = {}
+        count_per_parent = self._count_mediated_devices(enabled_vgpu_types)
+        for dev_name, count in count_per_parent.items():
+            inventories[dev_name] = {'total': count}
        # Filter how many available mdevs we can create for all the supported
        # types.
-        mdev_capable_devices = self._get_mdev_capable_devices(requested_types)
-        vgpus = 0
-        for dev in mdev_capable_devices:
-            for _type in dev['types']:
-                vgpus += dev['types'][_type]['availableInstances']
-        # Count the already created (but possibly not assigned to a guest)
-        # mdevs for all the supported types
-        mediated_devices = self._get_mediated_devices(requested_types)
-        vgpus += len(mediated_devices)
-        return vgpus
+        count_per_dev = self._count_mdev_capable_devices(enabled_vgpu_types)
+        # Combine the counts into the dict that we return to the caller.
+        for dev_name, count in count_per_dev.items():
+            inv_per_parent = inventories.setdefault(
+                dev_name, {'total': 0})
+            inv_per_parent['total'] += count
+            inv_per_parent.update({
+                'min_unit': 1,
+                'step_size': 1,
+                'reserved': 0,
+                # NOTE(sbauza): There is no sense to have a ratio but 1.0
+                # since we can't overallocate vGPU resources
+                'allocation_ratio': 1.0,
+                # FIXME(sbauza): Some vendors could support only one
+                'max_unit': inv_per_parent['total'],
+            })
+
+        return inventories

    def _get_instance_capabilities(self):
        """Get hypervisor instance capabilities
@ -6106,6 +6171,8 @@ class LibvirtDriver(driver.ComputeDriver):

        :returns: A dictionary of keys being mediated device UUIDs and their
                  respective values the instance UUID of the guest using it.
+                  Returns an empty dict if an instance is provided but not
+                  found in the hypervisor.
        """
        allocated_mdevs = {}
        if instance:
@ -6542,23 +6609,13 @@ class LibvirtDriver(driver.ComputeDriver):
        :raises ReshapeNeeded: If allocations is None and any inventory needs
            to be moved from one provider to another and/or to a different
            resource class.
+        :raises: ReshapeFailed if the requested tree reshape fails for
+            whatever reason.
        """
        disk_gb = int(self._get_local_gb_info()['total'])
        memory_mb = int(self._host.get_memory_mb_total())
        vcpus = self._get_vcpu_total()

-        # NOTE(sbauza): For the moment, the libvirt driver only supports
-        # providing the total number of virtual GPUs for a single GPU type. If
-        # you have multiple physical GPUs, each of them providing multiple GPU
-        # types, libvirt will return the total sum of virtual GPUs
-        # corresponding to the single type passed in enabled_vgpu_types
-        # configuration option. Eg. if you have 2 pGPUs supporting 'nvidia-35',
-        # each of them having 16 available instances, the total here will be
-        # 32.
-        # If one of the 2 pGPUs doesn't support 'nvidia-35', it won't be used.
-        # TODO(sbauza): Use traits to make a better world.
-        vgpus = self._get_vgpu_total()
-
        # NOTE(yikun): If the inv record does not exists, the allocation_ratio
        # will use the CONF.xxx_allocation_ratio value if xxx_allocation_ratio
        # is set, and fallback to use the initial_xxx_allocation_ratio
@ -6600,14 +6657,17 @@ class LibvirtDriver(driver.ComputeDriver):
            'reserved': self._get_reserved_host_disk_gb_from_config(),
        }

-        if vgpus > 0:
-            # Only provide VGPU resource classes if the driver supports it.
-            result[orc.VGPU] = {
-                'total': vgpus,
-                'min_unit': 1,
-                'max_unit': vgpus,
-                'step_size': 1,
-                }
+        # NOTE(sbauza): For the moment, the libvirt driver only supports
+        # providing the total number of virtual GPUs for a single GPU type. If
+        # you have multiple physical GPUs, each of them providing multiple GPU
+        # types, only one type will be used for each of the physical GPUs.
+        # If one of the pGPUs doesn't support this type, it won't be used.
+        # TODO(sbauza): Use traits to make a better world.
+        inventories_dict = self._get_gpu_inventories()
+        if inventories_dict:
+            self._update_provider_tree_for_vgpu(
+                inventories_dict, provider_tree, nodename,
+                allocations=allocations)

        provider_tree.update_inventory(nodename, result)

@ -6625,6 +6685,351 @@ class LibvirtDriver(driver.ComputeDriver):
        # so that spawn() or other methods can access it thru a getter
        self.provider_tree = copy.deepcopy(provider_tree)

+    @staticmethod
+    def _is_reshape_needed_vgpu_on_root(provider_tree, nodename):
+        """Determine if root RP has VGPU inventories.
+
+        Check to see if the root compute node provider in the tree for
+        this host already has VGPU inventory because if it does, we either
+        need to signal for a reshape (if _update_provider_tree_for_vgpu()
+        has no allocations) or move the allocations within the ProviderTree if
+        passed.
+
+        :param provider_tree: The ProviderTree object for this host.
+        :param nodename: The ComputeNode.hypervisor_hostname, also known as
+            the name of the root node provider in the tree for this host.
+        :returns: boolean, whether we have VGPU root inventory.
+        """
+        root_node = provider_tree.data(nodename)
+        return orc.VGPU in root_node.inventory
+
+    @staticmethod
+    def _ensure_pgpu_providers(inventories_dict, provider_tree, nodename):
+        """Ensures GPU inventory providers exist in the tree for $nodename.
+
+        GPU providers are named $nodename_$gpu-device-id, e.g.
+        ``somehost.foo.bar.com_pci_0000_84_00_0``.
+
+        :param inventories_dict: Dictionary of inventories for VGPU class
+            directly provided by _get_gpu_inventories() and which looks like:
+                {'pci_0000_84_00_0':
+                    {'total': $TOTAL,
+                     'min_unit': 1,
+                     'max_unit': $MAX_UNIT, # defaults to $TOTAL
+                     'step_size': 1,
+                     'reserved': 0,
+                     'allocation_ratio': 1.0,
+                    }
+                }
+        :param provider_tree: The ProviderTree to update.
+        :param nodename: The ComputeNode.hypervisor_hostname, also known as
+            the name of the root node provider in the tree for this host.
+        :returns: dict, keyed by GPU device ID, to ProviderData object
+            representing that resource provider in the tree
+        """
+        # Create the VGPU child providers if they do not already exist.
+        # TODO(mriedem): For the moment, _get_supported_vgpu_types() only
+        # returns one single type but that will be changed once we support
+        # multiple types.
+        # Note that we can't support multiple vgpu types until a reshape has
+        # been performed on the vgpu resources provided by the root provider,
+        # if any.
+
+        # Dict of PGPU RPs keyed by their libvirt PCI name
+        pgpu_rps = {}
+        for pgpu_dev_id, inventory in inventories_dict.items():
+            # For each physical GPU, we make sure to have a child provider
+            pgpu_rp_name = '%s_%s' % (nodename, pgpu_dev_id)
+            if not provider_tree.exists(pgpu_rp_name):
+                # This is the first time creating the child provider so add
+                # it to the tree under the root node provider.
+                provider_tree.new_child(pgpu_rp_name, nodename)
+            # We want to idempotently return the resource providers with VGPUs
+            pgpu_rp = provider_tree.data(pgpu_rp_name)
+            pgpu_rps[pgpu_dev_id] = pgpu_rp
+
+            # The VGPU inventory goes on a child provider of the given root
+            # node, identified by $nodename.
+            pgpu_inventory = {orc.VGPU: inventory}
+            provider_tree.update_inventory(pgpu_rp_name, pgpu_inventory)
+        return pgpu_rps
+
+    @staticmethod
+    def _assert_is_root_provider(
+            rp_uuid, root_node, consumer_uuid, alloc_data):
+        """Asserts during a reshape that rp_uuid is for the root node provider.
+
+        When reshaping, inventory and allocations should be on the root node
+        provider and then moved to child providers.
+
+        :param rp_uuid: UUID of the provider that holds inventory/allocations.
+        :param root_node: ProviderData object representing the root node in a
+            provider tree.
+        :param consumer_uuid: UUID of the consumer (instance) holding resource
+            allocations against the given rp_uuid provider.
+        :param alloc_data: dict of allocation data for the consumer.
+        :raises: ReshapeFailed if rp_uuid is not the root node indicating a
+            reshape was needed but the inventory/allocation structure is not
+            expected.
+        """
+        if rp_uuid != root_node.uuid:
+            # Something is wrong - VGPU inventory should
+            # only be on the root node provider if we are
+            # reshaping the tree.
+            msg = (_('Unexpected VGPU resource allocation '
+                     'on provider %(rp_uuid)s for consumer '
+                     '%(consumer_uuid)s: %(alloc_data)s. '
+                     'Expected VGPU allocation to be on root '
+                     'compute node provider %(root_uuid)s.')
+                   % {'rp_uuid': rp_uuid,
+                      'consumer_uuid': consumer_uuid,
+                      'alloc_data': alloc_data,
+                      'root_uuid': root_node.uuid})
+            raise exception.ReshapeFailed(error=msg)
+
+    def _get_assigned_mdevs_for_reshape(
+            self, instance_uuid, rp_uuid, alloc_data):
+        """Gets the mediated devices assigned to the instance during a reshape.
+
+        :param instance_uuid: UUID of the instance consuming VGPU resources
+            on this host.
+        :param rp_uuid: UUID of the resource provider with VGPU inventory being
+            consumed by the instance.
+        :param alloc_data: dict of allocation data for the instance consumer.
+        :return: list of mediated device UUIDs assigned to the instance
+        :raises: ReshapeFailed if the instance is not found in the hypervisor
+            or no mediated devices were found to be assigned to the instance
+            indicating VGPU allocations are out of sync with the hypervisor
+        """
+        # FIXME(sbauza): We don't really need an Instance
+        # object, but given some libvirt.host logs needs
+        # to have an instance name, just provide a fake one
+        Instance = collections.namedtuple('Instance', ['uuid', 'name'])
+        instance = Instance(uuid=instance_uuid, name=instance_uuid)
+        mdevs = self._get_all_assigned_mediated_devices(instance)
+        # _get_all_assigned_mediated_devices returns {} if the instance is
+        # not found in the hypervisor
+        if not mdevs:
+            # If we found a VGPU allocation against a consumer
+            # which is not an instance, the only left case for
+            # Nova would be a migration but we don't support
+            # this at the moment.
+            msg = (_('Unexpected VGPU resource allocation on provider '
+                     '%(rp_uuid)s for consumer %(consumer_uuid)s: '
+                     '%(alloc_data)s. The allocation is made against a '
+                     'non-existing instance or there are no devices assigned.')
+                   % {'rp_uuid': rp_uuid, 'consumer_uuid': instance_uuid,
+                      'alloc_data': alloc_data})
+            raise exception.ReshapeFailed(error=msg)
+        return mdevs
+
+    def _count_vgpus_per_pgpu(self, mdev_uuids):
+        """Count the number of VGPUs per physical GPU mediated device.
+
+        :param mdev_uuids: List of physical GPU mediated device UUIDs.
+        :return: dict, keyed by PGPU device ID, to count of VGPUs on that
+            device
+        """
+        vgpu_count_per_pgpu = collections.defaultdict(int)
+        for mdev_uuid in mdev_uuids:
+            # libvirt name is like mdev_00ead764_fdc0_46b6_8db9_2963f5c815b4
+            dev_name = "mdev_" + mdev_uuid.replace('-', '_')
+            # Count how many vGPUs are in use for this instance
+            dev_info = self._get_mediated_device_information(dev_name)
+            pgpu_dev_id = dev_info['parent']
+            vgpu_count_per_pgpu[pgpu_dev_id] += 1
+        return vgpu_count_per_pgpu
+
+    @staticmethod
+    def _check_vgpu_allocations_match_real_use(
+            vgpu_count_per_pgpu, expected_usage, rp_uuid, consumer_uuid,
+            alloc_data):
+        """Checks that the number of GPU devices assigned to the consumer
+        matches what is expected from the allocations in the placement service
+        and logs a warning if there is a mismatch.
+
+        :param vgpu_count_per_pgpu: dict, keyed by PGPU device ID, to count of
+            VGPUs on that device where each device is assigned to the consumer
+            (guest instance on this hypervisor)
+        :param expected_usage: The expected usage from placement for the
+            given resource provider and consumer
+        :param rp_uuid: UUID of the resource provider with VGPU inventory being
+            consumed by the instance
+        :param consumer_uuid: UUID of the consumer (instance) holding resource
+            allocations against the given rp_uuid provider
+        :param alloc_data: dict of allocation data for the instance consumer
+        """
+        actual_usage = sum(vgpu_count_per_pgpu.values())
+        if actual_usage != expected_usage:
+            # Don't make it blocking, just make sure you actually correctly
+            # allocate the existing resources
+            LOG.warning(
+                'Unexpected VGPU resource allocation on provider %(rp_uuid)s '
+                'for consumer %(consumer_uuid)s: %(alloc_data)s. Allocations '
+                '(%(expected_usage)s) differ from actual use '
+                '(%(actual_usage)s).',
+                {'rp_uuid': rp_uuid, 'consumer_uuid': consumer_uuid,
+                 'alloc_data': alloc_data, 'expected_usage': expected_usage,
+                 'actual_usage': actual_usage})
+
+    def _reshape_vgpu_allocations(
+            self, rp_uuid, root_node, consumer_uuid, alloc_data, resources,
+            pgpu_rps):
+        """Update existing VGPU allocations by moving them from the root node
+        provider to the child provider for the given VGPU provider.
+
+        :param rp_uuid: UUID of the VGPU resource provider with allocations
+            from consumer_uuid (should be the root node provider before
+            reshaping occurs)
+        :param root_node: ProviderData object for the root compute node
+            resource provider in the provider tree
+        :param consumer_uuid: UUID of the consumer (instance) with VGPU
+            allocations against the resource provider represented by rp_uuid
+        :param alloc_data: dict of allocation information for consumer_uuid
+        :param resources: dict, keyed by resource class, of resources allocated
+            to consumer_uuid from rp_uuid
+        :param pgpu_rps: dict, keyed by GPU device ID, to ProviderData object
+            representing that resource provider in the tree
+        :raises: ReshapeFailed if the reshape fails for whatever reason
+        """
+        # We've found VGPU allocations on a provider. It should be the root
+        # node provider.
+        self._assert_is_root_provider(
+            rp_uuid, root_node, consumer_uuid, alloc_data)
+
+        # Find which physical GPU corresponds to this allocation.
+        mdev_uuids = self._get_assigned_mdevs_for_reshape(
+            consumer_uuid, rp_uuid, alloc_data)
+
+        vgpu_count_per_pgpu = self._count_vgpus_per_pgpu(mdev_uuids)
+
+        # We need to make sure we found all the mediated devices that
+        # correspond to an allocation.
+        self._check_vgpu_allocations_match_real_use(
+            vgpu_count_per_pgpu, resources[orc.VGPU],
+            rp_uuid, consumer_uuid, alloc_data)
+
+        # Add the VGPU allocation for each VGPU provider.
+        allocs = alloc_data['allocations']
+        for pgpu_dev_id, pgpu_rp in pgpu_rps.items():
+            vgpu_count = vgpu_count_per_pgpu[pgpu_dev_id]
+            if vgpu_count:
+                allocs[pgpu_rp.uuid] = {
+                    'resources': {
+                        orc.VGPU: vgpu_count
+                    }
+                }
+        # And remove the VGPU allocation from the root node provider.
+        del resources[orc.VGPU]
+
+    def _reshape_gpu_resources(
+            self, allocations, root_node, pgpu_rps):
+        """Reshapes the provider tree moving VGPU inventory from root to child
+
+        :param allocations:
+            Dict of allocation data of the form:
+              { $CONSUMER_UUID: {
+                    # The shape of each "allocations" dict below is identical
+                    # to the return from GET /allocations/{consumer_uuid}
+                    "allocations": {
+                        $RP_UUID: {
+                            "generation": $RP_GEN,
+                            "resources": {
+                                $RESOURCE_CLASS: $AMOUNT,
+                                ...
+                            },
+                        },
+                        ...
+                    },
+                    "project_id": $PROJ_ID,
+                    "user_id": $USER_ID,
+                    "consumer_generation": $CONSUMER_GEN,
+                },
+                ...
+              }
+        :params root_node: The root node in the provider tree
+        :params pgpu_rps: dict, keyed by GPU device ID, to ProviderData object
+            representing that resource provider in the tree
+        """
+        LOG.info('Reshaping tree; moving VGPU allocations from root '
+                 'provider %s to child providers %s.', root_node.uuid,
+                 pgpu_rps.values())
+        # For each consumer in the allocations dict, look for VGPU
+        # allocations and move them to the VGPU provider.
+        for consumer_uuid, alloc_data in allocations.items():
+            # Copy and iterate over the current set of providers to avoid
+            # modifying keys while iterating.
+            allocs = alloc_data['allocations']
+            for rp_uuid in list(allocs):
+                resources = allocs[rp_uuid]['resources']
+                if orc.VGPU in resources:
+                    self._reshape_vgpu_allocations(
+                        rp_uuid, root_node, consumer_uuid, alloc_data,
+                        resources, pgpu_rps)
+
+    def _update_provider_tree_for_vgpu(self, inventories_dict, provider_tree,
+                                       nodename, allocations=None):
+        """Updates the provider tree for VGPU inventory.
+
+        Before Stein, VGPU inventory and allocations were on the root compute
+        node provider in the tree. Starting in Stein, the VGPU inventory is
+        on a child provider in the tree. As a result, this method will
+        "reshape" the tree if necessary on first start of this compute service
+        in Stein.
+
+        :param inventories_dict: Dictionary of inventories for VGPU class
+            directly provided by _get_gpu_inventories() and which looks like:
+                {'pci_0000_84_00_0':
+                    {'total': $TOTAL,
+                     'min_unit': 1,
+                     'max_unit': $MAX_UNIT, # defaults to $TOTAL
+                     'step_size': 1,
+                     'reserved': 0,
+                     'allocation_ratio': 1.0,
+                    }
+                }
+        :param provider_tree: The ProviderTree to update.
+        :param nodename: The ComputeNode.hypervisor_hostname, also known as
+            the name of the root node provider in the tree for this host.
+        :param allocations: If not None, indicates a reshape was requested and
+            should be performed.
+        :raises: nova.exception.ReshapeNeeded if ``allocations`` is None and
+            the method determines a reshape of the tree is needed, i.e. VGPU
+            inventory and allocations must be migrated from the root node
+            provider to a child provider of VGPU resources in the tree.
+        :raises: nova.exception.ReshapeFailed if the requested tree reshape
+            fails for whatever reason.
+        """
+        # Check to see if the root compute node provider in the tree for
+        # this host already has VGPU inventory because if it does, and
+        # we're not currently reshaping (allocations is None), we need
+        # to indicate that a reshape is needed to move the VGPU inventory
+        # onto a child provider in the tree.
+
+        # Ensure GPU providers are in the ProviderTree for the given inventory.
+        pgpu_rps = self._ensure_pgpu_providers(
+            inventories_dict, provider_tree, nodename)
+
+        if self._is_reshape_needed_vgpu_on_root(provider_tree, nodename):
+            if allocations is None:
+                # We have old VGPU inventory on root RP, but we haven't yet
+                # allocations. That means we need to ask for a reshape.
+                LOG.info('Requesting provider tree reshape in order to move '
+                         'VGPU inventory from the root compute node provider '
+                         '%s to a child provider.', nodename)
+                raise exception.ReshapeNeeded()
+            # We have allocations, that means we already asked for a reshape
+            # and the Placement API returned us them. We now need to move
+            # those from the root RP to the needed children RPs.
+            root_node = provider_tree.data(nodename)
+            # Reshape VGPU provider inventory and allocations, moving them
+            # from the root node provider to the child providers.
+            self._reshape_gpu_resources(allocations, root_node, pgpu_rps)
+            # Only delete the root inventory once the reshape is done
+            if orc.VGPU in root_node.inventory:
+                del root_node.inventory[orc.VGPU]
+                provider_tree.update_inventory(nodename, root_node.inventory)
+
    def get_available_resource(self, nodename):
        """Retrieve resource information.

--- a/releasenotes/notes/libvirt-stein-vgpu-reshape-a1fa23b8ad8aa966.yaml
+++ b/releasenotes/notes/libvirt-stein-vgpu-reshape-a1fa23b8ad8aa966.yaml
@ -0,0 +1,12 @@
+---
+upgrade:
+  - |
+    The libvirt compute driver will "reshape" VGPU inventories and allocations
+    on start of the ``nova-compute`` service. This will result in moving
+    VGPU inventory from the root compute node resource provider to a nested
+    (child) resource provider in the tree and move any associated VGPU
+    allocations with it. This will be a one-time operation on startup in Stein.
+    There is no end-user visible impact for this; it is for internal resource
+    tracking purposes. See the `spec`__ for more details.
+
+    .. __: https://specs.openstack.org/openstack/nova-specs/specs/stein/approved/reshape-provider-tree.html