Functional test with pGPUs

We provide two testclasses : - one will only focus about a single supported type - the other one will focus on multiple types The idea is that we check that inventories are created at compute startup accordingly with no allocations and then we verify when we boot a server that both allocations are correctly made against the right pGPU Resource Provider but also that the mdev is created. Change-Id: Ib5dbcee295d8a78b9a55806054de6265c3663343
2020-04-07 10:11:33 +02:00 · 2020-04-07 10:11:33 +02:00 · 9dcc0941f1
parent 5b5cbc64f9
commit 9dcc0941f1
4 changed files with 382 additions and 31 deletions
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -429,7 +429,54 @@ class _IntegratedTestBase(test.TestCase, InstanceHelperMixin):
                         ("The expected wsgi middlewares %s are not "
                          "existed") % expected_middleware)
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_uuid_by_name(self, name):
        return self.placement_api.get(
            '/resource_providers?name=%s' % name).body[
            'resource_providers'][0]['uuid']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_all_rp_uuids_in_a_tree(self, in_tree_rp_uuid):
        rps = self.placement_api.get(
            '/resource_providers?in_tree=%s' % in_tree_rp_uuid,
            version='1.20').body['resource_providers']
        return [rp['uuid'] for rp in rps]
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_inventory(self, rp_uuid):
        return self.placement_api.get(
            '/resource_providers/%s/inventories' % rp_uuid).body['inventories']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_usages(self, provider_uuid):
        return self.placement_api.get(
            '/resource_providers/%s/usages' % provider_uuid).body['usages']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _create_trait(self, trait):
        return self.placement_api.put('/traits/%s' % trait, {}, version='1.6')
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _set_provider_traits(self, rp_uuid, traits):
        """This will overwrite any existing traits.
        :param rp_uuid: UUID of the resource provider to update
        :param traits: list of trait strings to set on the provider
        :returns: APIResponse object with the results
        """
        provider = self.placement_api.get(
            '/resource_providers/%s' % rp_uuid).body
        put_traits_req = {
            'resource_provider_generation': provider['generation'],
            'traits': traits
        }
        return self.placement_api.put(
            '/resource_providers/%s/traits' % rp_uuid,
            put_traits_req, version='1.6')
 # FIXME(sbauza): There is little value to have this be a whole base testclass
 # instead of a mixin only providing methods for accessing Placement endpoint.
 class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
    """Base test class for functional tests that check provider usage
    and consumer allocations in Placement during various operations.
--- a/nova/tests/functional/libvirt/test_reshape.py
+++ b/nova/tests/functional/libvirt/test_reshape.py
@ -59,10 +59,28 @@ class VGPUReshapeTests(base.ServersTestBase):
        # the old tree as that would be a bad time for reshape. Later when the
        # compute service is restarted the driver will do the reshape.
        mdevs = {
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU1_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU2_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c03':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c03',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU3_PCI_ADDR),
        }
        fake_connection = self._get_connection(
            # We need more RAM or the 3rd server won't be created
            host_info=fakelibvirt.HostInfo(kB_mem=8192),
-            mdev_info=fakelibvirt.HostMdevDevicesInfo())
+            mdev_info=fakelibvirt.HostMdevDevicesInfo(devices=mdevs))
        self.mock_conn.return_value = fake_connection
        # start a compute with vgpu support disabled so the driver will
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@ -0,0 +1,234 @@
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import fixtures
 import re
 import os_resource_classes as orc
 from oslo_config import cfg
 from oslo_log import log as logging
 from oslo_utils import uuidutils
 import nova.conf
 from nova import context
 from nova import objects
 from nova.tests.functional.libvirt import base
 from nova.tests.unit.virt.libvirt import fakelibvirt
 from nova.virt.libvirt import utils as libvirt_utils
 CONF = cfg.CONF
 LOG = logging.getLogger(__name__)
 class VGPUTestBase(base.ServersTestBase):
    FAKE_LIBVIRT_VERSION = 5000000
    FAKE_QEMU_VERSION = 3001000
    def setUp(self):
        super(VGPUTestBase, self).setUp()
        self.useFixture(fixtures.MockPatch(
            'nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
            return_value={'total': 128,
                          'used': 44,
                          'free': 84}))
        self.useFixture(fixtures.MockPatch(
            'nova.privsep.libvirt.create_mdev',
            side_effect=self._create_mdev))
        self.context = context.get_admin_context()
    def pci2libvirt_address(self, address):
        return "pci_{}_{}_{}_{}".format(*re.split("[.:]", address))
    def libvirt2pci_address(self, dev_name):
        return "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
    def _create_mdev(self, physical_device, mdev_type, uuid=None):
        # We need to fake the newly created sysfs object by adding a new
        # FakeMdevDevice in the existing persisted Connection object so
        # when asking to get the existing mdevs, we would see it.
        if not uuid:
            uuid = uuidutils.generate_uuid()
        mdev_name = libvirt_utils.mdev_uuid2name(uuid)
        libvirt_parent = self.pci2libvirt_address(physical_device)
        self.fake_connection.mdev_info.devices.update(
            {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name,
                                                   type_id=mdev_type,
                                                   parent=libvirt_parent)})
        return uuid
    def _start_compute_service(self, hostname):
        self.fake_connection = self._get_connection(
            host_info=fakelibvirt.HostInfo(cpu_nodes=2, kB_mem=8192),
            # We want to create two pGPUs but no other PCI devices
            pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=0,
                                                    num_pfs=0,
                                                    num_vfs=0,
                                                    num_mdevcap=2),
            hostname=hostname)
        self.mock_conn.return_value = self.fake_connection
        compute = self.start_service('compute', host=hostname)
        rp_uuid = self._get_provider_uuid_by_name(hostname)
        rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
        for rp in rp_uuids:
            inventory = self._get_provider_inventory(rp)
            if orc.VGPU in inventory:
                usage = self._get_provider_usages(rp)
                self.assertEqual(16, inventory[orc.VGPU]['total'])
                self.assertEqual(0, usage[orc.VGPU])
        # Since we haven't created any mdevs yet, we shouldn't find them
        self.assertEqual([], compute.driver._get_mediated_devices())
        return compute
 class VGPUTests(VGPUTestBase):
    def setUp(self):
        super(VGPUTests, self).setUp()
        extra_spec = {"resources:VGPU": "1"}
        self.flavor = self._create_flavor(extra_spec=extra_spec)
        # Start compute1 supporting only nvidia-11
        self.flags(
            enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
            group='devices')
        self.compute1 = self._start_compute_service('host1')
    def test_create_servers_with_vgpu(self):
        self._create_server(
            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
            flavor_id=self.flavor, host=self.compute1.host,
            expected_state='ACTIVE')
        # Now we should find a new mdev
        mdevs = self.compute1.driver._get_mediated_devices()
        self.assertEqual(1, len(mdevs))
        # Checking also the allocations for the parent pGPU
        parent_name = mdevs[0]['parent']
        parent_rp_name = self.compute1.host + '_' + parent_name
        parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
        usage = self._get_provider_usages(parent_rp_uuid)
        self.assertEqual(1, usage[orc.VGPU])
 class VGPUMultipleTypesTests(VGPUTestBase):
    def setUp(self):
        super(VGPUMultipleTypesTests, self).setUp()
        extra_spec = {"resources:VGPU": "1"}
        self.flavor = self._create_flavor(extra_spec=extra_spec)
        self.flags(
            enabled_vgpu_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
                                fakelibvirt.NVIDIA_12_VGPU_TYPE],
            group='devices')
        # we need to call the below again to ensure the updated
        # 'device_addresses' value is read and the new groups created
        nova.conf.devices.register_dynamic_opts(CONF)
        # host1 will have 2 physical GPUs :
        #  - 0000:81:00.0 will only support nvidia-11
        #  - 0000:81:01.0 will only support nvidia-12
        pgpu1_pci_addr = self.libvirt2pci_address(fakelibvirt.PGPU1_PCI_ADDR)
        pgpu2_pci_addr = self.libvirt2pci_address(fakelibvirt.PGPU2_PCI_ADDR)
        self.flags(device_addresses=[pgpu1_pci_addr], group='vgpu_nvidia-11')
        self.flags(device_addresses=[pgpu2_pci_addr], group='vgpu_nvidia-12')
        # Prepare traits for later on
        self._create_trait('CUSTOM_NVIDIA_11')
        self._create_trait('CUSTOM_NVIDIA_12')
        self.compute1 = self._start_compute_service('host1')
    def test_create_servers_with_vgpu(self):
        self._create_server(
            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
            flavor_id=self.flavor, host=self.compute1.host,
            expected_state='ACTIVE')
        mdevs = self.compute1.driver._get_mediated_devices()
        self.assertEqual(1, len(mdevs))
        # We can be deterministic : since 0000:81:01.0 is asked to only support
        # nvidia-12 *BUT* doesn't actually have this type as a PCI capability,
        # we are sure that only 0000:81:00.0 is used.
        parent_name = mdevs[0]['parent']
        self.assertEqual(fakelibvirt.PGPU1_PCI_ADDR, parent_name)
        # We are also sure that there is no RP for 0000:81:01.0 since there
        # is no inventory for nvidia-12
        root_rp_uuid = self._get_provider_uuid_by_name(self.compute1.host)
        rp_uuids = self._get_all_rp_uuids_in_a_tree(root_rp_uuid)
        # We only have 2 RPs : the root RP and only the pGPU1 RP...
        self.assertEqual(2, len(rp_uuids))
        # ... but we double-check by asking the RP by its expected name
        expected_pgpu2_rp_name = (self.compute1.host + '_' +
                                  fakelibvirt.PGPU2_PCI_ADDR)
        pgpu2_rp = self.placement_api.get(
            '/resource_providers?name=' + expected_pgpu2_rp_name).body[
            'resource_providers']
        # See, Placement API returned no RP for this name as it doesn't exist.
        self.assertEqual([], pgpu2_rp)
    def test_create_servers_with_specific_type(self):
        # Regenerate the PCI addresses so both pGPUs now support nvidia-12
        self.fake_connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
            num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
            multiple_gpu_types=True)
        # Make a restart to update the Resource Providers
        self.compute1 = self.restart_compute_service(self.compute1)
        pgpu1_rp_uuid = self._get_provider_uuid_by_name(
            self.compute1.host + '_' + fakelibvirt.PGPU1_PCI_ADDR)
        pgpu2_rp_uuid = self._get_provider_uuid_by_name(
            self.compute1.host + '_' + fakelibvirt.PGPU2_PCI_ADDR)
        pgpu1_inventory = self._get_provider_inventory(pgpu1_rp_uuid)
        self.assertEqual(16, pgpu1_inventory[orc.VGPU]['total'])
        pgpu2_inventory = self._get_provider_inventory(pgpu2_rp_uuid)
        self.assertEqual(8, pgpu2_inventory[orc.VGPU]['total'])
        # Attach traits to the pGPU RPs
        self._set_provider_traits(pgpu1_rp_uuid, ['CUSTOM_NVIDIA_11'])
        self._set_provider_traits(pgpu2_rp_uuid, ['CUSTOM_NVIDIA_12'])
        expected = {'CUSTOM_NVIDIA_11': fakelibvirt.PGPU1_PCI_ADDR,
                    'CUSTOM_NVIDIA_12': fakelibvirt.PGPU2_PCI_ADDR}
        for trait in expected.keys():
            # Add a trait to the flavor
            extra_spec = {"resources:VGPU": "1",
                          "trait:%s" % trait: "required"}
            flavor = self._create_flavor(extra_spec=extra_spec)
            # Use the new flavor for booting
            server = self._create_server(
                image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
                flavor_id=flavor, host=self.compute1.host,
                expected_state='ACTIVE')
            # Get the instance we just created
            inst = objects.Instance.get_by_uuid(self.context, server['id'])
            # Get the mdevs that were allocated for this instance, we should
            # only have one
            mdevs = self.compute1.driver._get_all_assigned_mediated_devices(
                inst)
            self.assertEqual(1, len(mdevs))
            # It's a dict of mdev_uuid/instance_uuid pairs, we only care about
            # the keys
            mdevs = list(mdevs.keys())
            # Now get the detailed information about this single mdev
            mdev_info = self.compute1.driver._get_mediated_device_information(
                libvirt_utils.mdev_uuid2name(mdevs[0]))
            # We can be deterministic : since we asked for a specific type,
            # we know which pGPU we landed.
            self.assertEqual(expected[trait], mdev_info['parent'])
--- a/nova/tests/unit/virt/libvirt/fakelibvirt.py
+++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py
@ -190,10 +190,18 @@ VF_PROD_NAME = 'X540 Ethernet Controller Virtual Function'
 VF_DRIVER_NAME = 'ixgbevf'
 VF_CAP_TYPE = 'phys_function'
 MDEV_CAPABLE_VEND_ID = '10DE'
 MDEV_CAPABLE_VEND_NAME = 'Nvidia'
 MDEV_CAPABLE_PROD_ID = '0FFE'
 MDEV_CAPABLE_PROD_NAME = 'GRID M60-0B'
 MDEV_CAPABLE_DRIVER_NAME = 'nvidia'
 MDEV_CAPABLE_CAP_TYPE = 'mdev_types'
 NVIDIA_11_VGPU_TYPE = 'nvidia-11'
-PGPU1_PCI_ADDR = 'pci_0000_06_00_0'
+NVIDIA_12_VGPU_TYPE = 'nvidia-12'
-PGPU2_PCI_ADDR = 'pci_0000_07_00_0'
+PGPU1_PCI_ADDR = 'pci_0000_81_00_0'
-PGPU3_PCI_ADDR = 'pci_0000_08_00_0'
+PGPU2_PCI_ADDR = 'pci_0000_81_01_0'
 PGPU3_PCI_ADDR = 'pci_0000_81_02_0'
 class FakePCIDevice(object):
@ -235,9 +243,16 @@ class FakePCIDevice(object):
        </device>""".strip())  # noqa
    cap_templ = "<capability type='%(cap_type)s'>%(addresses)s</capability>"
    addr_templ = "<address domain='0x0000' bus='0x81' slot='%(slot)#02x' function='%(function)#02x'/>"  # noqa
    mdevtypes_templ = textwrap.dedent("""
        <type id='%(type_id)s'>
        <name>GRID M60-0B</name><deviceAPI>vfio-pci</deviceAPI>
        <availableInstances>%(instances)s</availableInstances>
        </type>""".strip())  # noqa
    is_capable_of_mdevs = False
    def __init__(self, dev_type, slot, function, iommu_group, numa_node,
-                 vf_ratio=None):
+                 vf_ratio=None, multiple_gpu_types=False):
        """Populate pci devices
        :param dev_type: (string) Indicates the type of the device (PCI, PF,
@ -248,8 +263,11 @@ class FakePCIDevice(object):
        :param numa_node: (int) NUMA node of the device.
        :param vf_ratio: (int) Ratio of Virtual Functions on Physical. Only
            applicable if ``dev_type`` is one of: ``PF``, ``VF``.
        :param multiple_gpu_types: (bool) Supports different vGPU types
        """
        vend_id = PCI_VEND_ID
        vend_name = PCI_VEND_NAME
        if dev_type == 'PCI':
            if vf_ratio:
                raise ValueError('vf_ratio does not apply for PCI devices')
@ -290,14 +308,34 @@ class FakePCIDevice(object):
                    'function': 0,
                }
            }
        elif dev_type == 'MDEV_TYPES':
            prod_id = MDEV_CAPABLE_PROD_ID
            prod_name = MDEV_CAPABLE_PROD_NAME
            driver = MDEV_CAPABLE_DRIVER_NAME
            vend_id = MDEV_CAPABLE_VEND_ID
            vend_name = MDEV_CAPABLE_VEND_NAME
            types = [self.mdevtypes_templ % {
                'type_id': NVIDIA_11_VGPU_TYPE,
                'instances': 16,
            }]
            if multiple_gpu_types:
                types.append(self.mdevtypes_templ % {
                    'type_id': NVIDIA_12_VGPU_TYPE,
                    'instances': 8,
                })
            capability = self.cap_templ % {
                'cap_type': MDEV_CAPABLE_CAP_TYPE,
                'addresses': '\n'.join(types)
            }
            self.is_capable_of_mdevs = True
        else:
            raise ValueError('Expected one of: PCI, VF, PCI')
        self.pci_device = self.pci_device_template % {
            'slot': slot,
            'function': function,
-            'vend_id': PCI_VEND_ID,
+            'vend_id': vend_id,
-            'vend_name': PCI_VEND_NAME,
+            'vend_name': vend_name,
            'prod_id': prod_id,
            'prod_name': prod_name,
            'driver': driver,
@ -321,26 +359,31 @@ class HostPCIDevicesInfo(object):
    TOTAL_NUMA_NODES = 2
    pci_devname_template = 'pci_0000_81_%(slot)02x_%(function)d'
-    def __init__(self, num_pci=0, num_pfs=2, num_vfs=8, numa_node=None):
+    def __init__(self, num_pci=0, num_pfs=2, num_vfs=8, num_mdevcap=0,
                 numa_node=None, multiple_gpu_types=False):
        """Create a new HostPCIDevicesInfo object.
-        :param num_pci: (int) The number of (non-SR-IOV) PCI devices.
+        :param num_pci: (int) The number of (non-SR-IOV) and (non-MDEV capable)
            PCI devices.
        :param num_pfs: (int) The number of PCI SR-IOV Physical Functions.
        :param num_vfs: (int) The number of PCI SR-IOV Virtual Functions.
        :param num_mdevcap: (int) The number of PCI devices capable of creating
            mediated devices.
        :param iommu_group: (int) Initial IOMMU group ID.
        :param numa_node: (int) NUMA node of the device; if set all of the
            devices will be assigned to the specified node else they will be
            split between ``$TOTAL_NUMA_NODES`` nodes.
        :param multiple_gpu_types: (bool) Supports different vGPU types
        """
        self.devices = {}
-        if not (num_vfs or num_pfs):
+        if not (num_vfs or num_pfs) and not num_mdevcap:
            return
        if num_vfs and not num_pfs:
            raise ValueError('Cannot create VFs without PFs')
-        if num_vfs % num_pfs:
+        if num_pfs and num_vfs % num_pfs:
            raise ValueError('num_vfs must be a factor of num_pfs')
        slot = 0
@ -364,6 +407,24 @@ class HostPCIDevicesInfo(object):
            slot += 1
            iommu_group += 1
        # Generate MDEV capable devs
        for dev in range(num_mdevcap):
            pci_dev_name = self.pci_devname_template % {
                'slot': slot, 'function': function}
            LOG.info('Generating MDEV capable device %r', pci_dev_name)
            self.devices[pci_dev_name] = FakePCIDevice(
                dev_type='MDEV_TYPES',
                slot=slot,
                function=function,
                iommu_group=iommu_group,
                numa_node=self._calc_numa_node(dev, numa_node),
                multiple_gpu_types=multiple_gpu_types)
            slot += 1
            iommu_group += 1
        vf_ratio = num_vfs // num_pfs if num_pfs else 0
        # Generate PFs
@ -420,6 +481,10 @@ class HostPCIDevicesInfo(object):
        pci_dev = self.devices.get(device_name)
        return pci_dev
    def get_all_mdev_capable_devices(self):
        return [dev for dev in self.devices
                if self.devices[dev].is_capable_of_mdevs]
 class FakeMdevDevice(object):
    template = """
@ -448,21 +513,11 @@ class FakeMdevDevice(object):
 class HostMdevDevicesInfo(object):
-    def __init__(self):
+    def __init__(self, devices=None):
-        self.devices = {
+        if devices is not None:
-            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01':
+            self.devices = devices
-                FakeMdevDevice(
+        else:
-                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
+            self.devices = {}
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU1_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02':
                FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU2_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c03':
                FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c03',
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU3_PCI_ADDR),
        }
    def get_all_devices(self):
        return self.devices.keys()
@ -1266,7 +1321,7 @@ class Connection(object):
        self.pci_info = pci_info or HostPCIDevicesInfo(num_pci=0,
                                                       num_pfs=0,
                                                       num_vfs=0)
-        self.mdev_info = mdev_info or []
+        self.mdev_info = mdev_info or HostMdevDevicesInfo(devices={})
        self.hostname = hostname or 'compute1'
    def _add_filter(self, nwfilter):
@ -1571,10 +1626,7 @@ class Connection(object):
        if cap == 'mdev':
            return self.mdev_info.get_all_devices()
        if cap == 'mdev_types':
-            # TODO(gibi): We should return something like
+            return self.pci_info.get_all_mdev_capable_devices()
            # https://libvirt.org/drvnodedev.html#MDEVCap but I tried and it
            # did not work for me.
            return None
        else:
            raise ValueError('Capability "%s" is not supported' % cap)