Merge "Functional test with pGPUs"

2020-04-09 23:29:21 +00:00 · 2020-04-09 23:29:21 +00:00 · d695b7dfbc
parent 77482b05af 9dcc0941f1
commit d695b7dfbc
4 changed files with 382 additions and 31 deletions
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -449,7 +449,54 @@ class _IntegratedTestBase(test.TestCase, InstanceHelperMixin):
                         ("The expected wsgi middlewares %s are not "
                          "existed") % expected_middleware)
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_uuid_by_name(self, name):
        return self.placement_api.get(
            '/resource_providers?name=%s' % name).body[
            'resource_providers'][0]['uuid']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_all_rp_uuids_in_a_tree(self, in_tree_rp_uuid):
        rps = self.placement_api.get(
            '/resource_providers?in_tree=%s' % in_tree_rp_uuid,
            version='1.20').body['resource_providers']
        return [rp['uuid'] for rp in rps]
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_inventory(self, rp_uuid):
        return self.placement_api.get(
            '/resource_providers/%s/inventories' % rp_uuid).body['inventories']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _get_provider_usages(self, provider_uuid):
        return self.placement_api.get(
            '/resource_providers/%s/usages' % provider_uuid).body['usages']
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _create_trait(self, trait):
        return self.placement_api.put('/traits/%s' % trait, {}, version='1.6')
    # TODO(sbauza): Drop this method once test classes inherit from a mixin
    def _set_provider_traits(self, rp_uuid, traits):
        """This will overwrite any existing traits.
        :param rp_uuid: UUID of the resource provider to update
        :param traits: list of trait strings to set on the provider
        :returns: APIResponse object with the results
        """
        provider = self.placement_api.get(
            '/resource_providers/%s' % rp_uuid).body
        put_traits_req = {
            'resource_provider_generation': provider['generation'],
            'traits': traits
        }
        return self.placement_api.put(
            '/resource_providers/%s/traits' % rp_uuid,
            put_traits_req, version='1.6')
 # FIXME(sbauza): There is little value to have this be a whole base testclass
 # instead of a mixin only providing methods for accessing Placement endpoint.
 class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
    """Base test class for functional tests that check provider usage
    and consumer allocations in Placement during various operations.
--- a/nova/tests/functional/libvirt/test_reshape.py
+++ b/nova/tests/functional/libvirt/test_reshape.py
@ -59,10 +59,28 @@ class VGPUReshapeTests(base.ServersTestBase):
        # the old tree as that would be a bad time for reshape. Later when the
        # compute service is restarted the driver will do the reshape.
        mdevs = {
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU1_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU2_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c03':
                fakelibvirt.FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c03',
                    type_id=fakelibvirt.NVIDIA_11_VGPU_TYPE,
                    parent=fakelibvirt.PGPU3_PCI_ADDR),
        }
        fake_connection = self._get_connection(
            # We need more RAM or the 3rd server won't be created
            host_info=fakelibvirt.HostInfo(kB_mem=8192),
-            mdev_info=fakelibvirt.HostMdevDevicesInfo())
+            mdev_info=fakelibvirt.HostMdevDevicesInfo(devices=mdevs))
        self.mock_conn.return_value = fake_connection
        # start a compute with vgpu support disabled so the driver will
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@ -0,0 +1,234 @@
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import fixtures
 import re
 import os_resource_classes as orc
 from oslo_config import cfg
 from oslo_log import log as logging
 from oslo_utils import uuidutils
 import nova.conf
 from nova import context
 from nova import objects
 from nova.tests.functional.libvirt import base
 from nova.tests.unit.virt.libvirt import fakelibvirt
 from nova.virt.libvirt import utils as libvirt_utils
 CONF = cfg.CONF
 LOG = logging.getLogger(__name__)
 class VGPUTestBase(base.ServersTestBase):
    FAKE_LIBVIRT_VERSION = 5000000
    FAKE_QEMU_VERSION = 3001000
    def setUp(self):
        super(VGPUTestBase, self).setUp()
        self.useFixture(fixtures.MockPatch(
            'nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
            return_value={'total': 128,
                          'used': 44,
                          'free': 84}))
        self.useFixture(fixtures.MockPatch(
            'nova.privsep.libvirt.create_mdev',
            side_effect=self._create_mdev))
        self.context = context.get_admin_context()
    def pci2libvirt_address(self, address):
        return "pci_{}_{}_{}_{}".format(*re.split("[.:]", address))
    def libvirt2pci_address(self, dev_name):
        return "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
    def _create_mdev(self, physical_device, mdev_type, uuid=None):
        # We need to fake the newly created sysfs object by adding a new
        # FakeMdevDevice in the existing persisted Connection object so
        # when asking to get the existing mdevs, we would see it.
        if not uuid:
            uuid = uuidutils.generate_uuid()
        mdev_name = libvirt_utils.mdev_uuid2name(uuid)
        libvirt_parent = self.pci2libvirt_address(physical_device)
        self.fake_connection.mdev_info.devices.update(
            {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=mdev_name,
                                                   type_id=mdev_type,
                                                   parent=libvirt_parent)})
        return uuid
    def _start_compute_service(self, hostname):
        self.fake_connection = self._get_connection(
            host_info=fakelibvirt.HostInfo(cpu_nodes=2, kB_mem=8192),
            # We want to create two pGPUs but no other PCI devices
            pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=0,
                                                    num_pfs=0,
                                                    num_vfs=0,
                                                    num_mdevcap=2),
            hostname=hostname)
        self.mock_conn.return_value = self.fake_connection
        compute = self.start_service('compute', host=hostname)
        rp_uuid = self._get_provider_uuid_by_name(hostname)
        rp_uuids = self._get_all_rp_uuids_in_a_tree(rp_uuid)
        for rp in rp_uuids:
            inventory = self._get_provider_inventory(rp)
            if orc.VGPU in inventory:
                usage = self._get_provider_usages(rp)
                self.assertEqual(16, inventory[orc.VGPU]['total'])
                self.assertEqual(0, usage[orc.VGPU])
        # Since we haven't created any mdevs yet, we shouldn't find them
        self.assertEqual([], compute.driver._get_mediated_devices())
        return compute
 class VGPUTests(VGPUTestBase):
    def setUp(self):
        super(VGPUTests, self).setUp()
        extra_spec = {"resources:VGPU": "1"}
        self.flavor = self._create_flavor(extra_spec=extra_spec)
        # Start compute1 supporting only nvidia-11
        self.flags(
            enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
            group='devices')
        self.compute1 = self._start_compute_service('host1')
    def test_create_servers_with_vgpu(self):
        self._create_server(
            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
            flavor_id=self.flavor, host=self.compute1.host,
            expected_state='ACTIVE')
        # Now we should find a new mdev
        mdevs = self.compute1.driver._get_mediated_devices()
        self.assertEqual(1, len(mdevs))
        # Checking also the allocations for the parent pGPU
        parent_name = mdevs[0]['parent']
        parent_rp_name = self.compute1.host + '_' + parent_name
        parent_rp_uuid = self._get_provider_uuid_by_name(parent_rp_name)
        usage = self._get_provider_usages(parent_rp_uuid)
        self.assertEqual(1, usage[orc.VGPU])
 class VGPUMultipleTypesTests(VGPUTestBase):
    def setUp(self):
        super(VGPUMultipleTypesTests, self).setUp()
        extra_spec = {"resources:VGPU": "1"}
        self.flavor = self._create_flavor(extra_spec=extra_spec)
        self.flags(
            enabled_vgpu_types=[fakelibvirt.NVIDIA_11_VGPU_TYPE,
                                fakelibvirt.NVIDIA_12_VGPU_TYPE],
            group='devices')
        # we need to call the below again to ensure the updated
        # 'device_addresses' value is read and the new groups created
        nova.conf.devices.register_dynamic_opts(CONF)
        # host1 will have 2 physical GPUs :
        #  - 0000:81:00.0 will only support nvidia-11
        #  - 0000:81:01.0 will only support nvidia-12
        pgpu1_pci_addr = self.libvirt2pci_address(fakelibvirt.PGPU1_PCI_ADDR)
        pgpu2_pci_addr = self.libvirt2pci_address(fakelibvirt.PGPU2_PCI_ADDR)
        self.flags(device_addresses=[pgpu1_pci_addr], group='vgpu_nvidia-11')
        self.flags(device_addresses=[pgpu2_pci_addr], group='vgpu_nvidia-12')
        # Prepare traits for later on
        self._create_trait('CUSTOM_NVIDIA_11')
        self._create_trait('CUSTOM_NVIDIA_12')
        self.compute1 = self._start_compute_service('host1')
    def test_create_servers_with_vgpu(self):
        self._create_server(
            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
            flavor_id=self.flavor, host=self.compute1.host,
            expected_state='ACTIVE')
        mdevs = self.compute1.driver._get_mediated_devices()
        self.assertEqual(1, len(mdevs))
        # We can be deterministic : since 0000:81:01.0 is asked to only support
        # nvidia-12 *BUT* doesn't actually have this type as a PCI capability,
        # we are sure that only 0000:81:00.0 is used.
        parent_name = mdevs[0]['parent']
        self.assertEqual(fakelibvirt.PGPU1_PCI_ADDR, parent_name)
        # We are also sure that there is no RP for 0000:81:01.0 since there
        # is no inventory for nvidia-12
        root_rp_uuid = self._get_provider_uuid_by_name(self.compute1.host)
        rp_uuids = self._get_all_rp_uuids_in_a_tree(root_rp_uuid)
        # We only have 2 RPs : the root RP and only the pGPU1 RP...
        self.assertEqual(2, len(rp_uuids))
        # ... but we double-check by asking the RP by its expected name
        expected_pgpu2_rp_name = (self.compute1.host + '_' +
                                  fakelibvirt.PGPU2_PCI_ADDR)
        pgpu2_rp = self.placement_api.get(
            '/resource_providers?name=' + expected_pgpu2_rp_name).body[
            'resource_providers']
        # See, Placement API returned no RP for this name as it doesn't exist.
        self.assertEqual([], pgpu2_rp)
    def test_create_servers_with_specific_type(self):
        # Regenerate the PCI addresses so both pGPUs now support nvidia-12
        self.fake_connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
            num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
            multiple_gpu_types=True)
        # Make a restart to update the Resource Providers
        self.compute1 = self.restart_compute_service(self.compute1)
        pgpu1_rp_uuid = self._get_provider_uuid_by_name(
            self.compute1.host + '_' + fakelibvirt.PGPU1_PCI_ADDR)
        pgpu2_rp_uuid = self._get_provider_uuid_by_name(
            self.compute1.host + '_' + fakelibvirt.PGPU2_PCI_ADDR)
        pgpu1_inventory = self._get_provider_inventory(pgpu1_rp_uuid)
        self.assertEqual(16, pgpu1_inventory[orc.VGPU]['total'])
        pgpu2_inventory = self._get_provider_inventory(pgpu2_rp_uuid)
        self.assertEqual(8, pgpu2_inventory[orc.VGPU]['total'])
        # Attach traits to the pGPU RPs
        self._set_provider_traits(pgpu1_rp_uuid, ['CUSTOM_NVIDIA_11'])
        self._set_provider_traits(pgpu2_rp_uuid, ['CUSTOM_NVIDIA_12'])
        expected = {'CUSTOM_NVIDIA_11': fakelibvirt.PGPU1_PCI_ADDR,
                    'CUSTOM_NVIDIA_12': fakelibvirt.PGPU2_PCI_ADDR}
        for trait in expected.keys():
            # Add a trait to the flavor
            extra_spec = {"resources:VGPU": "1",
                          "trait:%s" % trait: "required"}
            flavor = self._create_flavor(extra_spec=extra_spec)
            # Use the new flavor for booting
            server = self._create_server(
                image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
                flavor_id=flavor, host=self.compute1.host,
                expected_state='ACTIVE')
            # Get the instance we just created
            inst = objects.Instance.get_by_uuid(self.context, server['id'])
            # Get the mdevs that were allocated for this instance, we should
            # only have one
            mdevs = self.compute1.driver._get_all_assigned_mediated_devices(
                inst)
            self.assertEqual(1, len(mdevs))
            # It's a dict of mdev_uuid/instance_uuid pairs, we only care about
            # the keys
            mdevs = list(mdevs.keys())
            # Now get the detailed information about this single mdev
            mdev_info = self.compute1.driver._get_mediated_device_information(
                libvirt_utils.mdev_uuid2name(mdevs[0]))
            # We can be deterministic : since we asked for a specific type,
            # we know which pGPU we landed.
            self.assertEqual(expected[trait], mdev_info['parent'])
--- a/nova/tests/unit/virt/libvirt/fakelibvirt.py
+++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py
@ -195,10 +195,18 @@ VF_PROD_NAME = 'X540 Ethernet Controller Virtual Function'
 VF_DRIVER_NAME = 'ixgbevf'
 VF_CAP_TYPE = 'phys_function'
 MDEV_CAPABLE_VEND_ID = '10DE'
 MDEV_CAPABLE_VEND_NAME = 'Nvidia'
 MDEV_CAPABLE_PROD_ID = '0FFE'
 MDEV_CAPABLE_PROD_NAME = 'GRID M60-0B'
 MDEV_CAPABLE_DRIVER_NAME = 'nvidia'
 MDEV_CAPABLE_CAP_TYPE = 'mdev_types'
 NVIDIA_11_VGPU_TYPE = 'nvidia-11'
-PGPU1_PCI_ADDR = 'pci_0000_06_00_0'
+NVIDIA_12_VGPU_TYPE = 'nvidia-12'
-PGPU2_PCI_ADDR = 'pci_0000_07_00_0'
+PGPU1_PCI_ADDR = 'pci_0000_81_00_0'
-PGPU3_PCI_ADDR = 'pci_0000_08_00_0'
+PGPU2_PCI_ADDR = 'pci_0000_81_01_0'
 PGPU3_PCI_ADDR = 'pci_0000_81_02_0'
 class FakePCIDevice(object):
@ -240,9 +248,16 @@ class FakePCIDevice(object):
        </device>""".strip())  # noqa
    cap_templ = "<capability type='%(cap_type)s'>%(addresses)s</capability>"
    addr_templ = "<address domain='0x0000' bus='0x81' slot='%(slot)#02x' function='%(function)#02x'/>"  # noqa
    mdevtypes_templ = textwrap.dedent("""
        <type id='%(type_id)s'>
        <name>GRID M60-0B</name><deviceAPI>vfio-pci</deviceAPI>
        <availableInstances>%(instances)s</availableInstances>
        </type>""".strip())  # noqa
    is_capable_of_mdevs = False
    def __init__(self, dev_type, slot, function, iommu_group, numa_node,
-                 vf_ratio=None):
+                 vf_ratio=None, multiple_gpu_types=False):
        """Populate pci devices
        :param dev_type: (string) Indicates the type of the device (PCI, PF,
@ -253,8 +268,11 @@ class FakePCIDevice(object):
        :param numa_node: (int) NUMA node of the device.
        :param vf_ratio: (int) Ratio of Virtual Functions on Physical. Only
            applicable if ``dev_type`` is one of: ``PF``, ``VF``.
        :param multiple_gpu_types: (bool) Supports different vGPU types
        """
        vend_id = PCI_VEND_ID
        vend_name = PCI_VEND_NAME
        if dev_type == 'PCI':
            if vf_ratio:
                raise ValueError('vf_ratio does not apply for PCI devices')
@ -295,14 +313,34 @@ class FakePCIDevice(object):
                    'function': 0,
                }
            }
        elif dev_type == 'MDEV_TYPES':
            prod_id = MDEV_CAPABLE_PROD_ID
            prod_name = MDEV_CAPABLE_PROD_NAME
            driver = MDEV_CAPABLE_DRIVER_NAME
            vend_id = MDEV_CAPABLE_VEND_ID
            vend_name = MDEV_CAPABLE_VEND_NAME
            types = [self.mdevtypes_templ % {
                'type_id': NVIDIA_11_VGPU_TYPE,
                'instances': 16,
            }]
            if multiple_gpu_types:
                types.append(self.mdevtypes_templ % {
                    'type_id': NVIDIA_12_VGPU_TYPE,
                    'instances': 8,
                })
            capability = self.cap_templ % {
                'cap_type': MDEV_CAPABLE_CAP_TYPE,
                'addresses': '\n'.join(types)
            }
            self.is_capable_of_mdevs = True
        else:
            raise ValueError('Expected one of: PCI, VF, PCI')
        self.pci_device = self.pci_device_template % {
            'slot': slot,
            'function': function,
-            'vend_id': PCI_VEND_ID,
+            'vend_id': vend_id,
-            'vend_name': PCI_VEND_NAME,
+            'vend_name': vend_name,
            'prod_id': prod_id,
            'prod_name': prod_name,
            'driver': driver,
@ -326,26 +364,31 @@ class HostPCIDevicesInfo(object):
    TOTAL_NUMA_NODES = 2
    pci_devname_template = 'pci_0000_81_%(slot)02x_%(function)d'
-    def __init__(self, num_pci=0, num_pfs=2, num_vfs=8, numa_node=None):
+    def __init__(self, num_pci=0, num_pfs=2, num_vfs=8, num_mdevcap=0,
                 numa_node=None, multiple_gpu_types=False):
        """Create a new HostPCIDevicesInfo object.
-        :param num_pci: (int) The number of (non-SR-IOV) PCI devices.
+        :param num_pci: (int) The number of (non-SR-IOV) and (non-MDEV capable)
            PCI devices.
        :param num_pfs: (int) The number of PCI SR-IOV Physical Functions.
        :param num_vfs: (int) The number of PCI SR-IOV Virtual Functions.
        :param num_mdevcap: (int) The number of PCI devices capable of creating
            mediated devices.
        :param iommu_group: (int) Initial IOMMU group ID.
        :param numa_node: (int) NUMA node of the device; if set all of the
            devices will be assigned to the specified node else they will be
            split between ``$TOTAL_NUMA_NODES`` nodes.
        :param multiple_gpu_types: (bool) Supports different vGPU types
        """
        self.devices = {}
-        if not (num_vfs or num_pfs):
+        if not (num_vfs or num_pfs) and not num_mdevcap:
            return
        if num_vfs and not num_pfs:
            raise ValueError('Cannot create VFs without PFs')
-        if num_vfs % num_pfs:
+        if num_pfs and num_vfs % num_pfs:
            raise ValueError('num_vfs must be a factor of num_pfs')
        slot = 0
@ -369,6 +412,24 @@ class HostPCIDevicesInfo(object):
            slot += 1
            iommu_group += 1
        # Generate MDEV capable devs
        for dev in range(num_mdevcap):
            pci_dev_name = self.pci_devname_template % {
                'slot': slot, 'function': function}
            LOG.info('Generating MDEV capable device %r', pci_dev_name)
            self.devices[pci_dev_name] = FakePCIDevice(
                dev_type='MDEV_TYPES',
                slot=slot,
                function=function,
                iommu_group=iommu_group,
                numa_node=self._calc_numa_node(dev, numa_node),
                multiple_gpu_types=multiple_gpu_types)
            slot += 1
            iommu_group += 1
        vf_ratio = num_vfs // num_pfs if num_pfs else 0
        # Generate PFs
@ -425,6 +486,10 @@ class HostPCIDevicesInfo(object):
        pci_dev = self.devices.get(device_name)
        return pci_dev
    def get_all_mdev_capable_devices(self):
        return [dev for dev in self.devices
                if self.devices[dev].is_capable_of_mdevs]
 class FakeMdevDevice(object):
    template = """
@ -453,21 +518,11 @@ class FakeMdevDevice(object):
 class HostMdevDevicesInfo(object):
-    def __init__(self):
+    def __init__(self, devices=None):
-        self.devices = {
+        if devices is not None:
-            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01':
+            self.devices = devices
-                FakeMdevDevice(
+        else:
-                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
+            self.devices = {}
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU1_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02':
                FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU2_PCI_ADDR),
            'mdev_4b20d080_1b54_4048_85b3_a6a62d165c03':
                FakeMdevDevice(
                    dev_name='mdev_4b20d080_1b54_4048_85b3_a6a62d165c03',
                    type_id=NVIDIA_11_VGPU_TYPE, parent=PGPU3_PCI_ADDR),
        }
    def get_all_devices(self):
        return self.devices.keys()
@ -1268,7 +1323,7 @@ class Connection(object):
        self.pci_info = pci_info or HostPCIDevicesInfo(num_pci=0,
                                                       num_pfs=0,
                                                       num_vfs=0)
-        self.mdev_info = mdev_info or []
+        self.mdev_info = mdev_info or HostMdevDevicesInfo(devices={})
        self.hostname = hostname or 'compute1'
    def _add_filter(self, nwfilter):
@ -1573,10 +1628,7 @@ class Connection(object):
        if cap == 'mdev':
            return self.mdev_info.get_all_devices()
        if cap == 'mdev_types':
-            # TODO(gibi): We should return something like
+            return self.pci_info.get_all_mdev_capable_devices()
            # https://libvirt.org/drvnodedev.html#MDEVCap but I tried and it
            # did not work for me.
            return None
        else:
            raise ValueError('Capability "%s" is not supported' % cap)