Handle mdev devices in libvirt 7.7+

Libvirt 7.7 changed the mdev device naming to include the parent PCI
device when listing node devices. The domain, however, will still only
see the UUID and not see the parent PCI device. Changing the parsing to
simply drop the PCI identifier is not enough as the device cannot be
found when attempting to lookup the new ID.

Modify the Libvirt Driver's _get_mediated_device_information to tolerate
different formats of the mdev name. This first uses the legacy behavior
by trying to lookup the device name that is passed in (typically
mdev_<uuid> format) and if that is not found, iterates the list of mdev
node devices until the right UUID is found and selects that one.

Note that the lookup of the mdev device by UUID are needed in order
to keep the ability to recreate assigned mediated devices on a reboot of
the compute node.

Additionally, the libvirt utils parsing method mdev_name2uuid, has
been updated to tolerate both mdev_<uuid> and mdev_<uuid>_<pciid>
formats.

Closes-Bug: 1951656

Change-Id: Ifed0fa16053228990a6a8df8d4c666521db7e329
This commit is contained in:
Billy Olsen
2022-04-21 19:42:27 -07:00
committed by Sylvain Bauza
parent 1852019747
commit a28b907c4f
6 changed files with 96 additions and 26 deletions

View File

@@ -63,21 +63,11 @@ class VGPUTestsLibvirt7_7(test_vgpu.VGPUTestBase):
flavor_id=self.flavor, host=self.compute1.host,
networks='auto', expected_state='ACTIVE')
# TODO(sbauza): Modify this once bug #1851656 is fixed.
# mdev_name2uuid() raises a badly formed hexadecimal UUID string error
self.assertRaises(ValueError,
self.assert_mdev_usage,
self.compute1, expected_amount=1)
self.assert_mdev_usage(self.compute1, expected_amount=1)
# Now, the problem is that we can't create new instances with VGPUs
# from this host.
server = self._create_server(
self._create_server(
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
flavor_id=self.flavor, host=self.compute1.host,
networks='auto', expected_state='ERROR')
# The error is due to a bad mdev name parsing
self.assertIn('fault', server)
# since we only have one host, we have a RescheduledException as this
# service was creating an exception and we can't use another one.
self.assertIn('Exceeded maximum number of retries',
server['fault']['message'])
networks='auto', expected_state='ACTIVE')
self.assert_mdev_usage(self.compute1, expected_amount=2)

View File

@@ -3181,6 +3181,32 @@ class LibvirtConfigNodeDeviceTest(LibvirtConfigBaseTest):
config.LibvirtConfigNodeDeviceMdevInformation)
self.assertEqual("nvidia-11", obj.mdev_information.type)
self.assertEqual(12, obj.mdev_information.iommu_group)
self.assertIsNone(obj.mdev_information.uuid)
def test_config_mdev_device_uuid(self):
xmlin = """
<device>
<name>mdev_b2107403_110c_45b0_af87_32cc91597b8a_0000_41_00_0</name>
<path>/sys/devices/pci0000:40/0000:40:03.1/0000:41:00.0/b2107403-110c-45b0-af87-32cc91597b8a</path>
<parent>pci_0000_41_00_0</parent>
<driver>
<name>vfio_mdev</name>
</driver>
<capability type='mdev'>
<type id='nvidia-442'/>
<uuid>b2107403-110c-45b0-af87-32cc91597b8a</uuid>
<iommuGroup number='57'/>
</capability>
</device>"""
obj = config.LibvirtConfigNodeDevice()
obj.parse_str(xmlin)
self.assertIsInstance(obj.mdev_information,
config.LibvirtConfigNodeDeviceMdevInformation)
self.assertEqual("nvidia-442", obj.mdev_information.type)
self.assertEqual(57, obj.mdev_information.iommu_group)
self.assertEqual("b2107403-110c-45b0-af87-32cc91597b8a",
obj.mdev_information.uuid)
def test_config_vdpa_device(self):
xmlin = """

View File

@@ -3382,6 +3382,7 @@ class LibvirtConfigNodeDeviceMdevInformation(LibvirtConfigObject):
root_name="capability", **kwargs)
self.type = None
self.iommu_group = None
self.uuid = None
def parse_dom(self, xmldoc):
super(LibvirtConfigNodeDeviceMdevInformation,
@@ -3391,6 +3392,8 @@ class LibvirtConfigNodeDeviceMdevInformation(LibvirtConfigObject):
self.type = c.get('id')
if c.tag == "iommuGroup":
self.iommu_group = int(c.get('number'))
if c.tag == "uuid":
self.uuid = c.text
class LibvirtConfigNodeDeviceVpdCap(LibvirtConfigObject):

View File

@@ -8227,15 +8227,52 @@ class LibvirtDriver(driver.ComputeDriver):
def _get_mediated_device_information(self, devname):
"""Returns a dict of a mediated device."""
virtdev = self._host.device_lookup_by_name(devname)
# LP #1951656 - In Libvirt 7.7, the mdev name now includes the PCI
# address of the parent device (e.g. mdev_<uuid>_<pci_address>) due to
# the mdevctl allowing for multiple mediated devs having the same UUID
# defined (only one can be active at a time). Since the guest
# information doesn't have the parent ID, try to lookup which
# mediated device is available that matches the UUID. If multiple
# devices are found that match the UUID, then this is an error
# condition.
try:
virtdev = self._host.device_lookup_by_name(devname)
except libvirt.libvirtError as ex:
if ex.get_error_code() != libvirt.VIR_ERR_NO_NODE_DEVICE:
raise
mdevs = [dev for dev in self._host.list_mediated_devices()
if dev.startswith(devname)]
# If no matching devices are found, simply raise the original
# exception indicating that no devices are found.
if not mdevs:
raise
elif len(mdevs) > 1:
msg = ("The mediated device name %(devname)s refers to a UUID "
"that is present in multiple libvirt mediated devices. "
"Matching libvirt mediated devices are %(devices)s. "
"Mediated device UUIDs must be unique for Nova." %
{'devname': devname,
'devices': ', '.join(mdevs)})
raise exception.InvalidLibvirtMdevConfig(reason=msg)
LOG.debug('Found requested device %s as %s. Using that.',
devname, mdevs[0])
virtdev = self._host.device_lookup_by_name(mdevs[0])
xmlstr = virtdev.XMLDesc(0)
cfgdev = vconfig.LibvirtConfigNodeDevice()
cfgdev.parse_str(xmlstr)
# Starting with Libvirt 7.3, the uuid information is available in the
# node device information. If its there, use that. Otherwise,
# fall back to the previous behavior of parsing the uuid from the
# devname.
if cfgdev.mdev_information.uuid:
mdev_uuid = cfgdev.mdev_information.uuid
else:
mdev_uuid = libvirt_utils.mdev_name2uuid(cfgdev.name)
device = {
"dev_id": cfgdev.name,
# name is like mdev_00ead764_fdc0_46b6_8db9_2963f5c815b4
"uuid": libvirt_utils.mdev_name2uuid(cfgdev.name),
"uuid": mdev_uuid,
# the physical GPU PCI device
"parent": cfgdev.parent,
"type": cfgdev.mdev_information.type,

View File

@@ -1566,7 +1566,7 @@ class Host(object):
def list_mediated_devices(self, flags=0):
"""Lookup mediated devices.
:returns: a list of virNodeDevice instance
:returns: a list of strings with the name of the instance
"""
return self._list_devices("mdev", flags=flags)

View File

@@ -575,17 +575,31 @@ def get_default_machine_type(arch: str) -> ty.Optional[str]:
def mdev_name2uuid(mdev_name: str) -> str:
"""Convert an mdev name (of the form mdev_<uuid_with_underscores>) to a
uuid (of the form 8-4-4-4-12).
"""Convert an mdev name (of the form mdev_<uuid_with_underscores> or
mdev_<uuid_with_underscores>_<pciaddress>) to a uuid
(of the form 8-4-4-4-12).
:param mdev_name: the name of the mdev to parse the UUID from
:returns: string containing the uuid
"""
return str(uuid.UUID(mdev_name[5:].replace('_', '-')))
mdev_uuid = mdev_name[5:].replace('_', '-')
# Unconditionnally remove the PCI address from the name
mdev_uuid = mdev_uuid[:36]
return str(uuid.UUID(mdev_uuid))
def mdev_uuid2name(mdev_uuid: str) -> str:
"""Convert an mdev uuid (of the form 8-4-4-4-12) to a name (of the form
mdev_<uuid_with_underscores>).
def mdev_uuid2name(mdev_uuid: str, parent: str = None) -> str:
"""Convert an mdev uuid (of the form 8-4-4-4-12) and optionally its parent
device to a name (of the form mdev_<uuid_with_underscores>[_<pciid>]).
:param mdev_uuid: the uuid of the mediated device
:param parent: the parent device id for the mediated device
:returns: name of the mdev to reference in libvirt
"""
return "mdev_" + mdev_uuid.replace('-', '_')
name = "mdev_" + mdev_uuid.replace('-', '_')
if parent and parent.startswith('pci_'):
name = name + parent[4:]
return name
def get_flags_by_flavor_specs(flavor: 'objects.Flavor') -> ty.Set[str]: