libvirt: create vGPU for instance

If an allocation is asking for a VGPU, then libvirt will look at the
available mediated devices and call the sysfs to create one of them if
needed.

Please note I commented in the relnote all the caveats we currently have
with mediated devices in libvirt, but I'll provide some workarounds for
those in the next changes.

Change-Id: Ibf210dd27972fed2651d6c9bd73a0bcf352c8bab
Partially-Implements: blueprint add-support-for-vgpu
This commit is contained in:
Sylvain Bauza 2017-12-18 15:57:47 +01:00
parent bf55576e0b
commit 8dc0f636f4
6 changed files with 442 additions and 7 deletions

View File

@ -24,6 +24,7 @@ import stat
from oslo_concurrency import processutils
from oslo_log import log as logging
from oslo_utils import units
from oslo_utils import uuidutils
from nova.i18n import _
import nova.privsep
@ -301,3 +302,15 @@ def readpty(path):
@nova.privsep.sys_admin_pctxt.entrypoint
def xend_probe():
processutils.execute('xend', 'status', check_exit_code=True)
@nova.privsep.sys_admin_pctxt.entrypoint
def create_mdev(physical_device, mdev_type, uuid=None):
"""Instantiate a mediated device."""
if uuid is None:
uuid = uuidutils.generate_uuid()
fpath = '/sys/class/mdev_bus/{0}/mdev_supported_types/{1}/create'
fpath = fpath.format(physical_device, mdev_type)
with open(fpath, 'w') as f:
f.write(uuid)
return uuid

View File

@ -1409,6 +1409,34 @@ class LibvirtConfigGuestHostdevPCI(LibvirtConfigBaseTest):
self.assertEqual(obj.type, 'usb')
class LibvirtConfigGuestHostdevMDEV(LibvirtConfigBaseTest):
expected = """
<hostdev mode='subsystem' type='mdev' model='vfio-pci'
managed='no'>
<source>
<address uuid="b38a3f43-4be2-4046-897f-b67c2f5e0140" />
</source>
</hostdev>
"""
def test_config_guest_hostdev_mdev(self):
hostdev = config.LibvirtConfigGuestHostdevMDEV()
hostdev.uuid = "b38a3f43-4be2-4046-897f-b67c2f5e0140"
xml = hostdev.to_xml()
self.assertXmlEqual(self.expected, xml)
def test_parse_guest_hostdev_mdev(self):
xmldoc = self.expected
obj = config.LibvirtConfigGuestHostdevMDEV()
obj.parse_str(xmldoc)
self.assertEqual(obj.mode, 'subsystem')
self.assertEqual(obj.type, 'mdev')
self.assertEqual(obj.managed, 'no')
self.assertEqual(obj.model, 'vfio-pci')
self.assertEqual(obj.uuid, 'b38a3f43-4be2-4046-897f-b67c2f5e0140')
class LibvirtConfigGuestCharDeviceLog(LibvirtConfigBaseTest):
def test_config_log(self):

View File

@ -15874,6 +15874,24 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertEqual(info[0]['over_committed_disk_size'], 20647509226)
self.assertEqual(info[0]['virt_disk_size'], 21474836480)
def test_get_guest_config_with_mdevs(self):
mdevs = [uuids.mdev1]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
instance_ref = objects.Instance(**self.test_instance)
image_meta = objects.ImageMeta.from_dict(self.test_image_meta)
cfg = drvr._get_guest_config(instance_ref,
_fake_network_info(self, 1),
image_meta, {'mapping': {}},
mdevs=mdevs)
# Loop over all devices to make sure we have at least one mediated one.
for device in cfg.devices:
if isinstance(device, vconfig.LibvirtConfigGuestHostdevMDEV):
# Make sure we use the provided UUID
self.assertEqual(uuids.mdev1, device.uuid)
break
else:
assert False, "Unable to find any mediated device for the guest."
class HostStateTestCase(test.NoDBTestCase):
@ -18304,11 +18322,13 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
'deviceAPI': 'vfio-pci'},
}}]
get_mdevs.return_value = [
{'dev_id': 'pci_0000_84_00_0',
{'dev_id': 'mdev_4b20d080_1b54_4048_85b3_a6a62d165c01',
'uuid': "4b20d080-1b54-4048-85b3-a6a62d165c01",
'type': 'nvidia-11',
'iommuGroup': 1
},
{'dev_id': 'pci_0000_84_00_0',
{'dev_id': 'mdev_4b20d080_1b54_4048_85b3_a6a62d165c02',
'uuid': "4b20d080-1b54-4048-85b3-a6a62d165c02",
'type': 'nvidia-11',
'iommuGroup': 1
},
@ -18385,6 +18405,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
expected = [{"dev_id": "mdev_4b20d080_1b54_4048_85b3_a6a62d165c01",
"uuid": "4b20d080-1b54-4048-85b3-a6a62d165c01",
"type": "nvidia-11",
"iommu_group": 12
}]
@ -18410,6 +18431,138 @@ class LibvirtDriverTestCase(test.NoDBTestCase):
# we don't get results.
self.assertEqual([], drvr._get_mediated_devices(types=['nvidia-12']))
@mock.patch.object(host.Host, 'list_guests')
def test_get_all_assigned_mediated_devices(self, list_guests):
dom_with_vgpu = """
<domain type="kvm">
<devices>
<hostdev mode='subsystem' type='mdev' model='vfio-pci'>
<source>
<address uuid='%s'/>
</source>
</hostdev>
</devices>
</domain>
""" % uuids.mdev
guest1 = libvirt_guest.Guest(FakeVirtDomain())
guest2 = libvirt_guest.Guest(FakeVirtDomain(fake_xml=dom_with_vgpu))
list_guests.return_value = [guest1, guest2]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual({uuids.mdev: guest2.uuid},
drvr._get_all_assigned_mediated_devices())
def test_allocate_mdevs_with_no_vgpu_allocations(self):
allocations = {
'rp1': {
'resources': {
# Just any resource class but VGPU
fields.ResourceClass.VCPU: 1,
}
}
}
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertIsNone(drvr._allocate_mdevs(allocations=allocations))
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_existing_mdevs_not_assigned')
def test_allocate_mdevs_with_available_mdevs(self, get_unassigned_mdevs):
allocations = {
'rp1': {
'resources': {
fields.ResourceClass.VGPU: 1,
}
}
}
get_unassigned_mdevs.return_value = set([uuids.mdev1])
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual([uuids.mdev1],
drvr._allocate_mdevs(allocations=allocations))
@mock.patch.object(nova.privsep.libvirt, 'create_mdev')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_mdev_capable_devices')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_existing_mdevs_not_assigned')
def test_allocate_mdevs_with_no_mdevs_but_capacity(self,
unallocated_mdevs,
get_mdev_capable_devs,
privsep_create_mdev):
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
allocations = {
'rp1': {
'resources': {
fields.ResourceClass.VGPU: 1,
}
}
}
unallocated_mdevs.return_value = set()
get_mdev_capable_devs.return_value = [
{"dev_id": "pci_0000_06_00_0",
"types": {'nvidia-11': {'availableInstances': 16,
'name': 'GRID M60-0B',
'deviceAPI': 'vfio-pci'},
}
}]
privsep_create_mdev.return_value = uuids.mdev1
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual([uuids.mdev1],
drvr._allocate_mdevs(allocations=allocations))
privsep_create_mdev.assert_called_once_with("0000:06:00.0",
'nvidia-11')
@mock.patch.object(nova.privsep.libvirt, 'create_mdev')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_mdev_capable_devices')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_existing_mdevs_not_assigned')
def test_allocate_mdevs_with_no_gpu_capacity(self,
unallocated_mdevs,
get_mdev_capable_devs,
privsep_create_mdev):
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
allocations = {
'rp1': {
'resources': {
fields.ResourceClass.VGPU: 1,
}
}
}
unallocated_mdevs.return_value = set()
# Mock the fact all possible mediated devices are created and all of
# them being assigned
get_mdev_capable_devs.return_value = [
{"dev_id": "pci_0000_06_00_0",
"types": {'nvidia-11': {'availableInstances': 0,
'name': 'GRID M60-0B',
'deviceAPI': 'vfio-pci'},
}
}]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
self.assertRaises(exception.ComputeResourcesUnavailable,
drvr._allocate_mdevs, allocations=allocations)
@mock.patch.object(libvirt_driver.LibvirtDriver, '_get_mediated_devices')
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_get_all_assigned_mediated_devices')
def test_get_existing_mdevs_not_assigned(self, get_all_assigned_mdevs,
get_mediated_devices):
# mdev2 is assigned to instance1
get_all_assigned_mdevs.return_value = {uuids.mdev2: uuids.inst1}
# there is a total of 2 mdevs, mdev1 and mdev2
get_mediated_devices.return_value = [{'dev_id': 'mdev_some_uuid1',
'uuid': uuids.mdev1,
'type': 'nvidia-11',
'iommu_group': 1},
{'dev_id': 'mdev_some_uuid2',
'uuid': uuids.mdev2,
'type': 'nvidia-11',
'iommu_group': 1}]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
# Since mdev2 is assigned to inst1, only mdev1 is available
self.assertEqual(set([uuids.mdev1]),
drvr._get_existing_mdevs_not_assigned())
class LibvirtVolumeUsageTestCase(test.NoDBTestCase):
"""Test for LibvirtDriver.get_all_volume_usage."""

View File

@ -1595,7 +1595,9 @@ class LibvirtConfigGuestHostdev(LibvirtConfigGuestDevice):
__init__(root_name="hostdev", **kwargs)
self.mode = kwargs.get('mode')
self.type = kwargs.get('type')
self.managed = 'yes'
# managed attribute is only used by PCI devices but mediated devices
# need to say managed=no
self.managed = kwargs.get('managed', 'yes')
def format_dom(self):
dev = super(LibvirtConfigGuestHostdev, self).format_dom()
@ -1647,6 +1649,37 @@ class LibvirtConfigGuestHostdevPCI(LibvirtConfigGuestHostdev):
self.function = sub.get('function')
class LibvirtConfigGuestHostdevMDEV(LibvirtConfigGuestHostdev):
def __init__(self, **kwargs):
super(LibvirtConfigGuestHostdevMDEV, self).__init__(
mode='subsystem', type='mdev', managed='no', **kwargs)
# model attribute is only supported by mediated devices
self.model = kwargs.get('model', 'vfio-pci')
self.uuid = None
def format_dom(self):
dev = super(LibvirtConfigGuestHostdevMDEV, self).format_dom()
if self.model:
dev.set("model", self.model)
address = etree.Element("address", uuid=self.uuid)
source = etree.Element("source")
source.append(address)
dev.append(source)
return dev
def parse_dom(self, xmldoc):
children = super(LibvirtConfigGuestHostdevMDEV, self).parse_dom(xmldoc)
if xmldoc.get('model'):
self.model = xmldoc.get('model')
for c in children:
if c.tag == "source":
for sub in c.getchildren():
if sub.tag == 'address':
self.uuid = sub.get('uuid')
return
class LibvirtConfigGuestCharBase(LibvirtConfigGuestDevice):
def __init__(self, **kwargs):
@ -2281,6 +2314,7 @@ class LibvirtConfigGuest(LibvirtConfigObject):
# Note: This cover only for: LibvirtConfigGuestDisks
# LibvirtConfigGuestFilesys
# LibvirtConfigGuestHostdevPCI
# LibvirtConfigGuestHostdevMDEV
# LibvirtConfigGuestInterface
# LibvirtConfigGuestUidMap
# LibvirtConfigGuestGidMap
@ -2300,6 +2334,10 @@ class LibvirtConfigGuest(LibvirtConfigObject):
obj = LibvirtConfigGuestHostdevPCI()
obj.parse_dom(d)
self.devices.append(obj)
elif d.tag == 'hostdev' and d.get('type') == 'mdev':
obj = LibvirtConfigGuestHostdevMDEV()
obj.parse_dom(d)
self.devices.append(obj)
elif d.tag == 'interface':
obj = LibvirtConfigGuestInterface()
obj.parse_dom(d)

View File

@ -304,6 +304,9 @@ MIN_LIBVIRT_MDEV_SUPPORT = (3, 4, 0)
MIN_LIBVIRT_MULTIATTACH = (3, 10, 0)
VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
class LibvirtDriver(driver.ComputeDriver):
capabilities = {
"has_imagecache": True,
@ -2860,9 +2863,13 @@ class LibvirtDriver(driver.ComputeDriver):
# Required by Quobyte CI
self._ensure_console_log_for_instance(instance)
# Does the guest need to be assigned some vGPU mediated devices ?
mdevs = self._allocate_mdevs(allocations)
xml = self._get_guest_xml(context, instance, network_info,
disk_info, image_meta,
block_device_info=block_device_info)
block_device_info=block_device_info,
mdevs=mdevs)
self._create_domain_and_network(
context, xml, instance, network_info,
block_device_info=block_device_info,
@ -4805,12 +4812,14 @@ class LibvirtDriver(driver.ComputeDriver):
def _get_guest_config(self, instance, network_info, image_meta,
disk_info, rescue=None, block_device_info=None,
context=None):
context=None, mdevs=None):
"""Get config data for parameters.
:param rescue: optional dictionary that should contain the key
'ramdisk_id' if a ramdisk is needed for the rescue image and
'kernel_id' if a kernel is needed for the rescue image.
:param mdevs: optional list of mediated devices to assign to the guest.
"""
flavor = instance.flavor
inst_path = libvirt_utils.get_instance_path(instance)
@ -4914,8 +4923,17 @@ class LibvirtDriver(driver.ComputeDriver):
self._guest_add_memory_balloon(guest)
if mdevs:
self._guest_add_mdevs(guest, mdevs)
return guest
def _guest_add_mdevs(self, guest, chosen_mdevs):
for chosen_mdev in chosen_mdevs:
mdev = vconfig.LibvirtConfigGuestHostdevMDEV()
mdev.uuid = chosen_mdev
guest.add_device(mdev)
@staticmethod
def _guest_add_spice_channel(guest):
if (CONF.spice.enabled and CONF.spice.agent_enabled
@ -5057,7 +5075,8 @@ class LibvirtDriver(driver.ComputeDriver):
def _get_guest_xml(self, context, instance, network_info, disk_info,
image_meta, rescue=None,
block_device_info=None):
block_device_info=None,
mdevs=None):
# NOTE(danms): Stringifying a NetworkInfo will take a lock. Do
# this ahead of time so that we don't acquire it while also
# holding the logging lock.
@ -5075,7 +5094,7 @@ class LibvirtDriver(driver.ComputeDriver):
LOG.debug(strutils.mask_password(msg), instance=instance)
conf = self._get_guest_config(instance, network_info, image_meta,
disk_info, rescue, block_device_info,
context)
context, mdevs)
xml = conf.to_xml()
LOG.debug('End _get_guest_xml xml=%(xml)s',
@ -5686,6 +5705,8 @@ class LibvirtDriver(driver.ComputeDriver):
device = {
"dev_id": cfgdev.name,
# name is like mdev_00ead764_fdc0_46b6_8db9_2963f5c815b4
"uuid": str(uuid.UUID(cfgdev.name[5:].replace('_', '-'))),
"type": cfgdev.mdev_information.type,
"iommu_group": cfgdev.mdev_information.iommu_group,
}
@ -5709,6 +5730,132 @@ class LibvirtDriver(driver.ComputeDriver):
mediated_devices.append(device)
return mediated_devices
def _get_all_assigned_mediated_devices(self):
"""Lookup all instances from the host and return all the mediated
devices that are assigned to a guest.
:returns: A dictionary of keys being mediated device UUIDs and their
respective values the instance UUID of the guest using it.
"""
allocated_mdevs = {}
for guest in self._host.list_guests(only_running=False):
cfg = guest.get_config()
for device in cfg.devices:
if isinstance(device, vconfig.LibvirtConfigGuestHostdevMDEV):
allocated_mdevs[device.uuid] = guest.uuid
return allocated_mdevs
@staticmethod
def _vgpu_allocations(allocations):
"""Filtering only the VGPU allocations from a list of allocations.
:param allocations: Information about resources allocated to the
instance via placement, of the form returned by
SchedulerReportClient.get_allocations_for_consumer.
"""
if not allocations:
# If no allocations, there is no vGPU request.
return {}
RC_VGPU = fields.ResourceClass.VGPU
vgpu_allocations = {}
for rp in allocations:
res = allocations[rp]['resources']
if RC_VGPU in res and res[RC_VGPU] > 0:
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
return vgpu_allocations
def _get_existing_mdevs_not_assigned(self, requested_types=None):
"""Returns the already created mediated devices that are not assigned
to a guest yet.
:param requested_types: Filter out the result for only mediated devices
having those types.
"""
allocated_mdevs = self._get_all_assigned_mediated_devices()
mdevs = self._get_mediated_devices(requested_types)
available_mdevs = set([mdev["uuid"]
for mdev in mdevs]) - set(allocated_mdevs)
return available_mdevs
def _create_new_mediated_device(self, requested_types):
"""Find a physical device that can support a new mediated device and
create it.
:param requested_types: Filter only capable devices supporting those
types.
:returns: the newly created mdev UUID or None if not possible
"""
# Try to see if we can still create a new mediated device
devices = self._get_mdev_capable_devices(requested_types)
for device in devices:
# For the moment, the libvirt driver only supports one
# type per host
# TODO(sbauza): Once we support more than one type, make
# sure we look at the flavor/trait for the asked type.
asked_type = requested_types[0]
if device['types'][asked_type]['availableInstances'] > 0:
# That physical GPU has enough room for a new mdev
dev_name = device['dev_id']
# We need the PCI address, not the libvirt name
# The libvirt name is like 'pci_0000_84_00_0'
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
chosen_mdev = nova.privsep.libvirt.create_mdev(pci_addr,
asked_type)
return chosen_mdev
@utils.synchronized(VGPU_RESOURCE_SEMAPHORE)
def _allocate_mdevs(self, allocations):
"""Returns a list of mediated device UUIDs corresponding to available
resources we can assign to the guest(s) corresponding to the allocation
requests passed as argument.
That method can either find an existing but unassigned mediated device
it can allocate, or create a new mediated device from a capable
physical device if the latter has enough left capacity.
:param allocations: Information about resources allocated to the
instance via placement, of the form returned by
SchedulerReportClient.get_allocations_for_consumer.
That code is supporting Placement API version 1.12
"""
vgpu_allocations = self._vgpu_allocations(allocations)
if not vgpu_allocations:
return
# TODO(sbauza): Once we have nested resource providers, find which one
# is having the related allocation for the specific VGPU type.
# For the moment, we should only have one allocation for
# ResourceProvider.
# TODO(sbauza): Iterate over all the allocations once we have
# nested Resource Providers. For the moment, just take the first.
if len(vgpu_allocations) > 1:
LOG.warning('More than one allocation was passed over to libvirt '
'while at the moment libvirt only supports one. Only '
'the first allocation will be looked up.')
alloc = six.next(six.itervalues(vgpu_allocations))
vgpus_asked = alloc['resources'][fields.ResourceClass.VGPU]
requested_types = self._get_supported_vgpu_types()
# Which mediated devices are created but not assigned to a guest ?
mdevs_available = self._get_existing_mdevs_not_assigned(
requested_types)
chosen_mdevs = []
for c in six.moves.range(vgpus_asked):
chosen_mdev = None
if mdevs_available:
# Take the first available mdev
chosen_mdev = mdevs_available.pop()
else:
chosen_mdev = self._create_new_mediated_device(requested_types)
if not chosen_mdev:
# If we can't find devices having available VGPUs, just raise
raise exception.ComputeResourcesUnavailable(
reason='vGPU resource is not available')
else:
chosen_mdevs.append(chosen_mdev)
return chosen_mdevs
def _has_numa_support(self):
# This means that the host can support LibvirtConfigGuestNUMATune
# and the nodeset field in LibvirtConfigGuestMemoryBackingPage

View File

@ -0,0 +1,56 @@
---
features:
- |
The libvirt driver now supports booting instances by asking for virtual
GPUs.
In order to support that, the operators should specify the enabled vGPU
types in the nova-compute configuration file by using the configuration
option ``[devices]/enabled_vgpu_types``. Only the enabled vGPU types can be
used by instances.
For knowing which types the physical GPU driver supports for libvirt, the
operator can look at the sysfs by doing
..
ls /sys/class/mdev_bus/<device>/mdev_supported_types
Operators can specify a VGPU resource in a flavor by adding in the flavor's
extra specs
..
nova flavor-key <flavor-id> set resources:VGPU=1
That said, Nova currently has some caveats for using vGPUs.
* For the moment, only a single type can be supported across one compute
node, which means that libvirt will create the vGPU by using that
specific type only. It's also possible to have two compute nodes having
different types but there is no possibility yet to specify in the flavor
which specific type we want to use for that instance.
* For the moment, please don't restart instances (or suspend/resume them)
or the VGPU related device will be removed from the guest.
* Mediated devices that are created by the libvirt driver are not
persisted upon reboot. Consequently, a guest startup would fail since the
virtual device wouldn't be existing. In order to prevent that issue,
operators rebooting a compute node have to, before restarting
nova-compute service, look at all the guest XML configuration and
recreate mediated devices for existing guests by doing
..
echo <UUID> > /sys/class/mdev_bus/<device>/mdev_supported_types/<type>/create
* If you use Nvidia GRID cards, please know that there is a limitation with
the nvidia driver that prevents one guest to have more than one virtual
GPU from the same physical card. One guest can have two or more virtual
GPUs but then it requires each vGPU to be hosted by a separate physical
card.
We are working actively to remove or workaround those caveats, but please
understand that for the moment this feature is experimental given all the
above.