Merge "Use the correct mdev allocated from the pGPU"
This commit is contained in:
commit
1948ce59c0
@ -20603,11 +20603,42 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.assertIsNone(drvr._allocate_mdevs(allocations=allocations))
|
||||
|
||||
def _get_fake_provider_tree_with_vgpu(self):
|
||||
"""Returns a fake ProviderTree with VGPU inventory on two children RPs
|
||||
with one with a correct name and the other one wrong.
|
||||
|
||||
The child provider is named rp1 and its UUID is uuids.rp1.
|
||||
"""
|
||||
cn_rp = dict(
|
||||
uuid=uuids.cn,
|
||||
name='cn',
|
||||
)
|
||||
vgpu_rp_inv = {
|
||||
orc.VGPU: {
|
||||
'total': 1,
|
||||
'min_unit': 1,
|
||||
'max_unit': 1,
|
||||
'step_size': 1,
|
||||
}
|
||||
}
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root(cn_rp['name'], cn_rp['uuid'], generation=0)
|
||||
# Create a first child with a correct naming attribute
|
||||
pt.new_child(cn_rp['name'] + '_' + 'pci_0000_06_00_0', cn_rp['uuid'],
|
||||
uuid=uuids.rp1, generation=0)
|
||||
pt.update_inventory(uuids.rp1, vgpu_rp_inv)
|
||||
# Create a second child with a bad naming convention
|
||||
pt.new_child('oops_I_did_it_again', cn_rp['uuid'],
|
||||
uuid=uuids.rp2, generation=0)
|
||||
pt.update_inventory(uuids.rp2, vgpu_rp_inv)
|
||||
return pt
|
||||
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
'_get_existing_mdevs_not_assigned')
|
||||
def test_allocate_mdevs_with_available_mdevs(self, get_unassigned_mdevs):
|
||||
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||
allocations = {
|
||||
'rp1': {
|
||||
uuids.rp1: {
|
||||
'resources': {
|
||||
orc.VGPU: 1,
|
||||
}
|
||||
@ -20615,8 +20646,12 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
}
|
||||
get_unassigned_mdevs.return_value = set([uuids.mdev1])
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
# Mock the fact update_provider_tree() should have run
|
||||
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||
self.assertEqual([uuids.mdev1],
|
||||
drvr._allocate_mdevs(allocations=allocations))
|
||||
get_unassigned_mdevs.assert_called_once_with(['nvidia-11'],
|
||||
'pci_0000_06_00_0')
|
||||
|
||||
@mock.patch.object(nova.privsep.libvirt, 'create_mdev')
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
@ -20629,7 +20664,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
privsep_create_mdev):
|
||||
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||
allocations = {
|
||||
'rp1': {
|
||||
uuids.rp1: {
|
||||
'resources': {
|
||||
orc.VGPU: 1,
|
||||
}
|
||||
@ -20646,6 +20681,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
}]
|
||||
privsep_create_mdev.return_value = uuids.mdev1
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
# Mock the fact update_provider_tree() should have run
|
||||
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||
self.assertEqual([uuids.mdev1],
|
||||
drvr._allocate_mdevs(allocations=allocations))
|
||||
privsep_create_mdev.assert_called_once_with("0000:06:00.0",
|
||||
@ -20663,7 +20700,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
privsep_create_mdev):
|
||||
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||
allocations = {
|
||||
'rp1': {
|
||||
uuids.rp1: {
|
||||
'resources': {
|
||||
orc.VGPU: 1,
|
||||
}
|
||||
@ -20681,6 +20718,36 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
}
|
||||
}]
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
||||
# Mock the fact update_provider_tree() should have run
|
||||
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||
drvr._allocate_mdevs, allocations=allocations)
|
||||
|
||||
def test_allocate_mdevs_with_no_idea_of_the_provider(self):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
||||
# Mock the fact update_provider_tree() should have run
|
||||
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||
|
||||
# Test that the allocated RP doesn't exist in the tree
|
||||
allocations = {
|
||||
uuids.wrong_rp: {
|
||||
'resources': {
|
||||
orc.VGPU: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||
drvr._allocate_mdevs, allocations=allocations)
|
||||
|
||||
# Test that we were unable to guess the RP name
|
||||
allocations = {
|
||||
uuids.rp2: {
|
||||
'resources': {
|
||||
orc.VGPU: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
# Remember, rp2 has a wrong naming convention
|
||||
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||
drvr._allocate_mdevs, allocations=allocations)
|
||||
|
||||
|
@ -29,6 +29,7 @@ import binascii
|
||||
import collections
|
||||
from collections import deque
|
||||
import contextlib
|
||||
import copy
|
||||
import errno
|
||||
import functools
|
||||
import glob
|
||||
@ -384,6 +385,12 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# avoid any re-calculation when computing resources.
|
||||
self._reserved_hugepages = hardware.numa_get_reserved_huge_pages()
|
||||
|
||||
# Copy of the compute service ProviderTree object that is updated
|
||||
# every time update_provider_tree() is called.
|
||||
# NOTE(sbauza): We only want a read-only cache, this attribute is not
|
||||
# intended to be updatable directly
|
||||
self.provider_tree = None
|
||||
|
||||
def _get_volume_drivers(self):
|
||||
driver_registry = dict()
|
||||
|
||||
@ -576,6 +583,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
"""Recreate assigned mdevs that could have disappeared if we reboot
|
||||
the host.
|
||||
"""
|
||||
# FIXME(sbauza): We blindly recreate mediated devices without checking
|
||||
# which ResourceProvider was allocated for the instance so it would use
|
||||
# another pGPU.
|
||||
# TODO(sbauza): Pass all instances' allocations here.
|
||||
mdevs = self._get_all_assigned_mediated_devices()
|
||||
requested_types = self._get_supported_vgpu_types()
|
||||
for (mdev_uuid, instance_uuid) in six.iteritems(mdevs):
|
||||
@ -6142,26 +6153,34 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
|
||||
return vgpu_allocations
|
||||
|
||||
def _get_existing_mdevs_not_assigned(self, requested_types=None):
|
||||
def _get_existing_mdevs_not_assigned(self, requested_types=None,
|
||||
parent=None):
|
||||
"""Returns the already created mediated devices that are not assigned
|
||||
to a guest yet.
|
||||
|
||||
:param requested_types: Filter out the result for only mediated devices
|
||||
having those types.
|
||||
:param parent: Filter out result for only mdevs from the parent device.
|
||||
"""
|
||||
allocated_mdevs = self._get_all_assigned_mediated_devices()
|
||||
mdevs = self._get_mediated_devices(requested_types)
|
||||
available_mdevs = set([mdev["uuid"]
|
||||
for mdev in mdevs]) - set(allocated_mdevs)
|
||||
available_mdevs = set()
|
||||
for mdev in mdevs:
|
||||
if parent is None or mdev['parent'] == parent:
|
||||
available_mdevs.add(mdev["uuid"])
|
||||
|
||||
available_mdevs -= set(allocated_mdevs)
|
||||
return available_mdevs
|
||||
|
||||
def _create_new_mediated_device(self, requested_types, uuid=None):
|
||||
def _create_new_mediated_device(self, requested_types, uuid=None,
|
||||
parent=None):
|
||||
"""Find a physical device that can support a new mediated device and
|
||||
create it.
|
||||
|
||||
:param requested_types: Filter only capable devices supporting those
|
||||
types.
|
||||
:param uuid: The possible mdev UUID we want to create again
|
||||
:param parent: Only create a mdev for this device
|
||||
|
||||
:returns: the newly created mdev UUID or None if not possible
|
||||
"""
|
||||
@ -6176,6 +6195,11 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
if device['types'][asked_type]['availableInstances'] > 0:
|
||||
# That physical GPU has enough room for a new mdev
|
||||
dev_name = device['dev_id']
|
||||
# the parent attribute can be None
|
||||
if parent is not None and dev_name != parent:
|
||||
# The device is not the one that was called, not creating
|
||||
# the mdev
|
||||
continue
|
||||
# We need the PCI address, not the libvirt name
|
||||
# The libvirt name is like 'pci_0000_84_00_0'
|
||||
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
|
||||
@ -6212,13 +6236,49 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
LOG.warning('More than one allocation was passed over to libvirt '
|
||||
'while at the moment libvirt only supports one. Only '
|
||||
'the first allocation will be looked up.')
|
||||
alloc = six.next(six.itervalues(vgpu_allocations))
|
||||
rp_uuid, alloc = six.next(six.iteritems(vgpu_allocations))
|
||||
vgpus_asked = alloc['resources'][orc.VGPU]
|
||||
|
||||
# Find if we allocated against a specific pGPU (and then the allocation
|
||||
# is made against a child RP) or any pGPU (in case the VGPU inventory
|
||||
# is still on the root RP)
|
||||
try:
|
||||
allocated_rp = self.provider_tree.data(rp_uuid)
|
||||
except ValueError:
|
||||
# The provider doesn't exist, return a better understandable
|
||||
# exception
|
||||
raise exception.ComputeResourcesUnavailable(
|
||||
reason='vGPU resource is not available')
|
||||
# TODO(sbauza): Remove this conditional in Train once all VGPU
|
||||
# inventories are related to a child RP
|
||||
if allocated_rp.parent_uuid is None:
|
||||
# We are on a root RP
|
||||
parent_device = None
|
||||
else:
|
||||
rp_name = allocated_rp.name
|
||||
# There can be multiple roots, we need to find the root name
|
||||
# to guess the physical device name
|
||||
roots = self.provider_tree.roots
|
||||
for root in roots:
|
||||
if rp_name.startswith(root.name + '_'):
|
||||
# The RP name convention is :
|
||||
# root_name + '_' + parent_device
|
||||
parent_device = rp_name[len(root.name) + 1:]
|
||||
break
|
||||
else:
|
||||
LOG.warning("pGPU device name %(name)s can't be guessed from "
|
||||
"the ProviderTree "
|
||||
"roots %(roots)s", {'name': rp_name,
|
||||
'roots': roots})
|
||||
# We f... have no idea what was the parent device
|
||||
# If we can't find devices having available VGPUs, just raise
|
||||
raise exception.ComputeResourcesUnavailable(
|
||||
reason='vGPU resource is not available')
|
||||
|
||||
requested_types = self._get_supported_vgpu_types()
|
||||
# Which mediated devices are created but not assigned to a guest ?
|
||||
mdevs_available = self._get_existing_mdevs_not_assigned(
|
||||
requested_types)
|
||||
requested_types, parent_device)
|
||||
|
||||
chosen_mdevs = []
|
||||
for c in six.moves.range(vgpus_asked):
|
||||
@ -6227,7 +6287,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# Take the first available mdev
|
||||
chosen_mdev = mdevs_available.pop()
|
||||
else:
|
||||
chosen_mdev = self._create_new_mediated_device(requested_types)
|
||||
chosen_mdev = self._create_new_mediated_device(
|
||||
requested_types, parent=parent_device)
|
||||
if not chosen_mdev:
|
||||
# If we can't find devices having available VGPUs, just raise
|
||||
raise exception.ComputeResourcesUnavailable(
|
||||
@ -6563,6 +6624,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
provider_tree.add_traits(nodename, *traits_to_add)
|
||||
provider_tree.remove_traits(nodename, *traits_to_remove)
|
||||
|
||||
# Now that we updated the ProviderTree, we want to store it locally
|
||||
# so that spawn() or other methods can access it thru a getter
|
||||
self.provider_tree = copy.deepcopy(provider_tree)
|
||||
|
||||
def get_available_resource(self, nodename):
|
||||
"""Retrieve resource information.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user