Merge "Use the correct mdev allocated from the pGPU"
This commit is contained in:
commit
1948ce59c0
@ -20603,11 +20603,42 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||||
self.assertIsNone(drvr._allocate_mdevs(allocations=allocations))
|
self.assertIsNone(drvr._allocate_mdevs(allocations=allocations))
|
||||||
|
|
||||||
|
def _get_fake_provider_tree_with_vgpu(self):
|
||||||
|
"""Returns a fake ProviderTree with VGPU inventory on two children RPs
|
||||||
|
with one with a correct name and the other one wrong.
|
||||||
|
|
||||||
|
The child provider is named rp1 and its UUID is uuids.rp1.
|
||||||
|
"""
|
||||||
|
cn_rp = dict(
|
||||||
|
uuid=uuids.cn,
|
||||||
|
name='cn',
|
||||||
|
)
|
||||||
|
vgpu_rp_inv = {
|
||||||
|
orc.VGPU: {
|
||||||
|
'total': 1,
|
||||||
|
'min_unit': 1,
|
||||||
|
'max_unit': 1,
|
||||||
|
'step_size': 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pt = provider_tree.ProviderTree()
|
||||||
|
pt.new_root(cn_rp['name'], cn_rp['uuid'], generation=0)
|
||||||
|
# Create a first child with a correct naming attribute
|
||||||
|
pt.new_child(cn_rp['name'] + '_' + 'pci_0000_06_00_0', cn_rp['uuid'],
|
||||||
|
uuid=uuids.rp1, generation=0)
|
||||||
|
pt.update_inventory(uuids.rp1, vgpu_rp_inv)
|
||||||
|
# Create a second child with a bad naming convention
|
||||||
|
pt.new_child('oops_I_did_it_again', cn_rp['uuid'],
|
||||||
|
uuid=uuids.rp2, generation=0)
|
||||||
|
pt.update_inventory(uuids.rp2, vgpu_rp_inv)
|
||||||
|
return pt
|
||||||
|
|
||||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||||
'_get_existing_mdevs_not_assigned')
|
'_get_existing_mdevs_not_assigned')
|
||||||
def test_allocate_mdevs_with_available_mdevs(self, get_unassigned_mdevs):
|
def test_allocate_mdevs_with_available_mdevs(self, get_unassigned_mdevs):
|
||||||
|
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||||
allocations = {
|
allocations = {
|
||||||
'rp1': {
|
uuids.rp1: {
|
||||||
'resources': {
|
'resources': {
|
||||||
orc.VGPU: 1,
|
orc.VGPU: 1,
|
||||||
}
|
}
|
||||||
@ -20615,8 +20646,12 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
}
|
}
|
||||||
get_unassigned_mdevs.return_value = set([uuids.mdev1])
|
get_unassigned_mdevs.return_value = set([uuids.mdev1])
|
||||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||||
|
# Mock the fact update_provider_tree() should have run
|
||||||
|
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||||
self.assertEqual([uuids.mdev1],
|
self.assertEqual([uuids.mdev1],
|
||||||
drvr._allocate_mdevs(allocations=allocations))
|
drvr._allocate_mdevs(allocations=allocations))
|
||||||
|
get_unassigned_mdevs.assert_called_once_with(['nvidia-11'],
|
||||||
|
'pci_0000_06_00_0')
|
||||||
|
|
||||||
@mock.patch.object(nova.privsep.libvirt, 'create_mdev')
|
@mock.patch.object(nova.privsep.libvirt, 'create_mdev')
|
||||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||||
@ -20629,7 +20664,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
privsep_create_mdev):
|
privsep_create_mdev):
|
||||||
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||||
allocations = {
|
allocations = {
|
||||||
'rp1': {
|
uuids.rp1: {
|
||||||
'resources': {
|
'resources': {
|
||||||
orc.VGPU: 1,
|
orc.VGPU: 1,
|
||||||
}
|
}
|
||||||
@ -20646,6 +20681,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
}]
|
}]
|
||||||
privsep_create_mdev.return_value = uuids.mdev1
|
privsep_create_mdev.return_value = uuids.mdev1
|
||||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||||
|
# Mock the fact update_provider_tree() should have run
|
||||||
|
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||||
self.assertEqual([uuids.mdev1],
|
self.assertEqual([uuids.mdev1],
|
||||||
drvr._allocate_mdevs(allocations=allocations))
|
drvr._allocate_mdevs(allocations=allocations))
|
||||||
privsep_create_mdev.assert_called_once_with("0000:06:00.0",
|
privsep_create_mdev.assert_called_once_with("0000:06:00.0",
|
||||||
@ -20663,7 +20700,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
privsep_create_mdev):
|
privsep_create_mdev):
|
||||||
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
self.flags(enabled_vgpu_types=['nvidia-11'], group='devices')
|
||||||
allocations = {
|
allocations = {
|
||||||
'rp1': {
|
uuids.rp1: {
|
||||||
'resources': {
|
'resources': {
|
||||||
orc.VGPU: 1,
|
orc.VGPU: 1,
|
||||||
}
|
}
|
||||||
@ -20681,6 +20718,36 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
|||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
||||||
|
# Mock the fact update_provider_tree() should have run
|
||||||
|
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||||
|
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||||
|
drvr._allocate_mdevs, allocations=allocations)
|
||||||
|
|
||||||
|
def test_allocate_mdevs_with_no_idea_of_the_provider(self):
|
||||||
|
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
||||||
|
# Mock the fact update_provider_tree() should have run
|
||||||
|
drvr.provider_tree = self._get_fake_provider_tree_with_vgpu()
|
||||||
|
|
||||||
|
# Test that the allocated RP doesn't exist in the tree
|
||||||
|
allocations = {
|
||||||
|
uuids.wrong_rp: {
|
||||||
|
'resources': {
|
||||||
|
orc.VGPU: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||||
|
drvr._allocate_mdevs, allocations=allocations)
|
||||||
|
|
||||||
|
# Test that we were unable to guess the RP name
|
||||||
|
allocations = {
|
||||||
|
uuids.rp2: {
|
||||||
|
'resources': {
|
||||||
|
orc.VGPU: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Remember, rp2 has a wrong naming convention
|
||||||
self.assertRaises(exception.ComputeResourcesUnavailable,
|
self.assertRaises(exception.ComputeResourcesUnavailable,
|
||||||
drvr._allocate_mdevs, allocations=allocations)
|
drvr._allocate_mdevs, allocations=allocations)
|
||||||
|
|
||||||
|
@ -29,6 +29,7 @@ import binascii
|
|||||||
import collections
|
import collections
|
||||||
from collections import deque
|
from collections import deque
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import copy
|
||||||
import errno
|
import errno
|
||||||
import functools
|
import functools
|
||||||
import glob
|
import glob
|
||||||
@ -384,6 +385,12 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
# avoid any re-calculation when computing resources.
|
# avoid any re-calculation when computing resources.
|
||||||
self._reserved_hugepages = hardware.numa_get_reserved_huge_pages()
|
self._reserved_hugepages = hardware.numa_get_reserved_huge_pages()
|
||||||
|
|
||||||
|
# Copy of the compute service ProviderTree object that is updated
|
||||||
|
# every time update_provider_tree() is called.
|
||||||
|
# NOTE(sbauza): We only want a read-only cache, this attribute is not
|
||||||
|
# intended to be updatable directly
|
||||||
|
self.provider_tree = None
|
||||||
|
|
||||||
def _get_volume_drivers(self):
|
def _get_volume_drivers(self):
|
||||||
driver_registry = dict()
|
driver_registry = dict()
|
||||||
|
|
||||||
@ -576,6 +583,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
"""Recreate assigned mdevs that could have disappeared if we reboot
|
"""Recreate assigned mdevs that could have disappeared if we reboot
|
||||||
the host.
|
the host.
|
||||||
"""
|
"""
|
||||||
|
# FIXME(sbauza): We blindly recreate mediated devices without checking
|
||||||
|
# which ResourceProvider was allocated for the instance so it would use
|
||||||
|
# another pGPU.
|
||||||
|
# TODO(sbauza): Pass all instances' allocations here.
|
||||||
mdevs = self._get_all_assigned_mediated_devices()
|
mdevs = self._get_all_assigned_mediated_devices()
|
||||||
requested_types = self._get_supported_vgpu_types()
|
requested_types = self._get_supported_vgpu_types()
|
||||||
for (mdev_uuid, instance_uuid) in six.iteritems(mdevs):
|
for (mdev_uuid, instance_uuid) in six.iteritems(mdevs):
|
||||||
@ -6142,26 +6153,34 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
|
vgpu_allocations[rp] = {'resources': {RC_VGPU: res[RC_VGPU]}}
|
||||||
return vgpu_allocations
|
return vgpu_allocations
|
||||||
|
|
||||||
def _get_existing_mdevs_not_assigned(self, requested_types=None):
|
def _get_existing_mdevs_not_assigned(self, requested_types=None,
|
||||||
|
parent=None):
|
||||||
"""Returns the already created mediated devices that are not assigned
|
"""Returns the already created mediated devices that are not assigned
|
||||||
to a guest yet.
|
to a guest yet.
|
||||||
|
|
||||||
:param requested_types: Filter out the result for only mediated devices
|
:param requested_types: Filter out the result for only mediated devices
|
||||||
having those types.
|
having those types.
|
||||||
|
:param parent: Filter out result for only mdevs from the parent device.
|
||||||
"""
|
"""
|
||||||
allocated_mdevs = self._get_all_assigned_mediated_devices()
|
allocated_mdevs = self._get_all_assigned_mediated_devices()
|
||||||
mdevs = self._get_mediated_devices(requested_types)
|
mdevs = self._get_mediated_devices(requested_types)
|
||||||
available_mdevs = set([mdev["uuid"]
|
available_mdevs = set()
|
||||||
for mdev in mdevs]) - set(allocated_mdevs)
|
for mdev in mdevs:
|
||||||
|
if parent is None or mdev['parent'] == parent:
|
||||||
|
available_mdevs.add(mdev["uuid"])
|
||||||
|
|
||||||
|
available_mdevs -= set(allocated_mdevs)
|
||||||
return available_mdevs
|
return available_mdevs
|
||||||
|
|
||||||
def _create_new_mediated_device(self, requested_types, uuid=None):
|
def _create_new_mediated_device(self, requested_types, uuid=None,
|
||||||
|
parent=None):
|
||||||
"""Find a physical device that can support a new mediated device and
|
"""Find a physical device that can support a new mediated device and
|
||||||
create it.
|
create it.
|
||||||
|
|
||||||
:param requested_types: Filter only capable devices supporting those
|
:param requested_types: Filter only capable devices supporting those
|
||||||
types.
|
types.
|
||||||
:param uuid: The possible mdev UUID we want to create again
|
:param uuid: The possible mdev UUID we want to create again
|
||||||
|
:param parent: Only create a mdev for this device
|
||||||
|
|
||||||
:returns: the newly created mdev UUID or None if not possible
|
:returns: the newly created mdev UUID or None if not possible
|
||||||
"""
|
"""
|
||||||
@ -6176,6 +6195,11 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
if device['types'][asked_type]['availableInstances'] > 0:
|
if device['types'][asked_type]['availableInstances'] > 0:
|
||||||
# That physical GPU has enough room for a new mdev
|
# That physical GPU has enough room for a new mdev
|
||||||
dev_name = device['dev_id']
|
dev_name = device['dev_id']
|
||||||
|
# the parent attribute can be None
|
||||||
|
if parent is not None and dev_name != parent:
|
||||||
|
# The device is not the one that was called, not creating
|
||||||
|
# the mdev
|
||||||
|
continue
|
||||||
# We need the PCI address, not the libvirt name
|
# We need the PCI address, not the libvirt name
|
||||||
# The libvirt name is like 'pci_0000_84_00_0'
|
# The libvirt name is like 'pci_0000_84_00_0'
|
||||||
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
|
pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
|
||||||
@ -6212,13 +6236,49 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
LOG.warning('More than one allocation was passed over to libvirt '
|
LOG.warning('More than one allocation was passed over to libvirt '
|
||||||
'while at the moment libvirt only supports one. Only '
|
'while at the moment libvirt only supports one. Only '
|
||||||
'the first allocation will be looked up.')
|
'the first allocation will be looked up.')
|
||||||
alloc = six.next(six.itervalues(vgpu_allocations))
|
rp_uuid, alloc = six.next(six.iteritems(vgpu_allocations))
|
||||||
vgpus_asked = alloc['resources'][orc.VGPU]
|
vgpus_asked = alloc['resources'][orc.VGPU]
|
||||||
|
|
||||||
|
# Find if we allocated against a specific pGPU (and then the allocation
|
||||||
|
# is made against a child RP) or any pGPU (in case the VGPU inventory
|
||||||
|
# is still on the root RP)
|
||||||
|
try:
|
||||||
|
allocated_rp = self.provider_tree.data(rp_uuid)
|
||||||
|
except ValueError:
|
||||||
|
# The provider doesn't exist, return a better understandable
|
||||||
|
# exception
|
||||||
|
raise exception.ComputeResourcesUnavailable(
|
||||||
|
reason='vGPU resource is not available')
|
||||||
|
# TODO(sbauza): Remove this conditional in Train once all VGPU
|
||||||
|
# inventories are related to a child RP
|
||||||
|
if allocated_rp.parent_uuid is None:
|
||||||
|
# We are on a root RP
|
||||||
|
parent_device = None
|
||||||
|
else:
|
||||||
|
rp_name = allocated_rp.name
|
||||||
|
# There can be multiple roots, we need to find the root name
|
||||||
|
# to guess the physical device name
|
||||||
|
roots = self.provider_tree.roots
|
||||||
|
for root in roots:
|
||||||
|
if rp_name.startswith(root.name + '_'):
|
||||||
|
# The RP name convention is :
|
||||||
|
# root_name + '_' + parent_device
|
||||||
|
parent_device = rp_name[len(root.name) + 1:]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
LOG.warning("pGPU device name %(name)s can't be guessed from "
|
||||||
|
"the ProviderTree "
|
||||||
|
"roots %(roots)s", {'name': rp_name,
|
||||||
|
'roots': roots})
|
||||||
|
# We f... have no idea what was the parent device
|
||||||
|
# If we can't find devices having available VGPUs, just raise
|
||||||
|
raise exception.ComputeResourcesUnavailable(
|
||||||
|
reason='vGPU resource is not available')
|
||||||
|
|
||||||
requested_types = self._get_supported_vgpu_types()
|
requested_types = self._get_supported_vgpu_types()
|
||||||
# Which mediated devices are created but not assigned to a guest ?
|
# Which mediated devices are created but not assigned to a guest ?
|
||||||
mdevs_available = self._get_existing_mdevs_not_assigned(
|
mdevs_available = self._get_existing_mdevs_not_assigned(
|
||||||
requested_types)
|
requested_types, parent_device)
|
||||||
|
|
||||||
chosen_mdevs = []
|
chosen_mdevs = []
|
||||||
for c in six.moves.range(vgpus_asked):
|
for c in six.moves.range(vgpus_asked):
|
||||||
@ -6227,7 +6287,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
# Take the first available mdev
|
# Take the first available mdev
|
||||||
chosen_mdev = mdevs_available.pop()
|
chosen_mdev = mdevs_available.pop()
|
||||||
else:
|
else:
|
||||||
chosen_mdev = self._create_new_mediated_device(requested_types)
|
chosen_mdev = self._create_new_mediated_device(
|
||||||
|
requested_types, parent=parent_device)
|
||||||
if not chosen_mdev:
|
if not chosen_mdev:
|
||||||
# If we can't find devices having available VGPUs, just raise
|
# If we can't find devices having available VGPUs, just raise
|
||||||
raise exception.ComputeResourcesUnavailable(
|
raise exception.ComputeResourcesUnavailable(
|
||||||
@ -6563,6 +6624,10 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
provider_tree.add_traits(nodename, *traits_to_add)
|
provider_tree.add_traits(nodename, *traits_to_add)
|
||||||
provider_tree.remove_traits(nodename, *traits_to_remove)
|
provider_tree.remove_traits(nodename, *traits_to_remove)
|
||||||
|
|
||||||
|
# Now that we updated the ProviderTree, we want to store it locally
|
||||||
|
# so that spawn() or other methods can access it thru a getter
|
||||||
|
self.provider_tree = copy.deepcopy(provider_tree)
|
||||||
|
|
||||||
def get_available_resource(self, nodename):
|
def get_available_resource(self, nodename):
|
||||||
"""Retrieve resource information.
|
"""Retrieve resource information.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user