vgpu: Allow device_addresses to not be set
Sometimes, some GPU may have a long list of PCI addresses (say a SRIOV GPU) or operators may have a long list of GPUs. In order to help their lifes, let's allow device_addresses to be optional. This means that a valid configuration could be : [devices] enabled_mdev_types = nvidia-35, nvidia-36 [mdev_nvidia-35] [mdev_nvidia-36] NOTE(sbauza): we have a slight coverage gap for testing what happens if the groups aren't set, but I'll add it in a next patch Related-Bug: #2041519 Change-Id: I73762a0295212ee003db2149d6a9cf701023464f
This commit is contained in:
parent
60851e4464
commit
d445eaf9dd
@ -29,15 +29,20 @@ guest instance.
|
||||
|
||||
If more than one single mdev type is provided, then for each *mdev type* an
|
||||
additional section, ``[mdev_$(MDEV_TYPE)]``, must be added to the configuration
|
||||
file. Each section then **must** be configured with a single configuration
|
||||
option, ``device_addresses``, which should be a list of PCI addresses
|
||||
corresponding to the physical GPU(s) or mdev-capable hardware to assign to this
|
||||
type.
|
||||
file. Each section then can be configured with a single configuration option,
|
||||
``device_addresses``, which should be a list of PCI addresses corresponding to
|
||||
the physical GPU(s) or mdev-capable hardware to assign to this type. If
|
||||
`device_addresses` is not provided, then the related GPU type will be the
|
||||
default for all the found GPUs that aren't used by other types.
|
||||
|
||||
|
||||
If one or more sections are missing (meaning that a specific type is not wanted
|
||||
to use for at least one physical device) or if no device addresses are provided
|
||||
, then Nova will only use the first type that was provided by
|
||||
``[devices]/enabled_mdev_types``.
|
||||
to use for at least one physical device), then Nova will only use the first
|
||||
type that was provided by ``[devices]/enabled_mdev_types``.
|
||||
|
||||
If two or more sections are not set with ``device_addresses`` values, then only
|
||||
the first one will be used for defaulting all the non-defined GPUs to use this
|
||||
type.
|
||||
|
||||
If the same PCI address is provided for two different types, nova-compute will
|
||||
return an InvalidLibvirtMdevConfig exception at restart.
|
||||
@ -54,6 +59,17 @@ will be accepted. A valid configuration could then be::
|
||||
[vgpu_nvidia-36]
|
||||
device_addresses = 0000:86:00.0
|
||||
|
||||
Another valid configuration could be::
|
||||
|
||||
[devices]
|
||||
enabled_mdev_types = nvidia-35, nvidia-36
|
||||
|
||||
[mdev_nvidia-35]
|
||||
|
||||
[mdev_nvidia-36]
|
||||
device_addresses = 0000:86:00.0
|
||||
|
||||
|
||||
""")
|
||||
]
|
||||
|
||||
|
@ -26870,7 +26870,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
'nvidia-12'])
|
||||
|
||||
@mock.patch.object(libvirt_driver.LOG, 'warning')
|
||||
def test_get_supported_vgpu_types(self, mock_warning):
|
||||
def test_get_supported_vgpu_types_fails(self, mock_warning):
|
||||
# Verify that by default we don't support vGPU types
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.assertEqual([], drvr._get_supported_vgpu_types())
|
||||
@ -26889,34 +26889,34 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
# Since the operator wanted to only support one type, it's fine to not
|
||||
# provide config groups
|
||||
mock_warning.assert_not_called()
|
||||
# For further checking
|
||||
mock_warning.reset_mock()
|
||||
|
||||
# Now two types without forgetting to provide the pGPU addresses
|
||||
@mock.patch.object(libvirt_driver.LOG, 'warning')
|
||||
def test_get_supported_vgpu_types_two_types_unset(self, mock_warning):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
# Now two types without providing the pGPU addresses
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
# we need to call the below again to ensure the updated
|
||||
# 'device_addresses' value is read and the new groups created
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
|
||||
self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types())
|
||||
self.assertEqual({}, drvr.pgpu_type_mapping)
|
||||
self.assertEqual({}, drvr.mdev_class_mapping)
|
||||
self.assertEqual({}, drvr.mdev_type_max_mapping)
|
||||
# Here we only support one vGPU type
|
||||
self.assertEqual({orc.VGPU}, drvr.mdev_classes)
|
||||
msg = ("The mdev type '%(type)s' was listed in '[devices] "
|
||||
"enabled_mdev_types' but no corresponding "
|
||||
"'[mdev_%(type)s]' group or "
|
||||
"'[mdev_%(type)s] device_addresses' "
|
||||
"option was defined. Only the first type '%(ftype)s' "
|
||||
"will be used." % {'type': 'nvidia-12',
|
||||
'ftype': 'nvidia-11'})
|
||||
msg = ("Mdev type default already set to "
|
||||
" %(default_type)s so %(this_type)s will not "
|
||||
"be used." % {
|
||||
'default_type': 'nvidia-11',
|
||||
'this_type': 'nvidia-12'})
|
||||
mock_warning.assert_called_once_with(msg)
|
||||
# For further checking
|
||||
mock_warning.reset_mock()
|
||||
|
||||
# And now do it correctly !
|
||||
@mock.patch.object(libvirt_driver.LOG, 'warning')
|
||||
def test_get_supported_vgpu_types(self, mock_warning):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
# we need to call the below again to ensure the updated
|
||||
# 'device_addresses' value is read and the new groups created
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
|
||||
self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12')
|
||||
self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12')
|
||||
@ -26957,20 +26957,6 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
libvirt_driver.LibvirtDriver,
|
||||
fake.FakeVirtAPI(), False)
|
||||
|
||||
@mock.patch.object(nova.conf.devices, 'register_dynamic_opts')
|
||||
def test_get_supported_vgpu_types_registering_dynamic_opts(self, rdo):
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
drvr._get_supported_vgpu_types()
|
||||
|
||||
# Okay below is confusing, but remember, ._get_supported_vgpu_types()
|
||||
# is first called by the LibvirtDriver object creation, so when
|
||||
# calling the above drvr._get_supported_vgpu_types() method, it will
|
||||
# be the second time that register_dynamic_opts() will be called.
|
||||
rdo.assert_has_calls([mock.call(CONF), mock.call(CONF)])
|
||||
|
||||
@mock.patch.object(libvirt_driver.LOG, 'warning')
|
||||
def test_get_supported_vgpu_types_with_a_single_type(self, mock_warning):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
@ -26990,6 +26976,49 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
self.assertEqual({'CUSTOM_NOTVGPU'}, drvr.mdev_classes)
|
||||
mock_warning.assert_not_called()
|
||||
|
||||
def test_get_supported_vgpu_types_with_default_type(self):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
# we need to call the below again to ensure the updated
|
||||
# 'device_addresses' value is read and the new groups created
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
# Enable nvidia-11 as a the default type for all GPUs but 0000:84:00.0
|
||||
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-12')
|
||||
|
||||
self.assertEqual(['nvidia-11', 'nvidia-12'],
|
||||
drvr._get_supported_vgpu_types())
|
||||
self.assertEqual({'0000:84:00.0': 'nvidia-12'}, drvr.pgpu_type_mapping)
|
||||
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
|
||||
|
||||
@mock.patch.object(libvirt_driver.LOG, 'warning')
|
||||
def test_get_supported_vgpu_types_with_duplicate_default_type(
|
||||
self, mock_warning):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12', 'nvidia-13'],
|
||||
group='devices')
|
||||
# we need to call the below again to ensure the updated
|
||||
# 'device_addresses' value is read and the new groups created
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
|
||||
# Add a specific GPU for a third type
|
||||
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-13')
|
||||
# As both nvidia-11 and nvidia-12 aren't set with device_addresses,
|
||||
# only one of them should be the default.
|
||||
|
||||
# nvidia-12 won't be supported since none of the GPUs will use it.
|
||||
self.assertEqual(['nvidia-11', 'nvidia-13'],
|
||||
drvr._get_supported_vgpu_types())
|
||||
self.assertEqual({'0000:84:00.0': 'nvidia-13'}, drvr.pgpu_type_mapping)
|
||||
# There can be only one :-)
|
||||
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
|
||||
msg = ("Mdev type default already set to "
|
||||
" %(default_type)s so %(this_type)s will not "
|
||||
"be used." % {
|
||||
'default_type': 'nvidia-11',
|
||||
'this_type': 'nvidia-12'})
|
||||
mock_warning.assert_called_once_with(msg)
|
||||
|
||||
def test_get_vgpu_type_per_pgpu(self):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
device = 'pci_0000_84_00_0'
|
||||
@ -27046,6 +27075,20 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
|
||||
# 0000:86:00.0 wasn't configured
|
||||
self.assertIsNone(drvr._get_vgpu_type_per_pgpu('pci_0000_86_00_0'))
|
||||
|
||||
def test_get_vgpu_type_per_pgpu_with_default_type(self):
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
# we need to call the below again to ensure the updated
|
||||
# 'device_addresses' value is read and the new groups created
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
self.assertEqual('nvidia-11',
|
||||
drvr._get_vgpu_type_per_pgpu('pci_0000_84_00_0'))
|
||||
# Any GPU but 0000:84:00.0 defaults now to nvidia-12
|
||||
self.assertEqual('nvidia-12',
|
||||
drvr._get_vgpu_type_per_pgpu('pci_0000_85_00_0'))
|
||||
|
||||
def test_get_resource_class_for_device(self):
|
||||
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
|
||||
group='devices')
|
||||
|
@ -540,6 +540,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
self.mdev_classes = set([])
|
||||
# this is for knowing how many mdevs can be created by a type
|
||||
self.mdev_type_max_mapping = collections.defaultdict(str)
|
||||
# if we have a wildcard, we default to use this mdev type
|
||||
self.pgpu_type_default = None
|
||||
self.supported_vgpu_types = self._get_supported_vgpu_types()
|
||||
|
||||
# This dict is for knowing which mdevs are already claimed by some
|
||||
@ -8190,38 +8192,36 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# be calling this method before init_host()
|
||||
nova.conf.devices.register_dynamic_opts(CONF)
|
||||
|
||||
enabled_mdev_types = []
|
||||
for vgpu_type in CONF.devices.enabled_mdev_types:
|
||||
enabled_mdev_types.append(vgpu_type)
|
||||
# NOTE(sbauza) group is now always set because we register the
|
||||
# dynamic options above
|
||||
group = getattr(CONF, 'mdev_%s' % vgpu_type, None)
|
||||
if group is None or not group.device_addresses:
|
||||
first_type = CONF.devices.enabled_mdev_types[0]
|
||||
if len(CONF.devices.enabled_mdev_types) > 1:
|
||||
# Only provide the warning if the operator provided more
|
||||
# than one type as it's not needed to provide groups
|
||||
# if you only use one vGPU type.
|
||||
msg = ("The mdev type '%(type)s' was listed in '[devices] "
|
||||
"enabled_mdev_types' but no corresponding "
|
||||
"'[mdev_%(type)s]' group or "
|
||||
"'[mdev_%(type)s] device_addresses' "
|
||||
"option was defined. Only the first type "
|
||||
"'%(ftype)s' will be used." % {'type': vgpu_type,
|
||||
'ftype': first_type})
|
||||
LOG.warning(msg)
|
||||
# We need to reset the mapping tables that we started to
|
||||
# provide keys and values from previously processed vGPUs but
|
||||
# since there is a problem for this vGPU type, we only want to
|
||||
# support only the first type.
|
||||
self.pgpu_type_mapping.clear()
|
||||
self.mdev_class_mapping.clear()
|
||||
first_group = getattr(CONF, 'mdev_%s' % first_type, None)
|
||||
if first_group is None:
|
||||
self.mdev_classes = {orc.VGPU}
|
||||
else:
|
||||
self.mdev_classes = {first_group.mdev_class}
|
||||
return [first_type]
|
||||
if group is None:
|
||||
# Should never happen but if so, just fails early.
|
||||
raise exception.InvalidLibvirtMdevConfig(
|
||||
reason="can't find '[devices]/mdev_%s group' "
|
||||
"in the configuration" % group
|
||||
)
|
||||
mdev_class = group.mdev_class
|
||||
# By default, max_instances is None
|
||||
if group.max_instances:
|
||||
self.mdev_type_max_mapping[vgpu_type] = group.max_instances
|
||||
if not group.device_addresses:
|
||||
if not self.pgpu_type_default:
|
||||
self.pgpu_type_default = vgpu_type
|
||||
self.mdev_classes.add(mdev_class)
|
||||
else:
|
||||
msg = ("Mdev type default already set to "
|
||||
" %(default_type)s so %(this_type)s will not "
|
||||
"be used." % {
|
||||
'default_type': self.pgpu_type_default,
|
||||
'this_type': vgpu_type})
|
||||
LOG.warning(msg)
|
||||
# we remove the type from the supported list.
|
||||
enabled_mdev_types.remove(vgpu_type)
|
||||
continue
|
||||
for device_address in group.device_addresses:
|
||||
if device_address in self.pgpu_type_mapping:
|
||||
raise exception.InvalidLibvirtMdevConfig(
|
||||
@ -8238,7 +8238,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
self.pgpu_type_mapping[device_address] = vgpu_type
|
||||
self.mdev_class_mapping[device_address] = mdev_class
|
||||
self.mdev_classes.add(mdev_class)
|
||||
return CONF.devices.enabled_mdev_types
|
||||
return enabled_mdev_types
|
||||
|
||||
@staticmethod
|
||||
def _get_pci_id_from_libvirt_name(
|
||||
@ -8272,16 +8272,14 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
if not self.supported_vgpu_types:
|
||||
return
|
||||
|
||||
if len(self.supported_vgpu_types) == 1:
|
||||
first_type = self.supported_vgpu_types[0]
|
||||
group = getattr(CONF, 'mdev_%s' % first_type, None)
|
||||
if group is None or not group.device_addresses:
|
||||
return first_type
|
||||
|
||||
device_address = self._get_pci_id_from_libvirt_name(device_address)
|
||||
if not device_address:
|
||||
return
|
||||
return self.pgpu_type_mapping.get(device_address)
|
||||
mdev_type = self.pgpu_type_mapping.get(device_address)
|
||||
# if we can't find the mdev type by the config, do we have a default
|
||||
# type because of a config group not using device_addresses ?
|
||||
# NOTE(sbauza): By default pgpu_type_default is None if unset
|
||||
return mdev_type or self.pgpu_type_default
|
||||
|
||||
def _get_resource_class_for_device(self, device_address):
|
||||
"""Returns the resource class for the inventory of this device.
|
||||
|
Loading…
Reference in New Issue
Block a user