Merge "vgpu: Allow device_addresses to not be set"

This commit is contained in:
Zuul 2024-03-18 16:58:28 +00:00 committed by Gerrit Code Review
commit 3e358bc37c
3 changed files with 130 additions and 73 deletions

View File

@ -29,15 +29,20 @@ guest instance.
If more than one single mdev type is provided, then for each *mdev type* an If more than one single mdev type is provided, then for each *mdev type* an
additional section, ``[mdev_$(MDEV_TYPE)]``, must be added to the configuration additional section, ``[mdev_$(MDEV_TYPE)]``, must be added to the configuration
file. Each section then **must** be configured with a single configuration file. Each section then can be configured with a single configuration option,
option, ``device_addresses``, which should be a list of PCI addresses ``device_addresses``, which should be a list of PCI addresses corresponding to
corresponding to the physical GPU(s) or mdev-capable hardware to assign to this the physical GPU(s) or mdev-capable hardware to assign to this type. If
type. `device_addresses` is not provided, then the related GPU type will be the
default for all the found GPUs that aren't used by other types.
If one or more sections are missing (meaning that a specific type is not wanted If one or more sections are missing (meaning that a specific type is not wanted
to use for at least one physical device) or if no device addresses are provided to use for at least one physical device), then Nova will only use the first
, then Nova will only use the first type that was provided by type that was provided by ``[devices]/enabled_mdev_types``.
``[devices]/enabled_mdev_types``.
If two or more sections are not set with ``device_addresses`` values, then only
the first one will be used for defaulting all the non-defined GPUs to use this
type.
If the same PCI address is provided for two different types, nova-compute will If the same PCI address is provided for two different types, nova-compute will
return an InvalidLibvirtMdevConfig exception at restart. return an InvalidLibvirtMdevConfig exception at restart.
@ -54,6 +59,17 @@ will be accepted. A valid configuration could then be::
[vgpu_nvidia-36] [vgpu_nvidia-36]
device_addresses = 0000:86:00.0 device_addresses = 0000:86:00.0
Another valid configuration could be::
[devices]
enabled_mdev_types = nvidia-35, nvidia-36
[mdev_nvidia-35]
[mdev_nvidia-36]
device_addresses = 0000:86:00.0
""") """)
] ]

View File

@ -26870,7 +26870,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
'nvidia-12']) 'nvidia-12'])
@mock.patch.object(libvirt_driver.LOG, 'warning') @mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types(self, mock_warning): def test_get_supported_vgpu_types_fails(self, mock_warning):
# Verify that by default we don't support vGPU types # Verify that by default we don't support vGPU types
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual([], drvr._get_supported_vgpu_types()) self.assertEqual([], drvr._get_supported_vgpu_types())
@ -26889,34 +26889,34 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
# Since the operator wanted to only support one type, it's fine to not # Since the operator wanted to only support one type, it's fine to not
# provide config groups # provide config groups
mock_warning.assert_not_called() mock_warning.assert_not_called()
# For further checking
mock_warning.reset_mock()
# Now two types without forgetting to provide the pGPU addresses @mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_two_types_unset(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
# Now two types without providing the pGPU addresses
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'], self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices') group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types()) self.assertEqual(['nvidia-11'], drvr._get_supported_vgpu_types())
self.assertEqual({}, drvr.pgpu_type_mapping) self.assertEqual({}, drvr.pgpu_type_mapping)
self.assertEqual({}, drvr.mdev_class_mapping) self.assertEqual({}, drvr.mdev_class_mapping)
self.assertEqual({}, drvr.mdev_type_max_mapping) self.assertEqual({}, drvr.mdev_type_max_mapping)
# Here we only support one vGPU type # Here we only support one vGPU type
self.assertEqual({orc.VGPU}, drvr.mdev_classes) self.assertEqual({orc.VGPU}, drvr.mdev_classes)
msg = ("The mdev type '%(type)s' was listed in '[devices] " msg = ("Mdev type default already set to "
"enabled_mdev_types' but no corresponding " " %(default_type)s so %(this_type)s will not "
"'[mdev_%(type)s]' group or " "be used." % {
"'[mdev_%(type)s] device_addresses' " 'default_type': 'nvidia-11',
"option was defined. Only the first type '%(ftype)s' " 'this_type': 'nvidia-12'})
"will be used." % {'type': 'nvidia-12',
'ftype': 'nvidia-11'})
mock_warning.assert_called_once_with(msg) mock_warning.assert_called_once_with(msg)
# For further checking
mock_warning.reset_mock()
# And now do it correctly ! @mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11') self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12') self.flags(device_addresses=['0000:85:00.0'], group='mdev_nvidia-12')
self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12') self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_nvidia-12')
@ -26957,20 +26957,6 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
libvirt_driver.LibvirtDriver, libvirt_driver.LibvirtDriver,
fake.FakeVirtAPI(), False) fake.FakeVirtAPI(), False)
@mock.patch.object(nova.conf.devices, 'register_dynamic_opts')
def test_get_supported_vgpu_types_registering_dynamic_opts(self, rdo):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr._get_supported_vgpu_types()
# Okay below is confusing, but remember, ._get_supported_vgpu_types()
# is first called by the LibvirtDriver object creation, so when
# calling the above drvr._get_supported_vgpu_types() method, it will
# be the second time that register_dynamic_opts() will be called.
rdo.assert_has_calls([mock.call(CONF), mock.call(CONF)])
@mock.patch.object(libvirt_driver.LOG, 'warning') @mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_with_a_single_type(self, mock_warning): def test_get_supported_vgpu_types_with_a_single_type(self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
@ -26990,6 +26976,49 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
self.assertEqual({'CUSTOM_NOTVGPU'}, drvr.mdev_classes) self.assertEqual({'CUSTOM_NOTVGPU'}, drvr.mdev_classes)
mock_warning.assert_not_called() mock_warning.assert_not_called()
def test_get_supported_vgpu_types_with_default_type(self):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# Enable nvidia-11 as a the default type for all GPUs but 0000:84:00.0
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-12')
self.assertEqual(['nvidia-11', 'nvidia-12'],
drvr._get_supported_vgpu_types())
self.assertEqual({'0000:84:00.0': 'nvidia-12'}, drvr.pgpu_type_mapping)
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
@mock.patch.object(libvirt_driver.LOG, 'warning')
def test_get_supported_vgpu_types_with_duplicate_default_type(
self, mock_warning):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12', 'nvidia-13'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
# Add a specific GPU for a third type
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-13')
# As both nvidia-11 and nvidia-12 aren't set with device_addresses,
# only one of them should be the default.
# nvidia-12 won't be supported since none of the GPUs will use it.
self.assertEqual(['nvidia-11', 'nvidia-13'],
drvr._get_supported_vgpu_types())
self.assertEqual({'0000:84:00.0': 'nvidia-13'}, drvr.pgpu_type_mapping)
# There can be only one :-)
self.assertEqual('nvidia-11', drvr.pgpu_type_default)
msg = ("Mdev type default already set to "
" %(default_type)s so %(this_type)s will not "
"be used." % {
'default_type': 'nvidia-11',
'this_type': 'nvidia-12'})
mock_warning.assert_called_once_with(msg)
def test_get_vgpu_type_per_pgpu(self): def test_get_vgpu_type_per_pgpu(self):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
device = 'pci_0000_84_00_0' device = 'pci_0000_84_00_0'
@ -27046,6 +27075,20 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin):
# 0000:86:00.0 wasn't configured # 0000:86:00.0 wasn't configured
self.assertIsNone(drvr._get_vgpu_type_per_pgpu('pci_0000_86_00_0')) self.assertIsNone(drvr._get_vgpu_type_per_pgpu('pci_0000_86_00_0'))
def test_get_vgpu_type_per_pgpu_with_default_type(self):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices')
# we need to call the below again to ensure the updated
# 'device_addresses' value is read and the new groups created
nova.conf.devices.register_dynamic_opts(CONF)
self.flags(device_addresses=['0000:84:00.0'], group='mdev_nvidia-11')
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual('nvidia-11',
drvr._get_vgpu_type_per_pgpu('pci_0000_84_00_0'))
# Any GPU but 0000:84:00.0 defaults now to nvidia-12
self.assertEqual('nvidia-12',
drvr._get_vgpu_type_per_pgpu('pci_0000_85_00_0'))
def test_get_resource_class_for_device(self): def test_get_resource_class_for_device(self):
self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'], self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
group='devices') group='devices')

View File

@ -540,6 +540,8 @@ class LibvirtDriver(driver.ComputeDriver):
self.mdev_classes = set([]) self.mdev_classes = set([])
# this is for knowing how many mdevs can be created by a type # this is for knowing how many mdevs can be created by a type
self.mdev_type_max_mapping = collections.defaultdict(str) self.mdev_type_max_mapping = collections.defaultdict(str)
# if we have a wildcard, we default to use this mdev type
self.pgpu_type_default = None
self.supported_vgpu_types = self._get_supported_vgpu_types() self.supported_vgpu_types = self._get_supported_vgpu_types()
# This dict is for knowing which mdevs are already claimed by some # This dict is for knowing which mdevs are already claimed by some
@ -8199,38 +8201,36 @@ class LibvirtDriver(driver.ComputeDriver):
# be calling this method before init_host() # be calling this method before init_host()
nova.conf.devices.register_dynamic_opts(CONF) nova.conf.devices.register_dynamic_opts(CONF)
enabled_mdev_types = []
for vgpu_type in CONF.devices.enabled_mdev_types: for vgpu_type in CONF.devices.enabled_mdev_types:
enabled_mdev_types.append(vgpu_type)
# NOTE(sbauza) group is now always set because we register the
# dynamic options above
group = getattr(CONF, 'mdev_%s' % vgpu_type, None) group = getattr(CONF, 'mdev_%s' % vgpu_type, None)
if group is None or not group.device_addresses: if group is None:
first_type = CONF.devices.enabled_mdev_types[0] # Should never happen but if so, just fails early.
if len(CONF.devices.enabled_mdev_types) > 1: raise exception.InvalidLibvirtMdevConfig(
# Only provide the warning if the operator provided more reason="can't find '[devices]/mdev_%s group' "
# than one type as it's not needed to provide groups "in the configuration" % group
# if you only use one vGPU type. )
msg = ("The mdev type '%(type)s' was listed in '[devices] "
"enabled_mdev_types' but no corresponding "
"'[mdev_%(type)s]' group or "
"'[mdev_%(type)s] device_addresses' "
"option was defined. Only the first type "
"'%(ftype)s' will be used." % {'type': vgpu_type,
'ftype': first_type})
LOG.warning(msg)
# We need to reset the mapping tables that we started to
# provide keys and values from previously processed vGPUs but
# since there is a problem for this vGPU type, we only want to
# support only the first type.
self.pgpu_type_mapping.clear()
self.mdev_class_mapping.clear()
first_group = getattr(CONF, 'mdev_%s' % first_type, None)
if first_group is None:
self.mdev_classes = {orc.VGPU}
else:
self.mdev_classes = {first_group.mdev_class}
return [first_type]
mdev_class = group.mdev_class mdev_class = group.mdev_class
# By default, max_instances is None # By default, max_instances is None
if group.max_instances: if group.max_instances:
self.mdev_type_max_mapping[vgpu_type] = group.max_instances self.mdev_type_max_mapping[vgpu_type] = group.max_instances
if not group.device_addresses:
if not self.pgpu_type_default:
self.pgpu_type_default = vgpu_type
self.mdev_classes.add(mdev_class)
else:
msg = ("Mdev type default already set to "
" %(default_type)s so %(this_type)s will not "
"be used." % {
'default_type': self.pgpu_type_default,
'this_type': vgpu_type})
LOG.warning(msg)
# we remove the type from the supported list.
enabled_mdev_types.remove(vgpu_type)
continue
for device_address in group.device_addresses: for device_address in group.device_addresses:
if device_address in self.pgpu_type_mapping: if device_address in self.pgpu_type_mapping:
raise exception.InvalidLibvirtMdevConfig( raise exception.InvalidLibvirtMdevConfig(
@ -8247,7 +8247,7 @@ class LibvirtDriver(driver.ComputeDriver):
self.pgpu_type_mapping[device_address] = vgpu_type self.pgpu_type_mapping[device_address] = vgpu_type
self.mdev_class_mapping[device_address] = mdev_class self.mdev_class_mapping[device_address] = mdev_class
self.mdev_classes.add(mdev_class) self.mdev_classes.add(mdev_class)
return CONF.devices.enabled_mdev_types return enabled_mdev_types
@staticmethod @staticmethod
def _get_pci_id_from_libvirt_name( def _get_pci_id_from_libvirt_name(
@ -8281,16 +8281,14 @@ class LibvirtDriver(driver.ComputeDriver):
if not self.supported_vgpu_types: if not self.supported_vgpu_types:
return return
if len(self.supported_vgpu_types) == 1:
first_type = self.supported_vgpu_types[0]
group = getattr(CONF, 'mdev_%s' % first_type, None)
if group is None or not group.device_addresses:
return first_type
device_address = self._get_pci_id_from_libvirt_name(device_address) device_address = self._get_pci_id_from_libvirt_name(device_address)
if not device_address: if not device_address:
return return
return self.pgpu_type_mapping.get(device_address) mdev_type = self.pgpu_type_mapping.get(device_address)
# if we can't find the mdev type by the config, do we have a default
# type because of a config group not using device_addresses ?
# NOTE(sbauza): By default pgpu_type_default is None if unset
return mdev_type or self.pgpu_type_default
def _get_resource_class_for_device(self, device_address): def _get_resource_class_for_device(self, device_address):
"""Returns the resource class for the inventory of this device. """Returns the resource class for the inventory of this device.