Merge "Map PCI pools to RP UUIDs"

This commit is contained in:
Zuul 2022-12-15 13:10:45 +00:00 committed by Gerrit Code Review
commit 2f4feeabc2
6 changed files with 236 additions and 6 deletions

View File

@ -261,6 +261,12 @@ class PciResourceProvider:
)
provider_tree.update_traits(self.name, self.traits)
# Here we are sure the RP exists in the provider_tree. So, we can
# record the RP UUID in each PciDevice this RP represents
rp_uuid = provider_tree.data(self.name).uuid
for dev in self.devs:
dev.extra_info['rp_uuid'] = rp_uuid
def update_allocations(
self,
allocations: dict,
@ -598,6 +604,11 @@ def update_provider_tree_for_pci(
pv.update_provider_tree(provider_tree)
old_alloc = copy.deepcopy(allocations)
# update_provider_tree correlated the PciDevice objects with RPs in
# placement and recorded the RP UUID in the PciDevice object. We need to
# trigger an update on the device pools in the tracker to get the device
# RP UUID mapped to the device pools
pci_tracker.stats.populate_pools_metadata_from_assigned_devices()
updated = pv.update_allocations(allocations, provider_tree)
if updated:

View File

@ -991,8 +991,6 @@ class ResourceTracker(object):
# notified when instances are deleted, we need remove all usages
# from deleted instances.
self.pci_tracker.clean_usage(instances, migrations)
dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj()
cn.pci_device_pools = dev_pools_obj
self._report_final_resource_view(nodename)
@ -1314,13 +1312,23 @@ class ResourceTracker(object):
def _update(self, context, compute_node, startup=False):
"""Update partial stats locally and populate them to Scheduler."""
self._update_to_placement(context, compute_node, startup)
if self.pci_tracker:
# sync PCI device pool state stored in the compute node with
# the actual state from the PCI tracker as we commit changes in
# the DB and in the PCI tracker below
dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj()
compute_node.pci_device_pools = dev_pools_obj
# _resource_change will update self.old_resources if it detects changes
# but we want to restore those if compute_node.save() fails.
nodename = compute_node.hypervisor_hostname
old_compute = self.old_resources[nodename]
if self._resource_change(compute_node):
# If the compute_node's resource changed, update to DB. Note that
# _update_to_placement below does not supersede the need to do this
# _update_to_placement above does not supersede the need to do this
# because there are stats-related fields in the ComputeNode object
# which could have changed and still need to be reported to the
# scheduler filters/weighers (which could be out of tree as well).
@ -1333,8 +1341,6 @@ class ResourceTracker(object):
with excutils.save_and_reraise_exception(logger=LOG):
self.old_resources[nodename] = old_compute
self._update_to_placement(context, compute_node, startup)
if self.pci_tracker:
self.pci_tracker.save(context)

View File

@ -96,6 +96,8 @@ class PciDeviceStats(object):
pool_keys = pool.copy()
del pool_keys['count']
del pool_keys['devices']
# FIXME(gibi): do we need this?
pool_keys.pop('rp_uuid', None)
if (len(pool_keys.keys()) == len(dev_pool.keys()) and
self._equal_properties(dev_pool, pool_keys, list(dev_pool))):
return pool
@ -779,3 +781,40 @@ class PciDeviceStats(object):
)
pools = self._filter_pools_for_spec(self.pools, dummy_req)
return bool(pools)
def populate_pools_metadata_from_assigned_devices(self):
"""Populate the rp_uuid of each pool based on the rp_uuid of the
devices assigned to the pool. This can only be called from the compute
where devices are assigned to each pool. This should not be called from
the scheduler as there device - pool assignment is not known.
"""
# PciDevices are tracked in placement and flavor based PCI requests
# are scheduled and allocated in placement. To be able to correlate
# what is allocated in placement and what is consumed in nova we
# need to map device pools to RPs. We can do that as the PciDevice
# contains the RP UUID that represents it in placement.
# NOTE(gibi): We cannot do this when the device is originally added to
# the pool as the device -> placement translation, that creates the
# RPs, runs after all the device is created and assigned to pools.
for pool in self.pools:
pool_rps = {
dev.extra_info.get("rp_uuid")
for dev in pool["devices"]
if "rp_uuid" in dev.extra_info
}
if len(pool_rps) >= 2:
# FIXME(gibi): Do we have a 1:1 pool - RP mapping even
# if two PFs providing very similar VFs?
raise ValueError(
"We have a pool %s connected to more than one RPs %s in "
"placement via devs %s" % (pool, pool_rps, pool["devices"])
)
if not pool_rps:
# this can happen if the nova-compute is upgraded to have the
# PCI in placement inventory handling code but
# [pci]report_in_placement is not turned on yet.
continue
if pool_rps: # now we know that it is a single RP
pool['rp_uuid'] = next(iter(pool_rps))

View File

@ -12,12 +12,15 @@
# License for the specific language governing permissions and limitations
# under the License.
import ddt
from oslo_utils.fixture import uuidsentinel as uuids
from unittest import mock
from nova.compute import pci_placement_translator as ppt
from nova.compute import provider_tree
from nova import exception
from nova.objects import fields
from nova.objects import pci_device
from nova.pci import devspec
from nova import test
@ -235,3 +238,54 @@ class TestTranslator(test.NoDBTestCase):
"CUSTOM_BAR,CUSTOM_BAZ,CUSTOM_FOO for 0000:81:00.0,0000:81:00.1.",
str(ex),
)
def test_translator_maps_pci_device_to_rp(self):
pv = ppt.PlacementView(
"fake-node", instances_under_same_host_resize=[])
vf = pci_device.PciDevice(
address="0000:81:00.1",
parent_addr="0000:71:00.0",
dev_type=fields.PciDeviceType.SRIOV_VF,
vendor_id="dead",
product_id="beef",
)
pf = pci_device.PciDevice(
address="0000:72:00.0",
parent_addr=None,
dev_type=fields.PciDeviceType.SRIOV_PF,
vendor_id="dead",
product_id="beef",
)
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pv._add_dev(vf, {})
pv._add_dev(pf, {})
pv.update_provider_tree(pt)
self.assertEqual(
pt.data("fake-node_0000:71:00.0").uuid, vf.extra_info["rp_uuid"]
)
self.assertEqual(
pt.data("fake-node_0000:72:00.0").uuid, pf.extra_info["rp_uuid"]
)
def test_update_provider_tree_for_pci_update_pools(self):
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pf = pci_device.PciDevice(
address="0000:72:00.0",
parent_addr=None,
dev_type=fields.PciDeviceType.SRIOV_PF,
vendor_id="dead",
product_id="beef",
status=fields.PciDeviceStatus.AVAILABLE,
)
pci_tracker = mock.Mock()
pci_tracker.pci_devs = [pf]
pci_tracker.dev_filter.specs = [devspec.PciDeviceSpec({})]
ppt.update_provider_tree_for_pci(pt, 'fake-node', pci_tracker, {}, [])
pci_tracker.stats.populate_pools_metadata_from_assigned_devices.\
assert_called_once_with()

View File

@ -1580,6 +1580,7 @@ class TestUpdateComputeNode(BaseTestCase):
self.rt._update(mock.sentinel.ctx, new_compute)
save_mock.assert_called_once_with()
@mock.patch('nova.objects.ComputeNode.save', new=mock.Mock())
@mock.patch(
'nova.pci.stats.PciDeviceStats.has_remote_managed_device_pools',
return_value=True)
@ -1773,7 +1774,7 @@ class TestUpdateComputeNode(BaseTestCase):
self.assertEqual(4, ufpt_mock.call_count)
self.assertEqual(4, mock_sync_disabled.call_count)
# The retry is restricted to _update_to_placement
self.assertEqual(1, mock_resource_change.call_count)
self.assertEqual(0, mock_resource_change.call_count)
@mock.patch(
'nova.compute.resource_tracker.ResourceTracker.'
@ -2041,6 +2042,10 @@ class TestUpdateComputeNode(BaseTestCase):
self.assertIn('Unable to find services table record for nova-compute',
mock_log_error.call_args[0][0])
@mock.patch(
'nova.compute.resource_tracker.ResourceTracker.'
'_update_to_placement',
new=mock.Mock())
def test_update_compute_node_save_fails_restores_old_resources(self):
"""Tests the scenario that compute_node.save() fails and the
old_resources value for the node is restored to its previous value

View File

@ -17,6 +17,7 @@ from unittest import mock
from oslo_config import cfg
from oslo_serialization import jsonutils
from oslo_utils.fixture import uuidsentinel as uuids
from nova import exception
from nova import objects
@ -896,6 +897,120 @@ class PciDeviceStatsPlacementSupportTestCase(test.NoDBTestCase):
self.assertEqual(pools, matching_pools)
def test_populate_pools_metadata_from_assigned_devices(self):
device_spec = [
jsonutils.dumps(
{
"address": "0000:81:00.*",
}
),
]
self.flags(device_spec=device_spec, group="pci")
dev_filter = whitelist.Whitelist(device_spec)
pci_stats = stats.PciDeviceStats(
objects.NUMATopology(),
dev_filter=dev_filter)
pci_dev1 = objects.PciDevice(
vendor_id="dead",
product_id="beef",
address="0000:81:00.1",
parent_addr="0000:81:00.0",
numa_node=0,
dev_type="type-VF",
)
pci_dev2 = objects.PciDevice(
vendor_id="dead",
product_id="beef",
address="0000:81:00.2",
parent_addr="0000:81:00.0",
numa_node=0,
dev_type="type-VF",
)
pci_stats.add_device(pci_dev1)
pci_dev1.extra_info = {'rp_uuid': uuids.rp1}
pci_stats.add_device(pci_dev2)
pci_dev2.extra_info = {'rp_uuid': uuids.rp1}
self.assertEqual(1, len(pci_stats.pools))
pci_stats.populate_pools_metadata_from_assigned_devices()
self.assertEqual(uuids.rp1, pci_stats.pools[0]['rp_uuid'])
def test_populate_pools_metadata_from_assigned_devices_device_without_rp(
self
):
device_spec = [
jsonutils.dumps(
{
"address": "0000:81:00.*",
}
),
]
self.flags(device_spec=device_spec, group="pci")
dev_filter = whitelist.Whitelist(device_spec)
pci_stats = stats.PciDeviceStats(
objects.NUMATopology(),
dev_filter=dev_filter)
pci_dev1 = objects.PciDevice(
vendor_id="dead",
product_id="beef",
address="0000:81:00.1",
parent_addr="0000:81:00.0",
numa_node=0,
dev_type="type-VF",
)
pci_stats.add_device(pci_dev1)
self.assertEqual(1, len(pci_stats.pools))
pci_stats.populate_pools_metadata_from_assigned_devices()
self.assertNotIn('rp_uuid', pci_stats.pools[0])
def test_populate_pools_metadata_from_assigned_devices_multiple_rp(self):
device_spec = [
jsonutils.dumps(
{
"address": "0000:81:00.*",
}
),
]
self.flags(device_spec=device_spec, group="pci")
dev_filter = whitelist.Whitelist(device_spec)
pci_stats = stats.PciDeviceStats(
objects.NUMATopology(),
dev_filter=dev_filter)
pci_dev1 = objects.PciDevice(
compute_node_id=1,
vendor_id="dead",
product_id="beef",
address="0000:81:00.1",
parent_addr="0000:81:00.0",
numa_node=0,
dev_type="type-VF",
)
pci_dev2 = objects.PciDevice(
compute_node_id=1,
vendor_id="dead",
product_id="beef",
address="0000:81:00.2",
parent_addr="0000:81:00.0",
numa_node=0,
dev_type="type-VF",
)
pci_stats.add_device(pci_dev1)
pci_dev1.extra_info = {'rp_uuid': uuids.rp1}
pci_stats.add_device(pci_dev2)
pci_dev2.extra_info = {'rp_uuid': uuids.rp2}
self.assertEqual(1, len(pci_stats.pools))
self.assertRaises(
ValueError,
pci_stats.populate_pools_metadata_from_assigned_devices,
)
class PciDeviceVFPFStatsTestCase(test.NoDBTestCase):