Merge "[PCI tracker]Remove non configured devs when freed" into stable/2025.1

This commit is contained in:
Zuul
2025-10-10 17:54:46 +00:00
committed by Gerrit Code Review
3 changed files with 53 additions and 93 deletions

View File

@@ -69,6 +69,7 @@ class PciDevTracker(object):
tracking.
"""
self.stale: ty.Dict[str, objects.PciDevice] = {}
self.to_be_removed_when_freed: ty.Dict[str, objects.PciDevice] = {}
self.node_id: str = compute_node.id
self.dev_filter = whitelist.Whitelist(CONF.pci.device_spec)
numa_topology = compute_node.numa_topology
@@ -256,6 +257,12 @@ class PciDevTracker(object):
# device to a second vm. To prevent this bug we skip
# deleting the device from the db in this iteration and
# will try again on the next sync.
# NOTE(gibi): We keep a list of these devices in memory
# so that when the VM using the device is deleted then
# the tracker can not just free the device but also
# mark them for removal. This will prevent a bug where
# such a freed device is re-allocated before removed.
self.to_be_removed_when_freed[existed.address] = existed
continue
else:
# Note(yjiang5): no need to update stats if an assigned
@@ -397,8 +404,15 @@ class PciDevTracker(object):
stale = self.stale.pop(dev.address, None)
if stale:
dev.update_device(stale)
for dev in freed_devs:
self.stats.add_device(dev)
to_be_removed = self.to_be_removed_when_freed.pop(dev.address, None)
if to_be_removed:
dev.remove()
if dev in self.stats.get_free_devs():
self.stats.remove_device(dev)
else:
for dev in freed_devs:
self.stats.add_device(dev)
def free_instance_allocations(
self, context: ctx.RequestContext, instance: 'objects.Instance',

View File

@@ -892,48 +892,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
)
self.stdlog.delete_stored_logs()
# Delete the server as the warning suggests
self._delete_server(server)
# Delete the server as the warning suggests. Unfortunately the deletion
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning suggesting we have a bug. Indeed,
# this is part of https://bugs.launchpad.net/nova/+bug/2115905
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device 0000:81:00.0 in "
"[pci]device_spec. Ignoring device in Placement resource view. "
"This should not happen. Please file a bug",
self.stdlog.logger.output
)
# The sever delete fails as nova tries to delete the RP while it still
# has allocations.
self.assertRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# The allocation successfully removed
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
self.PF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
# However the RP and the inventory are not removed from Placement
# due to pci tracker caching. The PciDevice remains in the DB until
# the next nova-compute restart and therefore the RP remains in
# Placement until too. This is a potential bug that keeps a device
# that seems to be available, but it should not as the device is not
# in the device spec anymore.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# The instance is put into ERROR state.
server = self.api.get_server(server['id'])
self.assertEqual(server['status'], 'ERROR')
self.stdlog.delete_stored_logs()
self.restart_compute_service(hostname="compute1")
self._run_periodics()
# The next compute restart not trigger PCI warning
self.assertNotIn(
"WARNING [nova.compute.pci_placement_translator]",
self.stdlog.logger.output)
# And the device is now removed from Placement
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
# And the allocation is not removed.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
@@ -1022,49 +1000,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
self.stdlog.logger.output,
)
self.stdlog.delete_stored_logs()
# Delete the server as the warning suggests
self._delete_server(server)
# Delete the server as the warning suggests. Unfortunately the deletion
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning suggesting we have a bug. Indeed,
# this is part of https://bugs.launchpad.net/nova/+bug/2115905
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device 0000:81:00.1 in "
"[pci]device_spec. Ignoring device in Placement resource view. "
"This should not happen. Please file a bug",
self.stdlog.logger.output
)
# The sever delete fails as nova tries to delete the RP while it still
# has allocations.
self.assertRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# The allocation successfully removed
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
self.VF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
# However the RP and the inventory are not removed from Placement
# due to pci tracker caching. The PciDevice remains in the DB until
# the next nova-compute restart and therefore the RP remains in
# Placement until too. This is a potential bug that keeps a device
# that seems to be available, but it should not as the device is not
# in the device spec anymore.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# The instance is put into ERROR state.
server = self.api.get_server(server['id'])
self.assertEqual(server['status'], 'ERROR')
self.stdlog.delete_stored_logs()
self.restart_compute_service(hostname="compute1")
self._run_periodics()
# The next compute restart not trigger PCI warning
self.assertNotIn(
"WARNING [nova.compute.pci_placement_translator]",
self.stdlog.logger.output)
# And the device is now removed from Placement
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
# And the allocation is not removed.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
@@ -1113,16 +1068,7 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning as well suggesting we have a bug.
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device 0000:81:00.2 in "
"[pci]device_spec. Ignoring device in Placement resource view. "
"This should not happen. Please file a bug",
self.stdlog.logger.output
)
# and the same RP deletion error as before.
# We have the same RP deletion error as before.
self.assertRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "

View File

@@ -516,9 +516,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
0,
len([dev for dev in self.tracker.pci_devs
if dev.status == fields.PciDeviceStatus.REMOVED]))
# free the device that was allocated and update tracker again
# free the device that was allocated
self.tracker._free_device(claimed_dev)
self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
# and assert that one device is removed from the tracker
self.assertEqual(
1,