Merge "[PCI tracker]Remove non configured devs when freed" into stable/2025.1

This commit is contained in:
Zuul
2025-10-10 17:54:46 +00:00
committed by Gerrit Code Review
3 changed files with 53 additions and 93 deletions

View File

@@ -69,6 +69,7 @@ class PciDevTracker(object):
tracking. tracking.
""" """
self.stale: ty.Dict[str, objects.PciDevice] = {} self.stale: ty.Dict[str, objects.PciDevice] = {}
self.to_be_removed_when_freed: ty.Dict[str, objects.PciDevice] = {}
self.node_id: str = compute_node.id self.node_id: str = compute_node.id
self.dev_filter = whitelist.Whitelist(CONF.pci.device_spec) self.dev_filter = whitelist.Whitelist(CONF.pci.device_spec)
numa_topology = compute_node.numa_topology numa_topology = compute_node.numa_topology
@@ -256,6 +257,12 @@ class PciDevTracker(object):
# device to a second vm. To prevent this bug we skip # device to a second vm. To prevent this bug we skip
# deleting the device from the db in this iteration and # deleting the device from the db in this iteration and
# will try again on the next sync. # will try again on the next sync.
# NOTE(gibi): We keep a list of these devices in memory
# so that when the VM using the device is deleted then
# the tracker can not just free the device but also
# mark them for removal. This will prevent a bug where
# such a freed device is re-allocated before removed.
self.to_be_removed_when_freed[existed.address] = existed
continue continue
else: else:
# Note(yjiang5): no need to update stats if an assigned # Note(yjiang5): no need to update stats if an assigned
@@ -397,8 +404,15 @@ class PciDevTracker(object):
stale = self.stale.pop(dev.address, None) stale = self.stale.pop(dev.address, None)
if stale: if stale:
dev.update_device(stale) dev.update_device(stale)
for dev in freed_devs:
self.stats.add_device(dev) to_be_removed = self.to_be_removed_when_freed.pop(dev.address, None)
if to_be_removed:
dev.remove()
if dev in self.stats.get_free_devs():
self.stats.remove_device(dev)
else:
for dev in freed_devs:
self.stats.add_device(dev)
def free_instance_allocations( def free_instance_allocations(
self, context: ctx.RequestContext, instance: 'objects.Instance', self, context: ctx.RequestContext, instance: 'objects.Instance',

View File

@@ -892,48 +892,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
) )
self.stdlog.delete_stored_logs() self.stdlog.delete_stored_logs()
# Delete the server as the warning suggests # Delete the server as the warning suggests. Unfortunately the deletion
self._delete_server(server) # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning suggesting we have a bug. Indeed, # The sever delete fails as nova tries to delete the RP while it still
# this is part of https://bugs.launchpad.net/nova/+bug/2115905 # has allocations.
self.assertIn( self.assertRegex(
"WARNING [nova.compute.pci_placement_translator] " self.stdlog.logger.output,
"Device spec is not found for device 0000:81:00.0 in " "ERROR .nova.scheduler.client.report..*Failed to delete "
"[pci]device_spec. Ignoring device in Placement resource view. " "resource provider with UUID.*from the placement API. "
"This should not happen. Please file a bug", "Got 409.*Unable to delete resource provider.*Resource "
self.stdlog.logger.output "provider has allocations.")
)
# The allocation successfully removed # The instance is put into ERROR state.
compute1_expected_placement_view["usages"] = { server = self.api.get_server(server['id'])
"0000:81:00.0": { self.assertEqual(server['status'], 'ERROR')
self.PF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
# However the RP and the inventory are not removed from Placement
# due to pci tracker caching. The PciDevice remains in the DB until
# the next nova-compute restart and therefore the RP remains in
# Placement until too. This is a potential bug that keeps a device
# that seems to be available, but it should not as the device is not
# in the device spec anymore.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
self.stdlog.delete_stored_logs() # And the allocation is not removed.
self.restart_compute_service(hostname="compute1")
self._run_periodics()
# The next compute restart not trigger PCI warning
self.assertNotIn(
"WARNING [nova.compute.pci_placement_translator]",
self.stdlog.logger.output)
# And the device is now removed from Placement
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view( self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view) "compute1", **compute1_expected_placement_view)
@@ -1022,49 +1000,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
self.stdlog.logger.output, self.stdlog.logger.output,
) )
self.stdlog.delete_stored_logs() # Delete the server as the warning suggests. Unfortunately the deletion
# Delete the server as the warning suggests # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
self._delete_server(server) ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning suggesting we have a bug. Indeed, # The sever delete fails as nova tries to delete the RP while it still
# this is part of https://bugs.launchpad.net/nova/+bug/2115905 # has allocations.
self.assertIn( self.assertRegex(
"WARNING [nova.compute.pci_placement_translator] " self.stdlog.logger.output,
"Device spec is not found for device 0000:81:00.1 in " "ERROR .nova.scheduler.client.report..*Failed to delete "
"[pci]device_spec. Ignoring device in Placement resource view. " "resource provider with UUID.*from the placement API. "
"This should not happen. Please file a bug", "Got 409.*Unable to delete resource provider.*Resource "
self.stdlog.logger.output "provider has allocations.")
)
# The allocation successfully removed # The instance is put into ERROR state.
compute1_expected_placement_view["usages"] = { server = self.api.get_server(server['id'])
"0000:81:00.0": { self.assertEqual(server['status'], 'ERROR')
self.VF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
# However the RP and the inventory are not removed from Placement
# due to pci tracker caching. The PciDevice remains in the DB until
# the next nova-compute restart and therefore the RP remains in
# Placement until too. This is a potential bug that keeps a device
# that seems to be available, but it should not as the device is not
# in the device spec anymore.
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
self.stdlog.delete_stored_logs() # And the allocation is not removed.
self.restart_compute_service(hostname="compute1")
self._run_periodics()
# The next compute restart not trigger PCI warning
self.assertNotIn(
"WARNING [nova.compute.pci_placement_translator]",
self.stdlog.logger.output)
# And the device is now removed from Placement
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view( self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view) "compute1", **compute1_expected_placement_view)
@@ -1113,16 +1068,7 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
client.OpenStackApiException, self._delete_server, server) client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex)) self.assertIn("Unexpected API Error. Please report this", str(ex))
# The deletion triggers a warning as well suggesting we have a bug. # We have the same RP deletion error as before.
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device 0000:81:00.2 in "
"[pci]device_spec. Ignoring device in Placement resource view. "
"This should not happen. Please file a bug",
self.stdlog.logger.output
)
# and the same RP deletion error as before.
self.assertRegex( self.assertRegex(
self.stdlog.logger.output, self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete " "ERROR .nova.scheduler.client.report..*Failed to delete "

View File

@@ -516,9 +516,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
0, 0,
len([dev for dev in self.tracker.pci_devs len([dev for dev in self.tracker.pci_devs
if dev.status == fields.PciDeviceStatus.REMOVED])) if dev.status == fields.PciDeviceStatus.REMOVED]))
# free the device that was allocated and update tracker again
# free the device that was allocated
self.tracker._free_device(claimed_dev) self.tracker._free_device(claimed_dev)
self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
# and assert that one device is removed from the tracker # and assert that one device is removed from the tracker
self.assertEqual( self.assertEqual(
1, 1,