Merge "[PCI tracker]Remove non configured devs when freed" into stable/2025.1
This commit is contained in:
@@ -69,6 +69,7 @@ class PciDevTracker(object):
|
||||
tracking.
|
||||
"""
|
||||
self.stale: ty.Dict[str, objects.PciDevice] = {}
|
||||
self.to_be_removed_when_freed: ty.Dict[str, objects.PciDevice] = {}
|
||||
self.node_id: str = compute_node.id
|
||||
self.dev_filter = whitelist.Whitelist(CONF.pci.device_spec)
|
||||
numa_topology = compute_node.numa_topology
|
||||
@@ -256,6 +257,12 @@ class PciDevTracker(object):
|
||||
# device to a second vm. To prevent this bug we skip
|
||||
# deleting the device from the db in this iteration and
|
||||
# will try again on the next sync.
|
||||
# NOTE(gibi): We keep a list of these devices in memory
|
||||
# so that when the VM using the device is deleted then
|
||||
# the tracker can not just free the device but also
|
||||
# mark them for removal. This will prevent a bug where
|
||||
# such a freed device is re-allocated before removed.
|
||||
self.to_be_removed_when_freed[existed.address] = existed
|
||||
continue
|
||||
else:
|
||||
# Note(yjiang5): no need to update stats if an assigned
|
||||
@@ -397,8 +404,15 @@ class PciDevTracker(object):
|
||||
stale = self.stale.pop(dev.address, None)
|
||||
if stale:
|
||||
dev.update_device(stale)
|
||||
for dev in freed_devs:
|
||||
self.stats.add_device(dev)
|
||||
|
||||
to_be_removed = self.to_be_removed_when_freed.pop(dev.address, None)
|
||||
if to_be_removed:
|
||||
dev.remove()
|
||||
if dev in self.stats.get_free_devs():
|
||||
self.stats.remove_device(dev)
|
||||
else:
|
||||
for dev in freed_devs:
|
||||
self.stats.add_device(dev)
|
||||
|
||||
def free_instance_allocations(
|
||||
self, context: ctx.RequestContext, instance: 'objects.Instance',
|
||||
|
||||
@@ -892,48 +892,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
)
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
# Delete the server as the warning suggests
|
||||
self._delete_server(server)
|
||||
# Delete the server as the warning suggests. Unfortunately the deletion
|
||||
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
|
||||
ex = self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# The deletion triggers a warning suggesting we have a bug. Indeed,
|
||||
# this is part of https://bugs.launchpad.net/nova/+bug/2115905
|
||||
self.assertIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Device spec is not found for device 0000:81:00.0 in "
|
||||
"[pci]device_spec. Ignoring device in Placement resource view. "
|
||||
"This should not happen. Please file a bug",
|
||||
self.stdlog.logger.output
|
||||
)
|
||||
# The sever delete fails as nova tries to delete the RP while it still
|
||||
# has allocations.
|
||||
self.assertRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# The allocation successfully removed
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
self.PF_RC: 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
# However the RP and the inventory are not removed from Placement
|
||||
# due to pci tracker caching. The PciDevice remains in the DB until
|
||||
# the next nova-compute restart and therefore the RP remains in
|
||||
# Placement until too. This is a potential bug that keeps a device
|
||||
# that seems to be available, but it should not as the device is not
|
||||
# in the device spec anymore.
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
# The instance is put into ERROR state.
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertEqual(server['status'], 'ERROR')
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
self.restart_compute_service(hostname="compute1")
|
||||
self._run_periodics()
|
||||
|
||||
# The next compute restart not trigger PCI warning
|
||||
self.assertNotIn(
|
||||
"WARNING [nova.compute.pci_placement_translator]",
|
||||
self.stdlog.logger.output)
|
||||
|
||||
# And the device is now removed from Placement
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
# And the allocation is not removed.
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
@@ -1022,49 +1000,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
# Delete the server as the warning suggests
|
||||
self._delete_server(server)
|
||||
# Delete the server as the warning suggests. Unfortunately the deletion
|
||||
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
|
||||
ex = self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# The deletion triggers a warning suggesting we have a bug. Indeed,
|
||||
# this is part of https://bugs.launchpad.net/nova/+bug/2115905
|
||||
self.assertIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Device spec is not found for device 0000:81:00.1 in "
|
||||
"[pci]device_spec. Ignoring device in Placement resource view. "
|
||||
"This should not happen. Please file a bug",
|
||||
self.stdlog.logger.output
|
||||
)
|
||||
# The sever delete fails as nova tries to delete the RP while it still
|
||||
# has allocations.
|
||||
self.assertRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# The allocation successfully removed
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
self.VF_RC: 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
# However the RP and the inventory are not removed from Placement
|
||||
# due to pci tracker caching. The PciDevice remains in the DB until
|
||||
# the next nova-compute restart and therefore the RP remains in
|
||||
# Placement until too. This is a potential bug that keeps a device
|
||||
# that seems to be available, but it should not as the device is not
|
||||
# in the device spec anymore.
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
# The instance is put into ERROR state.
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertEqual(server['status'], 'ERROR')
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
self.restart_compute_service(hostname="compute1")
|
||||
self._run_periodics()
|
||||
|
||||
# The next compute restart not trigger PCI warning
|
||||
self.assertNotIn(
|
||||
"WARNING [nova.compute.pci_placement_translator]",
|
||||
self.stdlog.logger.output)
|
||||
|
||||
# And the device is now removed from Placement
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
# And the allocation is not removed.
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
@@ -1113,16 +1068,7 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# The deletion triggers a warning as well suggesting we have a bug.
|
||||
self.assertIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Device spec is not found for device 0000:81:00.2 in "
|
||||
"[pci]device_spec. Ignoring device in Placement resource view. "
|
||||
"This should not happen. Please file a bug",
|
||||
self.stdlog.logger.output
|
||||
)
|
||||
|
||||
# and the same RP deletion error as before.
|
||||
# We have the same RP deletion error as before.
|
||||
self.assertRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
|
||||
@@ -516,9 +516,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
|
||||
0,
|
||||
len([dev for dev in self.tracker.pci_devs
|
||||
if dev.status == fields.PciDeviceStatus.REMOVED]))
|
||||
# free the device that was allocated and update tracker again
|
||||
|
||||
# free the device that was allocated
|
||||
self.tracker._free_device(claimed_dev)
|
||||
self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
|
||||
# and assert that one device is removed from the tracker
|
||||
self.assertEqual(
|
||||
1,
|
||||
|
||||
Reference in New Issue
Block a user