diff --git a/nova/pci/manager.py b/nova/pci/manager.py index 2cbce90516d6..9efa98a01623 100644 --- a/nova/pci/manager.py +++ b/nova/pci/manager.py @@ -69,6 +69,7 @@ class PciDevTracker(object): tracking. """ self.stale: ty.Dict[str, objects.PciDevice] = {} + self.to_be_removed_when_freed: ty.Dict[str, objects.PciDevice] = {} self.node_id: str = compute_node.id self.dev_filter = whitelist.Whitelist(CONF.pci.device_spec) numa_topology = compute_node.numa_topology @@ -256,6 +257,12 @@ class PciDevTracker(object): # device to a second vm. To prevent this bug we skip # deleting the device from the db in this iteration and # will try again on the next sync. + # NOTE(gibi): We keep a list of these devices in memory + # so that when the VM using the device is deleted then + # the tracker can not just free the device but also + # mark them for removal. This will prevent a bug where + # such a freed device is re-allocated before removed. + self.to_be_removed_when_freed[existed.address] = existed continue else: # Note(yjiang5): no need to update stats if an assigned @@ -397,8 +404,15 @@ class PciDevTracker(object): stale = self.stale.pop(dev.address, None) if stale: dev.update_device(stale) - for dev in freed_devs: - self.stats.add_device(dev) + + to_be_removed = self.to_be_removed_when_freed.pop(dev.address, None) + if to_be_removed: + dev.remove() + if dev in self.stats.get_free_devs(): + self.stats.remove_device(dev) + else: + for dev in freed_devs: + self.stats.add_device(dev) def free_instance_allocations( self, context: ctx.RequestContext, instance: 'objects.Instance', diff --git a/nova/tests/functional/libvirt/test_pci_in_placement.py b/nova/tests/functional/libvirt/test_pci_in_placement.py index 7eb804aef199..cfa8b2a1af22 100644 --- a/nova/tests/functional/libvirt/test_pci_in_placement.py +++ b/nova/tests/functional/libvirt/test_pci_in_placement.py @@ -892,48 +892,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): ) self.stdlog.delete_stored_logs() - # Delete the server as the warning suggests - self._delete_server(server) + # Delete the server as the warning suggests. Unfortunately the deletion + # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905 + ex = self.assertRaises( + client.OpenStackApiException, self._delete_server, server) + self.assertIn("Unexpected API Error. Please report this", str(ex)) - # The deletion triggers a warning suggesting we have a bug. Indeed, - # this is part of https://bugs.launchpad.net/nova/+bug/2115905 - self.assertIn( - "WARNING [nova.compute.pci_placement_translator] " - "Device spec is not found for device 0000:81:00.0 in " - "[pci]device_spec. Ignoring device in Placement resource view. " - "This should not happen. Please file a bug", - self.stdlog.logger.output - ) + # The sever delete fails as nova tries to delete the RP while it still + # has allocations. + self.assertRegex( + self.stdlog.logger.output, + "ERROR .nova.scheduler.client.report..*Failed to delete " + "resource provider with UUID.*from the placement API. " + "Got 409.*Unable to delete resource provider.*Resource " + "provider has allocations.") - # The allocation successfully removed - compute1_expected_placement_view["usages"] = { - "0000:81:00.0": { - self.PF_RC: 0, - } - } - compute1_expected_placement_view["allocations"].pop(server["id"]) - # However the RP and the inventory are not removed from Placement - # due to pci tracker caching. The PciDevice remains in the DB until - # the next nova-compute restart and therefore the RP remains in - # Placement until too. This is a potential bug that keeps a device - # that seems to be available, but it should not as the device is not - # in the device spec anymore. - self.assert_placement_pci_view( - "compute1", **compute1_expected_placement_view) + # The instance is put into ERROR state. + server = self.api.get_server(server['id']) + self.assertEqual(server['status'], 'ERROR') - self.stdlog.delete_stored_logs() - self.restart_compute_service(hostname="compute1") - self._run_periodics() - - # The next compute restart not trigger PCI warning - self.assertNotIn( - "WARNING [nova.compute.pci_placement_translator]", - self.stdlog.logger.output) - - # And the device is now removed from Placement - compute1_expected_placement_view["inventories"].pop("0000:81:00.0") - compute1_expected_placement_view["traits"].pop("0000:81:00.0") - compute1_expected_placement_view["usages"].pop("0000:81:00.0") + # And the allocation is not removed. self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) @@ -1022,49 +1000,26 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): self.stdlog.logger.output, ) - self.stdlog.delete_stored_logs() - # Delete the server as the warning suggests - self._delete_server(server) + # Delete the server as the warning suggests. Unfortunately the deletion + # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905 + ex = self.assertRaises( + client.OpenStackApiException, self._delete_server, server) + self.assertIn("Unexpected API Error. Please report this", str(ex)) - # The deletion triggers a warning suggesting we have a bug. Indeed, - # this is part of https://bugs.launchpad.net/nova/+bug/2115905 - self.assertIn( - "WARNING [nova.compute.pci_placement_translator] " - "Device spec is not found for device 0000:81:00.1 in " - "[pci]device_spec. Ignoring device in Placement resource view. " - "This should not happen. Please file a bug", - self.stdlog.logger.output - ) + # The sever delete fails as nova tries to delete the RP while it still + # has allocations. + self.assertRegex( + self.stdlog.logger.output, + "ERROR .nova.scheduler.client.report..*Failed to delete " + "resource provider with UUID.*from the placement API. " + "Got 409.*Unable to delete resource provider.*Resource " + "provider has allocations.") - # The allocation successfully removed - compute1_expected_placement_view["usages"] = { - "0000:81:00.0": { - self.VF_RC: 0, - } - } - compute1_expected_placement_view["allocations"].pop(server["id"]) - # However the RP and the inventory are not removed from Placement - # due to pci tracker caching. The PciDevice remains in the DB until - # the next nova-compute restart and therefore the RP remains in - # Placement until too. This is a potential bug that keeps a device - # that seems to be available, but it should not as the device is not - # in the device spec anymore. - self.assert_placement_pci_view( - "compute1", **compute1_expected_placement_view) + # The instance is put into ERROR state. + server = self.api.get_server(server['id']) + self.assertEqual(server['status'], 'ERROR') - self.stdlog.delete_stored_logs() - self.restart_compute_service(hostname="compute1") - self._run_periodics() - - # The next compute restart not trigger PCI warning - self.assertNotIn( - "WARNING [nova.compute.pci_placement_translator]", - self.stdlog.logger.output) - - # And the device is now removed from Placement - compute1_expected_placement_view["inventories"].pop("0000:81:00.0") - compute1_expected_placement_view["traits"].pop("0000:81:00.0") - compute1_expected_placement_view["usages"].pop("0000:81:00.0") + # And the allocation is not removed. self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) @@ -1113,16 +1068,7 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): client.OpenStackApiException, self._delete_server, server) self.assertIn("Unexpected API Error. Please report this", str(ex)) - # The deletion triggers a warning as well suggesting we have a bug. - self.assertIn( - "WARNING [nova.compute.pci_placement_translator] " - "Device spec is not found for device 0000:81:00.2 in " - "[pci]device_spec. Ignoring device in Placement resource view. " - "This should not happen. Please file a bug", - self.stdlog.logger.output - ) - - # and the same RP deletion error as before. + # We have the same RP deletion error as before. self.assertRegex( self.stdlog.logger.output, "ERROR .nova.scheduler.client.report..*Failed to delete " diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py index 7826c3e72935..e880cdac25f0 100644 --- a/nova/tests/unit/pci/test_manager.py +++ b/nova/tests/unit/pci/test_manager.py @@ -516,9 +516,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase): 0, len([dev for dev in self.tracker.pci_devs if dev.status == fields.PciDeviceStatus.REMOVED])) - # free the device that was allocated and update tracker again + + # free the device that was allocated self.tracker._free_device(claimed_dev) - self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs)) # and assert that one device is removed from the tracker self.assertEqual( 1,