Handle PCI dev reconf with allocations

PCI devices which are allocated to instances can be removed from the [pci]device_spec configuration or can be removed from the hypervisor directly. The existing PciTracker code handle this cases by keeping the PciDevice in the nova DB exists and allocated and issue a warning in the logs during the compute service startup that nova is in an inconsistent state. Similar behavior is now added to the PCI placement tracking code so the PCI inventories and allocations in placement is kept in such situation. There is one case where we cannot simply accept the PCI device reconfiguration by keeping the existing allocations and applying the new config. It is when a PF that is configured and allocated is removed and VFs from this PF is now configured in the [pci]device_spec. And vice versa when VFs are removed and its parent PF is configured. In this case keeping the existing inventory and allocations and adding the new inventory to placement would result in placement model where a single PCI device would provide both PF and VF inventories. This dependent device configuration is not supported as it could lead to double consumption. In such situation the compute service will refuse to start. blueprint: pci-device-tracking-in-placement Change-Id: Id130893de650cc2d38953cea7cf9f53af71ced93
2022-08-08 12:42:56 +02:00 · 2022-08-08 12:42:56 +02:00 · 9268bc36a3
parent ab439dadb1
commit 9268bc36a3
7 changed files with 281 additions and 42 deletions
--- a/doc/source/admin/pci-passthrough.rst
+++ b/doc/source/admin/pci-passthrough.rst
@ -404,4 +404,20 @@ is upgraded. If there is an in-progress migration, then the PCI allocation on
 the source host of the migration will not be healed. The placement view will be
 consistent after such migration is completed or reverted.

+Reconfiguring the PCI devices on the hypervisor or changing the
+:oslo.config:option:`pci.device_spec` configuration option and restarting the
+nova-compute service is supported in the following cases:
+
+* new devices are added
+* devices without allocation is removed
+
+Removing a device that has allocations is not supported. If a device having any
+allocation is removed then the nova-compute service will keep the device and
+the allocation exists in the nova DB and in placement and logs a warning. If
+a device with any allocation is reconfigured in a way that an allocated PF is
+removed and VFs from the same PF is configured (or vice versa) then
+nova-compute will refuse to start as it would create a situation where both
+the PF and its VFs are made available for consumption.
+
+
 For deeper technical details please read the `nova specification. <https://specs.openstack.org/openstack/nova-specs/specs/zed/approved/pci-device-tracking-in-placement.html>`_
--- a/nova/compute/pci_placement_translator.py
+++ b/nova/compute/pci_placement_translator.py
@ -443,15 +443,33 @@ class PlacementView:
            fields.PciDeviceStatus.DELETED,
            fields.PciDeviceStatus.REMOVED,
        ):
+            # If the PCI tracker marked the device DELETED or REMOVED then
+            # such device is not allocated, so we are free to drop it from
+            # placement too.
            self._remove_dev(dev)
        else:
            if not dev_spec:
-                LOG.warning(
-                    "Device spec is not found for device %s in "
-                    "[pci]device_spec. Ignoring device in Placement resource "
-                    "view. This should not happen. Please file a bug.",
-                    dev.address
-                )
+                if dev.instance_uuid:
+                    LOG.warning(
+                        "Device spec is not found for device %s in "
+                        "[pci]device_spec. We are skipping this devices "
+                        "during Placement update. The device is allocated by "
+                        "%s. You should not remove an allocated device from "
+                        "the configuration. Please restore the configuration "
+                        "or cold migrate the instance to resolve the "
+                        "inconsistency.",
+                        dev.address,
+                        dev.instance_uuid
+                    )
+                else:
+                    LOG.warning(
+                        "Device spec is not found for device %s in "
+                        "[pci]device_spec. Ignoring device in Placement "
+                        "resource view. This should not happen. Please file a "
+                        "bug.",
+                        dev.address
+                    )
+
                return

            self._add_dev(dev, dev_spec.get_tags())
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1295,14 +1295,22 @@ class ResourceTracker(object):
        # This merges in changes from the provider config files loaded in init
        self._merge_provider_configs(self.provider_configs, prov_tree)

-        # Flush any changes. If we either processed ReshapeNeeded above or
-        # update_provider_tree_for_pci did reshape, then we need to pass allocs
-        # to update_from_provider_tree to hit placement's POST /reshaper route.
-        self.reportclient.update_from_provider_tree(
-            context,
-            prov_tree,
-            allocations=allocs if driver_reshaped or pci_reshaped else None
-        )
+        try:
+            # Flush any changes. If we either processed ReshapeNeeded above or
+            # update_provider_tree_for_pci did reshape, then we need to pass
+            # allocs to update_from_provider_tree to hit placement's POST
+            # /reshaper route.
+            self.reportclient.update_from_provider_tree(
+                context,
+                prov_tree,
+                allocations=allocs if driver_reshaped or pci_reshaped else None
+            )
+        except exception.InventoryInUse as e:
+            # This means an inventory reconfiguration (e.g.: removing a parent
+            # PF and adding a VF under that parent) was not possible due to
+            # existing allocations. Translate the exception to prevent the
+            # compute service to start
+            raise exception.PlacementPciException(error=str(e))

    def _update(self, context, compute_node, startup=False):
        """Update partial stats locally and populate them to Scheduler."""
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1373,7 +1373,6 @@ class SchedulerReportClient(object):
            # can inherit.
            helper_exceptions = (
                exception.InvalidResourceClass,
-                exception.InventoryInUse,
                exception.ResourceProviderAggregateRetrievalFailed,
                exception.ResourceProviderDeletionFailed,
                exception.ResourceProviderInUse,
--- a/nova/tests/functional/libvirt/test_pci_in_placement.py
+++ b/nova/tests/functional/libvirt/test_pci_in_placement.py
@ -36,7 +36,28 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
    # Just placeholders to satisfy the base class. The real value will be
    # redefined by the tests
    PCI_DEVICE_SPEC = []
-    PCI_ALIAS = None
+    PCI_ALIAS = [
+        jsonutils.dumps(x)
+        for x in (
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PCI_PROD_ID,
+                "name": "a-pci-dev",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PF_PROD_ID,
+                "device_type": "type-PF",
+                "name": "a-pf",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.VF_PROD_ID,
+                "device_type": "type-VF",
+                "name": "a-vf",
+            },
+        )
+    ]

    def setUp(self):
        super().setUp()
@ -681,6 +702,179 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
            },
        )

+    def _create_one_compute_with_a_pf_consumed_by_an_instance(self):
+        # The fake libvirt will emulate on the host:
+        # * two type-PFs in slot 0, with one type-VF
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=0, num_pfs=1, num_vfs=1)
+        # we match the PF only and ignore the VF
+        device_spec = self._to_device_spec_conf(
+            [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PF_PROD_ID,
+                    "address": "0000:81:00.0",
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+        self.mock_pci_report_in_placement.return_value = True
+        self.start_compute(hostname="compute1", pci_info=pci_info)
+
+        self.assertPCIDeviceCounts("compute1", total=1, free=1)
+        compute1_expected_placement_view = {
+            "inventories": {
+                "0000:81:00.0": {self.PF_RC: 1},
+            },
+            "traits": {
+                "0000:81:00.0": [],
+            },
+            "usages": {
+                "0000:81:00.0": {self.PF_RC: 0},
+            },
+            "allocations": {},
+        }
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+
+        # Create an instance consuming the PF
+        extra_spec = {"pci_passthrough:alias": "a-pf:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(flavor_id=flavor_id, networks=[])
+
+        self.assertPCIDeviceCounts("compute1", total=1, free=0)
+        compute1_expected_placement_view["usages"] = {
+            "0000:81:00.0": {self.PF_RC: 1},
+        }
+        compute1_expected_placement_view["allocations"][server["id"]] = {
+            "0000:81:00.0": {self.PF_RC: 1},
+        }
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+
+        return server, compute1_expected_placement_view
+
+    def test_device_reconfiguration_with_allocations_config_change_warn(self):
+        server, compute1_expected_placement_view = (
+            self._create_one_compute_with_a_pf_consumed_by_an_instance())
+
+        # remove 0000:81:00.0 from the device spec and restart the compute
+        device_spec = self._to_device_spec_conf([])
+        self.flags(group='pci', device_spec=device_spec)
+        # The PF is used but removed from the config. The PciTracker warns
+        # but keeps the device so the placement logic mimic this and only warns
+        # but keeps the RP and the allocation in placement intact.
+        self.restart_compute_service(hostname="compute1")
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        # the warning from the PciTracker
+        self.assertIn(
+            "WARNING [nova.pci.manager] Unable to remove device with status "
+            "'allocated' and ownership %s because of PCI device "
+            "1:0000:81:00.0 is allocated instead of ['available', "
+            "'unavailable', 'unclaimable']. Check your [pci]device_spec "
+            "configuration to make sure this allocated device is whitelisted. "
+            "If you have removed the device from the whitelist intentionally "
+            "or the device is no longer available on the host you will need "
+            "to delete the server or migrate it to another host to silence "
+            "this warning."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+        # the warning from the placement PCI tracking logic
+        self.assertIn(
+            "WARNING [nova.compute.pci_placement_translator] Device spec is "
+            "not found for device 0000:81:00.0 in [pci]device_spec. We are "
+            "skipping this devices during Placement update. The device is "
+            "allocated by %s. You should not remove an allocated device from "
+            "the configuration. Please restore the configuration or cold "
+            "migrate the instance to resolve the inconsistency."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+
+    def test_device_reconfiguration_with_allocations_config_change_stop(self):
+        self._create_one_compute_with_a_pf_consumed_by_an_instance()
+
+        # switch 0000:81:00.0 PF to 0000:81:00.1 VF
+        # in the config, then restart the compute service
+
+        # only match the VF now
+        device_spec = self._to_device_spec_conf(
+            [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.VF_PROD_ID,
+                    "address": "0000:81:00.1",
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+        # The compute fails to start as the new config would mean that the PF
+        # inventory is removed from the 0000:81:00.0 RP and the PF inventory is
+        # added instead there, but the VF inventory has allocations. Keeping
+        # the old inventory as in
+        # test_device_reconfiguration_with_allocations_config_change_warn is
+        # not an option as it would result in two resource class on the same RP
+        # one for the PF and one for the VF. That would allow consuming
+        # the same physical device twice. Such dependent device configuration
+        # is intentionally not supported so we are stopping the compute
+        # service.
+        ex = self.assertRaises(
+            exception.PlacementPciException,
+            self.restart_compute_service,
+            hostname="compute1"
+        )
+        self.assertRegex(
+            str(ex),
+            "Failed to gather or report PCI resources to Placement: There was "
+            "a conflict when trying to complete your request.\n\n "
+            "update conflict: Inventory for 'CUSTOM_PCI_8086_1528' on "
+            "resource provider '.*' in use.",
+        )
+
+    def test_device_reconfiguration_with_allocations_hyp_change(self):
+        server, compute1_expected_placement_view = (
+            self._create_one_compute_with_a_pf_consumed_by_an_instance())
+
+        # restart the compute but simulate that the device 0000:81:00.0 is
+        # removed from the hypervisor while the device spec config left
+        # intact. The PciTracker will notice this and log a warning. The
+        # placement tracking logic simply keeps the allocation intact in
+        # placement as both the PciDevice and the DeviceSpec is available.
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=0, num_pfs=0, num_vfs=0)
+        self.restart_compute_service(
+            hostname="compute1",
+            pci_info=pci_info,
+            keep_hypervisor_state=False
+        )
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        # the warning from the PciTracker
+        self.assertIn(
+            "WARNING [nova.pci.manager] Unable to remove device with status "
+            "'allocated' and ownership %s because of PCI device "
+            "1:0000:81:00.0 is allocated instead of ['available', "
+            "'unavailable', 'unclaimable']. Check your [pci]device_spec "
+            "configuration to make sure this allocated device is whitelisted. "
+            "If you have removed the device from the whitelist intentionally "
+            "or the device is no longer available on the host you will need "
+            "to delete the server or migrate it to another host to silence "
+            "this warning."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+
    def test_reporting_disabled_nothing_is_reported(self):
        # The fake libvirt will emulate on the host:
        # * one type-PCI in slot 0
@ -765,32 +959,6 @@ class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
            )
        )

-        # Pre-configure a PCI alias to consume our devs
-        alias_pci = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.PCI_PROD_ID,
-            "name": "a-pci-dev",
-        }
-        alias_pf = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.PF_PROD_ID,
-            "device_type": "type-PF",
-            "name": "a-pf",
-        }
-        alias_vf = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.VF_PROD_ID,
-            "device_type": "type-VF",
-            "name": "a-vf",
-        }
-        self.flags(
-            group='pci',
-            alias=self._to_pci_alias_conf([alias_pci, alias_pf, alias_vf]))
-
-    @staticmethod
-    def _to_pci_alias_conf(alias_list):
-        return [jsonutils.dumps(x) for x in alias_list]
-
    @staticmethod
    def _move_allocation(allocations, from_uuid, to_uuid):
        allocations[to_uuid] = allocations[from_uuid]
--- a/nova/tests/unit/compute/test_pci_placement_translator.py
+++ b/nova/tests/unit/compute/test_pci_placement_translator.py
@ -50,6 +50,7 @@ class TestTranslator(test.NoDBTestCase):
                pci_device.PciDevice(
                    address="0000:81:00.0",
                    status=fields.PciDeviceStatus.AVAILABLE,
+                    instance_uuid=None,
                )
            ]
        )
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -1960,6 +1960,35 @@ class TestUpdateComputeNode(BaseTestCase):
        upt = self.rt.reportclient.update_from_provider_tree
        upt.assert_called_once_with(mock.sentinel.ctx, ptree, allocations=None)

+    @mock.patch(
+        'nova.compute.resource_tracker.ResourceTracker.'
+        '_sync_compute_service_disabled_trait',
+        new=mock.Mock()
+    )
+    @mock.patch(
+        'nova.compute.resource_tracker.ResourceTracker._resource_change',
+        new=mock.Mock(return_value=False)
+    )
+    def test_update_pci_reporting_allocation_in_use_error_propagated(self):
+        """Assert that if the pci placement reporting code tries to remove
+        inventory with allocation from placement due to invalid hypervisor
+        or [pci]device_spec reconfiguration then the InventoryInUse error from
+        placement is propagated and makes the compute startup to fail.
+        """
+        compute_obj = _COMPUTE_NODE_FIXTURES[0].obj_clone()
+        self._setup_rt()
+        self.rt.reportclient.update_from_provider_tree.side_effect = (
+            exc.InventoryInUse(
+                resource_class="FOO", resource_provider="bar"))
+
+        self.assertRaises(
+            exc.PlacementPciException,
+            self.rt._update,
+            mock.sentinel.ctx,
+            compute_obj,
+            startup=True,
+        )
+
    @mock.patch('nova.objects.Service.get_by_compute_host',
                return_value=objects.Service(disabled=True))
    def test_sync_compute_service_disabled_trait_add(self, mock_get_by_host):