Merge "Handle PCI dev reconf with allocations"

2022-09-01 18:21:45 +00:00 · 2022-09-01 18:21:45 +00:00 · 82498dfe4b
commit 82498dfe4b
parent 2925fd1427 9268bc36a3
7 changed files with 281 additions and 42 deletions
--- a/doc/source/admin/pci-passthrough.rst
+++ b/doc/source/admin/pci-passthrough.rst
@ -404,4 +404,20 @@ is upgraded. If there is an in-progress migration, then the PCI allocation on
 the source host of the migration will not be healed. The placement view will be
 consistent after such migration is completed or reverted.

+Reconfiguring the PCI devices on the hypervisor or changing the
+:oslo.config:option:`pci.device_spec` configuration option and restarting the
+nova-compute service is supported in the following cases:
+
+* new devices are added
+* devices without allocation is removed
+
+Removing a device that has allocations is not supported. If a device having any
+allocation is removed then the nova-compute service will keep the device and
+the allocation exists in the nova DB and in placement and logs a warning. If
+a device with any allocation is reconfigured in a way that an allocated PF is
+removed and VFs from the same PF is configured (or vice versa) then
+nova-compute will refuse to start as it would create a situation where both
+the PF and its VFs are made available for consumption.
+
+
 For deeper technical details please read the `nova specification. <https://specs.openstack.org/openstack/nova-specs/specs/zed/approved/pci-device-tracking-in-placement.html>`_
--- a/nova/compute/pci_placement_translator.py
+++ b/nova/compute/pci_placement_translator.py
@ -443,15 +443,33 @@ class PlacementView:
            fields.PciDeviceStatus.DELETED,
            fields.PciDeviceStatus.REMOVED,
        ):
+            # If the PCI tracker marked the device DELETED or REMOVED then
+            # such device is not allocated, so we are free to drop it from
+            # placement too.
            self._remove_dev(dev)
        else:
            if not dev_spec:
+                if dev.instance_uuid:
                    LOG.warning(
                        "Device spec is not found for device %s in "
-                    "[pci]device_spec. Ignoring device in Placement resource "
-                    "view. This should not happen. Please file a bug.",
+                        "[pci]device_spec. We are skipping this devices "
+                        "during Placement update. The device is allocated by "
+                        "%s. You should not remove an allocated device from "
+                        "the configuration. Please restore the configuration "
+                        "or cold migrate the instance to resolve the "
+                        "inconsistency.",
+                        dev.address,
+                        dev.instance_uuid
+                    )
+                else:
+                    LOG.warning(
+                        "Device spec is not found for device %s in "
+                        "[pci]device_spec. Ignoring device in Placement "
+                        "resource view. This should not happen. Please file a "
+                        "bug.",
                        dev.address
                    )
+
                return

            self._add_dev(dev, dev_spec.get_tags())
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1295,14 +1295,22 @@ class ResourceTracker(object):
        # This merges in changes from the provider config files loaded in init
        self._merge_provider_configs(self.provider_configs, prov_tree)

+        try:
            # Flush any changes. If we either processed ReshapeNeeded above or
-        # update_provider_tree_for_pci did reshape, then we need to pass allocs
-        # to update_from_provider_tree to hit placement's POST /reshaper route.
+            # update_provider_tree_for_pci did reshape, then we need to pass
+            # allocs to update_from_provider_tree to hit placement's POST
+            # /reshaper route.
            self.reportclient.update_from_provider_tree(
                context,
                prov_tree,
                allocations=allocs if driver_reshaped or pci_reshaped else None
            )
+        except exception.InventoryInUse as e:
+            # This means an inventory reconfiguration (e.g.: removing a parent
+            # PF and adding a VF under that parent) was not possible due to
+            # existing allocations. Translate the exception to prevent the
+            # compute service to start
+            raise exception.PlacementPciException(error=str(e))

    def _update(self, context, compute_node, startup=False):
        """Update partial stats locally and populate them to Scheduler."""
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1373,7 +1373,6 @@ class SchedulerReportClient(object):
            # can inherit.
            helper_exceptions = (
                exception.InvalidResourceClass,
-                exception.InventoryInUse,
                exception.ResourceProviderAggregateRetrievalFailed,
                exception.ResourceProviderDeletionFailed,
                exception.ResourceProviderInUse,
--- a/nova/tests/functional/libvirt/test_pci_in_placement.py
+++ b/nova/tests/functional/libvirt/test_pci_in_placement.py
@ -36,7 +36,28 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
    # Just placeholders to satisfy the base class. The real value will be
    # redefined by the tests
    PCI_DEVICE_SPEC = []
-    PCI_ALIAS = None
+    PCI_ALIAS = [
+        jsonutils.dumps(x)
+        for x in (
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PCI_PROD_ID,
+                "name": "a-pci-dev",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PF_PROD_ID,
+                "device_type": "type-PF",
+                "name": "a-pf",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.VF_PROD_ID,
+                "device_type": "type-VF",
+                "name": "a-vf",
+            },
+        )
+    ]

    def setUp(self):
        super().setUp()
@ -681,6 +702,179 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
            },
        )

+    def _create_one_compute_with_a_pf_consumed_by_an_instance(self):
+        # The fake libvirt will emulate on the host:
+        # * two type-PFs in slot 0, with one type-VF
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=0, num_pfs=1, num_vfs=1)
+        # we match the PF only and ignore the VF
+        device_spec = self._to_device_spec_conf(
+            [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PF_PROD_ID,
+                    "address": "0000:81:00.0",
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+        self.mock_pci_report_in_placement.return_value = True
+        self.start_compute(hostname="compute1", pci_info=pci_info)
+
+        self.assertPCIDeviceCounts("compute1", total=1, free=1)
+        compute1_expected_placement_view = {
+            "inventories": {
+                "0000:81:00.0": {self.PF_RC: 1},
+            },
+            "traits": {
+                "0000:81:00.0": [],
+            },
+            "usages": {
+                "0000:81:00.0": {self.PF_RC: 0},
+            },
+            "allocations": {},
+        }
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+
+        # Create an instance consuming the PF
+        extra_spec = {"pci_passthrough:alias": "a-pf:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(flavor_id=flavor_id, networks=[])
+
+        self.assertPCIDeviceCounts("compute1", total=1, free=0)
+        compute1_expected_placement_view["usages"] = {
+            "0000:81:00.0": {self.PF_RC: 1},
+        }
+        compute1_expected_placement_view["allocations"][server["id"]] = {
+            "0000:81:00.0": {self.PF_RC: 1},
+        }
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+
+        return server, compute1_expected_placement_view
+
+    def test_device_reconfiguration_with_allocations_config_change_warn(self):
+        server, compute1_expected_placement_view = (
+            self._create_one_compute_with_a_pf_consumed_by_an_instance())
+
+        # remove 0000:81:00.0 from the device spec and restart the compute
+        device_spec = self._to_device_spec_conf([])
+        self.flags(group='pci', device_spec=device_spec)
+        # The PF is used but removed from the config. The PciTracker warns
+        # but keeps the device so the placement logic mimic this and only warns
+        # but keeps the RP and the allocation in placement intact.
+        self.restart_compute_service(hostname="compute1")
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        # the warning from the PciTracker
+        self.assertIn(
+            "WARNING [nova.pci.manager] Unable to remove device with status "
+            "'allocated' and ownership %s because of PCI device "
+            "1:0000:81:00.0 is allocated instead of ['available', "
+            "'unavailable', 'unclaimable']. Check your [pci]device_spec "
+            "configuration to make sure this allocated device is whitelisted. "
+            "If you have removed the device from the whitelist intentionally "
+            "or the device is no longer available on the host you will need "
+            "to delete the server or migrate it to another host to silence "
+            "this warning."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+        # the warning from the placement PCI tracking logic
+        self.assertIn(
+            "WARNING [nova.compute.pci_placement_translator] Device spec is "
+            "not found for device 0000:81:00.0 in [pci]device_spec. We are "
+            "skipping this devices during Placement update. The device is "
+            "allocated by %s. You should not remove an allocated device from "
+            "the configuration. Please restore the configuration or cold "
+            "migrate the instance to resolve the inconsistency."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+
+    def test_device_reconfiguration_with_allocations_config_change_stop(self):
+        self._create_one_compute_with_a_pf_consumed_by_an_instance()
+
+        # switch 0000:81:00.0 PF to 0000:81:00.1 VF
+        # in the config, then restart the compute service
+
+        # only match the VF now
+        device_spec = self._to_device_spec_conf(
+            [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.VF_PROD_ID,
+                    "address": "0000:81:00.1",
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+        # The compute fails to start as the new config would mean that the PF
+        # inventory is removed from the 0000:81:00.0 RP and the PF inventory is
+        # added instead there, but the VF inventory has allocations. Keeping
+        # the old inventory as in
+        # test_device_reconfiguration_with_allocations_config_change_warn is
+        # not an option as it would result in two resource class on the same RP
+        # one for the PF and one for the VF. That would allow consuming
+        # the same physical device twice. Such dependent device configuration
+        # is intentionally not supported so we are stopping the compute
+        # service.
+        ex = self.assertRaises(
+            exception.PlacementPciException,
+            self.restart_compute_service,
+            hostname="compute1"
+        )
+        self.assertRegex(
+            str(ex),
+            "Failed to gather or report PCI resources to Placement: There was "
+            "a conflict when trying to complete your request.\n\n "
+            "update conflict: Inventory for 'CUSTOM_PCI_8086_1528' on "
+            "resource provider '.*' in use.",
+        )
+
+    def test_device_reconfiguration_with_allocations_hyp_change(self):
+        server, compute1_expected_placement_view = (
+            self._create_one_compute_with_a_pf_consumed_by_an_instance())
+
+        # restart the compute but simulate that the device 0000:81:00.0 is
+        # removed from the hypervisor while the device spec config left
+        # intact. The PciTracker will notice this and log a warning. The
+        # placement tracking logic simply keeps the allocation intact in
+        # placement as both the PciDevice and the DeviceSpec is available.
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=0, num_pfs=0, num_vfs=0)
+        self.restart_compute_service(
+            hostname="compute1",
+            pci_info=pci_info,
+            keep_hypervisor_state=False
+        )
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        self._run_periodics()
+        self.assert_placement_pci_view(
+            "compute1", **compute1_expected_placement_view)
+        # the warning from the PciTracker
+        self.assertIn(
+            "WARNING [nova.pci.manager] Unable to remove device with status "
+            "'allocated' and ownership %s because of PCI device "
+            "1:0000:81:00.0 is allocated instead of ['available', "
+            "'unavailable', 'unclaimable']. Check your [pci]device_spec "
+            "configuration to make sure this allocated device is whitelisted. "
+            "If you have removed the device from the whitelist intentionally "
+            "or the device is no longer available on the host you will need "
+            "to delete the server or migrate it to another host to silence "
+            "this warning."
+            % server['id'],
+            self.stdlog.logger.output,
+        )
+
    def test_reporting_disabled_nothing_is_reported(self):
        # The fake libvirt will emulate on the host:
        # * one type-PCI in slot 0
@ -765,32 +959,6 @@ class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
            )
        )

-        # Pre-configure a PCI alias to consume our devs
-        alias_pci = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.PCI_PROD_ID,
-            "name": "a-pci-dev",
-        }
-        alias_pf = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.PF_PROD_ID,
-            "device_type": "type-PF",
-            "name": "a-pf",
-        }
-        alias_vf = {
-            "vendor_id": fakelibvirt.PCI_VEND_ID,
-            "product_id": fakelibvirt.VF_PROD_ID,
-            "device_type": "type-VF",
-            "name": "a-vf",
-        }
-        self.flags(
-            group='pci',
-            alias=self._to_pci_alias_conf([alias_pci, alias_pf, alias_vf]))
-
-    @staticmethod
-    def _to_pci_alias_conf(alias_list):
-        return [jsonutils.dumps(x) for x in alias_list]
-
    @staticmethod
    def _move_allocation(allocations, from_uuid, to_uuid):
        allocations[to_uuid] = allocations[from_uuid]
--- a/nova/tests/unit/compute/test_pci_placement_translator.py
+++ b/nova/tests/unit/compute/test_pci_placement_translator.py
@ -50,6 +50,7 @@ class TestTranslator(test.NoDBTestCase):
                pci_device.PciDevice(
                    address="0000:81:00.0",
                    status=fields.PciDeviceStatus.AVAILABLE,
+                    instance_uuid=None,
                )
            ]
        )
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@ -1960,6 +1960,35 @@ class TestUpdateComputeNode(BaseTestCase):
        upt = self.rt.reportclient.update_from_provider_tree
        upt.assert_called_once_with(mock.sentinel.ctx, ptree, allocations=None)

+    @mock.patch(
+        'nova.compute.resource_tracker.ResourceTracker.'
+        '_sync_compute_service_disabled_trait',
+        new=mock.Mock()
+    )
+    @mock.patch(
+        'nova.compute.resource_tracker.ResourceTracker._resource_change',
+        new=mock.Mock(return_value=False)
+    )
+    def test_update_pci_reporting_allocation_in_use_error_propagated(self):
+        """Assert that if the pci placement reporting code tries to remove
+        inventory with allocation from placement due to invalid hypervisor
+        or [pci]device_spec reconfiguration then the InventoryInUse error from
+        placement is propagated and makes the compute startup to fail.
+        """
+        compute_obj = _COMPUTE_NODE_FIXTURES[0].obj_clone()
+        self._setup_rt()
+        self.rt.reportclient.update_from_provider_tree.side_effect = (
+            exc.InventoryInUse(
+                resource_class="FOO", resource_provider="bar"))
+
+        self.assertRaises(
+            exc.PlacementPciException,
+            self.rt._update,
+            mock.sentinel.ctx,
+            compute_obj,
+            startup=True,
+        )
+
    @mock.patch('nova.objects.Service.get_by_compute_host',
                return_value=objects.Service(disabled=True))
    def test_sync_compute_service_disabled_trait_add(self, mock_get_by_host):