From 00f1d4757e503bb9807d7a8d7035c061a97db983 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Wed, 31 Mar 2021 16:57:35 -0400 Subject: [PATCH] Update SRIOV port pci_slot when unshelving There are a few things we need to do to make that work: * Always set the PCIRequest's requester_id. Previously, this was only done for resource requests. The requester_id is the port UUID, so we can use that to correlate which port to update with which pci_slot (in the case of multiple SRIOV ports per instance). This has the side effect of making the fix work only for instances created *after* this patch has been applied. It's not ideal, but there does not appear to be a better way. * Call setup_networks_on_host() within the instance_claim context. This means the instance's pci_devices are updated when we call it, allowing us to get the pci_slot information from them. With the two previous changes in place, we can figure out the port's new pci_slot in _update_port_binding_for_instance(). Closes: bug 1851545 Change-Id: Icfa8c1d6e84eab758af6223a2870078685584aaa --- nova/compute/manager.py | 14 ++-- nova/network/neutron.py | 83 ++++++++++++++----- .../libvirt/test_pci_sriov_servers.py | 28 ++----- nova/tests/unit/network/test_neutron.py | 41 ++++++++- .../notes/bug-1851545-781c358939d96cea.yaml | 12 +++ 5 files changed, 126 insertions(+), 52 deletions(-) create mode 100644 releasenotes/notes/bug-1851545-781c358939d96cea.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 8e7b8885590e..69286438f3f6 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6556,12 +6556,6 @@ class ComputeManager(manager.Manager): context, self.reportclient, instance.pci_requests.requests, provider_mappings) - self.network_api.setup_instance_network_on_host( - context, instance, self.host, - provider_mappings=provider_mappings) - network_info = self.network_api.get_instance_nw_info( - context, instance) - accel_info = [] if accel_uuids: try: @@ -6571,11 +6565,17 @@ class ComputeManager(manager.Manager): LOG.exception('Failure getting accelerator requests ' 'with the exception: %s', exc, instance=instance) - self._build_resources_cleanup(instance, network_info) + self._build_resources_cleanup(instance, None) raise with self.rt.instance_claim(context, instance, node, allocations, limits): + self.network_api.setup_instance_network_on_host( + context, instance, self.host, + provider_mappings=provider_mappings) + network_info = self.network_api.get_instance_nw_info( + context, instance) + self.driver.spawn(context, instance, image_meta, injected_files=[], admin_password=None, diff --git a/nova/network/neutron.py b/nova/network/neutron.py index a45e48bf2485..6d42682e8225 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -2100,6 +2100,9 @@ class API(base.Base): port_numa_policy = None if request_net.port_id: + # InstancePCIRequest.requester_id is semantically linked + # to a port with a resource_request. + requester_id = request_net.port_id (vnic_type, trusted, network_id, resource_request, port_numa_policy) = self._get_port_vnic_info( context, neutron, request_net.port_id) @@ -2107,9 +2110,6 @@ class API(base.Base): context, neutron, network_id) if resource_request: - # InstancePCIRequest.requester_id is semantically linked - # to a port with a resource_request. - requester_id = request_net.port_id # NOTE(gibi): explicitly orphan the RequestGroup by setting # context=None as we never intended to save it to the DB. resource_requests.append( @@ -3357,6 +3357,37 @@ class API(base.Base): migration.get('status') == 'reverted') return instance.migration_context.get_pci_mapping_for_migration(revert) + def _get_port_pci_slot(self, context, instance, port): + """Find the PCI address of the device corresponding to the port. + Assumes the port is an SRIOV one. + + :param context: The request context. + :param instance: The instance to which the port is attached. + :param port: The Neutron port, as obtained from the Neutron API + JSON form. + :return: The PCI address as a string, or None if unable to find. + """ + # Find the port's PCIRequest, or return None + for r in instance.pci_requests.requests: + if r.requester_id == port['id']: + request = r + break + else: + LOG.debug('No PCI request found for port %s', port['id'], + instance=instance) + return None + # Find the request's device, or return None + for d in instance.pci_devices: + if d.request_id == request.request_id: + device = d + break + else: + LOG.debug('No PCI device found for request %s', + request.request_id, instance=instance) + return None + # Return the device's PCI address + return device.address + def _update_port_binding_for_instance( self, context, instance, host, migration=None, provider_mappings=None): @@ -3365,7 +3396,6 @@ class API(base.Base): search_opts = {'device_id': instance.uuid, 'tenant_id': instance.project_id} data = neutron.list_ports(**search_opts) - pci_mapping = None port_updates = [] ports = data['ports'] FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND, @@ -3398,25 +3428,36 @@ class API(base.Base): # that this function is called without a migration object, such # as in an unshelve operation. vnic_type = p.get('binding:vnic_type') - if (vnic_type in network_model.VNIC_TYPES_SRIOV and - migration is not None and - not migration.is_live_migration): - # Note(adrianc): for live migration binding profile was already - # updated in conductor when calling bind_ports_to_host() - if not pci_mapping: - pci_mapping = self._get_pci_mapping_for_migration( - instance, migration) + if vnic_type in network_model.VNIC_TYPES_SRIOV: + # NOTE(artom) For migrations, update the binding profile from + # the migration object... + if migration is not None: + # NOTE(artom) ... except for live migrations, because the + # conductor has already done that whe calling + # bind_ports_to_host(). + if not migration.is_live_migration: + pci_mapping = self._get_pci_mapping_for_migration( + instance, migration) - pci_slot = binding_profile.get('pci_slot') - new_dev = pci_mapping.get(pci_slot) - if new_dev: - binding_profile.update( - self._get_pci_device_profile(new_dev)) - updates[constants.BINDING_PROFILE] = binding_profile + pci_slot = binding_profile.get('pci_slot') + new_dev = pci_mapping.get(pci_slot) + if new_dev: + binding_profile.update( + self._get_pci_device_profile(new_dev)) + updates[ + constants.BINDING_PROFILE] = binding_profile + else: + raise exception.PortUpdateFailed(port_id=p['id'], + reason=_("Unable to correlate PCI slot %s") % + pci_slot) + # NOTE(artom) If migration is None, this is an unshevle, and we + # need to figure out the pci_slot from the InstancePCIRequest + # and PciDevice objects. else: - raise exception.PortUpdateFailed(port_id=p['id'], - reason=_("Unable to correlate PCI slot %s") % - pci_slot) + pci_slot = self._get_port_pci_slot(context, instance, p) + if pci_slot: + binding_profile.update({'pci_slot': pci_slot}) + updates[constants.BINDING_PROFILE] = binding_profile # NOTE(gibi): during live migration the conductor already sets the # allocation key in the port binding. However during resize, cold diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index 1e1dbd0433a9..66fe0594002d 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -434,30 +434,17 @@ class SRIOVServersTest(_PCIServersTestBase): source_port = self.neutron.show_port(source_port['port']['id']) same_slot_port = self.neutron.show_port(same_slot_port['port']['id']) - # FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the - # pci_slot. - if expect_fail: - self.assertEqual( - source_port['port']['binding:profile']['pci_slot'], - same_slot_port['port']['binding:profile']['pci_slot']) - else: - self.assertNotEqual( - source_port['port']['binding:profile']['pci_slot'], - same_slot_port['port']['binding:profile']['pci_slot']) + self.assertNotEqual( + source_port['port']['binding:profile']['pci_slot'], + same_slot_port['port']['binding:profile']['pci_slot']) conn = self.computes['dest'].driver._host.get_connection() vms = [vm._def for vm in conn._vms.values()] self.assertEqual(2, len(vms)) for vm in vms: self.assertEqual(1, len(vm['devices']['nics'])) - # FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the - # XML. - if expect_fail: - self.assertEqual(vms[0]['devices']['nics'][0]['source'], - vms[1]['devices']['nics'][0]['source']) - else: - self.assertNotEqual(vms[0]['devices']['nics'][0]['source'], - vms[1]['devices']['nics'][0]['source']) + self.assertNotEqual(vms[0]['devices']['nics'][0]['source'], + vms[1]['devices']['nics'][0]['source']) def test_unshelve_server_with_neutron(self): def move_operation(source_server): @@ -466,10 +453,7 @@ class SRIOVServersTest(_PCIServersTestBase): self.api.put_service(self.computes['source'].service_ref.uuid, {'status': 'disabled'}) self._unshelve_server(source_server) - # FIXME(artom) Bug 1851545 means we explain failure here: the pci_slot - # and XML will not get updated. - self._test_move_operation_with_neutron(move_operation, - expect_fail=True) + self._test_move_operation_with_neutron(move_operation) def test_cold_migrate_server_with_neutron(self): def move_operation(source_server): diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index 5ec7ff3de89b..3176cf48f11d 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -5828,9 +5828,11 @@ class TestAPI(TestAPIBase): else: self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec) - # Only the port with a resource_request will have pci_req.requester_id. + # Only SRIOV ports and those with a resource_request will have + # pci_req.requester_id. self.assertEqual( - [None, None, None, None, uuids.trusted_port, None], + [uuids.portid_1, uuids.portid_3, uuids.portid_4, uuids.portid_5, + uuids.trusted_port, uuids.portid_vdpa], [pci_req.requester_id for pci_req in pci_requests.requests]) self.assertCountEqual( @@ -6381,6 +6383,41 @@ class TestAPI(TestAPIBase): self.api.get_segment_id_for_subnet, self.context, uuids.subnet_id) + @mock.patch.object(neutronapi.LOG, 'debug') + def test_get_port_pci_slot(self, mock_debug): + fake_port = {'id': uuids.fake_port_id} + request = objects.InstancePCIRequest(requester_id=uuids.fake_port_id, + request_id=uuids.pci_request_id) + bad_request = objects.InstancePCIRequest( + requester_id=uuids.wrong_port_id) + device = objects.PciDevice(request_id=uuids.pci_request_id, + address='fake-pci-address') + bad_device = objects.PciDevice(request_id=uuids.wrong_request_id) + # Test the happy path + instance = objects.Instance( + pci_requests=objects.InstancePCIRequests(requests=[request]), + pci_devices=objects.PciDeviceList(objects=[device])) + self.assertEqual( + 'fake-pci-address', + self.api._get_port_pci_slot(self.context, instance, fake_port)) + # Test not finding the request + instance = objects.Instance( + pci_requests=objects.InstancePCIRequests( + requests=[objects.InstancePCIRequest(bad_request)])) + self.assertIsNone( + self.api._get_port_pci_slot(self.context, instance, fake_port)) + mock_debug.assert_called_with('No PCI request found for port %s', + uuids.fake_port_id, instance=instance) + mock_debug.reset_mock() + # Test not finding the device + instance = objects.Instance( + pci_requests=objects.InstancePCIRequests(requests=[request]), + pci_devices=objects.PciDeviceList(objects=[bad_device])) + self.assertIsNone( + self.api._get_port_pci_slot(self.context, instance, fake_port)) + mock_debug.assert_called_with('No PCI device found for request %s', + uuids.pci_request_id, instance=instance) + class TestAPIModuleMethods(test.NoDBTestCase): diff --git a/releasenotes/notes/bug-1851545-781c358939d96cea.yaml b/releasenotes/notes/bug-1851545-781c358939d96cea.yaml new file mode 100644 index 000000000000..cab86d9545eb --- /dev/null +++ b/releasenotes/notes/bug-1851545-781c358939d96cea.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + `Bug 1851545 `_, wherein + unshelving an instance with SRIOV Neutron ports did not update the port + binding's ``pci_slot`` and could cause libvirt PCI conflicts, has been + fixed. + + .. important:: Constraints in the fix's implementation mean that it only + applies to instances booted **after** it has been applied. Existing + instances will still experience bug 1851545 after being shelved and + unshelved, even with the fix applied.