Update SRIOV port pci_slot when unshelving

There are a few things we need to do to make that work:

* Always set the PCIRequest's requester_id. Previously, this was only
  done for resource requests. The requester_id is the port UUID, so we
  can use that to correlate which port to update with which pci_slot
  (in the case of multiple SRIOV ports per instance).

  This has the side effect of making the fix work only for instances
  created *after* this patch has been applied. It's not ideal, but
  there does not appear to be a better way.

* Call setup_networks_on_host() within the instance_claim context.
  This means the instance's pci_devices are updated when we call it,
  allowing us to get the pci_slot information from them.

With the two previous changes in place, we can figure out the port's
new pci_slot in _update_port_binding_for_instance().

Closes: bug 1851545
Change-Id: Icfa8c1d6e84eab758af6223a2870078685584aaa
This commit is contained in:
Artom Lifshitz 2021-03-31 16:57:35 -04:00
parent bea06123db
commit 00f1d4757e
5 changed files with 126 additions and 52 deletions

View File

@ -6556,12 +6556,6 @@ class ComputeManager(manager.Manager):
context, self.reportclient, instance.pci_requests.requests,
provider_mappings)
self.network_api.setup_instance_network_on_host(
context, instance, self.host,
provider_mappings=provider_mappings)
network_info = self.network_api.get_instance_nw_info(
context, instance)
accel_info = []
if accel_uuids:
try:
@ -6571,11 +6565,17 @@ class ComputeManager(manager.Manager):
LOG.exception('Failure getting accelerator requests '
'with the exception: %s', exc,
instance=instance)
self._build_resources_cleanup(instance, network_info)
self._build_resources_cleanup(instance, None)
raise
with self.rt.instance_claim(context, instance, node, allocations,
limits):
self.network_api.setup_instance_network_on_host(
context, instance, self.host,
provider_mappings=provider_mappings)
network_info = self.network_api.get_instance_nw_info(
context, instance)
self.driver.spawn(context, instance, image_meta,
injected_files=[],
admin_password=None,

View File

@ -2100,6 +2100,9 @@ class API(base.Base):
port_numa_policy = None
if request_net.port_id:
# InstancePCIRequest.requester_id is semantically linked
# to a port with a resource_request.
requester_id = request_net.port_id
(vnic_type, trusted, network_id, resource_request,
port_numa_policy) = self._get_port_vnic_info(
context, neutron, request_net.port_id)
@ -2107,9 +2110,6 @@ class API(base.Base):
context, neutron, network_id)
if resource_request:
# InstancePCIRequest.requester_id is semantically linked
# to a port with a resource_request.
requester_id = request_net.port_id
# NOTE(gibi): explicitly orphan the RequestGroup by setting
# context=None as we never intended to save it to the DB.
resource_requests.append(
@ -3357,6 +3357,37 @@ class API(base.Base):
migration.get('status') == 'reverted')
return instance.migration_context.get_pci_mapping_for_migration(revert)
def _get_port_pci_slot(self, context, instance, port):
"""Find the PCI address of the device corresponding to the port.
Assumes the port is an SRIOV one.
:param context: The request context.
:param instance: The instance to which the port is attached.
:param port: The Neutron port, as obtained from the Neutron API
JSON form.
:return: The PCI address as a string, or None if unable to find.
"""
# Find the port's PCIRequest, or return None
for r in instance.pci_requests.requests:
if r.requester_id == port['id']:
request = r
break
else:
LOG.debug('No PCI request found for port %s', port['id'],
instance=instance)
return None
# Find the request's device, or return None
for d in instance.pci_devices:
if d.request_id == request.request_id:
device = d
break
else:
LOG.debug('No PCI device found for request %s',
request.request_id, instance=instance)
return None
# Return the device's PCI address
return device.address
def _update_port_binding_for_instance(
self, context, instance, host, migration=None,
provider_mappings=None):
@ -3365,7 +3396,6 @@ class API(base.Base):
search_opts = {'device_id': instance.uuid,
'tenant_id': instance.project_id}
data = neutron.list_ports(**search_opts)
pci_mapping = None
port_updates = []
ports = data['ports']
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
@ -3398,25 +3428,36 @@ class API(base.Base):
# that this function is called without a migration object, such
# as in an unshelve operation.
vnic_type = p.get('binding:vnic_type')
if (vnic_type in network_model.VNIC_TYPES_SRIOV and
migration is not None and
not migration.is_live_migration):
# Note(adrianc): for live migration binding profile was already
# updated in conductor when calling bind_ports_to_host()
if not pci_mapping:
pci_mapping = self._get_pci_mapping_for_migration(
instance, migration)
if vnic_type in network_model.VNIC_TYPES_SRIOV:
# NOTE(artom) For migrations, update the binding profile from
# the migration object...
if migration is not None:
# NOTE(artom) ... except for live migrations, because the
# conductor has already done that whe calling
# bind_ports_to_host().
if not migration.is_live_migration:
pci_mapping = self._get_pci_mapping_for_migration(
instance, migration)
pci_slot = binding_profile.get('pci_slot')
new_dev = pci_mapping.get(pci_slot)
if new_dev:
binding_profile.update(
self._get_pci_device_profile(new_dev))
updates[constants.BINDING_PROFILE] = binding_profile
pci_slot = binding_profile.get('pci_slot')
new_dev = pci_mapping.get(pci_slot)
if new_dev:
binding_profile.update(
self._get_pci_device_profile(new_dev))
updates[
constants.BINDING_PROFILE] = binding_profile
else:
raise exception.PortUpdateFailed(port_id=p['id'],
reason=_("Unable to correlate PCI slot %s") %
pci_slot)
# NOTE(artom) If migration is None, this is an unshevle, and we
# need to figure out the pci_slot from the InstancePCIRequest
# and PciDevice objects.
else:
raise exception.PortUpdateFailed(port_id=p['id'],
reason=_("Unable to correlate PCI slot %s") %
pci_slot)
pci_slot = self._get_port_pci_slot(context, instance, p)
if pci_slot:
binding_profile.update({'pci_slot': pci_slot})
updates[constants.BINDING_PROFILE] = binding_profile
# NOTE(gibi): during live migration the conductor already sets the
# allocation key in the port binding. However during resize, cold

View File

@ -434,30 +434,17 @@ class SRIOVServersTest(_PCIServersTestBase):
source_port = self.neutron.show_port(source_port['port']['id'])
same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
# pci_slot.
if expect_fail:
self.assertEqual(
source_port['port']['binding:profile']['pci_slot'],
same_slot_port['port']['binding:profile']['pci_slot'])
else:
self.assertNotEqual(
source_port['port']['binding:profile']['pci_slot'],
same_slot_port['port']['binding:profile']['pci_slot'])
self.assertNotEqual(
source_port['port']['binding:profile']['pci_slot'],
same_slot_port['port']['binding:profile']['pci_slot'])
conn = self.computes['dest'].driver._host.get_connection()
vms = [vm._def for vm in conn._vms.values()]
self.assertEqual(2, len(vms))
for vm in vms:
self.assertEqual(1, len(vm['devices']['nics']))
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
# XML.
if expect_fail:
self.assertEqual(vms[0]['devices']['nics'][0]['source'],
vms[1]['devices']['nics'][0]['source'])
else:
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
vms[1]['devices']['nics'][0]['source'])
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
vms[1]['devices']['nics'][0]['source'])
def test_unshelve_server_with_neutron(self):
def move_operation(source_server):
@ -466,10 +453,7 @@ class SRIOVServersTest(_PCIServersTestBase):
self.api.put_service(self.computes['source'].service_ref.uuid,
{'status': 'disabled'})
self._unshelve_server(source_server)
# FIXME(artom) Bug 1851545 means we explain failure here: the pci_slot
# and XML will not get updated.
self._test_move_operation_with_neutron(move_operation,
expect_fail=True)
self._test_move_operation_with_neutron(move_operation)
def test_cold_migrate_server_with_neutron(self):
def move_operation(source_server):

View File

@ -5828,9 +5828,11 @@ class TestAPI(TestAPIBase):
else:
self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec)
# Only the port with a resource_request will have pci_req.requester_id.
# Only SRIOV ports and those with a resource_request will have
# pci_req.requester_id.
self.assertEqual(
[None, None, None, None, uuids.trusted_port, None],
[uuids.portid_1, uuids.portid_3, uuids.portid_4, uuids.portid_5,
uuids.trusted_port, uuids.portid_vdpa],
[pci_req.requester_id for pci_req in pci_requests.requests])
self.assertCountEqual(
@ -6381,6 +6383,41 @@ class TestAPI(TestAPIBase):
self.api.get_segment_id_for_subnet,
self.context, uuids.subnet_id)
@mock.patch.object(neutronapi.LOG, 'debug')
def test_get_port_pci_slot(self, mock_debug):
fake_port = {'id': uuids.fake_port_id}
request = objects.InstancePCIRequest(requester_id=uuids.fake_port_id,
request_id=uuids.pci_request_id)
bad_request = objects.InstancePCIRequest(
requester_id=uuids.wrong_port_id)
device = objects.PciDevice(request_id=uuids.pci_request_id,
address='fake-pci-address')
bad_device = objects.PciDevice(request_id=uuids.wrong_request_id)
# Test the happy path
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[request]),
pci_devices=objects.PciDeviceList(objects=[device]))
self.assertEqual(
'fake-pci-address',
self.api._get_port_pci_slot(self.context, instance, fake_port))
# Test not finding the request
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(
requests=[objects.InstancePCIRequest(bad_request)]))
self.assertIsNone(
self.api._get_port_pci_slot(self.context, instance, fake_port))
mock_debug.assert_called_with('No PCI request found for port %s',
uuids.fake_port_id, instance=instance)
mock_debug.reset_mock()
# Test not finding the device
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[request]),
pci_devices=objects.PciDeviceList(objects=[bad_device]))
self.assertIsNone(
self.api._get_port_pci_slot(self.context, instance, fake_port))
mock_debug.assert_called_with('No PCI device found for request %s',
uuids.pci_request_id, instance=instance)
class TestAPIModuleMethods(test.NoDBTestCase):

View File

@ -0,0 +1,12 @@
---
fixes:
- |
`Bug 1851545 <https://bugs.launchpad.net/nova/+bug/1851545>`_, wherein
unshelving an instance with SRIOV Neutron ports did not update the port
binding's ``pci_slot`` and could cause libvirt PCI conflicts, has been
fixed.
.. important:: Constraints in the fix's implementation mean that it only
applies to instances booted **after** it has been applied. Existing
instances will still experience bug 1851545 after being shelved and
unshelved, even with the fix applied.