Update SRIOV port pci_slot when unshelving
There are a few things we need to do to make that work: * Always set the PCIRequest's requester_id. Previously, this was only done for resource requests. The requester_id is the port UUID, so we can use that to correlate which port to update with which pci_slot (in the case of multiple SRIOV ports per instance). This has the side effect of making the fix work only for instances created *after* this patch has been applied. It's not ideal, but there does not appear to be a better way. * Call setup_networks_on_host() within the instance_claim context. This means the instance's pci_devices are updated when we call it, allowing us to get the pci_slot information from them. With the two previous changes in place, we can figure out the port's new pci_slot in _update_port_binding_for_instance(). Closes: bug 1851545 Change-Id: Icfa8c1d6e84eab758af6223a2870078685584aaa
This commit is contained in:
parent
bea06123db
commit
00f1d4757e
@ -6556,12 +6556,6 @@ class ComputeManager(manager.Manager):
|
||||
context, self.reportclient, instance.pci_requests.requests,
|
||||
provider_mappings)
|
||||
|
||||
self.network_api.setup_instance_network_on_host(
|
||||
context, instance, self.host,
|
||||
provider_mappings=provider_mappings)
|
||||
network_info = self.network_api.get_instance_nw_info(
|
||||
context, instance)
|
||||
|
||||
accel_info = []
|
||||
if accel_uuids:
|
||||
try:
|
||||
@ -6571,11 +6565,17 @@ class ComputeManager(manager.Manager):
|
||||
LOG.exception('Failure getting accelerator requests '
|
||||
'with the exception: %s', exc,
|
||||
instance=instance)
|
||||
self._build_resources_cleanup(instance, network_info)
|
||||
self._build_resources_cleanup(instance, None)
|
||||
raise
|
||||
|
||||
with self.rt.instance_claim(context, instance, node, allocations,
|
||||
limits):
|
||||
self.network_api.setup_instance_network_on_host(
|
||||
context, instance, self.host,
|
||||
provider_mappings=provider_mappings)
|
||||
network_info = self.network_api.get_instance_nw_info(
|
||||
context, instance)
|
||||
|
||||
self.driver.spawn(context, instance, image_meta,
|
||||
injected_files=[],
|
||||
admin_password=None,
|
||||
|
@ -2100,6 +2100,9 @@ class API(base.Base):
|
||||
port_numa_policy = None
|
||||
|
||||
if request_net.port_id:
|
||||
# InstancePCIRequest.requester_id is semantically linked
|
||||
# to a port with a resource_request.
|
||||
requester_id = request_net.port_id
|
||||
(vnic_type, trusted, network_id, resource_request,
|
||||
port_numa_policy) = self._get_port_vnic_info(
|
||||
context, neutron, request_net.port_id)
|
||||
@ -2107,9 +2110,6 @@ class API(base.Base):
|
||||
context, neutron, network_id)
|
||||
|
||||
if resource_request:
|
||||
# InstancePCIRequest.requester_id is semantically linked
|
||||
# to a port with a resource_request.
|
||||
requester_id = request_net.port_id
|
||||
# NOTE(gibi): explicitly orphan the RequestGroup by setting
|
||||
# context=None as we never intended to save it to the DB.
|
||||
resource_requests.append(
|
||||
@ -3357,6 +3357,37 @@ class API(base.Base):
|
||||
migration.get('status') == 'reverted')
|
||||
return instance.migration_context.get_pci_mapping_for_migration(revert)
|
||||
|
||||
def _get_port_pci_slot(self, context, instance, port):
|
||||
"""Find the PCI address of the device corresponding to the port.
|
||||
Assumes the port is an SRIOV one.
|
||||
|
||||
:param context: The request context.
|
||||
:param instance: The instance to which the port is attached.
|
||||
:param port: The Neutron port, as obtained from the Neutron API
|
||||
JSON form.
|
||||
:return: The PCI address as a string, or None if unable to find.
|
||||
"""
|
||||
# Find the port's PCIRequest, or return None
|
||||
for r in instance.pci_requests.requests:
|
||||
if r.requester_id == port['id']:
|
||||
request = r
|
||||
break
|
||||
else:
|
||||
LOG.debug('No PCI request found for port %s', port['id'],
|
||||
instance=instance)
|
||||
return None
|
||||
# Find the request's device, or return None
|
||||
for d in instance.pci_devices:
|
||||
if d.request_id == request.request_id:
|
||||
device = d
|
||||
break
|
||||
else:
|
||||
LOG.debug('No PCI device found for request %s',
|
||||
request.request_id, instance=instance)
|
||||
return None
|
||||
# Return the device's PCI address
|
||||
return device.address
|
||||
|
||||
def _update_port_binding_for_instance(
|
||||
self, context, instance, host, migration=None,
|
||||
provider_mappings=None):
|
||||
@ -3365,7 +3396,6 @@ class API(base.Base):
|
||||
search_opts = {'device_id': instance.uuid,
|
||||
'tenant_id': instance.project_id}
|
||||
data = neutron.list_ports(**search_opts)
|
||||
pci_mapping = None
|
||||
port_updates = []
|
||||
ports = data['ports']
|
||||
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
|
||||
@ -3398,25 +3428,36 @@ class API(base.Base):
|
||||
# that this function is called without a migration object, such
|
||||
# as in an unshelve operation.
|
||||
vnic_type = p.get('binding:vnic_type')
|
||||
if (vnic_type in network_model.VNIC_TYPES_SRIOV and
|
||||
migration is not None and
|
||||
not migration.is_live_migration):
|
||||
# Note(adrianc): for live migration binding profile was already
|
||||
# updated in conductor when calling bind_ports_to_host()
|
||||
if not pci_mapping:
|
||||
pci_mapping = self._get_pci_mapping_for_migration(
|
||||
instance, migration)
|
||||
if vnic_type in network_model.VNIC_TYPES_SRIOV:
|
||||
# NOTE(artom) For migrations, update the binding profile from
|
||||
# the migration object...
|
||||
if migration is not None:
|
||||
# NOTE(artom) ... except for live migrations, because the
|
||||
# conductor has already done that whe calling
|
||||
# bind_ports_to_host().
|
||||
if not migration.is_live_migration:
|
||||
pci_mapping = self._get_pci_mapping_for_migration(
|
||||
instance, migration)
|
||||
|
||||
pci_slot = binding_profile.get('pci_slot')
|
||||
new_dev = pci_mapping.get(pci_slot)
|
||||
if new_dev:
|
||||
binding_profile.update(
|
||||
self._get_pci_device_profile(new_dev))
|
||||
updates[constants.BINDING_PROFILE] = binding_profile
|
||||
pci_slot = binding_profile.get('pci_slot')
|
||||
new_dev = pci_mapping.get(pci_slot)
|
||||
if new_dev:
|
||||
binding_profile.update(
|
||||
self._get_pci_device_profile(new_dev))
|
||||
updates[
|
||||
constants.BINDING_PROFILE] = binding_profile
|
||||
else:
|
||||
raise exception.PortUpdateFailed(port_id=p['id'],
|
||||
reason=_("Unable to correlate PCI slot %s") %
|
||||
pci_slot)
|
||||
# NOTE(artom) If migration is None, this is an unshevle, and we
|
||||
# need to figure out the pci_slot from the InstancePCIRequest
|
||||
# and PciDevice objects.
|
||||
else:
|
||||
raise exception.PortUpdateFailed(port_id=p['id'],
|
||||
reason=_("Unable to correlate PCI slot %s") %
|
||||
pci_slot)
|
||||
pci_slot = self._get_port_pci_slot(context, instance, p)
|
||||
if pci_slot:
|
||||
binding_profile.update({'pci_slot': pci_slot})
|
||||
updates[constants.BINDING_PROFILE] = binding_profile
|
||||
|
||||
# NOTE(gibi): during live migration the conductor already sets the
|
||||
# allocation key in the port binding. However during resize, cold
|
||||
|
@ -434,30 +434,17 @@ class SRIOVServersTest(_PCIServersTestBase):
|
||||
source_port = self.neutron.show_port(source_port['port']['id'])
|
||||
same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
|
||||
|
||||
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
|
||||
# pci_slot.
|
||||
if expect_fail:
|
||||
self.assertEqual(
|
||||
source_port['port']['binding:profile']['pci_slot'],
|
||||
same_slot_port['port']['binding:profile']['pci_slot'])
|
||||
else:
|
||||
self.assertNotEqual(
|
||||
source_port['port']['binding:profile']['pci_slot'],
|
||||
same_slot_port['port']['binding:profile']['pci_slot'])
|
||||
self.assertNotEqual(
|
||||
source_port['port']['binding:profile']['pci_slot'],
|
||||
same_slot_port['port']['binding:profile']['pci_slot'])
|
||||
|
||||
conn = self.computes['dest'].driver._host.get_connection()
|
||||
vms = [vm._def for vm in conn._vms.values()]
|
||||
self.assertEqual(2, len(vms))
|
||||
for vm in vms:
|
||||
self.assertEqual(1, len(vm['devices']['nics']))
|
||||
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the
|
||||
# XML.
|
||||
if expect_fail:
|
||||
self.assertEqual(vms[0]['devices']['nics'][0]['source'],
|
||||
vms[1]['devices']['nics'][0]['source'])
|
||||
else:
|
||||
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
|
||||
vms[1]['devices']['nics'][0]['source'])
|
||||
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
|
||||
vms[1]['devices']['nics'][0]['source'])
|
||||
|
||||
def test_unshelve_server_with_neutron(self):
|
||||
def move_operation(source_server):
|
||||
@ -466,10 +453,7 @@ class SRIOVServersTest(_PCIServersTestBase):
|
||||
self.api.put_service(self.computes['source'].service_ref.uuid,
|
||||
{'status': 'disabled'})
|
||||
self._unshelve_server(source_server)
|
||||
# FIXME(artom) Bug 1851545 means we explain failure here: the pci_slot
|
||||
# and XML will not get updated.
|
||||
self._test_move_operation_with_neutron(move_operation,
|
||||
expect_fail=True)
|
||||
self._test_move_operation_with_neutron(move_operation)
|
||||
|
||||
def test_cold_migrate_server_with_neutron(self):
|
||||
def move_operation(source_server):
|
||||
|
@ -5828,9 +5828,11 @@ class TestAPI(TestAPIBase):
|
||||
else:
|
||||
self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec)
|
||||
|
||||
# Only the port with a resource_request will have pci_req.requester_id.
|
||||
# Only SRIOV ports and those with a resource_request will have
|
||||
# pci_req.requester_id.
|
||||
self.assertEqual(
|
||||
[None, None, None, None, uuids.trusted_port, None],
|
||||
[uuids.portid_1, uuids.portid_3, uuids.portid_4, uuids.portid_5,
|
||||
uuids.trusted_port, uuids.portid_vdpa],
|
||||
[pci_req.requester_id for pci_req in pci_requests.requests])
|
||||
|
||||
self.assertCountEqual(
|
||||
@ -6381,6 +6383,41 @@ class TestAPI(TestAPIBase):
|
||||
self.api.get_segment_id_for_subnet,
|
||||
self.context, uuids.subnet_id)
|
||||
|
||||
@mock.patch.object(neutronapi.LOG, 'debug')
|
||||
def test_get_port_pci_slot(self, mock_debug):
|
||||
fake_port = {'id': uuids.fake_port_id}
|
||||
request = objects.InstancePCIRequest(requester_id=uuids.fake_port_id,
|
||||
request_id=uuids.pci_request_id)
|
||||
bad_request = objects.InstancePCIRequest(
|
||||
requester_id=uuids.wrong_port_id)
|
||||
device = objects.PciDevice(request_id=uuids.pci_request_id,
|
||||
address='fake-pci-address')
|
||||
bad_device = objects.PciDevice(request_id=uuids.wrong_request_id)
|
||||
# Test the happy path
|
||||
instance = objects.Instance(
|
||||
pci_requests=objects.InstancePCIRequests(requests=[request]),
|
||||
pci_devices=objects.PciDeviceList(objects=[device]))
|
||||
self.assertEqual(
|
||||
'fake-pci-address',
|
||||
self.api._get_port_pci_slot(self.context, instance, fake_port))
|
||||
# Test not finding the request
|
||||
instance = objects.Instance(
|
||||
pci_requests=objects.InstancePCIRequests(
|
||||
requests=[objects.InstancePCIRequest(bad_request)]))
|
||||
self.assertIsNone(
|
||||
self.api._get_port_pci_slot(self.context, instance, fake_port))
|
||||
mock_debug.assert_called_with('No PCI request found for port %s',
|
||||
uuids.fake_port_id, instance=instance)
|
||||
mock_debug.reset_mock()
|
||||
# Test not finding the device
|
||||
instance = objects.Instance(
|
||||
pci_requests=objects.InstancePCIRequests(requests=[request]),
|
||||
pci_devices=objects.PciDeviceList(objects=[bad_device]))
|
||||
self.assertIsNone(
|
||||
self.api._get_port_pci_slot(self.context, instance, fake_port))
|
||||
mock_debug.assert_called_with('No PCI device found for request %s',
|
||||
uuids.pci_request_id, instance=instance)
|
||||
|
||||
|
||||
class TestAPIModuleMethods(test.NoDBTestCase):
|
||||
|
||||
|
12
releasenotes/notes/bug-1851545-781c358939d96cea.yaml
Normal file
12
releasenotes/notes/bug-1851545-781c358939d96cea.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
`Bug 1851545 <https://bugs.launchpad.net/nova/+bug/1851545>`_, wherein
|
||||
unshelving an instance with SRIOV Neutron ports did not update the port
|
||||
binding's ``pci_slot`` and could cause libvirt PCI conflicts, has been
|
||||
fixed.
|
||||
|
||||
.. important:: Constraints in the fix's implementation mean that it only
|
||||
applies to instances booted **after** it has been applied. Existing
|
||||
instances will still experience bug 1851545 after being shelved and
|
||||
unshelved, even with the fix applied.
|
Loading…
Reference in New Issue
Block a user