Merge "Update SRIOV port pci_slot when unshelving"

This commit is contained in:
Zuul 2021-05-03 21:57:16 +00:00 committed by Gerrit Code Review
commit 34cad6dfbe
5 changed files with 126 additions and 52 deletions

View File

@ -6556,12 +6556,6 @@ class ComputeManager(manager.Manager):
context, self.reportclient, instance.pci_requests.requests, context, self.reportclient, instance.pci_requests.requests,
provider_mappings) provider_mappings)
self.network_api.setup_instance_network_on_host(
context, instance, self.host,
provider_mappings=provider_mappings)
network_info = self.network_api.get_instance_nw_info(
context, instance)
accel_info = [] accel_info = []
if accel_uuids: if accel_uuids:
try: try:
@ -6571,11 +6565,17 @@ class ComputeManager(manager.Manager):
LOG.exception('Failure getting accelerator requests ' LOG.exception('Failure getting accelerator requests '
'with the exception: %s', exc, 'with the exception: %s', exc,
instance=instance) instance=instance)
self._build_resources_cleanup(instance, network_info) self._build_resources_cleanup(instance, None)
raise raise
with self.rt.instance_claim(context, instance, node, allocations, with self.rt.instance_claim(context, instance, node, allocations,
limits): limits):
self.network_api.setup_instance_network_on_host(
context, instance, self.host,
provider_mappings=provider_mappings)
network_info = self.network_api.get_instance_nw_info(
context, instance)
self.driver.spawn(context, instance, image_meta, self.driver.spawn(context, instance, image_meta,
injected_files=[], injected_files=[],
admin_password=None, admin_password=None,

View File

@ -2114,6 +2114,9 @@ class API(base.Base):
port_numa_policy = None port_numa_policy = None
if request_net.port_id: if request_net.port_id:
# InstancePCIRequest.requester_id is semantically linked
# to a port with a resource_request.
requester_id = request_net.port_id
(vnic_type, trusted, network_id, resource_request, (vnic_type, trusted, network_id, resource_request,
port_numa_policy) = self._get_port_vnic_info( port_numa_policy) = self._get_port_vnic_info(
context, neutron, request_net.port_id) context, neutron, request_net.port_id)
@ -2121,9 +2124,6 @@ class API(base.Base):
context, neutron, network_id) context, neutron, network_id)
if resource_request: if resource_request:
# InstancePCIRequest.requester_id is semantically linked
# to a port with a resource_request.
requester_id = request_net.port_id
# NOTE(gibi): explicitly orphan the RequestGroup by setting # NOTE(gibi): explicitly orphan the RequestGroup by setting
# context=None as we never intended to save it to the DB. # context=None as we never intended to save it to the DB.
resource_requests.append( resource_requests.append(
@ -3373,6 +3373,37 @@ class API(base.Base):
migration.get('status') == 'reverted') migration.get('status') == 'reverted')
return instance.migration_context.get_pci_mapping_for_migration(revert) return instance.migration_context.get_pci_mapping_for_migration(revert)
def _get_port_pci_slot(self, context, instance, port):
"""Find the PCI address of the device corresponding to the port.
Assumes the port is an SRIOV one.
:param context: The request context.
:param instance: The instance to which the port is attached.
:param port: The Neutron port, as obtained from the Neutron API
JSON form.
:return: The PCI address as a string, or None if unable to find.
"""
# Find the port's PCIRequest, or return None
for r in instance.pci_requests.requests:
if r.requester_id == port['id']:
request = r
break
else:
LOG.debug('No PCI request found for port %s', port['id'],
instance=instance)
return None
# Find the request's device, or return None
for d in instance.pci_devices:
if d.request_id == request.request_id:
device = d
break
else:
LOG.debug('No PCI device found for request %s',
request.request_id, instance=instance)
return None
# Return the device's PCI address
return device.address
def _update_port_binding_for_instance( def _update_port_binding_for_instance(
self, context, instance, host, migration=None, self, context, instance, host, migration=None,
provider_mappings=None): provider_mappings=None):
@ -3381,7 +3412,6 @@ class API(base.Base):
search_opts = {'device_id': instance.uuid, search_opts = {'device_id': instance.uuid,
'tenant_id': instance.project_id} 'tenant_id': instance.project_id}
data = neutron.list_ports(**search_opts) data = neutron.list_ports(**search_opts)
pci_mapping = None
port_updates = [] port_updates = []
ports = data['ports'] ports = data['ports']
FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND, FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND,
@ -3414,25 +3444,36 @@ class API(base.Base):
# that this function is called without a migration object, such # that this function is called without a migration object, such
# as in an unshelve operation. # as in an unshelve operation.
vnic_type = p.get('binding:vnic_type') vnic_type = p.get('binding:vnic_type')
if (vnic_type in network_model.VNIC_TYPES_SRIOV and if vnic_type in network_model.VNIC_TYPES_SRIOV:
migration is not None and # NOTE(artom) For migrations, update the binding profile from
not migration.is_live_migration): # the migration object...
# Note(adrianc): for live migration binding profile was already if migration is not None:
# updated in conductor when calling bind_ports_to_host() # NOTE(artom) ... except for live migrations, because the
if not pci_mapping: # conductor has already done that whe calling
pci_mapping = self._get_pci_mapping_for_migration( # bind_ports_to_host().
instance, migration) if not migration.is_live_migration:
pci_mapping = self._get_pci_mapping_for_migration(
instance, migration)
pci_slot = binding_profile.get('pci_slot') pci_slot = binding_profile.get('pci_slot')
new_dev = pci_mapping.get(pci_slot) new_dev = pci_mapping.get(pci_slot)
if new_dev: if new_dev:
binding_profile.update( binding_profile.update(
self._get_pci_device_profile(new_dev)) self._get_pci_device_profile(new_dev))
updates[constants.BINDING_PROFILE] = binding_profile updates[
constants.BINDING_PROFILE] = binding_profile
else:
raise exception.PortUpdateFailed(port_id=p['id'],
reason=_("Unable to correlate PCI slot %s") %
pci_slot)
# NOTE(artom) If migration is None, this is an unshevle, and we
# need to figure out the pci_slot from the InstancePCIRequest
# and PciDevice objects.
else: else:
raise exception.PortUpdateFailed(port_id=p['id'], pci_slot = self._get_port_pci_slot(context, instance, p)
reason=_("Unable to correlate PCI slot %s") % if pci_slot:
pci_slot) binding_profile.update({'pci_slot': pci_slot})
updates[constants.BINDING_PROFILE] = binding_profile
# NOTE(gibi): during live migration the conductor already sets the # NOTE(gibi): during live migration the conductor already sets the
# allocation key in the port binding. However during resize, cold # allocation key in the port binding. However during resize, cold

View File

@ -434,30 +434,17 @@ class SRIOVServersTest(_PCIServersTestBase):
source_port = self.neutron.show_port(source_port['port']['id']) source_port = self.neutron.show_port(source_port['port']['id'])
same_slot_port = self.neutron.show_port(same_slot_port['port']['id']) same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the self.assertNotEqual(
# pci_slot. source_port['port']['binding:profile']['pci_slot'],
if expect_fail: same_slot_port['port']['binding:profile']['pci_slot'])
self.assertEqual(
source_port['port']['binding:profile']['pci_slot'],
same_slot_port['port']['binding:profile']['pci_slot'])
else:
self.assertNotEqual(
source_port['port']['binding:profile']['pci_slot'],
same_slot_port['port']['binding:profile']['pci_slot'])
conn = self.computes['dest'].driver._host.get_connection() conn = self.computes['dest'].driver._host.get_connection()
vms = [vm._def for vm in conn._vms.values()] vms = [vm._def for vm in conn._vms.values()]
self.assertEqual(2, len(vms)) self.assertEqual(2, len(vms))
for vm in vms: for vm in vms:
self.assertEqual(1, len(vm['devices']['nics'])) self.assertEqual(1, len(vm['devices']['nics']))
# FIXME(artom) Until bug 1851545 is fixed, unshelve will not update the self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
# XML. vms[1]['devices']['nics'][0]['source'])
if expect_fail:
self.assertEqual(vms[0]['devices']['nics'][0]['source'],
vms[1]['devices']['nics'][0]['source'])
else:
self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
vms[1]['devices']['nics'][0]['source'])
def test_unshelve_server_with_neutron(self): def test_unshelve_server_with_neutron(self):
def move_operation(source_server): def move_operation(source_server):
@ -466,10 +453,7 @@ class SRIOVServersTest(_PCIServersTestBase):
self.api.put_service(self.computes['source'].service_ref.uuid, self.api.put_service(self.computes['source'].service_ref.uuid,
{'status': 'disabled'}) {'status': 'disabled'})
self._unshelve_server(source_server) self._unshelve_server(source_server)
# FIXME(artom) Bug 1851545 means we explain failure here: the pci_slot self._test_move_operation_with_neutron(move_operation)
# and XML will not get updated.
self._test_move_operation_with_neutron(move_operation,
expect_fail=True)
def test_cold_migrate_server_with_neutron(self): def test_cold_migrate_server_with_neutron(self):
def move_operation(source_server): def move_operation(source_server):

View File

@ -5828,9 +5828,11 @@ class TestAPI(TestAPIBase):
else: else:
self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec) self.assertNotIn(pci_request.PCI_TRUSTED_TAG, spec)
# Only the port with a resource_request will have pci_req.requester_id. # Only SRIOV ports and those with a resource_request will have
# pci_req.requester_id.
self.assertEqual( self.assertEqual(
[None, None, None, None, uuids.trusted_port, None], [uuids.portid_1, uuids.portid_3, uuids.portid_4, uuids.portid_5,
uuids.trusted_port, uuids.portid_vdpa],
[pci_req.requester_id for pci_req in pci_requests.requests]) [pci_req.requester_id for pci_req in pci_requests.requests])
self.assertCountEqual( self.assertCountEqual(
@ -6381,6 +6383,41 @@ class TestAPI(TestAPIBase):
self.api.get_segment_id_for_subnet, self.api.get_segment_id_for_subnet,
self.context, uuids.subnet_id) self.context, uuids.subnet_id)
@mock.patch.object(neutronapi.LOG, 'debug')
def test_get_port_pci_slot(self, mock_debug):
fake_port = {'id': uuids.fake_port_id}
request = objects.InstancePCIRequest(requester_id=uuids.fake_port_id,
request_id=uuids.pci_request_id)
bad_request = objects.InstancePCIRequest(
requester_id=uuids.wrong_port_id)
device = objects.PciDevice(request_id=uuids.pci_request_id,
address='fake-pci-address')
bad_device = objects.PciDevice(request_id=uuids.wrong_request_id)
# Test the happy path
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[request]),
pci_devices=objects.PciDeviceList(objects=[device]))
self.assertEqual(
'fake-pci-address',
self.api._get_port_pci_slot(self.context, instance, fake_port))
# Test not finding the request
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(
requests=[objects.InstancePCIRequest(bad_request)]))
self.assertIsNone(
self.api._get_port_pci_slot(self.context, instance, fake_port))
mock_debug.assert_called_with('No PCI request found for port %s',
uuids.fake_port_id, instance=instance)
mock_debug.reset_mock()
# Test not finding the device
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[request]),
pci_devices=objects.PciDeviceList(objects=[bad_device]))
self.assertIsNone(
self.api._get_port_pci_slot(self.context, instance, fake_port))
mock_debug.assert_called_with('No PCI device found for request %s',
uuids.pci_request_id, instance=instance)
class TestAPIModuleMethods(test.NoDBTestCase): class TestAPIModuleMethods(test.NoDBTestCase):

View File

@ -0,0 +1,12 @@
---
fixes:
- |
`Bug 1851545 <https://bugs.launchpad.net/nova/+bug/1851545>`_, wherein
unshelving an instance with SRIOV Neutron ports did not update the port
binding's ``pci_slot`` and could cause libvirt PCI conflicts, has been
fixed.
.. important:: Constraints in the fix's implementation mean that it only
applies to instances booted **after** it has been applied. Existing
instances will still experience bug 1851545 after being shelved and
unshelved, even with the fix applied.