Merge "Remove unavailable but not reported PCI devices at startup" into stable/yoga

This commit is contained in:
Zuul 2022-06-14 04:22:41 +00:00 committed by Gerrit Code Review
commit 5f4ead1000
4 changed files with 123 additions and 7 deletions

View File

@ -447,11 +447,30 @@ class PciDevice(base.NovaPersistentObject, base.NovaObject):
instance.pci_devices.objects.append(copy.copy(self))
def remove(self):
if self.status != fields.PciDeviceStatus.AVAILABLE:
# We allow removal of a device is if it is unused. It can be unused
# either by being in available state or being in a state that shows
# that the parent or child device blocks the consumption of this device
expected_states = [
fields.PciDeviceStatus.AVAILABLE,
fields.PciDeviceStatus.UNAVAILABLE,
fields.PciDeviceStatus.UNCLAIMABLE,
]
if self.status not in expected_states:
raise exception.PciDeviceInvalidStatus(
compute_node_id=self.compute_node_id,
address=self.address, status=self.status,
hopestatus=[fields.PciDeviceStatus.AVAILABLE])
hopestatus=expected_states)
# Just to be on the safe side, do not allow removal of device that has
# an owner even if the state of the device suggests that it is not
# owned.
if 'instance_uuid' in self and self.instance_uuid is not None:
raise exception.PciDeviceInvalidOwner(
compute_node_id=self.compute_node_id,
address=self.address,
owner=self.instance_uuid,
hopeowner=None,
)
self.status = fields.PciDeviceStatus.REMOVED
self.instance_uuid = None
self.request_id = None

View File

@ -217,10 +217,13 @@ class PciDevTracker(object):
# from the pci whitelist.
try:
existed.remove()
except exception.PciDeviceInvalidStatus as e:
LOG.warning("Unable to remove device with %(status)s "
"ownership %(instance_uuid)s because of "
"%(pci_exception)s. "
except (
exception.PciDeviceInvalidStatus,
exception.PciDeviceInvalidOwner,
) as e:
LOG.warning("Unable to remove device with status "
"'%(status)s' and ownership %(instance_uuid)s "
"because of %(pci_exception)s. "
"Check your [pci]passthrough_whitelist "
"configuration to make sure this allocated "
"device is whitelisted. If you have removed "
@ -250,7 +253,10 @@ class PciDevTracker(object):
else:
# Note(yjiang5): no need to update stats if an assigned
# device is hot removed.
self.stats.remove_device(existed)
# NOTE(gibi): only remove the device from the pools if it
# is not already removed
if existed in self.stats.get_free_devs():
self.stats.remove_device(existed)
else:
# Update tracked devices.
new_value: ty.Dict[str, ty.Any]

View File

@ -467,6 +467,16 @@ class _TestPciDeviceObject(object):
devobj.claim(self.inst.uuid)
self.assertRaises(exception.PciDeviceInvalidStatus, devobj.remove)
def test_remove_device_fail_owned_with_unavailable_state(self):
# This test creates an PCI device in an invalid state. This should
# not happen in any known scenario. But we want to be save not to allow
# removing a device that has an owner. See bug 1969496 for more details
self._create_fake_instance()
devobj = pci_device.PciDevice.create(None, dev_dict)
devobj.claim(self.inst.uuid)
devobj.status = fields.PciDeviceStatus.UNAVAILABLE
self.assertRaises(exception.PciDeviceInvalidOwner, devobj.remove)
class TestPciDeviceObject(test_objects._LocalTest,
_TestPciDeviceObject):

View File

@ -397,6 +397,87 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
self.assertEqual(len(self.tracker.stale), 1)
self.assertEqual(self.tracker.stale['0000:00:00.2']['vendor_id'], 'v2')
def _get_device_by_address(self, address):
devs = [dev for dev in self.tracker.pci_devs if dev.address == address]
if len(devs) == 1:
return devs[0]
if devs:
raise ValueError('ambiguous address', devs)
else:
raise ValueError('device not found', address)
def test_set_hvdevs_unavailable_vf_removed(self):
# We start with a PF parent and two VF children
self._create_tracker([fake_db_dev_3, fake_db_dev_4, fake_db_dev_5])
pci_requests_obj = self._create_pci_requests_object(
[
{
'count': 1,
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
}
],
instance_uuid=uuidsentinel.instance1,
)
# then claim and allocate the PF that makes the VFs unavailable
self.tracker.claim_instance(
mock.sentinel.context, pci_requests_obj, None)
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
self.assertEqual('allocated', dev3_pf.status)
self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
self.assertEqual('unavailable', dev4_vf.status)
dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
self.assertEqual('unavailable', dev5_vf.status)
# now simulate that one VF (dev_5) is removed from the hypervisor and
# the compute is restarted. As the VF is not claimed or allocated we
# are free to remove it from the tracker.
self.tracker._set_hvdevs(copy.deepcopy([fake_pci_3, fake_pci_4]))
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
self.assertEqual('allocated', dev3_pf.status)
self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
self.assertEqual('unavailable', dev4_vf.status)
dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
self.assertEqual('removed', dev5_vf.status)
def test_set_hvdevs_unavailable_pf_removed(self):
# We start with one PF parent and one child VF
self._create_tracker([fake_db_dev_3, fake_db_dev_4])
pci_requests_obj = self._create_pci_requests_object(
[
{
'count': 1,
'spec': [{'dev_type': fields.PciDeviceType.SRIOV_VF}]
}
],
instance_uuid=uuidsentinel.instance1,
)
# Then we claim and allocate the VF that makes the PF unavailable
self.tracker.claim_instance(
mock.sentinel.context, pci_requests_obj, None)
self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
self.assertEqual('unavailable', dev3_pf.status)
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
self.assertEqual('allocated', dev4_vf.status)
self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
# now simulate that the parent PF is removed from the hypervisor and
# the compute is restarted. As the PF is not claimed or allocated we
# are free to remove it from the tracker.
self.tracker._set_hvdevs(copy.deepcopy([fake_pci_4]))
dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
self.assertEqual('removed', dev3_pf.status)
dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
self.assertEqual('allocated', dev4_vf.status)
self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
def test_update_pci_for_instance_active(self):
pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)
self.tracker.claim_instance(mock.sentinel.context,