Merge "PCI: do not force remove allocated devices"

This commit is contained in:
Zuul 2019-02-05 23:17:45 +00:00 committed by Gerrit Code Review
commit ca14fb1eea
2 changed files with 77 additions and 5 deletions

View File

@ -163,23 +163,47 @@ class PciDevTracker(object):
for existed in self.pci_devs:
if existed.address in exist_addrs - new_addrs:
# Remove previously tracked PCI devices that are either
# no longer reported by the hypervisor or have been removed
# from the pci whitelist.
try:
existed.remove()
except exception.PciDeviceInvalidStatus as e:
LOG.warning("Trying to remove device with %(status)s "
LOG.warning("Unable to remove device with %(status)s "
"ownership %(instance_uuid)s because of "
"%(pci_exception)s",
"%(pci_exception)s. "
"Check your [pci]passthrough_whitelist "
"configuration to make sure this allocated "
"device is whitelisted. If you have removed "
"the device from the whitelist intentionally "
"or the device is no longer available on the "
"host you will need to delete the server or "
"migrate it to another host to silence this "
"warning.",
{'status': existed.status,
'instance_uuid': existed.instance_uuid,
'pci_exception': e.format_message()})
# Note(yjiang5): remove the device by force so that
# db entry is cleaned in next sync.
existed.status = fields.PciDeviceStatus.REMOVED
# NOTE(sean-k-mooney): the device may not be tracked for
# two reasons: first the device could have been removed
# from the host or second the whitelist could have been
# updated. While force removing may seam reasonable, if
# the device is allocated to a vm, force removing the
# device entry from the resource tracker can prevent the vm
# from rebooting. If the PCI device was removed due to an
# update to the PCI whitelist which was later reverted,
# removing the entry from the database and adding it back
# later may lead to the scheduler incorrectly selecting
# this host and the ResourceTracker assigning the PCI
# device to a second vm. To prevent this bug we skip
# deleting the device from the db in this iteration and
# will try again on the next sync.
continue
else:
# Note(yjiang5): no need to update stats if an assigned
# device is hot removed.
self.stats.remove_device(existed)
else:
# Update tracked devices.
new_value = next((dev for dev in devices if
dev['address'] == existed.address))
new_value['compute_node_id'] = self.node_id
@ -202,6 +226,7 @@ class PciDevTracker(object):
else:
existed.update_device(new_value)
# Track newly discovered devices.
for dev in [dev for dev in devices if
dev['address'] in new_addrs - exist_addrs]:
dev['compute_node_id'] = self.node_id

View File

@ -312,6 +312,53 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
self.assertEqual(vfs, pf.child_devices)
self.assertEqual(vfs[0].parent_device, pf)
def test_set_hvdev_remove_tree_maintained_with_allocations(self):
# Make sure the device tree is properly maintained when there are
# devices removed from the system that are allocated to vms.
all_devs = fake_db_devs_tree[:]
self._create_tracker(all_devs)
# we start with 3 devices
self.assertEqual(
3,
len([dev for dev in self.tracker.pci_devs
if dev.status != fields.PciDeviceStatus.REMOVED]))
# we then allocate one device
pci_requests_obj = self._create_pci_requests_object(
[{'count': 1, 'spec': [{'vendor_id': 'v2'}]}])
# NOTE(sean-k-mooney): context, pci request, numa topology
claimed_dev = self.tracker.claim_instance(
mock.sentinel.context, pci_requests_obj, None)[0]
self.tracker._set_hvdevs(all_devs)
# and assert that no devices were removed
self.assertEqual(
0,
len([dev for dev in self.tracker.pci_devs
if dev.status == fields.PciDeviceStatus.REMOVED]))
# we then try to remove the allocated device from the set reported
# by the driver.
fake_pci_devs = [dev for dev in all_devs
if dev['address'] != claimed_dev.address]
with mock.patch("nova.pci.manager.LOG.warning") as log:
self.tracker._set_hvdevs(fake_pci_devs)
log.assert_called_once()
args = log.call_args_list[0][0] # args of first call
self.assertIn('Unable to remove device with', args[0])
# and assert no devices are removed from the tracker
self.assertEqual(
0,
len([dev for dev in self.tracker.pci_devs
if dev.status == fields.PciDeviceStatus.REMOVED]))
# free the device that was allocated and update tracker again
self.tracker._free_device(claimed_dev)
self.tracker._set_hvdevs(fake_pci_devs)
# and assert that one device is removed from the tracker
self.assertEqual(
1,
len([dev for dev in self.tracker.pci_devs
if dev.status == fields.PciDeviceStatus.REMOVED]))
def test_set_hvdev_changed_stal(self):
pci_requests_obj = self._create_pci_requests_object(
[{'count': 1, 'spec': [{'vendor_id': 'v1'}]}])