Fix PCI passthrough cleanup on reschedule

The resource tracker Claim object works on a copy of the instance object
got from the compute manager. But the PCI claim logic does not use the
copy but use the original instance object. However the abort claim logic
including the abort PCI claim logic worked on the copy only. Therefore the
claimed PCI devices are visible to the compute manager in the
instance.pci_decives list even after the claim is aborted.

There was another bug in the PCIDevice object where the instance object
wasn't passed to the free() function and therefore the
instance.pci_devices list wasn't updated when the device was freed.

Closes-Bug: #1860555
Change-Id: Iff343d4d78996cd17a6a584fefa7071c81311673
This commit is contained in:
Balazs Gibizer
2024-08-15 13:06:39 +02:00
parent a2d77845ab
commit f8b98390dc
6 changed files with 19 additions and 19 deletions

View File

@@ -18,7 +18,9 @@
"ip_addresses": [], "ip_addresses": [],
"launched_at": null, "launched_at": null,
"power_state": "pending", "power_state": "pending",
"state": "building" "state": "building",
"host": null,
"node": null
} }
}, },
"priority":"ERROR", "priority":"ERROR",

View File

@@ -64,6 +64,7 @@ class Claim(NopClaim):
super().__init__(migration=migration) super().__init__(migration=migration)
# Stash a copy of the instance at the current point of time # Stash a copy of the instance at the current point of time
self.instance = instance.obj_clone() self.instance = instance.obj_clone()
self.instance_ref = instance
self.nodename = nodename self.nodename = nodename
self.tracker = tracker self.tracker = tracker
self._pci_requests = pci_requests self._pci_requests = pci_requests
@@ -82,7 +83,7 @@ class Claim(NopClaim):
been aborted. been aborted.
""" """
LOG.debug("Aborting claim: %s", self, instance=self.instance) LOG.debug("Aborting claim: %s", self, instance=self.instance)
self.tracker.abort_instance_claim(self.context, self.instance, self.tracker.abort_instance_claim(self.context, self.instance_ref,
self.nodename) self.nodename)
def _claim_test(self, compute_node, limits=None): def _claim_test(self, compute_node, limits=None):

View File

@@ -407,7 +407,7 @@ class PciDevTracker(object):
for dev in self.pci_devs: for dev in self.pci_devs:
if (dev.status == fields.PciDeviceStatus.ALLOCATED and if (dev.status == fields.PciDeviceStatus.ALLOCATED and
dev.instance_uuid == instance['uuid']): dev.instance_uuid == instance['uuid']):
self._free_device(dev) self._free_device(dev, instance)
def free_instance_claims( def free_instance_claims(
self, context: ctx.RequestContext, instance: 'objects.Instance', self, context: ctx.RequestContext, instance: 'objects.Instance',
@@ -423,7 +423,7 @@ class PciDevTracker(object):
for dev in self.pci_devs: for dev in self.pci_devs:
if (dev.status == fields.PciDeviceStatus.CLAIMED and if (dev.status == fields.PciDeviceStatus.CLAIMED and
dev.instance_uuid == instance['uuid']): dev.instance_uuid == instance['uuid']):
self._free_device(dev) self._free_device(dev, instance)
def free_instance( def free_instance(
self, context: ctx.RequestContext, instance: 'objects.Instance', self, context: ctx.RequestContext, instance: 'objects.Instance',

View File

@@ -3042,11 +3042,8 @@ class PCIResourceRequestReschedulingTest(_PCIServersTestBase):
def validate_group_policy(manager, instance, *args, **kwargs): def validate_group_policy(manager, instance, *args, **kwargs):
nonlocal validate_group_policy_called nonlocal validate_group_policy_called
if validate_group_policy_called:
# FIXME(johngarbutt): This is bug 1860555, it should be 1.
self.assertEqual(2, len(instance.pci_devices))
else:
self.assertEqual(1, len(instance.pci_devices)) self.assertEqual(1, len(instance.pci_devices))
if not validate_group_policy_called:
validate_group_policy_called = True validate_group_policy_called = True
raise exception.RescheduledException( raise exception.RescheduledException(
instance_uuid='fake-uuid', instance_uuid='fake-uuid',

View File

@@ -596,14 +596,6 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
# This is before we've failed. # This is before we've failed.
self.assertEqual(task_states.SPAWNING, instance.task_state) self.assertEqual(task_states.SPAWNING, instance.task_state)
tracking['last_state'] = instance.task_state tracking['last_state'] = instance.task_state
elif tracking['last_state'] == task_states.SPAWNING:
# This is after we've failed.
self.assertIsNone(instance.host)
self.assertIsNone(instance.node)
self.assertIsNone(instance.task_state)
tracking['last_state'] = instance.task_state
else:
self.fail('Unexpected save!')
with mock.patch.object(instance, 'save') as mock_save: with mock.patch.object(instance, 'save') as mock_save:
mock_save.side_effect = check_save mock_save.side_effect = check_save
@@ -614,6 +606,10 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
request_spec=objects.RequestSpec(), request_spec=objects.RequestSpec(),
accel_uuids=[]) accel_uuids=[])
self.assertIsNone(instance.host)
self.assertIsNone(instance.node)
self.assertIsNone(instance.task_state)
mock_notify_instance_action.assert_called_once_with( mock_notify_instance_action.assert_called_once_with(
self.context, instance, 'fake-mini', action='unshelve', self.context, instance, 'fake-mini', action='unshelve',
phase='start', bdms=mock_bdms) phase='start', bdms=mock_bdms)

View File

@@ -528,7 +528,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
# Ensure that the claim actually fixes the inconsistency so when the # Ensure that the claim actually fixes the inconsistency so when the
# parent if freed the children become available too. # parent if freed the children become available too.
self.tracker.free_instance( self.tracker.free_instance(
mock.sentinel.context, {'uuid': uuidsentinel.instance1}) mock.sentinel.context,
{'uuid': uuidsentinel.instance1,
'pci_devices': [objects.PciDevice(id=pf['id'])]})
pf_dev = self._get_device_by_address(pf['address']) pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status) self.assertEqual('available', pf_dev.status)
@@ -586,7 +588,9 @@ class PciDevTrackerTestCase(test.NoDBTestCase):
# Ensure that the claim actually fixes the inconsistency so when the # Ensure that the claim actually fixes the inconsistency so when the
# parent if freed the children become available too. # parent if freed the children become available too.
self.tracker.free_instance( self.tracker.free_instance(
mock.sentinel.context, {'uuid': uuidsentinel.instance1}) mock.sentinel.context,
{'uuid': uuidsentinel.instance1,
'pci_devices': [objects.PciDevice(id=pf['id'])]})
pf_dev = self._get_device_by_address(pf['address']) pf_dev = self._get_device_by_address(pf['address'])
self.assertEqual('available', pf_dev.status) self.assertEqual('available', pf_dev.status)