Support same host resize with PCI in placement

Id02e445c55fc956965b7d725f0260876d42422f2 added special case in the
healing logic for same host resize. Now that the scheduler also creates
allocation on the destination host during resize we need to make sure
that the drop_move_claim code that runs during revert and confirm drops
the tracked migration from the resource tracker only after the healing
logic run as these migrations being confirmed / reverted are still
affecting PciDevices at this point.

blueprint: pci-device-tracking-in-placement
Change-Id: I6241965fe6c1cc1f2560fcce65d5e32ef308d502
This commit is contained in:
Balazs Gibizer 2022-08-24 17:47:53 +02:00
parent b387401187
commit fa4832c660
3 changed files with 151 additions and 13 deletions

View File

@ -619,18 +619,11 @@ class ResourceTracker(object):
:param prefix: Prefix to use when accessing migration context
attributes. 'old_' or 'new_', with 'new_' being the default.
"""
# Remove usage for an instance that is tracked in migrations, such as
# on the dest node during revert resize.
if instance['uuid'] in self.tracked_migrations:
migration = self.tracked_migrations.pop(instance['uuid'])
if instance["uuid"] in self.tracked_migrations:
if not flavor:
flavor = self._get_flavor(instance, prefix, migration)
# Remove usage for an instance that is not tracked in migrations (such
# as on the source node after a migration).
# NOTE(lbeliveau): On resize on the same node, the instance is
# included in both tracked_migrations and tracked_instances.
elif instance['uuid'] in self.tracked_instances:
self.tracked_instances.remove(instance['uuid'])
flavor = self._get_flavor(
instance, prefix, self.tracked_migrations[instance["uuid"]]
)
if flavor is not None:
numa_topology = self._get_migration_context_resource(
@ -646,6 +639,15 @@ class ResourceTracker(object):
ctxt = context.elevated()
self._update(ctxt, self.compute_nodes[nodename])
# Remove usage for an instance that is tracked in migrations, such as
# on the dest node during revert resize.
self.tracked_migrations.pop(instance['uuid'], None)
# Remove usage for an instance that is not tracked in migrations (such
# as on the source node after a migration).
# NOTE(lbeliveau): On resize on the same node, the instance is
# included in both tracked_migrations and tracked_instances.
self.tracked_instances.discard(instance['uuid'])
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def update_usage(self, context, instance, nodename):
"""Update the resource usage and stats after a change in an

View File

@ -1597,8 +1597,17 @@ class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
compute1_expected_placement_view["allocations"][server["id"]] = {
"0000:81:00.0": {self.VF_RC: 2}
}
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# NOTE(gibi): This is unfortunate but during same host resize
# confirm when the PCI scheduling is not enabled the healing logic
# cannot heal the dest host allocation during the claim. It will only
# heal it in the next run of the ResourceTracker._update(). This due
# to the fact that ResourceTracker.drop_move_claim runs both for
# revert (on the dest) and confirm (on the source) and in same host
# resize this means that it runs on both the source and the dest as
# they are the same.
# Anyhow the healing will happen just a bit later. And the end goal is
# to make the scheduler support enabled by default and delete the
# whole healing logic. So I think this is acceptable.
self._run_periodics()
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)

View File

@ -2356,6 +2356,133 @@ class PCIServersTest(_PCIServersTestBase):
self.assert_no_pci_healing("test_compute0")
self.assert_no_pci_healing("test_compute1")
def test_same_host_resize_with_pci(self):
"""Start a single compute with 3 PCI devs and resize and instance
from one dev to two devs
"""
self.flags(allow_resize_to_same_host=True)
self.start_compute(
hostname='test_compute0',
pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=3))
self.assertPCIDeviceCounts('test_compute0', total=3, free=3)
test_compute0_placement_pci_view = {
"inventories": {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PCI_RC: 1},
},
"traits": {
"0000:81:00.0": [],
"0000:81:01.0": [],
"0000:81:02.0": [],
},
"usages": {
"0000:81:00.0": {self.PCI_RC: 0},
"0000:81:01.0": {self.PCI_RC: 0},
"0000:81:02.0": {self.PCI_RC: 0},
},
"allocations": {},
}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# Boot a server with a single PCI device.
# To stabilize the test we reserve 81.01 and 81.02 in placement so
# we can be sure that the instance will use 81.00, otherwise the
# allocation will be random between 00, 01, and 02
self._reserve_placement_resource(
"test_compute0_0000:81:01.0", self.PCI_RC, 1)
self._reserve_placement_resource(
"test_compute0_0000:81:02.0", self.PCI_RC, 1)
extra_spec = {'pci_passthrough:alias': f'{self.ALIAS_NAME}:1'}
pci_flavor_id = self._create_flavor(extra_spec=extra_spec)
server = self._create_server(flavor_id=pci_flavor_id, networks='none')
self.assertPCIDeviceCounts('test_compute0', total=3, free=2)
test_compute0_placement_pci_view[
"usages"]["0000:81:00.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view[
"allocations"][server['id']] = {"0000:81:00.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# remove the reservations, so we can resize on the same host and
# consume 01 and 02
self._reserve_placement_resource(
"test_compute0_0000:81:01.0", self.PCI_RC, 0)
self._reserve_placement_resource(
"test_compute0_0000:81:02.0", self.PCI_RC, 0)
# Resize the server to use 2 PCI devices
extra_spec = {'pci_passthrough:alias': f'{self.ALIAS_NAME}:2'}
pci_flavor_id = self._create_flavor(extra_spec=extra_spec)
with mock.patch(
'nova.virt.libvirt.driver.LibvirtDriver'
'.migrate_disk_and_power_off',
return_value='{}',
):
self._resize_server(server, pci_flavor_id)
self.assertPCIDeviceCounts('test_compute0', total=3, free=0)
# the source host side of the allocation is now held by the migration
# UUID
self._move_server_allocation(
test_compute0_placement_pci_view["allocations"], server['id'])
# but we have the dest host side of the allocations on the same host
test_compute0_placement_pci_view[
"usages"]["0000:81:01.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view[
"usages"]["0000:81:02.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view["allocations"][server['id']] = {
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PCI_RC: 1},
}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# revert the resize so the instance should go back to use a single
# device
self._revert_resize(server)
self.assertPCIDeviceCounts('test_compute0', total=3, free=2)
# the migration allocation is moved back to the instance UUID
self._move_server_allocation(
test_compute0_placement_pci_view["allocations"],
server["id"],
revert=True,
)
# and the "dest" side of the allocation is dropped
test_compute0_placement_pci_view[
"usages"]["0000:81:01.0"][self.PCI_RC] = 0
test_compute0_placement_pci_view[
"usages"]["0000:81:02.0"][self.PCI_RC] = 0
test_compute0_placement_pci_view["allocations"][server['id']] = {
"0000:81:00.0": {self.PCI_RC: 1},
}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# resize again but now confirm the same host resize and assert that
# only the new flavor usage remains
with mock.patch(
'nova.virt.libvirt.driver.LibvirtDriver'
'.migrate_disk_and_power_off',
return_value='{}',
):
self._resize_server(server, pci_flavor_id)
self._confirm_resize(server)
self.assertPCIDeviceCounts('test_compute0', total=3, free=1)
test_compute0_placement_pci_view["usages"] = {
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PCI_RC: 1},
}
test_compute0_placement_pci_view["allocations"][
server['id']] = {self.PCI_RC: 1}
test_compute0_placement_pci_view["allocations"][server['id']] = {
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PCI_RC: 1},
}
self.assert_no_pci_healing("test_compute0")
def _confirm_resize(self, server, host='host1'):
# NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
# host option to know the source hostname but given we have a global