Support cold migrate and resize with PCI tracking in placement

This patch adds support for cold migrate, and resize with PCI
devices when the placement tracking is enabled.

Same host resize, evacuate and unshelve will be supported by subsequent
patches. Live migration was not supported with flavor based PCI requests
before so it won't be supported now either.

blueprint: pci-device-tracking-in-placement
Change-Id: I8eec331ab3c30e5958ed19c173eff9998c1f41b0
This commit is contained in:
Balazs Gibizer 2022-08-23 19:58:25 +02:00
parent 1462883dcc
commit e667a7f8d8
5 changed files with 277 additions and 65 deletions

View File

@ -258,6 +258,11 @@ class MigrationTask(base.TaskBase):
# resource requests in a single list and add them to the RequestSpec.
self.request_spec.requested_resources = port_res_req
self.request_spec.request_level_params = req_lvl_params
# NOTE(gibi): as PCI devices is tracked in placement we need to
# generate request groups from InstancePCIRequests. This will append
# new RequestGroup objects to the request_spec.requested_resources list
# if needed
self.request_spec.generate_request_groups_from_pci_requests()
self._set_requested_destination_cell(legacy_props)

View File

@ -495,7 +495,7 @@ class RequestSpec(base.NovaObject):
def _pci_in_placement_enabled():
return False
def _generate_request_groups_from_pci_requests(self):
def generate_request_groups_from_pci_requests(self):
if not self._pci_in_placement_enabled():
return False
@ -656,7 +656,7 @@ class RequestSpec(base.NovaObject):
if port_resource_requests:
spec_obj.requested_resources.extend(port_resource_requests)
spec_obj._generate_request_groups_from_pci_requests()
spec_obj.generate_request_groups_from_pci_requests()
# NOTE(gibi): later the scheduler adds more request level params but
# never overrides existing ones so we can initialize them here.

View File

@ -949,18 +949,6 @@ class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
)
)
@staticmethod
def _move_allocation(allocations, from_uuid, to_uuid):
allocations[to_uuid] = allocations[from_uuid]
del allocations[from_uuid]
def _move_server_allocation(self, allocations, server_uuid, revert=False):
migration_uuid = self.get_migration_uuid_for_instance(server_uuid)
if revert:
self._move_allocation(allocations, migration_uuid, server_uuid)
else:
self._move_allocation(allocations, server_uuid, migration_uuid)
def test_heal_single_pci_allocation(self):
# The fake libvirt will emulate on the host:
# * one type-PCI in slot 0

View File

@ -246,6 +246,18 @@ class _PCIServersTestBase(base.ServersTestBase):
def _to_device_spec_conf(spec_list):
return [jsonutils.dumps(x) for x in spec_list]
@staticmethod
def _move_allocation(allocations, from_uuid, to_uuid):
allocations[to_uuid] = allocations[from_uuid]
del allocations[from_uuid]
def _move_server_allocation(self, allocations, server_uuid, revert=False):
migration_uuid = self.get_migration_uuid_for_instance(server_uuid)
if revert:
self._move_allocation(allocations, migration_uuid, server_uuid)
else:
self._move_allocation(allocations, server_uuid, migration_uuid)
class _PCIServersWithMigrationTestBase(_PCIServersTestBase):
@ -2102,6 +2114,207 @@ class PCIServersTest(_PCIServersTestBase):
self.assert_no_pci_healing("test_compute0")
self.assert_no_pci_healing("test_compute1")
def test_resize_vanilla_to_pci(self):
"""Resize an instance from a non PCI flavor to a PCI flavor"""
# Start two computes, one with PCI and one without.
self.start_compute(
hostname='test_compute0',
pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=1))
test_compute0_placement_pci_view = {
"inventories": {"0000:81:00.0": {self.PCI_RC: 1}},
"traits": {"0000:81:00.0": []},
"usages": {"0000:81:00.0": {self.PCI_RC: 0}},
"allocations": {},
}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.start_compute(hostname='test_compute1')
test_compute1_placement_pci_view = {
"inventories": {},
"traits": {},
"usages": {},
"allocations": {},
}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# Boot a server without PCI device and make sure it lands on the
# compute that has no device, so we can resize it later to the other
# host having PCI device.
extra_spec = {}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server = self._create_server(
flavor_id=flavor_id, networks='none', host="test_compute1")
self.assertPCIDeviceCounts('test_compute0', total=1, free=1)
self.assertPCIDeviceCounts('test_compute1', total=0, free=0)
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# Resize it to a flavor with a PCI devices. We expect this to work, as
# test_compute0 is available and having PCI devices.
extra_spec = {'pci_passthrough:alias': f'{self.ALIAS_NAME}:1'}
pci_flavor_id = self._create_flavor(extra_spec=extra_spec)
with mock.patch(
'nova.virt.libvirt.driver.LibvirtDriver'
'.migrate_disk_and_power_off',
return_value='{}',
):
self._resize_server(server, pci_flavor_id)
self._confirm_resize(server)
self.assertPCIDeviceCounts('test_compute0', total=1, free=0)
self.assertPCIDeviceCounts('test_compute1', total=0, free=0)
test_compute0_placement_pci_view[
"usages"]["0000:81:00.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view[
"allocations"][server['id']] = {"0000:81:00.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
self.assert_no_pci_healing("test_compute0")
self.assert_no_pci_healing("test_compute1")
def test_resize_from_one_dev_to_two(self):
self.start_compute(
hostname='test_compute0',
pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=1))
self.assertPCIDeviceCounts('test_compute0', total=1, free=1)
test_compute0_placement_pci_view = {
"inventories": {"0000:81:00.0": {self.PCI_RC: 1}},
"traits": {"0000:81:00.0": []},
"usages": {"0000:81:00.0": {self.PCI_RC: 0}},
"allocations": {},
}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.start_compute(
hostname='test_compute1',
pci_info=fakelibvirt.HostPCIDevicesInfo(num_pci=2),
)
self.assertPCIDeviceCounts('test_compute1', total=2, free=2)
test_compute1_placement_pci_view = {
"inventories": {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
},
"traits": {
"0000:81:00.0": [],
"0000:81:01.0": [],
},
"usages": {
"0000:81:00.0": {self.PCI_RC: 0},
"0000:81:01.0": {self.PCI_RC: 0},
},
"allocations": {},
}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# boot a VM on test_compute0 with a single PCI dev
extra_spec = {'pci_passthrough:alias': f'{self.ALIAS_NAME}:1'}
pci_flavor_id = self._create_flavor(extra_spec=extra_spec)
server = self._create_server(
flavor_id=pci_flavor_id, networks='none', host="test_compute0")
self.assertPCIDeviceCounts('test_compute0', total=1, free=0)
test_compute0_placement_pci_view["usages"][
"0000:81:00.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view["allocations"][
server['id']] = {"0000:81:00.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# resize the server to a flavor requesting two devices
extra_spec = {'pci_passthrough:alias': f'{self.ALIAS_NAME}:2'}
pci_flavor_id = self._create_flavor(extra_spec=extra_spec)
with mock.patch(
'nova.virt.libvirt.driver.LibvirtDriver'
'.migrate_disk_and_power_off',
return_value='{}',
):
self._resize_server(server, pci_flavor_id)
self.assertPCIDeviceCounts('test_compute0', total=1, free=0)
# one the source host the PCI allocation is now held by the migration
self._move_server_allocation(
test_compute0_placement_pci_view['allocations'], server['id'])
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# on the dest we have now two device allocated
self.assertPCIDeviceCounts('test_compute1', total=2, free=0)
test_compute1_placement_pci_view["usages"] = {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
}
test_compute1_placement_pci_view["allocations"][
server['id']] = {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# now revert the resize
self._revert_resize(server)
self.assertPCIDeviceCounts('test_compute0', total=1, free=0)
# on the host the allocation should move back to the instance UUID
self._move_server_allocation(
test_compute0_placement_pci_view["allocations"],
server["id"],
revert=True,
)
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# so the dest should be freed
self.assertPCIDeviceCounts('test_compute1', total=2, free=2)
test_compute1_placement_pci_view["usages"] = {
"0000:81:00.0": {self.PCI_RC: 0},
"0000:81:01.0": {self.PCI_RC: 0},
}
del test_compute1_placement_pci_view["allocations"][server['id']]
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# now resize again and confirm it
with mock.patch(
'nova.virt.libvirt.driver.LibvirtDriver'
'.migrate_disk_and_power_off',
return_value='{}',
):
self._resize_server(server, pci_flavor_id)
self._confirm_resize(server)
# the source host now need to be freed up
self.assertPCIDeviceCounts('test_compute0', total=1, free=1)
test_compute0_placement_pci_view["usages"] = {
"0000:81:00.0": {self.PCI_RC: 0},
}
test_compute0_placement_pci_view["allocations"] = {}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
# and dest allocated
self.assertPCIDeviceCounts('test_compute1', total=2, free=0)
test_compute1_placement_pci_view["usages"] = {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
}
test_compute1_placement_pci_view["allocations"][
server['id']] = {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
self.assert_no_pci_healing("test_compute0")
self.assert_no_pci_healing("test_compute1")
def _confirm_resize(self, server, host='host1'):
# NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
# host option to know the source hostname but given we have a global
@ -2115,11 +2328,6 @@ class PCIServersTest(_PCIServersTestBase):
self.flags(host=orig_host)
def test_cold_migrate_server_with_pci(self):
# FIXME(gibi): enable this once the allocation candidate filtering
# in hardware.py and the allocation correlation in the PCI claim is
# implemented
self.mock_pci_in_placement_enabled.return_value = False
host_devices = {}
orig_create = nova.virt.libvirt.guest.Guest.create
@ -2190,8 +2398,16 @@ class PCIServersTest(_PCIServersTestBase):
}
flavor_id = self._create_flavor(extra_spec=extra_spec)
# force the allocation on test_compute0 to 81:00 to make it easy
# to assert the placement allocation
self._reserve_placement_resource(
"test_compute0_0000:81:01.0", self.PCI_RC, 1)
server_a = self._create_server(
flavor_id=flavor_id, networks='none', host='test_compute0')
# force the allocation on test_compute1 to 81:00 to make it easy
# to assert the placement allocation
self._reserve_placement_resource(
"test_compute1_0000:81:01.0", self.PCI_RC, 1)
server_b = self._create_server(
flavor_id=flavor_id, networks='none', host='test_compute1')
@ -2203,22 +2419,24 @@ class PCIServersTest(_PCIServersTestBase):
for hostname in ('test_compute0', 'test_compute1'):
self.assertPCIDeviceCounts(hostname, total=2, free=1)
# FIXME(gibi): This fails as the scheduler allocates different PCI dev
# in placement than what the pci claim allocates on the host.
# test_compute0_placement_pci_view[
# "usages"]["0000:81:00.0"][self.PCI_RC] = 1
# test_compute0_placement_pci_view[
# "allocations"][server_a['id']] =
# {"0000:81:00.0": {self.PCI_RC: 1}}
# self.assert_placement_pci_view(
# "test_compute0", **test_compute0_placement_pci_view)
# test_compute1_placement_pci_view[
# "usages"]["0000:81:00.0"][self.PCI_RC] = 1
# test_compute1_placement_pci_view[
# "allocations"][server_b['id']] =
# {"0000:81:00.0": {self.PCI_RC: 1}}
# self.assert_placement_pci_view(
# "test_compute1", **test_compute1_placement_pci_view)
test_compute0_placement_pci_view["usages"][
"0000:81:00.0"][self.PCI_RC] = 1
test_compute0_placement_pci_view["allocations"][
server_a['id']] = {"0000:81:00.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
test_compute1_placement_pci_view[
"usages"]["0000:81:00.0"][self.PCI_RC] = 1
test_compute1_placement_pci_view["allocations"][
server_b['id']] = {"0000:81:00.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# remove the resource reservation from test_compute1 to be able to
# migrate server_a there
self._reserve_placement_resource(
"test_compute1_0000:81:01.0", self.PCI_RC, 0)
# TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
# probably be less...dumb
@ -2237,40 +2455,41 @@ class PCIServersTest(_PCIServersTestBase):
server_a['OS-EXT-SRV-ATTR:host'], server_b['OS-EXT-SRV-ATTR:host'],
)
self.assertPCIDeviceCounts('test_compute0', total=2, free=1)
# migration_uuid = self.get_migration_uuid_for_instance(server_a['id'])
# test_compute0_placement_pci_view["allocations"][migration_uuid] = (
# test_compute0_placement_pci_view["allocations"][server_a['id']])
# del test_compute0_placement_pci_view["allocations"][server_a['id']]
# self.assert_placement_pci_view(
# "test_compute0", **test_compute0_placement_pci_view)
# on the source host the allocation is now held by the migration UUID
self._move_server_allocation(
test_compute0_placement_pci_view["allocations"], server_a['id'])
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.assertPCIDeviceCounts('test_compute1', total=2, free=0)
# test_compute1_placement_pci_view[
# "usages"]["0000:81:01.0"][self.PCI_RC] = 1
# test_compute1_placement_pci_view[
# "allocations"][server_a['id']] =
# {"0000:81:01.0": {self.PCI_RC: 1}}
# self.assert_placement_pci_view(
# "test_compute1", **test_compute1_placement_pci_view)
# sever_a now have allocation on test_compute1 on 81:01
test_compute1_placement_pci_view["usages"][
"0000:81:01.0"][self.PCI_RC] = 1
test_compute1_placement_pci_view["allocations"][
server_a['id']] = {"0000:81:01.0": {self.PCI_RC: 1}}
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
# now, confirm the migration and check our counts once again
self._confirm_resize(server_a)
self.assertPCIDeviceCounts('test_compute0', total=2, free=2)
# test_compute0_placement_pci_view["usages"] = {
# "0000:81:00.0": {self.PCI_RC: 0},
# "0000:81:01.0": {self.PCI_RC: 0},
# }
# del test_compute0_placement_pci_view["allocations"][migration_uuid]
# self.assert_placement_pci_view(
# "test_compute0", **test_compute0_placement_pci_view)
# the source host now has no allocations as the migration allocation
# is removed by confirm resize
test_compute0_placement_pci_view["usages"] = {
"0000:81:00.0": {self.PCI_RC: 0},
"0000:81:01.0": {self.PCI_RC: 0},
}
test_compute0_placement_pci_view["allocations"] = {}
self.assert_placement_pci_view(
"test_compute0", **test_compute0_placement_pci_view)
self.assertPCIDeviceCounts('test_compute1', total=2, free=0)
# self.assert_placement_pci_view(
# "test_compute1", **test_compute1_placement_pci_view)
#
# self.assert_no_pci_healing("test_compute0")
# self.assert_no_pci_healing("test_compute1")
self.assert_placement_pci_view(
"test_compute1", **test_compute1_placement_pci_view)
self.assert_no_pci_healing("test_compute0")
self.assert_no_pci_healing("test_compute1")
def test_request_two_pci_but_host_has_one(self):
# simulate a single type-PCI device on the host

View File

@ -1146,7 +1146,7 @@ class TestInstancePCIRequestToRequestGroups(test.NoDBTestCase):
),
)
spec._generate_request_groups_from_pci_requests()
spec.generate_request_groups_from_pci_requests()
self.assertEqual(0, len(spec.requested_resources))
@ -1164,7 +1164,7 @@ class TestInstancePCIRequestToRequestGroups(test.NoDBTestCase):
objects.InstancePCIRequest.NEUTRON_PORT, pci_req.source
)
spec._generate_request_groups_from_pci_requests()
spec.generate_request_groups_from_pci_requests()
self.assertEqual(0, len(spec.requested_resources))
@ -1189,7 +1189,7 @@ class TestInstancePCIRequestToRequestGroups(test.NoDBTestCase):
),
)
spec._generate_request_groups_from_pci_requests()
spec.generate_request_groups_from_pci_requests()
self.assertEqual(2, len(spec.requested_resources))
self.assertEqual(
@ -1224,7 +1224,7 @@ class TestInstancePCIRequestToRequestGroups(test.NoDBTestCase):
),
)
spec._generate_request_groups_from_pci_requests()
spec.generate_request_groups_from_pci_requests()
self.assertEqual(2, len(spec.requested_resources))
self.assertEqual(
@ -1277,7 +1277,7 @@ class TestInstancePCIRequestToRequestGroups(test.NoDBTestCase):
),
)
spec._generate_request_groups_from_pci_requests()
spec.generate_request_groups_from_pci_requests()
self.assertEqual(2, len(spec.requested_resources))
self.assertEqual(