diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst index 6b031ba96806..3632294526e7 100644 --- a/doc/source/admin/index.rst +++ b/doc/source/admin/index.rst @@ -199,6 +199,7 @@ instance for these kind of workloads. virtual-gpu file-backed-memory ports-with-resource-requests + vdpa virtual-persistent-memory emulated-tpm uefi diff --git a/doc/source/admin/vdpa.rst b/doc/source/admin/vdpa.rst new file mode 100644 index 000000000000..8583d327cccf --- /dev/null +++ b/doc/source/admin/vdpa.rst @@ -0,0 +1,92 @@ +============================ +Using ports vnic_type='vdpa' +============================ +.. versionadded:: 23.0.0 (Wallaby) + + Introduced support for vDPA. + +.. important:: + The functionality described below is only supported by the + libvirt/KVM virt driver. + +The kernel vDPA (virtio Data Path Acceleration) framework +provides a vendor independent framework for offloading data-plane +processing to software or hardware virtio device backends. +While the kernel vDPA framework supports many types of vDPA devices, +at this time nova only support ``virtio-net`` devices +using the ``vhost-vdpa`` front-end driver. Support for ``virtio-blk`` or +``virtio-gpu`` may be added in the future but is not currently planned +for any specific release. + +vDPA device tracking +~~~~~~~~~~~~~~~~~~~~ +When implementing support for vDPA based neutron ports one of the first +decisions nova had to make was how to model the availability of vDPA devices +and the capability to virtualize vDPA devices. As the initial use-case +for this technology was to offload networking to hardware offload OVS via +neutron ports the decision was made to extend the existing PCI tracker that +is used for SR-IOV and pci-passthrough to support vDPA devices. As a result +a simplification was made to assume that the parent device of a vDPA device +is an SR-IOV Virtual Function (VF). As a result software only vDPA device such +as those created by the kernel ``vdpa-sim`` sample module are not supported. + +To make vDPA device available to be scheduled to guests the operator should +include the device using the PCI address or vendor ID and product ID of the +parent VF in the PCI ``device_spec``. +See: :nova-doc:`pci-passthrough ` for details. + +Nova will not create the VFs or vDPA devices automatically. It is expected +that the operator will allocate them before starting the nova-compute agent. +While no specific mechanisms is prescribed to do this udev rules or systemd +service files are generally the recommended approach to ensure the devices +are created consistently across reboots. + +.. note:: + As vDPA is an offload only for the data plane and not the control plane a + vDPA control plane is required to properly support vDPA device passthrough. + At the time of writing only hardware offloaded OVS is supported when using + vDPA with nova. Because of this vDPA devices cannot be requested using the + PCI alias. While nova could allow vDPA devices to be requested by the + flavor using a PCI alias we would not be able to correctly configure the + device as there would be no suitable control plane. For this reason vDPA + devices are currently only consumable via neutron ports. + +Virt driver support +~~~~~~~~~~~~~~~~~~~ + +Supporting neutron ports with ``vnic_type=vdpa`` depends on the capability +of the virt driver. At this time only the ``libvirt`` virt driver with KVM +is fully supported. QEMU may also work but is untested. + +vDPA support depends on kernel 5.7+, Libvirt 6.9.0+ and QEMU 5.1+. + +vDPA lifecycle operations +~~~~~~~~~~~~~~~~~~~~~~~~~ + +At this time vDPA ports can only be added to a VM when it is first created. +To do this the normal SR-IOV workflow is used where by the port is first created +in neutron and passed into nova as part of the server create request. + +.. code-block:: bash + + openstack port create --network --vnic-type vdpa vdpa-port + openstack server create --flavor --image --port vdpa-vm + +When vDPA support was first introduced no move operations were supported. +As this documentation was added in the change that enabled some move operations +The following should be interpreted both as a retrospective and future looking +viewpoint and treated as a living document which will be updated as functionality evolves. + +23.0.0: initial support is added for creating a VM with vDPA ports, move operations +are blocked in the API but implemented in code. +26.0.0: support for all move operation except live migration is tested and api blocks are removed. +25.x.y: (planned) api block removal backported to stable/Yoga +24.x.y: (planned) api block removal backported to stable/Xena +23.x.y: (planned) api block removal backported to stable/wallaby +26.0.0: (in progress) interface attach/detach, suspend/resume and hot plug live migration +are implemented to fully support all lifecycle operations on instances with vDPA ports. + +.. note:: + The ``(planned)`` and ``(in progress)`` qualifiers will be removed when those items are + completed. If your current version of the document contains those qualifiers then those + lifecycle operations are unsupported. diff --git a/nova/compute/api.py b/nova/compute/api.py index 66543f57dc0a..1d19473b2f30 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -4100,9 +4100,6 @@ class API: # finally split resize and cold migration into separate code paths @block_extended_resource_request @block_port_accelerators() - # FIXME(sean-k-mooney): Cold migrate and resize to different hosts - # probably works but they have not been tested so block them for now - @reject_vdpa_instances(instance_actions.RESIZE) @block_accelerators() @check_instance_lock @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED]) @@ -4341,10 +4338,7 @@ class API: allow_same_host = CONF.allow_resize_to_same_host return allow_same_host - # FIXME(sean-k-mooney): Shelve works but unshelve does not due to bug - # #1851545, so block it for now @block_port_accelerators() - @reject_vdpa_instances(instance_actions.SHELVE) @reject_vtpm_instances(instance_actions.SHELVE) @block_accelerators(until_service=54) @check_instance_lock @@ -5565,8 +5559,6 @@ class API: @block_extended_resource_request @block_port_accelerators() - # FIXME(sean-k-mooney): rebuild works but we have not tested evacuate yet - @reject_vdpa_instances(instance_actions.EVACUATE) @reject_vtpm_instances(instance_actions.EVACUATE) @block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD) @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index d5e5be8ebf1d..2ca6c3f9c93e 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -1101,7 +1101,7 @@ class VDPAServersTest(_PCIServersTestBase): # fixture already stubbed. self.neutron = self.useFixture(base.LibvirtNeutronFixture(self)) - def start_compute(self): + def start_vdpa_compute(self, hostname='compute-0'): vf_ratio = self.NUM_VFS // self.NUM_PFS pci_info = fakelibvirt.HostPCIDevicesInfo( @@ -1139,7 +1139,7 @@ class VDPAServersTest(_PCIServersTestBase): driver_name='mlx5_core') vdpa_info.add_device(f'vdpa_vdpa{idx}', idx, vf) - return super().start_compute( + return super().start_compute(hostname=hostname, pci_info=pci_info, vdpa_info=vdpa_info, libvirt_version=self.FAKE_LIBVIRT_VERSION, qemu_version=self.FAKE_QEMU_VERSION) @@ -1194,7 +1194,7 @@ class VDPAServersTest(_PCIServersTestBase): fake_create, ) - hostname = self.start_compute() + hostname = self.start_vdpa_compute() num_pci = self.NUM_PFS + self.NUM_VFS # both the PF and VF with vDPA capabilities (dev_type=vdpa) should have @@ -1227,12 +1227,16 @@ class VDPAServersTest(_PCIServersTestBase): port['binding:profile'], ) - def _test_common(self, op, *args, **kwargs): - self.start_compute() - + def _create_port_and_server(self): # create the port and a server, with the port attached to the server vdpa_port = self.create_vdpa_port() server = self._create_server(networks=[{'port': vdpa_port['id']}]) + return vdpa_port, server + + def _test_common(self, op, *args, **kwargs): + self.start_vdpa_compute() + + vdpa_port, server = self._create_port_and_server() # attempt the unsupported action and ensure it fails ex = self.assertRaises( @@ -1243,13 +1247,11 @@ class VDPAServersTest(_PCIServersTestBase): ex.response.text) def test_attach_interface(self): - self.start_compute() - + self.start_vdpa_compute() # create the port and a server, but don't attach the port to the server # yet vdpa_port = self.create_vdpa_port() server = self._create_server(networks='none') - # attempt to attach the port to the server ex = self.assertRaises( client.OpenStackApiException, @@ -1261,22 +1263,283 @@ class VDPAServersTest(_PCIServersTestBase): def test_detach_interface(self): self._test_common(self._detach_interface, uuids.vdpa_port) - def test_shelve(self): - self._test_common(self._shelve_server) + def test_shelve_offload(self): + hostname = self.start_vdpa_compute() + vdpa_port, server = self._create_port_and_server() + # assert the port is bound to the vm and the compute host + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(server['id'], port['device_id']) + self.assertEqual(hostname, port['binding:host_id']) + num_pci = self.NUM_PFS + self.NUM_VFS + # -2 we claim the vdpa device which make the parent PF unavailable + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2) + server = self._shelve_server(server) + # now that the vm is shelve offloaded it should not be bound + # to any host but should still be owned by the vm + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(server['id'], port['device_id']) + # FIXME(sean-k-mooney): we should be unbinding the port from + # the host when we shelve offload but we don't today. + # This is unrelated to vdpa port and is a general issue. + self.assertEqual(hostname, port['binding:host_id']) + self.assertIn('binding:profile', port) + self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + self.assertIsNone(server['OS-EXT-SRV-ATTR:host']) + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci) + + def test_unshelve_to_same_host(self): + hostname = self.start_vdpa_compute() + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci) + + vdpa_port, server = self._create_port_and_server() + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2) + self.assertEqual( + hostname, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(hostname, port['binding:host_id']) + + server = self._shelve_server(server) + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci) + self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + # FIXME(sean-k-mooney): shelve offload should unbind the port + # self.assertEqual('', port['binding:host_id']) + self.assertEqual(hostname, port['binding:host_id']) + + server = self._unshelve_server(server) + self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2) + self.assertEqual( + hostname, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(hostname, port['binding:host_id']) + + def test_unshelve_to_different_host(self): + source = self.start_vdpa_compute(hostname='source') + dest = self.start_vdpa_compute(hostname='dest') + + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + + # ensure we boot the vm on the "source" compute + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'disabled'}) + vdpa_port, server = self._create_port_and_server() + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertEqual( + source, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(source, port['binding:host_id']) + + server = self._shelve_server(server) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + # FIXME(sean-k-mooney): shelve should unbind the port + # self.assertEqual('', port['binding:host_id']) + self.assertEqual(source, port['binding:host_id']) + + # force the unshelve to the other host + self.api.put_service( + self.computes['source'].service_ref.uuid, {'status': 'disabled'}) + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'enabled'}) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + server = self._unshelve_server(server) + # the dest devices should be claimed + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + # and the source host devices should still be free + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertEqual( + dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(dest, port['binding:host_id']) + + def test_evacute(self): + source = self.start_vdpa_compute(hostname='source') + dest = self.start_vdpa_compute(hostname='dest') + + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + + # ensure we boot the vm on the "source" compute + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'disabled'}) + vdpa_port, server = self._create_port_and_server() + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertEqual( + source, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(source, port['binding:host_id']) + + # stop the source compute and enable the dest + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'enabled'}) + self.computes['source'].stop() + # Down the source compute to enable the evacuation + self.api.put_service( + self.computes['source'].service_ref.uuid, {'forced_down': True}) + + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + server = self._evacuate_server(server) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + self.assertEqual( + dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + port = self.neutron.show_port(vdpa_port['id'])['port'] + self.assertEqual(dest, port['binding:host_id']) + + # as the source compute is offline the pci claims will not be cleaned + # up on the source compute. + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + # but if you fix/restart the source node the allocations for evacuated + # instances should be released. + self.restart_compute_service(source) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + + def test_resize_same_host(self): + self.flags(allow_resize_to_same_host=True) + num_pci = self.NUM_PFS + self.NUM_VFS + source = self.start_vdpa_compute() + vdpa_port, server = self._create_port_and_server() + # before we resize the vm should be using 1 VF but that will mark + # the PF as unavailable so we assert 2 devices are in use. + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + flavor_id = self._create_flavor(name='new-flavor') + self.assertNotEqual(server['flavor']['original_name'], 'new-flavor') + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + server = self._resize_server(server, flavor_id) + self.assertEqual( + server['flavor']['original_name'], 'new-flavor') + # in resize verify the VF claims should be doubled even + # for same host resize so assert that 3 are in devices in use + # 1 PF and 2 VFs . + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 3) + server = self._confirm_resize(server) + # but once we confrim it should be reduced back to 1 PF and 1 VF + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + # assert the hostname has not have changed as part + # of the resize. + self.assertEqual( + source, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + + def test_resize_different_host(self): + self.flags(allow_resize_to_same_host=False) + source = self.start_vdpa_compute(hostname='source') + dest = self.start_vdpa_compute(hostname='dest') + + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + + # ensure we boot the vm on the "source" compute + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'disabled'}) + vdpa_port, server = self._create_port_and_server() + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + flavor_id = self._create_flavor(name='new-flavor') + self.assertNotEqual(server['flavor']['original_name'], 'new-flavor') + # disable the source compute and enable the dest + self.api.put_service( + self.computes['source'].service_ref.uuid, {'status': 'disabled'}) + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'enabled'}) + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + server = self._resize_server(server, flavor_id) + self.assertEqual( + server['flavor']['original_name'], 'new-flavor') + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + server = self._confirm_resize(server) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + self.assertEqual( + dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + + def test_resize_revert(self): + self.flags(allow_resize_to_same_host=False) + source = self.start_vdpa_compute(hostname='source') + dest = self.start_vdpa_compute(hostname='dest') + + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + + # ensure we boot the vm on the "source" compute + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'disabled'}) + vdpa_port, server = self._create_port_and_server() + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + flavor_id = self._create_flavor(name='new-flavor') + self.assertNotEqual(server['flavor']['original_name'], 'new-flavor') + # disable the source compute and enable the dest + self.api.put_service( + self.computes['source'].service_ref.uuid, {'status': 'disabled'}) + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'enabled'}) + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + server = self._resize_server(server, flavor_id) + self.assertEqual( + server['flavor']['original_name'], 'new-flavor') + # in resize verify both the dest and source pci claims should be + # present. + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + server = self._revert_resize(server) + # but once we revert the dest claims should be freed. + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertEqual( + source, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + + def test_cold_migrate(self): + source = self.start_vdpa_compute(hostname='source') + dest = self.start_vdpa_compute(hostname='dest') + + num_pci = self.NUM_PFS + self.NUM_VFS + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci) + + # ensure we boot the vm on the "source" compute + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'disabled'}) + vdpa_port, server = self._create_port_and_server() + self.assertEqual( + source, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + # enable the dest we do not need to disable the source since cold + # migrate wont happen to the same host in the libvirt driver + self.api.put_service( + self.computes['dest'].service_ref.uuid, {'status': 'enabled'}) + with mock.patch( + 'nova.virt.libvirt.driver.LibvirtDriver' + '.migrate_disk_and_power_off', return_value='{}', + ): + server = self._migrate_server(server) + self.assertEqual( + dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + server = self._confirm_resize(server) + self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci) + self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2) + self.assertEqual( + dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname']) def test_suspend(self): self._test_common(self._suspend_server) - def test_evacuate(self): - self._test_common(self._evacuate_server) - - def test_resize(self): - flavor_id = self._create_flavor() - self._test_common(self._resize_server, flavor_id) - - def test_cold_migrate(self): - self._test_common(self._migrate_server) - class PCIServersTest(_PCIServersTestBase): diff --git a/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml b/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml new file mode 100644 index 000000000000..2580f73d35b4 --- /dev/null +++ b/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + When vDPA was first introduced move operations were implemented in the code + but untested either in a real environment or in functional tests. Due to + this gap nova elected to block move operations for instance with vDPA + devices. All move operations except for live migration have now been tested + and found to indeed work so the API blocks have now been removed and + functional tests introduced. Other operations such as suspend and + live migration require code changes to support and will be enabled as new + features in the future.