From 803f85d7e638c7367db709a93f732f12d81e083a Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Thu, 14 Jun 2018 17:04:48 +0100 Subject: [PATCH] scheduler: Start utilizing RequestSpec.network_metadata Now that we have this information, we can use it as a pre-filtering for suitable hosts. With this patch we complete the blueprint. As a result, documentation and release notes are bundled in the patch and previously inactive tests are now enabled. Part of blueprint numa-aware-vswitches Change-Id: Ide262733ffd7714fdc702b31c61bdd42dbf7acc3 --- doc/source/admin/cpu-topologies.rst | 8 + doc/source/admin/index.rst | 1 + doc/source/admin/networking.rst | 184 ++++++++++++++++++ nova/compute/api.py | 22 ++- .../scheduler/filters/numa_topology_filter.py | 8 + .../functional/libvirt/test_numa_servers.py | 4 +- nova/tests/unit/compute/test_compute_api.py | 5 +- nova/tests/unit/compute/test_compute_cells.py | 2 +- .../filters/test_numa_topology_filters.py | 65 ++++++- ...numa-aware-vswitches-162132290dd6ef17.yaml | 13 ++ 10 files changed, 298 insertions(+), 14 deletions(-) create mode 100644 doc/source/admin/networking.rst create mode 100644 releasenotes/notes/numa-aware-vswitches-162132290dd6ef17.yaml diff --git a/doc/source/admin/cpu-topologies.rst b/doc/source/admin/cpu-topologies.rst index ccc57514b848..f75bc95925a1 100644 --- a/doc/source/admin/cpu-topologies.rst +++ b/doc/source/admin/cpu-topologies.rst @@ -31,6 +31,14 @@ Simultaneous Multi-Threading (SMT) CPUs on the system and can execute workloads in parallel. However, as with NUMA, threads compete for shared resources. +Non Uniform I/O Access (NUMA I/O) + In a NUMA system, I/O to a device mapped to a local memory region is more + efficient than I/O to a remote device. A device connected to the same socket + providing the CPU and memory offers lower latencies for I/O operations due to + its physical proximity. This generally manifests itself in devices connected + to the PCIe bus, such as NICs or vGPUs, but applies to any device support + memory mapped I/O. + In OpenStack, SMP CPUs are known as *cores*, NUMA cells or nodes are known as *sockets*, and SMT CPUs are known as *threads*. For example, a quad-socket, eight core system with Hyper-Threading would have four sockets, eight cores per diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst index 01a777589f9b..c3ff1c2da8f1 100644 --- a/doc/source/admin/index.rst +++ b/doc/source/admin/index.rst @@ -31,6 +31,7 @@ operating system, and exposes functionality over a web-based API. manage-volumes.rst migration.rst networking-nova.rst + networking.rst node-down.rst pci-passthrough.rst quotas2.rst diff --git a/doc/source/admin/networking.rst b/doc/source/admin/networking.rst new file mode 100644 index 000000000000..83e4d3df3a21 --- /dev/null +++ b/doc/source/admin/networking.rst @@ -0,0 +1,184 @@ +======================= +Networking with neutron +======================= + +While nova uses the :neutron-doc:`OpenStack Networking service (neutron) <>` to +provide network connectivity for instances, nova itself provides some +additional features not possible with neutron alone. These are described below. + + +SR-IOV +------ + +.. versionchanged:: 2014.2 + + The feature described below was first introduced in the Juno release. + +The SR-IOV specification defines a standardized mechanism to virtualize PCIe +devices. This mechanism can virtualize a single PCIe Ethernet controller to +appear as multiple PCIe devices. Each device can be directly assigned to an +instance, bypassing the hypervisor and virtual switch layer. As a result, users +are able to achieve low latency and near-line wire speed. + +A full guide on configuring and using SR-IOV is provided in the +:neutron-doc:`OpenStack Networking service documentation +` + + +NUMA Affinity +------------- + +.. versionadded:: 18.0.0 + + The feature described below was first introduced in the Rocky release. + +.. important:: + + The functionality described below is currently only supported by the + libvirt/KVM driver. + +As described in :doc:`cpu-topologies`, NUMA is a computer architecture where +memory accesses to certain regions of system memory can have higher latencies +than other regions, depending on the CPU(s) your process is running on. This +effect extends to devices connected to the PCIe bus, a concept known as NUMA +I/O. Many Network Interface Cards (NICs) connect using the PCIe interface, +meaning they are susceptible to the ill-effects of poor NUMA affinitization. As +a result, NUMA locality must be considered when creating an instance where high +dataplane performance is a requirement. + +Fortunately, nova provides functionality to ensure NUMA affinitization is +provided for instances using neutron. How this works depends on the type of +port you are trying to use. + +.. todo:: + + Add documentation for PCI NUMA affinity and PCI policies and link to it from + here. + +For SR-IOV ports, virtual functions, which are PCI devices, are attached to the +instance. This means the instance can benefit from the NUMA affinity guarantees +provided for PCI devices. This happens automatically. + +For all other types of ports, some manual configuration is required. + +#. Identify the type of network(s) you wish to provide NUMA affinity for. + + - If a network is an L2-type network (``provider:network_type`` of ``flat`` + or ``vlan``), affinity of the network to given NUMA node(s) can vary + depending on value of the ``provider:physical_network`` attribute of the + network, commonly referred to as the *physnet* of the network. This is + because most neutron drivers map each *physnet* to a different bridge, to + which multiple NICs are attached, or to a different (logical) NIC. + + - If a network is an L3-type networks (``provider:network_type`` of + ``vxlan``, ``gre`` or ``geneve``), all traffic will use the device to + which the *endpoint IP* is assigned. This means all L3 networks on a given + host will have affinity to the same NUMA node(s). Refer to + :neutron-doc:`the neutron documentation + ` for more information. + +#. Determine the NUMA affinity of the NICs attached to the given network(s). + + How this should be achieved varies depending on the switching solution used + and whether the network is a L2-type network or an L3-type networks. + + Consider an L2-type network using the Linux Bridge mechanism driver. As + noted in the :neutron-doc:`neutron documentation + `, *physets* are mapped to interfaces + using the ``[linux_bridge] physical_interface_mappings`` configuration + option. For example: + + .. code-block:: ini + + [linux_bridge] + physical_interface_mappings = provider:PROVIDER_INTERFACE + + Once you have the device name, you can query *sysfs* to retrieve the NUMA + affinity for this device. For example: + + .. code-block:: shell + + $ cat /sys/class/net/PROVIDER_INTERFACE/device/numa_node + + For an L3-type network using the Linux Bridge mechanism driver, the device + used will be configured using protocol-specific endpoint IP configuration + option. For VXLAN, this is the ``[vxlan] local_ip`` option. For example:: + + .. code-block:: + + [vxlan] + local_ip = OVERLAY_INTERFACE_IP_ADDRESS + + Once you have the IP address in question, you can use :command:`ip` to + identify the device that has been assigned this IP address and from there + can query the NUMA affinity using *sysfs* as above. + + .. note:: + + The example provided above is merely that: an example. How one should + identify this information can vary massively depending on the driver + used, whether bonding is used, the type of network used, etc. + +#. Configure NUMA affinity in ``nova.conf``. + + Once you have identified the NUMA affinity of the devices used for your + networks, you need to configure this in ``nova.conf``. As before, how this + should be achieved varies depending on the type of network. + + For L2-type networks, NUMA affinity is defined based on the + ``provider:physical_network`` attribute of the network. There are two + configuration options that must be set: + + ``[neutron] physnets`` + This should be set to the list of physnets for which you wish to provide + NUMA affinity. Refer to the :oslo.config:option:`documentation + ` for more information. + + ``[neutron_physnet_{physnet}] numa_nodes`` + This should be set to the list of NUMA node(s) that networks with the + given ``{physnet}`` should be affinitized to. + + For L3-type networks, NUMA affinity is defined globally for all tunneled + networks on a given host. There is only one configuration option that must + be set: + + ``[neutron_tunneled] numa_nodes`` + This should be set to a list of one or NUMA nodes to which instances using + tunneled networks will be affinitized. + +Examples +~~~~~~~~ + +Take an example for deployment using L2-type networks first. + +.. code-block:: ini + + [neutron] + physnets = foo,bar + + [neutron_physnet_foo] + numa_nodes = 0 + + [neutron_physnet_bar] + numa_nodes = 2, 3 + +This configuration will ensure instances using one or more L2-type networks +with ``provider:physical_network=foo`` must be scheduled on host cores from +NUMA nodes 0, while instances using one or more networks with +``provider:physical_network=bar`` must be scheduled on host cores from both +NUMA nodes 2 and 3. For the latter case, it will be necessary to split the +guest across two or more host NUMA nodes using the ``hw:numa_nodes`` +:ref:`flavor extra spec `. + +Now, take an example for a deployment using L3 networks. + +.. code-block:: ini + + [neutron_tunneled] + numa_nodes = 0 + +This is much simpler as all tunneled traffic uses the same logical interface. +As with the L2-type networks, this configuration will ensure instances using +one or more L3-type networks must be scheduled on host cores from NUMA node 0. +It is also possible to define more than one NUMA node, in which case the +instance must be split across these nodes. diff --git a/nova/compute/api.py b/nova/compute/api.py index 3cde93928446..6e20e6df22da 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -818,8 +818,9 @@ class API(base.Base): # InstancePCIRequests object pci_request_info = pci_request.get_pci_requests_from_flavor( instance_type) - self.network_api.create_resource_requests( - context, requested_networks, pci_request_info) + + network_metadata = self.network_api.create_resource_requests(context, + requested_networks, pci_request_info) base_options = { 'reservation_id': reservation_id, @@ -859,13 +860,15 @@ class API(base.Base): # return the validated options and maximum number of instances allowed # by the network quotas - return base_options, max_network_count, key_pair, security_groups + return (base_options, max_network_count, key_pair, security_groups, + network_metadata) def _provision_instances(self, context, instance_type, min_count, max_count, base_options, boot_meta, security_groups, block_device_mapping, shutdown_terminate, instance_group, check_server_group_quota, filter_properties, - key_pair, tags, trusted_certs, supports_multiattach=False): + key_pair, tags, trusted_certs, supports_multiattach=False, + network_metadata=None): # Check quotas num_instances = compute_utils.check_num_instances_quota( context, instance_type, min_count, max_count) @@ -901,6 +904,11 @@ class API(base.Base): req_spec.num_instances = num_instances req_spec.create() + # NOTE(stephenfin): The network_metadata field is not persisted + # and is therefore set after 'create' is called. + if network_metadata: + req_spec.network_metadata = network_metadata + # Create an instance object, but do not store in db yet. instance = objects.Instance(context=context) instance.uuid = instance_uuid @@ -1148,8 +1156,8 @@ class API(base.Base): self._check_auto_disk_config(image=boot_meta, auto_disk_config=auto_disk_config) - base_options, max_net_count, key_pair, security_groups = \ - self._validate_and_build_base_options( + base_options, max_net_count, key_pair, security_groups, \ + network_metadata = self._validate_and_build_base_options( context, instance_type, boot_meta, image_href, image_id, kernel_id, ramdisk_id, display_name, display_description, key_name, key_data, security_groups, availability_zone, @@ -1189,7 +1197,7 @@ class API(base.Base): boot_meta, security_groups, block_device_mapping, shutdown_terminate, instance_group, check_server_group_quota, filter_properties, key_pair, tags, trusted_certs, - supports_multiattach) + supports_multiattach, network_metadata) instances = [] request_specs = [] diff --git a/nova/scheduler/filters/numa_topology_filter.py b/nova/scheduler/filters/numa_topology_filter.py index 34a314279dab..6fc74d083d44 100644 --- a/nova/scheduler/filters/numa_topology_filter.py +++ b/nova/scheduler/filters/numa_topology_filter.py @@ -76,6 +76,10 @@ class NUMATopologyFilter(filters.BaseHostFilter): host_state) pci_requests = spec_obj.pci_requests + network_metadata = None + if 'network_metadata' in spec_obj: + network_metadata = spec_obj.network_metadata + if pci_requests: pci_requests = pci_requests.requests @@ -87,6 +91,10 @@ class NUMATopologyFilter(filters.BaseHostFilter): limits = objects.NUMATopologyLimits( cpu_allocation_ratio=cpu_ratio, ram_allocation_ratio=ram_ratio) + + if network_metadata: + limits.network_metadata = network_metadata + instance_topology = (hardware.numa_fit_instance_to_host( host_topology, requested_topology, limits=limits, diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py index e7dd39d766ce..815656b4951d 100644 --- a/nova/tests/functional/libvirt/test_numa_servers.py +++ b/nova/tests/functional/libvirt/test_numa_servers.py @@ -326,9 +326,7 @@ class NUMAServersWithNetworksTest(NUMAServersTestBase): flavor_id, networks) self.assertTrue(filter_mock.called) - # TODO(stephenfin): Switch this to 'ERROR' once the final patch is - # merged - self.assertEqual('ACTIVE', status) + self.assertEqual('ERROR', status) def test_create_server_with_physnet_and_tunneled_net(self): """Test combination of physnet and tunneled network. diff --git a/nova/tests/unit/compute/test_compute_api.py b/nova/tests/unit/compute/test_compute_api.py index 9bef37e4067f..cea45f3a28a5 100644 --- a/nova/tests/unit/compute/test_compute_api.py +++ b/nova/tests/unit/compute/test_compute_api.py @@ -293,7 +293,7 @@ class _ComputeAPIUnitTestMixIn(object): mock.patch.object(self.compute_api, '_validate_and_build_base_options', return_value=({}, max_net_count, None, - ['default'])) + ['default'], None)) ) as ( get_image, check_auto_disk_config, @@ -6076,7 +6076,8 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): with mock.patch.object( self.compute_api.security_group_api, 'get', return_value={'id': uuids.secgroup_uuid}) as scget: - base_options, max_network_count, key_pair, security_groups = ( + base_options, max_network_count, key_pair, security_groups, \ + network_metadata = ( self.compute_api._validate_and_build_base_options( self.context, instance_type, boot_meta, uuids.image_href, mock.sentinel.image_id, kernel_id, ramdisk_id, diff --git a/nova/tests/unit/compute/test_compute_cells.py b/nova/tests/unit/compute/test_compute_cells.py index b8e03ade71a2..1f9dde7313a3 100644 --- a/nova/tests/unit/compute/test_compute_cells.py +++ b/nova/tests/unit/compute/test_compute_cells.py @@ -628,7 +628,7 @@ class CellsConductorAPIRPCRedirect(test.NoDBTestCase): _validate, _get_image, _check_bdm, _provision, _record_action_start): _get_image.return_value = (None, 'fake-image') - _validate.return_value = ({}, 1, None, ['default']) + _validate.return_value = ({}, 1, None, ['default'], None) _check_bdm.return_value = objects.BlockDeviceMappingList() _provision.return_value = [] diff --git a/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py b/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py index 6d3ba5e0042c..935ac7e86bd1 100644 --- a/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py +++ b/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py @@ -28,13 +28,18 @@ class TestNUMATopologyFilter(test.NoDBTestCase): super(TestNUMATopologyFilter, self).setUp() self.filt_cls = numa_topology_filter.NUMATopologyFilter() - def _get_spec_obj(self, numa_topology): + def _get_spec_obj(self, numa_topology, network_metadata=None): image_meta = objects.ImageMeta(properties=objects.ImageMetaProps()) + spec_obj = objects.RequestSpec(numa_topology=numa_topology, pci_requests=None, instance_uuid=uuids.fake, flavor=objects.Flavor(extra_specs={}), image=image_meta) + + if network_metadata: + spec_obj.network_metadata = network_metadata + return spec_obj def test_numa_topology_filter_pass(self): @@ -230,3 +235,61 @@ class TestNUMATopologyFilter(test.NoDBTestCase): 'cpu_allocation_ratio': 16.0, 'ram_allocation_ratio': 1.5}) self.assertFalse(self.filt_cls.host_passes(host, spec_obj)) + + def _get_fake_host_state_with_networks(self): + network_a = objects.NetworkMetadata(physnets=set(['foo', 'bar']), + tunneled=False) + network_b = objects.NetworkMetadata(physnets=set(), tunneled=True) + host_topology = objects.NUMATopology(cells=[ + objects.NUMACell(id=1, cpuset=set([1, 2]), memory=2048, + cpu_usage=2, memory_usage=2048, mempages=[], + siblings=[set([1]), set([2])], + pinned_cpus=set([]), + network_metadata=network_a), + objects.NUMACell(id=2, cpuset=set([3, 4]), memory=2048, + cpu_usage=2, memory_usage=2048, mempages=[], + siblings=[set([3]), set([4])], + pinned_cpus=set([]), + network_metadata=network_b)]) + + return fakes.FakeHostState('host1', 'node1', { + 'numa_topology': host_topology, + 'pci_stats': None, + 'cpu_allocation_ratio': 16.0, + 'ram_allocation_ratio': 1.5}) + + def test_numa_topology_filter_pass_networks(self): + host = self._get_fake_host_state_with_networks() + + instance_topology = objects.InstanceNUMATopology(cells=[ + objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512), + objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512)]) + + network_metadata = objects.NetworkMetadata( + physnets=set(['foo']), tunneled=False) + spec_obj = self._get_spec_obj(numa_topology=instance_topology, + network_metadata=network_metadata) + self.assertTrue(self.filt_cls.host_passes(host, spec_obj)) + + # this should pass because while the networks are affined to different + # host NUMA nodes, our guest itself has multiple NUMA nodes + network_metadata = objects.NetworkMetadata( + physnets=set(['foo', 'bar']), tunneled=True) + spec_obj = self._get_spec_obj(numa_topology=instance_topology, + network_metadata=network_metadata) + self.assertTrue(self.filt_cls.host_passes(host, spec_obj)) + + def test_numa_topology_filter_fail_networks(self): + host = self._get_fake_host_state_with_networks() + + instance_topology = objects.InstanceNUMATopology(cells=[ + objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512)]) + + # this should fail because the networks are affined to different host + # NUMA nodes but our guest only has a single NUMA node + network_metadata = objects.NetworkMetadata( + physnets=set(['foo']), tunneled=True) + spec_obj = self._get_spec_obj(numa_topology=instance_topology, + network_metadata=network_metadata) + + self.assertFalse(self.filt_cls.host_passes(host, spec_obj)) diff --git a/releasenotes/notes/numa-aware-vswitches-162132290dd6ef17.yaml b/releasenotes/notes/numa-aware-vswitches-162132290dd6ef17.yaml new file mode 100644 index 000000000000..ac2452a8d9ee --- /dev/null +++ b/releasenotes/notes/numa-aware-vswitches-162132290dd6ef17.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + It is now possible to configure NUMA affinity for most neutron networks. + This is available for networks that use a ``provider:network_type`` of + ``flat`` or ``vlan`` and a ``provider:physical_network`` (L2 networks) or + networks that use a ``provider:network_type`` of ``vxlan``, ``gre`` or + ``geneve`` (L3 networks). + + For more information, refer to the `spec`__ and `documentation`__. + + __ https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/numa-aware-vswitches.html + __ https://docs.openstack.org/nova/latest/admin/networking.html