From 83cd67cd89ba58243d85db8e82485bda6fd00fde Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Tue, 2 Jun 2015 13:16:41 +0100 Subject: [PATCH] Add 'hw:cpu_thread_policy=require' scheduling The 'require' CPU threads policy ensure that hosts are required to have an SMT architecture and that it be possible to schedule an instance utilising the thread siblings. The 'require' policy should and will only be used when explicitly requested by the instance or image. The existing 'prefer' policy (though it wasn't called this before) is retained and will be used both when requested and as the default, per the spec. A stub is included for the remaining 'isolate' case. It is necessary to add a filter to exclude hosts with no sibling support. This ensures that hosts that could not implement this threading policy are quickly discarded. Finally, some tests that tested the 'require'-like aspects of the 'prefer' policy are reworked to request this policy explicitly. Change-Id: Idedf16a734d9d85db08269d05f8c147959a17679 Implements: blueprint virt-driver-cpu-thread-pinning --- .../scheduler/filters/numa_topology_filter.py | 43 +++++++ nova/tests/unit/scheduler/fakes.py | 11 ++ .../filters/test_numa_topology_filters.py | 112 ++++++++++++++---- nova/tests/unit/virt/test_hardware.py | 49 ++++++++ nova/virt/hardware.py | 106 +++++++++++------ 5 files changed, 262 insertions(+), 59 deletions(-) diff --git a/nova/scheduler/filters/numa_topology_filter.py b/nova/scheduler/filters/numa_topology_filter.py index 680a97ad1845..2417f43927b0 100644 --- a/nova/scheduler/filters/numa_topology_filter.py +++ b/nova/scheduler/filters/numa_topology_filter.py @@ -13,6 +13,7 @@ from oslo_log import log as logging from nova import objects +from nova.objects import fields from nova.scheduler import filters from nova.virt import hardware @@ -22,15 +23,57 @@ LOG = logging.getLogger(__name__) class NUMATopologyFilter(filters.BaseHostFilter): """Filter on requested NUMA topology.""" + def _satisfies_cpu_policy(self, host_state, extra_specs, image_props): + """Check that the host_state provided satisfies any available + CPU policy requirements. + """ + host_topology, _ = hardware.host_topology_and_format_from_host( + host_state) + # NOTE(stephenfin): There can be conflicts between the policy + # specified by the image and that specified by the instance, but this + # is not the place to resolve these. We do this during scheduling. + cpu_policy = [extra_specs.get('hw:cpu_policy'), + image_props.get('hw_cpu_policy')] + cpu_thread_policy = [extra_specs.get('hw:cpu_thread_policy'), + image_props.get('hw_cpu_thread_policy')] + + if not host_topology: + return True + + if fields.CPUAllocationPolicy.DEDICATED not in cpu_policy: + return True + + if fields.CPUThreadAllocationPolicy.REQUIRE not in cpu_thread_policy: + return True + + for cell in host_topology.cells: + # the presence of siblings indicates hyperthreading (HT) + if not cell.siblings: + LOG.debug("%(host_state)s fails CPU policy requirements. " + "Host does not have hyperthreading or " + "hyperthreading is disabled, but 'require' threads " + "policy was requested.", {'host_state': host_state}) + return False + + return True + def host_passes(self, host_state, spec_obj): ram_ratio = host_state.ram_allocation_ratio cpu_ratio = host_state.cpu_allocation_ratio + extra_specs = spec_obj.flavor.extra_specs + image_props = spec_obj.image.properties requested_topology = spec_obj.numa_topology host_topology, _fmt = hardware.host_topology_and_format_from_host( host_state) pci_requests = spec_obj.pci_requests + if pci_requests: pci_requests = pci_requests.requests + + if not self._satisfies_cpu_policy(host_state, extra_specs, + image_props): + return False + if requested_topology and host_topology: limits = objects.NUMATopologyLimits( cpu_allocation_ratio=cpu_ratio, diff --git a/nova/tests/unit/scheduler/fakes.py b/nova/tests/unit/scheduler/fakes.py index c6991f8da7e8..646e6cdb9c24 100644 --- a/nova/tests/unit/scheduler/fakes.py +++ b/nova/tests/unit/scheduler/fakes.py @@ -32,6 +32,17 @@ NUMA_TOPOLOGY = objects.NUMATopology( cpu_usage=0, memory_usage=0, mempages=[], siblings=[], pinned_cpus=set([]))]) +NUMA_TOPOLOGY_W_HT = objects.NUMATopology(cells=[ + objects.NUMACell( + id=0, cpuset=set([1, 2, 5, 6]), memory=512, + cpu_usage=0, memory_usage=0, mempages=[], + siblings=[set([1, 5]), set([2, 6])], pinned_cpus=set([])), + objects.NUMACell( + id=1, cpuset=set([3, 4, 7, 8]), memory=512, + cpu_usage=0, memory_usage=0, mempages=[], + siblings=[set([3, 4]), set([7, 8])], pinned_cpus=set([])) +]) + COMPUTE_NODES = [ objects.ComputeNode( id=1, local_gb=1024, memory_mb=1024, vcpus=1, diff --git a/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py b/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py index fc57ad2a9341..abcce20e3aa0 100644 --- a/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py +++ b/nova/tests/unit/scheduler/filters/test_numa_topology_filters.py @@ -10,9 +10,11 @@ # License for the specific language governing permissions and limitations # under the License. +import itertools import uuid from nova import objects +from nova.objects import fields from nova.scheduler.filters import numa_topology_filter from nova import test from nova.tests.unit.scheduler import fakes @@ -24,14 +26,21 @@ class TestNUMATopologyFilter(test.NoDBTestCase): super(TestNUMATopologyFilter, self).setUp() self.filt_cls = numa_topology_filter.NUMATopologyFilter() + def _get_spec_obj(self, numa_topology): + image_meta = objects.ImageMeta(properties=objects.ImageMetaProps()) + spec_obj = objects.RequestSpec(numa_topology=numa_topology, + pci_requests=None, + instance_uuid=str(uuid.uuid4()), + flavor=objects.Flavor(extra_specs={}), + image=image_meta) + return spec_obj + def test_numa_topology_filter_pass(self): instance_topology = objects.InstanceNUMATopology( cells=[objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512), objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512) ]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY, 'pci_stats': None, @@ -45,16 +54,12 @@ class TestNUMATopologyFilter(test.NoDBTestCase): objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512) ]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'pci_stats': None}) self.assertFalse(self.filt_cls.host_passes(host, spec_obj)) def test_numa_topology_filter_numa_host_no_numa_instance_pass(self): - spec_obj = objects.RequestSpec(numa_topology=None, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=None) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY}) self.assertTrue(self.filt_cls.host_passes(host, spec_obj)) @@ -65,9 +70,7 @@ class TestNUMATopologyFilter(test.NoDBTestCase): objects.InstanceNUMACell(id=1, cpuset=set([2]), memory=512), objects.InstanceNUMACell(id=2, cpuset=set([3]), memory=512) ]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY, 'pci_stats': None, @@ -81,9 +84,7 @@ class TestNUMATopologyFilter(test.NoDBTestCase): memory=1024), objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512) ]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY, 'pci_stats': None, @@ -96,9 +97,7 @@ class TestNUMATopologyFilter(test.NoDBTestCase): cells=[objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512), objects.InstanceNUMACell(id=1, cpuset=set([3, 4, 5]), memory=512)]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY, 'pci_stats': None, @@ -111,9 +110,7 @@ class TestNUMATopologyFilter(test.NoDBTestCase): cells=[objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512), objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512) ]) - spec_obj = objects.RequestSpec(numa_topology=instance_topology, - pci_requests=None, - instance_uuid=str(uuid.uuid4())) + spec_obj = self._get_spec_obj(numa_topology=instance_topology) host = fakes.FakeHostState('host1', 'node1', {'numa_topology': fakes.NUMA_TOPOLOGY, 'pci_stats': None, @@ -123,3 +120,76 @@ class TestNUMATopologyFilter(test.NoDBTestCase): limits = host.limits['numa_topology'] self.assertEqual(limits.cpu_allocation_ratio, 21) self.assertEqual(limits.ram_allocation_ratio, 1.3) + + def _do_test_numa_topology_filter_cpu_policy( + self, numa_topology, cpu_policy, cpu_thread_policy, passes): + instance_topology = objects.InstanceNUMATopology( + cells=[objects.InstanceNUMACell(id=0, cpuset=set([1]), memory=512), + objects.InstanceNUMACell(id=1, cpuset=set([3]), memory=512) + ]) + spec_obj = objects.RequestSpec(numa_topology=instance_topology, + pci_requests=None, + instance_uuid=str(uuid.uuid4())) + + extra_specs = [ + {}, + { + 'hw:cpu_policy': cpu_policy, + 'hw:cpu_thread_policy': cpu_thread_policy, + } + ] + image_props = [ + {}, + { + 'hw_cpu_policy': cpu_policy, + 'hw_cpu_thread_policy': cpu_thread_policy, + } + ] + host = fakes.FakeHostState('host1', 'node1', { + 'numa_topology': numa_topology, + 'pci_stats': None, + 'cpu_allocation_ratio': 1, + 'ram_allocation_ratio': 1.5}) + assertion = self.assertTrue if passes else self.assertFalse + + # test combinations of image properties and extra specs + for specs, props in itertools.product(extra_specs, image_props): + # ...except for the one where no policy is specified + if specs == props == {}: + continue + + fake_flavor = objects.Flavor(memory_mb=1024, extra_specs=specs) + fake_image_props = objects.ImageMetaProps(**props) + fake_image = objects.ImageMeta(properties=fake_image_props) + + spec_obj.image = fake_image + spec_obj.flavor = fake_flavor + + assertion(self.filt_cls.host_passes(host, spec_obj)) + + def test_numa_topology_filter_fail_cpu_thread_policy_require(self): + cpu_policy = fields.CPUAllocationPolicy.DEDICATED + cpu_thread_policy = fields.CPUThreadAllocationPolicy.REQUIRE + numa_topology = fakes.NUMA_TOPOLOGY + + self._do_test_numa_topology_filter_cpu_policy( + numa_topology, cpu_policy, cpu_thread_policy, False) + + def test_numa_topology_filter_pass_cpu_thread_policy_require(self): + cpu_policy = fields.CPUAllocationPolicy.DEDICATED + cpu_thread_policy = fields.CPUThreadAllocationPolicy.REQUIRE + numa_topology = fakes.NUMA_TOPOLOGY_W_HT + + self._do_test_numa_topology_filter_cpu_policy( + numa_topology, cpu_policy, cpu_thread_policy, True) + + def test_numa_topology_filter_pass_cpu_thread_policy_others(self): + cpu_policy = fields.CPUAllocationPolicy.DEDICATED + cpu_thread_policy = fields.CPUThreadAllocationPolicy.PREFER + numa_topology = fakes.NUMA_TOPOLOGY + + for cpu_thread_policy in [ + fields.CPUThreadAllocationPolicy.PREFER, + fields.CPUThreadAllocationPolicy.ISOLATE]: + self._do_test_numa_topology_filter_cpu_policy( + numa_topology, cpu_policy, cpu_thread_policy, True) diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py index 6f4c140fcb13..69579ad21246 100644 --- a/nova/tests/unit/virt/test_hardware.py +++ b/nova/tests/unit/virt/test_hardware.py @@ -2103,6 +2103,55 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase): got_pinning = {x: x for x in range(0, 4)} self.assertEqual(got_pinning, inst_pin.cpu_pinning) + def test_get_pinning_require_policy_too_few_siblings(self): + host_pin = objects.NUMACell( + id=0, + cpuset=set([0, 1, 2, 3, 4, 5, 6, 7]), + memory=4096, memory_usage=0, + pinned_cpus=set([0, 1, 2]), + siblings=[set([0, 4]), set([1, 5]), set([2, 6]), set([3, 7])], + mempages=[]) + inst_pin = objects.InstanceNUMACell( + cpuset=set([0, 1, 2, 3]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED, + cpu_thread_policy=fields.CPUThreadAllocationPolicy.REQUIRE) + inst_pin = hw._numa_fit_instance_cell_with_pinning(host_pin, inst_pin) + self.assertIsNone(inst_pin) + + def test_get_pinning_require_policy_fits(self): + host_pin = objects.NUMACell(id=0, cpuset=set([0, 1, 2, 3]), + memory=4096, memory_usage=0, + siblings=[set([0, 1]), set([2, 3])], + mempages=[], pinned_cpus=set([])) + inst_pin = objects.InstanceNUMACell( + cpuset=set([0, 1, 2, 3]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED, + cpu_thread_policy=fields.CPUThreadAllocationPolicy.REQUIRE) + inst_pin = hw._numa_fit_instance_cell_with_pinning(host_pin, inst_pin) + self.assertInstanceCellPinned(inst_pin) + got_topo = objects.VirtCPUTopology(sockets=1, cores=2, threads=2) + self.assertEqualTopology(got_topo, inst_pin.cpu_topology) + + def test_get_pinning_require_policy_fits_w_usage(self): + host_pin = objects.NUMACell( + id=0, + cpuset=set([0, 1, 2, 3, 4, 5, 6, 7]), + memory=4096, memory_usage=0, + pinned_cpus=set([0, 1]), + siblings=[set([0, 4]), set([1, 5]), set([2, 6]), set([3, 7])], + mempages=[]) + inst_pin = objects.InstanceNUMACell( + cpuset=set([0, 1, 2, 3]), + memory=2048, + cpu_policy=fields.CPUAllocationPolicy.DEDICATED, + cpu_thread_policy=fields.CPUThreadAllocationPolicy.REQUIRE) + inst_pin = hw._numa_fit_instance_cell_with_pinning(host_pin, inst_pin) + self.assertInstanceCellPinned(inst_pin) + got_topo = objects.VirtCPUTopology(sockets=1, cores=2, threads=2) + self.assertEqualTopology(got_topo, inst_pin.cpu_topology) + def test_get_pinning_host_siblings_instance_odd_fit(self): host_pin = objects.NUMACell(id=0, cpuset=set([0, 1, 2, 3, 4, 5, 6, 7]), memory=4096, memory_usage=0, diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py index f60d58e7d7d8..c51aebca4ff1 100644 --- a/nova/virt/hardware.py +++ b/nova/virt/hardware.py @@ -670,31 +670,27 @@ def _pack_instance_onto_cores(available_siblings, instance_cell, host_cell_id): topology, making sure that hyperthreads of the instance match up with those of the host when the pinning takes effect. - Currently the strategy for packing is to prefer siblings and try use - cores evenly, by using emptier cores first. This is achieved by the way we - order cores in the can_pack structure, and the order in which we iterate + Currently the strategy for packing is to prefer siblings and try use cores + evenly, by using emptier cores first. This is achieved by the way we order + cores in the sibling_sets structure, and the order in which we iterate through it. - The main packing loop that iterates over the can_pack dictionary will not - currently try to look for a fit that maximizes number of siblings, but will - simply rely on the iteration ordering and picking the first viable + The main packing loop that iterates over the sibling_sets dictionary will + not currently try to look for a fit that maximizes number of siblings, but + will simply rely on the iteration ordering and picking the first viable placement. """ - # We build up a data structure 'can_pack' that answers the question: - # 'Given the number of threads I want to pack, give me a list of all - # the available sibling sets that can accommodate it' - can_pack = collections.defaultdict(list) + # We build up a data structure that answers the question: 'Given the + # number of threads I want to pack, give me a list of all the available + # sibling sets (or groups thereof) that can accommodate it' + sibling_sets = collections.defaultdict(list) for sib in available_siblings: for threads_no in range(1, len(sib) + 1): - can_pack[threads_no].append(sib) + sibling_sets[threads_no].append(sib) - def _can_pack_instance_cell(instance_cell, threads_per_core, cores_list): - """Determines if instance cell can fit an avail set of cores.""" - - if threads_per_core * len(cores_list) < len(instance_cell): - return False - return True + pinning = None + threads_no = 1 def _orphans(instance_cell, threads_per_core): """Number of instance CPUs which will not fill up a host core. @@ -702,7 +698,7 @@ def _pack_instance_onto_cores(available_siblings, instance_cell, host_cell_id): Best explained by an example: consider set of free host cores as such: [(0, 1), (3, 5), (6, 7, 8)] This would be a case of 2 threads_per_core AKA an entry for 2 in the - can_pack structure. + sibling_sets structure. If we attempt to pack a 5 core instance on it - due to the fact that we iterate the list in order, we will end up with a single core of the @@ -733,25 +729,59 @@ def _pack_instance_onto_cores(available_siblings, instance_cell, host_cell_id): return fractions.gcd(threads_per_core, _orphans(instance_cell, threads_per_core)) - # We iterate over the can_pack dict in descending order of cores that - # can be packed - an attempt to get even distribution over time - for cores_per_sib, sib_list in sorted( - (t for t in can_pack.items()), reverse=True): - if _can_pack_instance_cell(instance_cell, - cores_per_sib, sib_list): - sliced_sibs = map(lambda s: list(s)[:cores_per_sib], sib_list) - pinning = zip(sorted(instance_cell.cpuset), - itertools.chain(*sliced_sibs)) + if (instance_cell.cpu_thread_policy == + fields.CPUThreadAllocationPolicy.REQUIRE): + LOG.debug("Requested 'require' thread policy for %d cores", + len(instance_cell)) + elif (instance_cell.cpu_thread_policy == + fields.CPUThreadAllocationPolicy.PREFER): + LOG.debug("Request 'prefer' thread policy for %d cores", + len(instance_cell)) + elif (instance_cell.cpu_thread_policy == + fields.CPUThreadAllocationPolicy.ISOLATE): + LOG.debug("Requested 'isolate' thread policy for %d cores", + len(instance_cell)) + raise NotImplementedError("The 'isolate' policy is not supported.") + else: + LOG.debug("User did not specify a thread policy. Using default " + "for %d cores", len(instance_cell)) - threads = _threads(instance_cell, cores_per_sib) - cores = len(instance_cell) / threads - topology = objects.VirtCPUTopology(sockets=1, - cores=cores, - threads=threads) - instance_cell.pin_vcpus(*pinning) - instance_cell.cpu_topology = topology - instance_cell.id = host_cell_id - return instance_cell + # NOTE(ndipanov): We iterate over the sibling sets in descending order + # of cores that can be packed. This is an attempt to evenly distribute + # instances among physical cores + for threads_no, sibling_set in sorted( + (t for t in sibling_sets.items()), reverse=True): + if threads_no * len(sibling_set) < len(instance_cell): + continue + + usable_cores = map(lambda s: list(s)[:threads_no], sibling_set) + + threads_no = _threads(instance_cell, threads_no) + + # NOTE(sfinucan): The key difference between the require and + # prefer policies is that require will not settle for non-siblings + # if this is all that is available. Enforce this by ensuring we're + # using sibling sets that contain at least one sibling + if (instance_cell.cpu_thread_policy == + fields.CPUThreadAllocationPolicy.REQUIRE): + if threads_no <= 1: + continue + + pinning = zip(sorted(instance_cell.cpuset), + itertools.chain(*usable_cores)) + + break + + if not pinning: + return + + topology = objects.VirtCPUTopology(sockets=1, + cores=len(pinning) / threads_no, + threads=threads_no) + instance_cell.pin_vcpus(*pinning) + instance_cell.cpu_topology = topology + instance_cell.id = host_cell_id + return instance_cell def _numa_fit_instance_cell_with_pinning(host_cell, instance_cell): @@ -1063,7 +1093,6 @@ def _add_cpu_pinning_constraint(flavor, image_meta, numa_topology): for cell in numa_topology.cells: cell.cpu_policy = cpu_policy cell.cpu_thread_policy = cpu_thread_policy - return numa_topology else: single_cell = objects.InstanceNUMACell( id=0, @@ -1072,7 +1101,8 @@ def _add_cpu_pinning_constraint(flavor, image_meta, numa_topology): cpu_policy=cpu_policy, cpu_thread_policy=cpu_thread_policy) numa_topology = objects.InstanceNUMATopology(cells=[single_cell]) - return numa_topology + + return numa_topology # TODO(sahid): Move numa related to hardward/numa.py