From 7ddab327675d36a4ba59d02d22d042d418236336 Mon Sep 17 00:00:00 2001
From: Stephen Finucane <stephenfin@redhat.com>
Date: Thu, 30 Jul 2020 17:36:24 +0100
Subject: [PATCH] hardware: Reject requests for no hyperthreads on hosts with
 HT

Attempting to boot an instance with 'hw:cpu_policy=dedicated' will
result in a request from nova-scheduler to placement for allocation
candidates with $flavor.vcpu 'PCPU' inventory. Similarly, booting an
instance with 'hw:cpu_thread_policy=isolate' will result in a request
for allocation candidates with 'HW_CPU_HYPERTHREADING=forbidden', i.e.
hosts without hyperthreading. This has been the case since the
cpu-resources feature was implemented in Train. However, as part of that
work and to enable upgrades from hosts that predated Train, we also make
a second request for candidates with $flavor.vcpu 'VCPU' inventory. The
idea behind this is that old compute nodes would only report 'VCPU' and
should be useable, and any new compute nodes that got caught up in this
second request could never actually be scheduled to since there wouldn't
be enough cores from 'ComputeNode.numa_topology.cells.[*].pcpuset'
available to schedule to, resulting in rejection by the
'NUMATopologyFilter'. However, if a host was rejected in the first
query because it reported the 'HW_CPU_HYPERTHREADING' trait, it could
get picked up by the second query and would happily be scheduled to,
resulting in an instance consuming 'VCPU' inventory from a host that
properly supported 'PCPU' inventory.

The solution is simply, though also a huge hack. If we detect that the
host is using new style configuration and should be able to report
'PCPU', check if the instance asked for no hyperthreading and whether
the host has it. If all are True, reject the request.

Change-Id: Id39aaaac09585ca1a754b669351c86e234b89dd9
Signed-off-by: Stephen Finucane <stephenfin@redhat.com>
Closes-Bug: #1889633
(cherry picked from commit 9c270332041d6b98951c0b57d7b344fd551a413c)
---
 .../functional/libvirt/test_numa_servers.py   |  7 +-
 nova/tests/unit/virt/test_hardware.py         | 75 ++++++++++++++++---
 nova/virt/hardware.py                         | 24 ++++++
 .../notes/bug-1889633-37e524fb6c20fbdf.yaml   |  9 +++
 4 files changed, 99 insertions(+), 16 deletions(-)
 create mode 100644 releasenotes/notes/bug-1889633-37e524fb6c20fbdf.yaml

diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index a5cee9946d54..184fbb822b28 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -383,12 +383,7 @@ class NUMAServersTest(NUMAServersTestBase):
         }
         flavor_id = self._create_flavor(vcpu=2, extra_spec=extra_spec)
 
-        # FIXME(stephenfin): This should go to error status since there should
-        # not be a host available
-        expected_usage = {
-            'DISK_GB': 20, 'MEMORY_MB': 2048, 'PCPU': 0, 'VCPU': 2,
-        }
-        self._run_build_test(flavor_id, expected_usage=expected_usage)
+        self._run_build_test(flavor_id, end_status='ERROR')
 
     def test_create_server_with_pcpu(self):
         """Create a server using an explicit 'resources:PCPU' request.
diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py
index 7bbaa0fee29b..4349f47c1284 100644
--- a/nova/tests/unit/virt/test_hardware.py
+++ b/nova/tests/unit/virt/test_hardware.py
@@ -3022,10 +3022,13 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         got_topo = objects.VirtCPUTopology(sockets=1, cores=5, threads=1)
         self.assertEqualTopology(got_topo, inst_pin.cpu_topology)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_get_pinning_isolate_policy_too_few_fully_free_cores(self):
         host_pin = objects.NUMACell(
             id=0,
-            cpuset=set(),
+            # Simulate a legacy host with vcpu_pin_set configuration,
+            # meaning cpuset == pcpuset
+            cpuset=set([0, 1, 2, 3]),
             pcpuset=set([0, 1, 2, 3]),
             memory=4096,
             memory_usage=0,
@@ -3040,10 +3043,13 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         inst_pin = hw._numa_fit_instance_cell_with_pinning(host_pin, inst_pin)
         self.assertIsNone(inst_pin)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_get_pinning_isolate_policy_no_fully_free_cores(self):
         host_pin = objects.NUMACell(
             id=0,
-            cpuset=set(),
+            # Simulate a legacy host with vcpu_pin_set configuration,
+            # meaning cpuset == pcpuset
+            cpuset=set([0, 1, 2, 3]),
             pcpuset=set([0, 1, 2, 3]),
             memory=4096,
             memory_usage=0,
@@ -3058,10 +3064,13 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         inst_pin = hw._numa_fit_instance_cell_with_pinning(host_pin, inst_pin)
         self.assertIsNone(inst_pin)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_get_pinning_isolate_policy_fits(self):
         host_pin = objects.NUMACell(
             id=0,
-            cpuset=set(),
+            # Simulate a legacy host with vcpu_pin_set configuration,
+            # meaning cpuset == pcpuset
+            cpuset=set([0, 1, 2, 3]),
             pcpuset=set([0, 1, 2, 3]),
             memory=4096,
             memory_usage=0,
@@ -3078,10 +3087,13 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         got_topo = objects.VirtCPUTopology(sockets=1, cores=2, threads=1)
         self.assertEqualTopology(got_topo, inst_pin.cpu_topology)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_get_pinning_isolate_policy_fits_ht_host(self):
         host_pin = objects.NUMACell(
             id=0,
-            cpuset=set(),
+            # Simulate a legacy host with vcpu_pin_set configuration,
+            # meaning cpuset == pcpuset
+            cpuset=set([0, 1, 2, 3]),
             pcpuset=set([0, 1, 2, 3]),
             memory=4096,
             memory_usage=0,
@@ -3098,10 +3110,13 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         got_topo = objects.VirtCPUTopology(sockets=1, cores=2, threads=1)
         self.assertEqualTopology(got_topo, inst_pin.cpu_topology)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_get_pinning_isolate_policy_fits_w_usage(self):
         host_pin = objects.NUMACell(
             id=0,
-            cpuset=set(),
+            # Simulate a legacy host with vcpu_pin_set configuration,
+            # meaning cpuset == pcpuset
+            cpuset=set([0, 1, 2, 3, 4, 5, 6, 7]),
             pcpuset=set([0, 1, 2, 3, 4, 5, 6, 7]),
             memory=4096,
             memory_usage=0,
@@ -3118,6 +3133,38 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         got_topo = objects.VirtCPUTopology(sockets=1, cores=2, threads=1)
         self.assertEqualTopology(got_topo, inst_pin.cpu_topology)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
+    @mock.patch.object(hw, 'LOG')
+    def test_get_pinning_isolate_policy_bug_1889633(self, mock_log):
+        host_pin = objects.NUMACell(
+            id=0,
+            cpuset={0, 1, 4, 5},
+            pcpuset={2, 3, 6, 7},
+            memory=4096,
+            memory_usage=0,
+            pinned_cpus=set(),
+            siblings=[{0, 4}, {1, 5}, {2, 6}, {3, 7}],
+            mempages=[],
+        )
+        inst_pin = objects.InstanceNUMACell(
+            cpuset=set(),
+            pcpuset={0, 1},
+            memory=2048,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+            cpu_thread_policy=fields.CPUThreadAllocationPolicy.ISOLATE,
+        )
+        limits = objects.NUMATopologyLimits(
+            cpu_allocation_ratio=2, ram_allocation_ratio=2,
+        )
+
+        # host has hyperthreads, which means no NUMA topology should be built
+        inst_topo = hw._numa_fit_instance_cell(host_pin, inst_pin, limits)
+        self.assertIsNone(inst_topo)
+        self.assertIn(
+            'Host supports hyperthreads, but instance requested no',
+            mock_log.warning.call_args[0][0],
+        )
+
 
 class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
     def test_host_numa_fit_instance_to_host_single_cell(self):
@@ -3825,11 +3872,14 @@ class EmulatorThreadsTestCase(test.NoDBTestCase):
 
         self.assertEqual(set([0, 1]), host_topo.cells[0].pinned_cpus)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_isolate_w_isolate_thread_alloc(self):
         host_topo = objects.NUMATopology(cells=[
             objects.NUMACell(
                 id=0,
-                cpuset=set(),
+                # Simulate a legacy host with vcpu_pin_set configuration,
+                # meaning cpuset == pcpuset
+                cpuset=set([0, 1, 2, 3, 4, 5]),
                 pcpuset=set([0, 1, 2, 3, 4, 5]),
                 memory=2048,
                 cpu_usage=0,
@@ -3852,11 +3902,14 @@ class EmulatorThreadsTestCase(test.NoDBTestCase):
         self.assertEqual({0: 0, 1: 2}, inst_topo.cells[0].cpu_pinning)
         self.assertEqual(set([4]), inst_topo.cells[0].cpuset_reserved)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_isolate_w_isolate_thread_alloc_usage(self):
         host_topo = objects.NUMATopology(cells=[
             objects.NUMACell(
                 id=0,
-                cpuset=set(),
+                # Simulate a legacy host with vcpu_pin_set configuration,
+                # meaning cpuset == pcpuset
+                cpuset=set([0, 1, 2, 3, 4, 5]),
                 pcpuset=set([0, 1, 2, 3, 4, 5]),
                 memory=2048,
                 cpu_usage=0,
@@ -3912,11 +3965,14 @@ class EmulatorThreadsTestCase(test.NoDBTestCase):
         self.assertEqual({0: 2, 1: 3}, inst_topo.cells[0].cpu_pinning)
         self.assertEqual(set([1]), inst_topo.cells[0].cpuset_reserved)
 
+    # TODO(stephenfin): Remove when we drop support for vcpu_pin_set
     def test_asymmetric_host_w_isolate_thread_alloc(self):
         host_topo = objects.NUMATopology(cells=[
             objects.NUMACell(
                 id=0,
-                cpuset=set(),
+                # Simulate a legacy host with vcpu_pin_set configuration,
+                # meaning cpuset == pcpuset
+                cpuset=set([1, 2, 3, 4, 5]),
                 pcpuset=set([1, 2, 3, 4, 5]),
                 memory=2048,
                 cpu_usage=0,
@@ -3926,8 +3982,7 @@ class EmulatorThreadsTestCase(test.NoDBTestCase):
                 mempages=[objects.NUMAPagesTopology(
                     size_kb=4, total=524288, used=0)])])
         inst_topo = objects.InstanceNUMATopology(
-            emulator_threads_policy=(
-                fields.CPUEmulatorThreadsPolicy.ISOLATE),
+            emulator_threads_policy=fields.CPUEmulatorThreadsPolicy.ISOLATE,
             cells=[objects.InstanceNUMACell(
                 id=0,
                 cpuset=set([0, 1]), memory=2048,
diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py
index 0253dd992786..ccd5eb958bda 100644
--- a/nova/virt/hardware.py
+++ b/nova/virt/hardware.py
@@ -903,6 +903,30 @@ def _pack_instance_onto_cores(host_cell, instance_cell,
                       'for the isolate policy without this.')
             return
 
+        # TODO(stephenfin): Drop this when we drop support for 'vcpu_pin_set'
+        # NOTE(stephenfin): This is total hack. We're relying on the fact that
+        # the libvirt driver, which is the only one that currently supports
+        # pinned CPUs, will set cpuset and pcpuset to the same value if using
+        # legacy configuration, i.e. 'vcpu_pin_set', as part of
+        # '_get_host_numa_topology'. They can't be equal otherwise since
+        # 'cpu_dedicated_set' and 'cpu_shared_set' must be disjoint. Therefore,
+        # if these are equal, the host that this NUMA cell corresponds to is
+        # using legacy configuration and it's okay to use the old, "pin a core
+        # and reserve its siblings" implementation of the 'isolate' policy. If
+        # they're not, the host is using new-style configuration and we've just
+        # hit bug #1889633
+        if threads_per_core != 1 and host_cell.pcpuset != host_cell.cpuset:
+            LOG.warning(
+                "Host supports hyperthreads, but instance requested no "
+                "hyperthreads. This should have been rejected by the "
+                "scheduler but we likely got here due to the fallback VCPU "
+                "query. Consider setting '[workarounds] "
+                "disable_fallback_pcpu_query' to 'True' once hosts are no "
+                "longer using 'vcpu_pin_set'. Refer to bug #1889633 for more "
+                "information."
+            )
+            return
+
         pinning = _get_pinning(
             1,  # we only want to "use" one thread per core
             sibling_sets[threads_per_core],
diff --git a/releasenotes/notes/bug-1889633-37e524fb6c20fbdf.yaml b/releasenotes/notes/bug-1889633-37e524fb6c20fbdf.yaml
new file mode 100644
index 000000000000..77c86013e072
--- /dev/null
+++ b/releasenotes/notes/bug-1889633-37e524fb6c20fbdf.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    An issue that could result in instances with the ``isolate`` thread policy
+    (``hw:cpu_thread_policy=isolate``) being scheduled to hosts with SMT
+    (HyperThreading) and consuming ``VCPU`` instead of ``PCPU`` has been
+    resolved. See `bug #1889633`__ for more information.
+
+    .. __: https://bugs.launchpad.net/nova/+bug/1889633