From f14c16af82ecc44cff07be8589fb020bd6625af2 Mon Sep 17 00:00:00 2001
From: Stephen Finucane <stephenfin@redhat.com>
Date: Tue, 15 Sep 2020 15:58:40 +0100
Subject: [PATCH] Make overcommit check for pinned instance pagesize aware

When working on a fix for bug #1811870, it was noted that the check to
ensure pinned instances do not overcommit was not pagesize aware. This
means if an instance without hugepages boots on a host with a large
number of hugepages allocated, it may not get all of the memory
allocated to it. Put in concrete terms, consider a host with 1 NUMA
cell, 2 CPUs, 1G of 4k pages, and a single 1G page. If you boot a first
instance with 1 CPU, CPU pinning, 1G of RAM, and no specific page size,
the instance should boot successfully. An attempt to boot a second
instance with the same configuration should fail because there is only
the single 1G page available, however, this is not currently the case.
The reason this happens is because we currently have two tests: a first
that checks total (not free!) host pages and a second that checks free
memory but with no consideration for page size. The first check passes
because we have 1G worth of 4K pages configured and the second check
passes because we have the single 1G page.

Close this gap.

Change-Id: I74861a67827dda1ab2b8451967f5cf0ae93a4ad3
Signed-off-by: Stephen Finucane <sfinucan@redhat.com>
Closes-Bug: #1811886
---
 nova/tests/unit/virt/test_hardware.py | 181 ++++++++++++++++----------
 nova/virt/hardware.py                 |  56 +++++---
 2 files changed, 150 insertions(+), 87 deletions(-)

diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py
index be4ccb17ddc5..6285b96d8e87 100644
--- a/nova/tests/unit/virt/test_hardware.py
+++ b/nova/tests/unit/virt/test_hardware.py
@@ -2529,73 +2529,146 @@ class NUMATopologyTest(test.NoDBTestCase):
 
 
 class VirtNUMATopologyCellUsageTestCase(test.NoDBTestCase):
-    def test_fit_instance_cell_success_no_limit(self):
+    def test_fit_instance_cell_no_host_mempages(self):
+        """Validate fitting without host or guest mempages.
+
+        This tests overcommitting without host mempages, which is allowed, and
+        self overcommit, which is not.
+        """
+        # host cell has 1024 MiB memory with no mempages reported
         host_cell = objects.NUMACell(
             id=4,
             cpuset=set([1, 2]),
+            pcpuset=set(),
             memory=1024,
             cpu_usage=0,
-            memory_usage=0,
-            pinned_cpus=set(),
-            mempages=[objects.NUMAPagesTopology(
-                size_kb=4, total=524288, used=0)],
-            siblings=[set([1]), set([2])])
+            memory_usage=512,
+            mempages=[],
+            siblings=[set([1]), set([2])],
+            pinned_cpus=set([]))
+
+        # instance cell requests 1024 MiB memory, no mempages -> PASS
         instance_cell = objects.InstanceNUMACell(
             id=0, cpuset=set([1, 2]), pcpuset=set(), memory=1024)
         fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
         self.assertIsInstance(fitted_cell, objects.InstanceNUMACell)
         self.assertEqual(host_cell.id, fitted_cell.id)
 
-    def test_fit_instance_cell_success_w_limit(self):
+        # instance cell requests 4096 MiB memory, no mempages -> FAIL
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([1, 2, 3]), memory=4096)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsNone(fitted_cell)
+
+        # instance cell requests 1024 MiB memory, no mempages, but has
+        # dedicated CPUs -> FAIL
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([0, 1]), memory=1024,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsNone(fitted_cell)
+
+    def test_fit_instance_cell_no_guest_mempages(self):
+        """Validate fitting without guest mempages.
+
+        This tests overcommitting with host small mempages, which is allowed,
+        and self overcommit, which is not.
+        """
+        # host cell has 1024 MiB memory, all small pages
         host_cell = objects.NUMACell(
             id=4,
             cpuset=set([1, 2]),
+            pcpuset=set(),
+            memory=1024,
+            cpu_usage=0,
+            memory_usage=512,
+            mempages=[
+                # 262144 * 4 KiB (1024 MiB) 4k pages, which are all currently
+                # "used"
+                objects.NUMAPagesTopology(size_kb=4, total=262144, used=262144)
+            ],
+            siblings=[set([1]), set([2])],
+            pinned_cpus=set([]))
+
+        # instance cell requests 1024 MiB memory, no mempages -> PASS
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([1, 2]), memory=1024)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsInstance(fitted_cell, objects.InstanceNUMACell)
+        self.assertEqual(host_cell.id, fitted_cell.id)
+
+        # instance cell requests 4096 MiB memory, no mempages -> FAIL
+        instance_cell = objects.InstanceNUMACell(
+            id=0, cpuset=set([1, 2, 3]), memory=4096)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsNone(fitted_cell)
+
+        # instance cell requests 1024 MiB memory, no mempages, but has
+        # dedicated CPUs -> FAIL
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([0, 1]), memory=1024,
+            cpu_policy=fields.CPUAllocationPolicy.DEDICATED)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsNone(fitted_cell)
+
+    def test_fit_instance_cell_mempages(self):
+        """Validate fitting with guest (and host) mempages.
+
+        This also tests overcommitting with explicitly requested guest
+        mempages, which is not allowed.
+        """
+        # host cell has 1024 MiB memory, all small pages
+        host_cell = objects.NUMACell(
+            id=4,
+            cpuset=set([0, 1]),
+            pcpuset=set(),
+            memory=1024,
+            cpu_usage=0, memory_usage=0, mempages=[
+                # 262144 * 4 KiB (1024 MiB) 4k pages, none used
+                objects.NUMAPagesTopology(size_kb=4, total=262144, used=0)
+            ],
+            siblings=[set([0]), set([1])],
+            pinned_cpus=set([]))
+
+        # instance cell requests 1024 MiB memory, all small pages -> PASS
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([0, 1]), memory=1024, pagesize=4)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertEqual(host_cell.id, fitted_cell.id)
+
+        # host cell now only has 1023 MiB memory, all small pages
+        host_cell.memory_usage = 1
+        host_cell.mempages[0].used = 1
+
+        # instance cell requests 1024 MiB memory again, all small pages -> FAIL
+        instance_cell = objects.InstanceNUMACell(
+            cpuset=set([0, 1]), memory=1024, pagesize=4)
+        fitted_cell = hw._numa_fit_instance_cell(host_cell, instance_cell)
+        self.assertIsNone(fitted_cell)
+
+    def test_fit_instance_cell_with_limit(self):
+        host_cell = objects.NUMACell(
+            id=4,
+            cpuset=set([1, 2]),
+            pcpuset=set(),
             memory=1024,
             cpu_usage=2,
             memory_usage=1024,
-            pinned_cpus=set(),
-            mempages=[objects.NUMAPagesTopology(
-                size_kb=4, total=524288, used=0)],
-            siblings=[set([1]), set([2])])
+            mempages=[
+                objects.NUMAPagesTopology(size_kb=4, total=524288, used=0)
+            ],
+            siblings=[set([1]), set([2])],
+            pinned_cpus=set([]))
         limits = objects.NUMATopologyLimits(
             cpu_allocation_ratio=2, ram_allocation_ratio=2)
+
         instance_cell = objects.InstanceNUMACell(
-            id=0, cpuset=set([1, 2]), pcpuset=set(), memory=1024)
+            cpuset=set([1, 2]), memory=1024)
         fitted_cell = hw._numa_fit_instance_cell(
             host_cell, instance_cell, limits=limits)
         self.assertIsInstance(fitted_cell, objects.InstanceNUMACell)
         self.assertEqual(host_cell.id, fitted_cell.id)
 
-    def test_fit_instance_cell_self_overcommit(self):
-        host_cell = objects.NUMACell(
-            id=4,
-            cpuset=set([1, 2]),
-            memory=1024,
-            cpu_usage=0,
-            memory_usage=0,
-            mempages=[objects.NUMAPagesTopology(
-                size_kb=4, total=524288, used=0)],
-            siblings=[set([1]), set([2])],
-            pinned_cpus=set())
-        limits = objects.NUMATopologyLimits(
-            cpu_allocation_ratio=2, ram_allocation_ratio=2)
-        instance_cell = objects.InstanceNUMACell(
-            id=0, cpuset=set([1, 2, 3]), memory=4096)
-        fitted_cell = hw._numa_fit_instance_cell(
-            host_cell, instance_cell, limits=limits)
-        self.assertIsNone(fitted_cell)
-
-    def test_fit_instance_cell_fail_w_limit(self):
-        host_cell = objects.NUMACell(
-            id=4,
-            cpuset=set([1, 2]),
-            memory=1024,
-            cpu_usage=2,
-            memory_usage=1024,
-            mempages=[objects.NUMAPagesTopology(
-                size_kb=4, total=524288, used=0)],
-            siblings=[set([1]), set([2])],
-            pinned_cpus=set())
         instance_cell = objects.InstanceNUMACell(
             id=0, cpuset=set([1, 2]), pcpuset=set(), memory=4096)
         limits = objects.NUMATopologyLimits(
@@ -3169,30 +3242,6 @@ class CPUPinningCellTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
 
         self.assertIsNone(inst_pin)
 
-    def test_get_pinning_inst_too_large_mem(self):
-        host_pin = objects.NUMACell(
-            id=0,
-            cpuset=set(),
-            pcpuset=set([0, 1, 2]),
-            memory=2048,
-            memory_usage=1024,
-            pinned_cpus=set(),
-            mempages=[],
-            siblings=[set([0]), set([1]), set([2])])
-        inst_pin = objects.InstanceNUMACell(
-            cpuset=set(),
-            pcpuset=set([0, 1, 2]),
-            memory=2048,
-            cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
-        )
-        limits = objects.NUMATopologyLimits(
-            cpu_allocation_ratio=2, ram_allocation_ratio=2,
-        )
-
-        inst_pin = hw._numa_fit_instance_cell(host_pin, inst_pin, limits)
-
-        self.assertIsNone(inst_pin)
-
     def test_get_pinning_inst_not_avail(self):
         host_pin = objects.NUMACell(
             id=0,
diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py
index e869af876730..2648240e41a1 100644
--- a/nova/virt/hardware.py
+++ b/nova/virt/hardware.py
@@ -961,10 +961,15 @@ def _numa_fit_instance_cell(
             LOG.debug('No specific pagesize requested for instance, '
                       'selected pagesize: %d', pagesize)
             # we want to allow overcommit in this case as we're not using
-            # hugepages
-            if not host_cell.can_fit_pagesize(pagesize,
-                                              instance_cell.memory * units.Ki,
-                                              use_free=False):
+            # hugepages *except* if using CPU pinning, which for legacy reasons
+            # does not allow overcommit
+            use_free = instance_cell.cpu_policy in (
+                fields.CPUAllocationPolicy.DEDICATED,
+                fields.CPUAllocationPolicy.MIXED,
+            )
+            if not host_cell.can_fit_pagesize(
+                pagesize, instance_cell.memory * units.Ki, use_free=use_free
+            ):
                 LOG.debug('Not enough available memory to schedule instance '
                           'with pagesize %(pagesize)d. Required: '
                           '%(required)s, available: %(available)s, total: '
@@ -977,16 +982,35 @@ def _numa_fit_instance_cell(
         else:
             # The host does not support explicit page sizes. Ignore pagesizes
             # completely.
+
             # NOTE(stephenfin): Do not allow an instance to overcommit against
             # itself on any NUMA cell, i.e. with 'ram_allocation_ratio = 2.0'
             # on a host with 1GB RAM, we should allow two 1GB instances but not
-            # one 2GB instance.
-            if instance_cell.memory > host_cell.memory:
-                LOG.debug('Not enough host cell memory to fit instance cell. '
-                          'Required: %(required)d, actual: %(actual)d',
-                          {'required': instance_cell.memory,
-                           'actual': host_cell.memory})
-                return None
+            # one 2GB instance. If CPU pinning is in use, don't allow
+            # overcommit at all.
+            if instance_cell.cpu_policy in (
+                fields.CPUAllocationPolicy.DEDICATED,
+                fields.CPUAllocationPolicy.MIXED,
+            ):
+                if host_cell.avail_memory < instance_cell.memory:
+                    LOG.debug(
+                        'Not enough host cell memory to fit instance cell. '
+                        'Oversubscription is not possible with pinned '
+                        'instances. '
+                        'Required: %(required)d, available: %(available)d, '
+                        'total: %(total)d. ',
+                        {'required': instance_cell.memory,
+                         'available': host_cell.avail_memory,
+                         'total': host_cell.memory})
+                    return None
+            else:
+                if host_cell.memory < instance_cell.memory:
+                    LOG.debug(
+                        'Not enough host cell memory to fit instance cell. '
+                        'Required: %(required)d, actual: %(total)d',
+                        {'required': instance_cell.memory,
+                         'total': host_cell.memory})
+                    return None
 
     # NOTE(stephenfin): As with memory, do not allow an instance to overcommit
     # against itself on any NUMA cell
@@ -1036,16 +1060,6 @@ def _numa_fit_instance_cell(
                        'num_cpu_reserved': cpuset_reserved})
             return None
 
-        if instance_cell.memory > host_cell.avail_memory:
-            LOG.debug('Not enough available memory to schedule instance. '
-                      'Oversubscription is not possible with pinned '
-                      'instances. Required: %(required)s, available: '
-                      '%(available)s, total: %(total)s. ',
-                      {'required': instance_cell.memory,
-                       'available': host_cell.avail_memory,
-                       'total': host_cell.memory})
-            return None
-
         # Try to pack the instance cell onto cores
         instance_cell = _pack_instance_onto_cores(
             host_cell, instance_cell, num_cpu_reserved=cpuset_reserved,