diff --git a/nova/tests/unit/virt/test_hardware.py b/nova/tests/unit/virt/test_hardware.py
index 1e98dfba6a5e..be6f9d20c726 100644
--- a/nova/tests/unit/virt/test_hardware.py
+++ b/nova/tests/unit/virt/test_hardware.py
@@ -3836,9 +3836,16 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
                 siblings=[set([2]), set([3])])
         ])
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                    cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                )
+            ]
+        )
 
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
 
@@ -3867,9 +3874,16 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
                 siblings=[set([2]), set([3])])
         ])
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                    cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                )
+            ]
+        )
 
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
 
@@ -3898,9 +3912,16 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
                 siblings=[set([2]), set([3])])
         ])
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                    cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                )
+            ]
+        )
 
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
         self.assertIsNone(inst_topo)
@@ -3927,12 +3948,24 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
                 siblings=[set([4]), set([5]), set([6]), set([7])])
         ])
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED),
-                       objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([2, 3]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+                objects.InstanceNUMACell(
+                    id=1,
+                    cpuset=set(),
+                    pcpuset=set([2, 3]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+            ]
+        )
+
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
 
         for cell in inst_topo.cells:
@@ -3970,12 +4003,24 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
         ])
 
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED),
-                       objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([2, 3]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+                objects.InstanceNUMACell(
+                    id=1,
+                    cpuset=set(),
+                    pcpuset=set([2, 3]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+            ]
+        )
+
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
 
         for cell in inst_topo.cells:
@@ -4003,12 +4048,24 @@ class CPUPinningTestCase(test.NoDBTestCase, _CPUPinningTestCaseBase):
                 siblings=[set([4]), set([5]), set([6]), set([7])])
         ])
         inst_topo = objects.InstanceNUMATopology(
-                cells=[objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([0, 1]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED),
-                       objects.InstanceNUMACell(
-                            cpuset=set(), pcpuset=set([2, 3]), memory=2048,
-                            cpu_policy=fields.CPUAllocationPolicy.DEDICATED)])
+            cells=[
+                objects.InstanceNUMACell(
+                    id=0,
+                    cpuset=set(),
+                    pcpuset=set([0, 1]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+                objects.InstanceNUMACell(
+                    id=1,
+                    cpuset=set(),
+                    pcpuset=set([2, 3]),
+                    memory=2048,
+                    cpu_policy=fields.CPUAllocationPolicy.DEDICATED,
+                ),
+            ]
+        )
+
         inst_topo = hw.numa_fit_instance_to_host(host_topo, inst_topo)
         self.assertIsNone(inst_topo)
 
diff --git a/nova/virt/hardware.py b/nova/virt/hardware.py
index c4ebae11ca0f..f6f96a1af202 100644
--- a/nova/virt/hardware.py
+++ b/nova/virt/hardware.py
@@ -2357,12 +2357,37 @@ def numa_fit_instance_to_host(
                     host_cells,
                     key=lambda cell: total_pci_in_cell.get(cell.id, 0))
 
+    # a set of host_cell.id, instance_cell.id pairs where we already checked
+    # that the instance cell does not fit
+    not_fit_cache = set()
+    # a set of host_cell.id, instance_cell.id pairs where we already checked
+    # that the instance cell does fit
+    fit_cache = set()
     for host_cell_perm in itertools.permutations(
             host_cells, len(instance_topology)):
         chosen_instance_cells: ty.List['objects.InstanceNUMACell'] = []
         chosen_host_cells: ty.List['objects.NUMACell'] = []
         for host_cell, instance_cell in zip(
                 host_cell_perm, instance_topology.cells):
+
+            cell_pair = (host_cell.id, instance_cell.id)
+
+            # if we already checked this pair, and they did not fit then no
+            # need to check again just move to the next permutation
+            if cell_pair in not_fit_cache:
+                break
+
+            # if we already checked this pair, and they fit before that they
+            # will fit now too. So no need to check again. Just continue with
+            # the next cell pair in the permutation
+            if cell_pair in fit_cache:
+                chosen_host_cells.append(host_cell)
+                # Normally this would have done by _numa_fit_instance_cell
+                # but we optimized that out here based on the cache
+                instance_cell.id = host_cell.id
+                chosen_instance_cells.append(instance_cell)
+                continue
+
             try:
                 cpuset_reserved = 0
                 if (instance_topology.emulator_threads_isolated and
@@ -2379,11 +2404,18 @@ def numa_fit_instance_to_host(
                 # This exception will been raised if instance cell's
                 # custom pagesize is not supported with host cell in
                 # _numa_cell_supports_pagesize_request function.
+
+                # cache the result
+                not_fit_cache.add(cell_pair)
                 break
             if got_cell is None:
+                # cache the result
+                not_fit_cache.add(cell_pair)
                 break
             chosen_host_cells.append(host_cell)
             chosen_instance_cells.append(got_cell)
+            # cache the result
+            fit_cache.add(cell_pair)
 
         if len(chosen_instance_cells) != len(host_cell_perm):
             continue
diff --git a/releasenotes/notes/bug-1978372-optimized-numa-fitting-algorithm-5d5b922b0bdbf818.yaml b/releasenotes/notes/bug-1978372-optimized-numa-fitting-algorithm-5d5b922b0bdbf818.yaml
new file mode 100644
index 000000000000..3f42f7090828
--- /dev/null
+++ b/releasenotes/notes/bug-1978372-optimized-numa-fitting-algorithm-5d5b922b0bdbf818.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    The algorithm that is used to see if a multi NUMA guest fits to
+    a multi NUMA host has been optimized to speed up the decision
+    on hosts with high number of NUMA nodes ( > 8). For details see
+    `bug 1978372`_
+
+    .. _bug 1978372: https://bugs.launchpad.net/nova/+bug/1978372