From 34e0c0205b1053d3bbe064177740aba654997fe0 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Fri, 24 Sep 2021 15:17:28 +0200
Subject: [PATCH] Store old_flavor already on source host during resize

During resize, on the source host, in resize_instance(), the instance.host
and .node is updated to point to the destination host. This indicates to
the source host's resource tracker that the allocation of this instance
does not need to be tracked as an instance but as an outbound migration
instead. However for the source host's resource tracker to do that it,
needs to use the instance.old_flavor. Unfortunately the
instance.old_flavor is only set during finish_resize() on the
destination host. (resize_instance cast to the finish_resize). So it is
possible that a running resize_instance() set the instance.host to point
to the destination and then before the finish_resize could set the
old_flavor an update_available_resources periodic runs on the source
host. This causes that the allocation of this instance is not tracked as
an instance as the instance.host point to the destination but it is not
tracked as a migration either as the instance.old_flavor is not yet set.
So the allocation on the source host is simply dropped by the periodic
job.

When such migration is confirmed the confirm_resize() tries to drop
the same resource allocation again but fails as the pinned CPUs of the
instance already freed.

When such migration is reverted instead, then revert succeeds but the
source host resource allocation will not contain the resource allocation
of the instance until the next update_available_resources periodic runs
and corrects it.

This does not affect resources tracked exclusively in placement (e.g.
VCPU, MEMORY_MB, DISK_GB) but it does affect NUMA related resource that
are still tracked in the resource tracker (e.g. huge pages, pinned
CPUs).

This patch moves the instance.old_flavor setting to the source node to
the same transaction that sets the instance.host to point to the
destination host. Hence solving the race condition.

Change-Id: Ic0d6c59147abe5e094e8f13e0d3523b178daeba9
Closes-Bug: #1944759
(cherry picked from commit b841e553214be9a732703e2dfed6c97698ef9b71)
(cherry picked from commit d4edcd62bae44c01885268a6cf7b7fae92617060)
(cherry picked from commit c8b04d183f560a616a79577c4d4ae9465d03e541)
---
 nova/compute/manager.py                       | 12 ++++++
 .../functional/libvirt/test_numa_servers.py   | 40 ++++++-------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 72e88ce6e6e0..281f2733aab7 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -5654,6 +5654,14 @@ class ComputeManager(manager.Manager):
 
             instance.host = migration.dest_compute
             instance.node = migration.dest_node
+            # NOTE(gibi): as the instance now tracked on the destination we
+            # have to make sure that the source compute resource track can
+            # track this instance as a migration. For that the resource tracker
+            # needs to see the old_flavor set on the instance. The old_flavor
+            # setting used to be done on the destination host in finish_resize
+            # but that is racy with a source host update_available_resource
+            # periodic run
+            instance.old_flavor = instance.flavor
             instance.task_state = task_states.RESIZE_MIGRATED
             instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
 
@@ -5767,6 +5775,10 @@ class ComputeManager(manager.Manager):
         # to ACTIVE for backwards compatibility
         old_vm_state = instance.system_metadata.get('old_vm_state',
                                                     vm_states.ACTIVE)
+        # NOTE(gibi): this is already set by the resize_instance on the source
+        # node before calling finish_resize on destination but during upgrade
+        # it can be that the source node is not having the fix for bug 1944759
+        # yet. This assignment can be removed in Z release.
         instance.old_flavor = old_flavor
 
         if old_instance_type_id != new_instance_type_id:
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index 90afeb763c06..144bad33c82e 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -766,10 +766,10 @@ class NUMAServersTest(NUMAServersTestBase):
 
         dst_host = server['OS-EXT-SRV-ATTR:host']
 
-        # This is a resource accounting bug, we should have 2 cpus pinned on
-        # both computes. The source should have it due to the outbound
-        # migration and the destination due to the instance running there
-        self._assert_pinned_cpus(src_host, 0)
+        # we have 2 cpus pinned on both computes. The source should have it
+        # due to the outbound migration and the destination due to the
+        # instance running there
+        self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 2)
 
         return server, src_host, dst_host
@@ -781,30 +781,17 @@ class NUMAServersTest(NUMAServersTestBase):
         # Now confirm the resize
         post = {'confirmResize': None}
 
-        # FIXME(gibi): This is bug 1944759 where during resize, on the source
-        # node the resize_instance() call at the point of calling finish_resize
-        # overlaps with a update_available_resources() periodic job. This
-        # causes that the periodic job will not track the migration nor the
-        # instance and therefore freeing the resource allocation. Then when
-        # later the resize is confirmed the confirm_resize on the source
-        # compute also wants to free up the resources, the pinned CPUs, and it
-        # fails as they are already freed.
-        exc = self.assertRaises(
-            client.OpenStackApiException,
-            self.api.post_server_action, server['id'], post
-        )
-        self.assertEqual(500, exc.response.status_code)
-        self.assertIn('CPUUnpinningInvalid', str(exc))
+        self.api.post_server_action(server['id'], post)
+        self._wait_for_state_change(server, 'ACTIVE')
 
-        # confirm failed above but the resource allocation reflects that the
-        # VM is running on the dest node
+        # the resource allocation reflects that the VM is running on the dest
+        # node
         self._assert_pinned_cpus(src_host, 0)
         self._assert_pinned_cpus(dst_host, 2)
 
+        # and running periodics does not break it either
         self._run_periodics()
 
-        # and such allocation situation is stable so as a recovery the VM
-        # can be reset-state to ACTIVE without problem.
         self._assert_pinned_cpus(src_host, 0)
         self._assert_pinned_cpus(dst_host, 2)
 
@@ -820,15 +807,14 @@ class NUMAServersTest(NUMAServersTestBase):
         self.api.post_server_action(server['id'], post)
         self._wait_for_state_change(server, 'ACTIVE')
 
-        # This is a resource accounting bug. After the revert the source host
-        # should have 2 cpus pinned due to the instance.
-        self._assert_pinned_cpus(src_host, 0)
+        # After the revert the source host should have 2 cpus pinned due to
+        # the instance.
+        self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 0)
 
-        # running the periodic job will fix the resource accounting
+        # running the periodic job will not break it either
         self._run_periodics()
 
-        # this is now correct
         self._assert_pinned_cpus(src_host, 2)
         self._assert_pinned_cpus(dst_host, 0)