From 035b8404fce878b0a88c4741bea46135b6af51e8 Mon Sep 17 00:00:00 2001
From: Matthew N Heler <matthew.heler@hotmail.com>
Date: Wed, 11 Sep 2024 12:28:15 -0500
Subject: [PATCH] Fix regression with live migration on shared storage

The commit c1ccc1a3165ec1556c605b3b036274e992b0a09d introduced
a regression when NUMA live migration was done on shared storage

The live migration support for the power mgmt feature means we need to
call driver.cleanup() for all NUMA instances to potentially offline
pcpus that are not used any more after the instance is migrated away.
However this change exposed an issue with the disk cleanup logic. Nova
should never delete the instance directory if that directory is on
shared storage (e.g. the nova instances path is backed by NFS).

This patch will fix that behavior so live migration will function

Closes-Bug: #2080436
Change-Id: Ia2bbb5b4ac728563a8aabd857ed0503449991df1
---
 nova/compute/manager.py                       |  4 +++-
 nova/tests/unit/compute/test_compute_mgr.py   | 14 ++++++++++--
 nova/tests/unit/virt/libvirt/test_driver.py   | 22 ++++++++++++++++++-
 nova/virt/libvirt/driver.py                   | 10 ++++-----
 .../notes/bug-2080436-568b03b5b5ba5760.yaml   |  8 +++++++
 5 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 0d169e7a4cc4..d5434a15cd0b 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -9205,7 +9205,9 @@ class ComputeManager(manager.Manager):
             # vpmem must be cleaned
             do_cleanup = (not migrate_data.is_shared_instance_path or
                           has_vpmem or has_mdevs or power_management_possible)
-            destroy_disks = not migrate_data.is_shared_block_storage
+            destroy_disks = not (
+                    migrate_data.is_shared_block_storage or
+                    migrate_data.is_shared_instance_path)
         elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
             # NOTE(claudiub): We need to cleanup any zombie Planned VM.
             do_cleanup = True
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 0b16fb821880..4b1b22f78009 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -11380,7 +11380,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags(
                 migrate_data, migr_ctxt)
         self.assertTrue(do_cleanup)
-        self.assertTrue(destroy_disks)
+        self.assertFalse(destroy_disks)
 
     def test_live_migration_cleanup_flags_block_migrate_libvirt(self):
         migrate_data = objects.LibvirtLiveMigrateData(
@@ -11407,7 +11407,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags(
             migrate_data)
         self.assertFalse(do_cleanup)
-        self.assertTrue(destroy_disks)
+        self.assertFalse(destroy_disks)
 
     def test_live_migration_cleanup_flags_shared_libvirt(self):
         migrate_data = objects.LibvirtLiveMigrateData(
@@ -11418,6 +11418,16 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
         self.assertFalse(do_cleanup)
         self.assertFalse(destroy_disks)
 
+    def test_live_migration_cleanup_flags_shared_path_libvirt_mdev(self):
+        migrate_data = objects.LibvirtLiveMigrateData(
+            is_shared_block_storage=False,
+            is_shared_instance_path=True,
+            target_mdevs={})
+        do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags(
+            migrate_data)
+        self.assertTrue(do_cleanup)
+        self.assertFalse(destroy_disks)
+
     def test_live_migration_cleanup_flags_live_migrate(self):
         do_cleanup, destroy_disks = self.compute._live_migration_cleanup_flags(
             {})
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index edea0177ac17..170d8bfc9a61 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -21002,7 +21002,8 @@ class LibvirtConnTestCase(test.NoDBTestCase,
         # is_shared_block_storage=True and destroy_disks=False.
         instance = objects.Instance(self.context, **self.test_instance)
         migrate_data = objects.LibvirtLiveMigrateData(
-                is_shared_block_storage=True)
+                is_shared_block_storage=True,
+                is_shared_instance_path=False)
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI())
         drvr.cleanup(
             self.context, instance, network_info={}, destroy_disks=False,
@@ -21012,6 +21013,25 @@ class LibvirtConnTestCase(test.NoDBTestCase,
         self.assertTrue(instance.cleaned)
         save.assert_called_once_with()
 
+    @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files',
+                       return_value=True)
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(libvirt_driver.LibvirtDriver, '_undefine_domain')
+    def test_cleanup_migrate_data_block_storage_and_share_instance_dir(
+        self, _undefine_domain, save, delete_instance_files
+    ):
+        # Test the case when the instance directory is on shared storage
+        # (e.g. NFS) and the instance is booted form volume.
+        instance = objects.Instance(self.context, **self.test_instance)
+        migrate_data = objects.LibvirtLiveMigrateData(
+                is_shared_block_storage=True,
+                is_shared_instance_path=True)
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI())
+        drvr.cleanup(
+            self.context, instance, network_info={}, destroy_disks=False,
+            migrate_data=migrate_data, destroy_vifs=False)
+        delete_instance_files.assert_not_called()
+
     @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files',
                        return_value=True)
     @mock.patch.object(objects.Instance, 'save')
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 16834846e2d4..4e1fe43b236a 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -1705,12 +1705,12 @@ class LibvirtDriver(driver.ComputeDriver):
             cleanup_instance_dir = True
             cleanup_instance_disks = True
         else:
-            # NOTE(mdbooth): I think the theory here was that if this is a
-            # migration with shared block storage then we need to delete the
-            # instance directory because that's not shared. I'm pretty sure
-            # this is wrong.
+            # NOTE(mheler): For shared block storage we only need to clean up
+            # the instance directory when it's not on a shared path.
             if migrate_data and 'is_shared_block_storage' in migrate_data:
-                cleanup_instance_dir = migrate_data.is_shared_block_storage
+                cleanup_instance_dir = (
+                        migrate_data.is_shared_block_storage and
+                        not migrate_data.is_shared_instance_path)
 
             # NOTE(lyarwood): The following workaround allows operators to
             # ensure that non-shared instance directories are removed after an
diff --git a/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml b/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml
new file mode 100644
index 000000000000..fad6e75b0024
--- /dev/null
+++ b/releasenotes/notes/bug-2080436-568b03b5b5ba5760.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes a regression for live migration on shared storage that
+    was removing the backing disk and instance folder during the
+    cleanup of a virtual machine post live migration.
+    `bug 2080436
+    <https://bugs.launchpad.net/nova/+bug/2080436>`__ for details.