From 18163761d02fc02d5484f91bf52cd4f25536f95e Mon Sep 17 00:00:00 2001
From: Gorka Eguileor <geguileo@redhat.com>
Date: Tue, 12 Sep 2023 20:53:15 +0200
Subject: [PATCH] Fix guard for NVMeOF volumes

When detaching multiple NVMe-oF volumes from the same host we may end
with a NVMe subsystem in "connecting" state, and we'll see a bunch nvme
error in dmesg.

This happens on storage systems that share the same subsystem for
multiple volumes because Nova has not been updated to support the
tri-state "shared_targets" option that groups the detach and unmap of
volumes to prevent race conditions.

This is related to the issue mentioned in an os-brick commit message [1]

For the guard_connection method of os-brick to work as expected for
NVMe-oF volumes we need to use microversion 3.69 when retrieving the
cinder volume.

In microversion 3.69 we started reporting 3 states for shared_targets:
True, False, and None.

- True is to guard iSCSI volumes and will only be used if the iSCSI
  initiator running on the host doesn't have the manual scans feature.

- False is that no target/subsystem is being shared so no guard is
  necessary.

- None is to force guarding, and it's currenly used for NVMe-oF volumes
  when sharing the subsystem.

[1]: https://review.opendev.org/c/openstack/os-brick/+/836062/12//COMMIT_MSG

Closes-Bug: #2035375
Change-Id: I4def1c0f20118d0b8eb7d3bbb09af2948ffd70e1
---
 nova/tests/unit/virt/test_block_device.py     | 28 +++++++++++++++++--
 nova/virt/block_device.py                     | 15 +++++-----
 .../notes/nvmeof-guard-0f99effdd03983b6.yaml  |  6 ++++
 3 files changed, 40 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml

diff --git a/nova/tests/unit/virt/test_block_device.py b/nova/tests/unit/virt/test_block_device.py
index 703f15967cba..3973b7fb5f40 100644
--- a/nova/tests/unit/virt/test_block_device.py
+++ b/nova/tests/unit/virt/test_block_device.py
@@ -15,6 +15,7 @@
 from os_brick import encryptors
 from unittest import mock
 
+import ddt
 from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids
 
@@ -35,6 +36,7 @@ from nova.volume import cinder
 ATTACHMENT_ID = uuids.attachment_id
 
 
+@ddt.ddt
 class TestDriverBlockDevice(test.NoDBTestCase):
     # os-brick>=5.1 now uses external file system locks instead of internal
     # locks so we need to set up locking
@@ -613,6 +615,7 @@ class TestDriverBlockDevice(test.NoDBTestCase):
             # First call to get() fails because the API isn't new enough.
             # So we fallback to the old call.
             self.volume_api.get.side_effect = [
+                exception.CinderAPIVersionNotAvailable(version='3.69'),
                 exception.CinderAPIVersionNotAvailable(version='3.48'),
                 fake_volume]
 
@@ -688,14 +691,17 @@ class TestDriverBlockDevice(test.NoDBTestCase):
 
         if include_shared_targets:
             self.volume_api.get.assert_called_once_with(
-                self.context, fake_volume['id'], microversion='3.48')
+                self.context, fake_volume['id'], microversion='3.69')
         else:
             # First call to get() fails because the API isn't new enough.
             # So we fallback to the old call.
             self.volume_api.get.assert_has_calls([
+                mock.call(self.context, fake_volume['id'],
+                          microversion='3.69'),
                 mock.call(self.context, fake_volume['id'],
                           microversion='3.48'),
-                mock.call(self.context, fake_volume['id'])])
+                mock.call(self.context, fake_volume['id'],
+                          microversion=None)])
 
         try:
             self.volume_api.check_availability_zone.assert_called_once_with(
@@ -1557,6 +1563,24 @@ class TestDriverBlockDevice(test.NoDBTestCase):
         self._test_boot_from_volume_source_snapshot_volume_type(
             bdm, 'fake-lvm-1')
 
+    @ddt.data(['3.69'], ['3.69', '3.48'], ['3.69', '3.48', None])
+    def test__get_volume(self, microversions):
+        volume_api = mock.Mock()
+        exp = mock.Mock()
+        exc = exception.CinderAPIVersionNotAvailable
+        side_effect = [exc(version=mv) for mv in microversions[:-1]] + [exp]
+        volume_api.get.side_effect = side_effect
+
+        res = self.driver_classes['volume']._get_volume(
+            self.context, volume_api, mock.sentinel.volume_id)
+
+        self.assertEqual(exp, res)
+
+        self.assertEqual(len(microversions), volume_api.get.call_count)
+        volume_api.get.assert_has_calls(
+            [mock.call(self.context, mock.sentinel.volume_id, microversion=mv)
+             for mv in microversions])
+
 
 class TestDriverBlockDeviceNewFlow(TestDriverBlockDevice):
     """Virt block_device tests for the Cinder 3.44 volume attach flow
diff --git a/nova/virt/block_device.py b/nova/virt/block_device.py
index 28a866a817fb..059131c25095 100644
--- a/nova/virt/block_device.py
+++ b/nova/virt/block_device.py
@@ -399,13 +399,14 @@ class DriverVolumeBlockDevice(DriverBlockDevice):
 
     @staticmethod
     def _get_volume(context, volume_api, volume_id):
-        # First try to get the volume at microversion 3.48 so we can get the
-        # shared_targets parameter exposed in that version. If that API version
-        # is not available, we just fallback.
-        try:
-            return volume_api.get(context, volume_id, microversion='3.48')
-        except exception.CinderAPIVersionNotAvailable:
-            return volume_api.get(context, volume_id)
+        # First try microversion for tri-state shared_targets, then older
+        # shared_targets, finally fallback to standard v3.
+        versions = ('3.69', '3.48', None)
+        for mv in versions:
+            try:
+                return volume_api.get(context, volume_id, microversion=mv)
+            except exception.CinderAPIVersionNotAvailable:
+                pass
 
     def _create_volume(self, context, instance, volume_api, size,
                        wait_func=None, **create_kwargs):
diff --git a/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml b/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml
new file mode 100644
index 000000000000..379ae9daace3
--- /dev/null
+++ b/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    `Bug #2035375 <https://bugs.launchpad.net/nova/+bug/2035375>`_: Fixed
+    leftover NVMe-oF subsystems when disconnecting multiple NVMe-oF volumes on
+    the same host from storage sharing the subsystem for different volumes.