From 18163761d02fc02d5484f91bf52cd4f25536f95e Mon Sep 17 00:00:00 2001 From: Gorka Eguileor <geguileo@redhat.com> Date: Tue, 12 Sep 2023 20:53:15 +0200 Subject: [PATCH] Fix guard for NVMeOF volumes When detaching multiple NVMe-oF volumes from the same host we may end with a NVMe subsystem in "connecting" state, and we'll see a bunch nvme error in dmesg. This happens on storage systems that share the same subsystem for multiple volumes because Nova has not been updated to support the tri-state "shared_targets" option that groups the detach and unmap of volumes to prevent race conditions. This is related to the issue mentioned in an os-brick commit message [1] For the guard_connection method of os-brick to work as expected for NVMe-oF volumes we need to use microversion 3.69 when retrieving the cinder volume. In microversion 3.69 we started reporting 3 states for shared_targets: True, False, and None. - True is to guard iSCSI volumes and will only be used if the iSCSI initiator running on the host doesn't have the manual scans feature. - False is that no target/subsystem is being shared so no guard is necessary. - None is to force guarding, and it's currenly used for NVMe-oF volumes when sharing the subsystem. [1]: https://review.opendev.org/c/openstack/os-brick/+/836062/12//COMMIT_MSG Closes-Bug: #2035375 Change-Id: I4def1c0f20118d0b8eb7d3bbb09af2948ffd70e1 --- nova/tests/unit/virt/test_block_device.py | 28 +++++++++++++++++-- nova/virt/block_device.py | 15 +++++----- .../notes/nvmeof-guard-0f99effdd03983b6.yaml | 6 ++++ 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml diff --git a/nova/tests/unit/virt/test_block_device.py b/nova/tests/unit/virt/test_block_device.py index 703f15967cba..3973b7fb5f40 100644 --- a/nova/tests/unit/virt/test_block_device.py +++ b/nova/tests/unit/virt/test_block_device.py @@ -15,6 +15,7 @@ from os_brick import encryptors from unittest import mock +import ddt from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids @@ -35,6 +36,7 @@ from nova.volume import cinder ATTACHMENT_ID = uuids.attachment_id +@ddt.ddt class TestDriverBlockDevice(test.NoDBTestCase): # os-brick>=5.1 now uses external file system locks instead of internal # locks so we need to set up locking @@ -613,6 +615,7 @@ class TestDriverBlockDevice(test.NoDBTestCase): # First call to get() fails because the API isn't new enough. # So we fallback to the old call. self.volume_api.get.side_effect = [ + exception.CinderAPIVersionNotAvailable(version='3.69'), exception.CinderAPIVersionNotAvailable(version='3.48'), fake_volume] @@ -688,14 +691,17 @@ class TestDriverBlockDevice(test.NoDBTestCase): if include_shared_targets: self.volume_api.get.assert_called_once_with( - self.context, fake_volume['id'], microversion='3.48') + self.context, fake_volume['id'], microversion='3.69') else: # First call to get() fails because the API isn't new enough. # So we fallback to the old call. self.volume_api.get.assert_has_calls([ + mock.call(self.context, fake_volume['id'], + microversion='3.69'), mock.call(self.context, fake_volume['id'], microversion='3.48'), - mock.call(self.context, fake_volume['id'])]) + mock.call(self.context, fake_volume['id'], + microversion=None)]) try: self.volume_api.check_availability_zone.assert_called_once_with( @@ -1557,6 +1563,24 @@ class TestDriverBlockDevice(test.NoDBTestCase): self._test_boot_from_volume_source_snapshot_volume_type( bdm, 'fake-lvm-1') + @ddt.data(['3.69'], ['3.69', '3.48'], ['3.69', '3.48', None]) + def test__get_volume(self, microversions): + volume_api = mock.Mock() + exp = mock.Mock() + exc = exception.CinderAPIVersionNotAvailable + side_effect = [exc(version=mv) for mv in microversions[:-1]] + [exp] + volume_api.get.side_effect = side_effect + + res = self.driver_classes['volume']._get_volume( + self.context, volume_api, mock.sentinel.volume_id) + + self.assertEqual(exp, res) + + self.assertEqual(len(microversions), volume_api.get.call_count) + volume_api.get.assert_has_calls( + [mock.call(self.context, mock.sentinel.volume_id, microversion=mv) + for mv in microversions]) + class TestDriverBlockDeviceNewFlow(TestDriverBlockDevice): """Virt block_device tests for the Cinder 3.44 volume attach flow diff --git a/nova/virt/block_device.py b/nova/virt/block_device.py index 28a866a817fb..059131c25095 100644 --- a/nova/virt/block_device.py +++ b/nova/virt/block_device.py @@ -399,13 +399,14 @@ class DriverVolumeBlockDevice(DriverBlockDevice): @staticmethod def _get_volume(context, volume_api, volume_id): - # First try to get the volume at microversion 3.48 so we can get the - # shared_targets parameter exposed in that version. If that API version - # is not available, we just fallback. - try: - return volume_api.get(context, volume_id, microversion='3.48') - except exception.CinderAPIVersionNotAvailable: - return volume_api.get(context, volume_id) + # First try microversion for tri-state shared_targets, then older + # shared_targets, finally fallback to standard v3. + versions = ('3.69', '3.48', None) + for mv in versions: + try: + return volume_api.get(context, volume_id, microversion=mv) + except exception.CinderAPIVersionNotAvailable: + pass def _create_volume(self, context, instance, volume_api, size, wait_func=None, **create_kwargs): diff --git a/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml b/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml new file mode 100644 index 000000000000..379ae9daace3 --- /dev/null +++ b/releasenotes/notes/nvmeof-guard-0f99effdd03983b6.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + `Bug #2035375 <https://bugs.launchpad.net/nova/+bug/2035375>`_: Fixed + leftover NVMe-oF subsystems when disconnecting multiple NVMe-oF volumes on + the same host from storage sharing the subsystem for different volumes.