Use file locks in connectors
Currently os-brick is using in-process locks that will only prevent concurrent
access to critical sections to threads within a single process.
But based on the comment from iSCSI it seems like the code assumed that
these were file based locks that prevented concurrent access from
multiple processes.
Mentioned iSCSI comment is being removed because it's not correct that
our current retry mechanism will work with connect and disconnect
concurrency issues.
The reason why we haven't seen errors in Nova is because it runs a
single process and locks will be effective.
This is probably also not an issue in some transport protocols, such as
FC and RBD, and it wouldn't be an issue in iSCSI connections that don't
share targets.
But for others, such as iSCSI with shared targets and NVMe-OF, not
using file locks will create race conditions in the following cases:
- More than 1 cinder backend: Because we can have one backend doing a
detach in a create volume from image and the other an attach for an
offline migration.
- Backup/Restore if backup and volume services are running on the same
host.
- HCI scenarios where cinder volume and nova compute are running on the
same host, even if the same lock path if configured.
- Glance using Cinder as backend and is running on the same node as
cinder-volume or cinder-backup.
The problematic race conditions happen because the disconnect will do a
logout of the iSCSI target once the connect call has already confirmed
that the session to the target exists.
We could just add the file locks to iSCSI and NVMe, but I think it's
safer to add it to all the connectors and then, after proper testing, we
can can change back the locks that can be changed, and remove or reduce
the critical section in others.
Closes-Bug: #1947370
Change-Id: I6f7f7d19540361204d4ae3ead2bd6dcddb8fcd68
(cherry picked from commit 6a43669edc
)
This commit is contained in:
parent
4d116483af
commit
19a4820f5c
|
@ -168,7 +168,7 @@ class FibreChannelConnector(base.BaseLinuxConnector):
|
|||
return volume_paths
|
||||
|
||||
@utils.trace
|
||||
@synchronized('extend_volume')
|
||||
@synchronized('extend_volume', external=True)
|
||||
def extend_volume(self, connection_properties):
|
||||
"""Update the local kernel's size information.
|
||||
|
||||
|
@ -189,7 +189,7 @@ class FibreChannelConnector(base.BaseLinuxConnector):
|
|||
raise exception.VolumePathsNotFound()
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def connect_volume(self, connection_properties):
|
||||
"""Attach the volume to instance_name.
|
||||
|
||||
|
@ -314,7 +314,7 @@ class FibreChannelConnector(base.BaseLinuxConnector):
|
|||
return raw_devices
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def disconnect_volume(self, connection_properties, device_info,
|
||||
force=False, ignore_errors=False):
|
||||
"""Detach the volume from instance_name.
|
||||
|
|
|
@ -85,7 +85,7 @@ class HuaweiStorHyperConnector(base.BaseLinuxConnector):
|
|||
return out['dev_addr']
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def connect_volume(self, connection_properties):
|
||||
"""Connect to a volume.
|
||||
|
||||
|
@ -116,7 +116,7 @@ class HuaweiStorHyperConnector(base.BaseLinuxConnector):
|
|||
return device_info
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def disconnect_volume(self, connection_properties, device_info,
|
||||
force=False, ignore_errors=False):
|
||||
"""Disconnect a volume from the local host.
|
||||
|
|
|
@ -474,7 +474,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
root_helper=self._root_helper)
|
||||
|
||||
@utils.trace
|
||||
@synchronized('extend_volume')
|
||||
@synchronized('extend_volume', external=True)
|
||||
def extend_volume(self, connection_properties: dict):
|
||||
"""Update the local kernel's size information.
|
||||
|
||||
|
@ -497,18 +497,10 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
raise exception.VolumePathsNotFound()
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def connect_volume(self, connection_properties: dict):
|
||||
"""Attach the volume to instance_name.
|
||||
|
||||
NOTE: Will retry up to three times to handle the case where c-vol
|
||||
and n-cpu are both using os-brick to manage iSCSI sessions but they
|
||||
are on the same node and using different locking directories. In this
|
||||
case, even though this call is synchronized, they will be separate
|
||||
locks and can still overlap with connect and disconnect. Since a
|
||||
disconnect during an initial attach can't cause IO failure (the device
|
||||
has not been made available yet), we just try the connection again.
|
||||
|
||||
:param connection_properties: The valid dictionary that describes all
|
||||
of the target volume attributes.
|
||||
:type connection_properties: dict
|
||||
|
@ -870,7 +862,7 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
return device_map
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def disconnect_volume(self, connection_properties, device_info,
|
||||
force=False, ignore_errors=False):
|
||||
"""Detach the volume from instance_name.
|
||||
|
@ -1056,7 +1048,8 @@ class ISCSIConnector(base.BaseLinuxConnector, base_iscsi.BaseISCSIConnector):
|
|||
target_iqn = connection_properties['target_iqn']
|
||||
|
||||
lock_name = f'connect_to_iscsi_portal-{portal}-{target_iqn}'
|
||||
method = synchronized(lock_name)(self._connect_to_iscsi_portal_unsafe)
|
||||
method = synchronized(
|
||||
lock_name, external=True)(self._connect_to_iscsi_portal_unsafe)
|
||||
return method(connection_properties)
|
||||
|
||||
@utils.retry((exception.BrickException))
|
||||
|
|
|
@ -317,7 +317,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
return self._is_nvme_available(nvme_name)
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def connect_volume(self, connection_properties):
|
||||
"""Discover and attach the volume.
|
||||
|
||||
|
@ -368,7 +368,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
return device_info
|
||||
|
||||
@utils.trace
|
||||
@synchronized('connect_volume')
|
||||
@synchronized('connect_volume', external=True)
|
||||
def disconnect_volume(self, connection_properties, device_info,
|
||||
force=False, ignore_errors=False):
|
||||
"""Flush the volume.
|
||||
|
@ -412,7 +412,7 @@ class NVMeOFConnector(base.BaseLinuxConnector):
|
|||
raise
|
||||
|
||||
@utils.trace
|
||||
@synchronized('extend_volume')
|
||||
@synchronized('extend_volume', external=True)
|
||||
def extend_volume(self, connection_properties):
|
||||
"""Update the local kernel's size information.
|
||||
|
||||
|
|
|
@ -336,7 +336,7 @@ class ScaleIOConnector(base.BaseLinuxConnector):
|
|||
return device_info
|
||||
|
||||
@utils.trace
|
||||
@lockutils.synchronized('scaleio', 'scaleio-')
|
||||
@lockutils.synchronized('scaleio', 'scaleio-', external=True)
|
||||
def connect_volume(self, connection_properties):
|
||||
"""Connect the volume.
|
||||
|
||||
|
@ -450,7 +450,7 @@ class ScaleIOConnector(base.BaseLinuxConnector):
|
|||
return device_info
|
||||
|
||||
@utils.trace
|
||||
@lockutils.synchronized('scaleio', 'scaleio-')
|
||||
@lockutils.synchronized('scaleio', 'scaleio-', external=True)
|
||||
def disconnect_volume(self, connection_properties, device_info,
|
||||
force=False, ignore_errors=False):
|
||||
"""Disconnect the ScaleIO volume.
|
||||
|
|
|
@ -42,5 +42,6 @@ def guard_connection(device):
|
|||
else:
|
||||
# Cinder passes an OVO, but Nova passes a dictionary, so we use dict
|
||||
# key access that works with both.
|
||||
with lockutils.lock(device['service_uuid'], 'os-brick-'):
|
||||
with lockutils.lock(device['service_uuid'], 'os-brick-',
|
||||
external=True):
|
||||
yield
|
||||
|
|
|
@ -18,6 +18,8 @@ import os
|
|||
from unittest import mock
|
||||
|
||||
import fixtures
|
||||
from oslo_concurrency import lockutils
|
||||
from oslo_config import fixture as config_fixture
|
||||
from oslo_utils import strutils
|
||||
import testtools
|
||||
|
||||
|
@ -65,6 +67,13 @@ class TestCase(testtools.TestCase):
|
|||
patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
# At runtime this would be set by the library user: Cinder, Nova, etc.
|
||||
self.useFixture(fixtures.NestedTempfile())
|
||||
lock_path = self.useFixture(fixtures.TempDir()).path
|
||||
self.fixture = self.useFixture(config_fixture.Config(lockutils.CONF))
|
||||
self.fixture.config(lock_path=lock_path, group='oslo_concurrency')
|
||||
lockutils.set_defaults(lock_path)
|
||||
|
||||
def _common_cleanup(self):
|
||||
"""Runs after each test method to tear down test environment."""
|
||||
|
||||
|
|
|
@ -59,4 +59,5 @@ class InitiatorUtilsTestCase(base.TestCase):
|
|||
utils.ISCSI_SUPPORTS_MANUAL_SCAN = False
|
||||
with utils.guard_connection({'service_uuid': mock.sentinel.uuid,
|
||||
'shared_targets': True}):
|
||||
mock_lock.assert_called_once_with(mock.sentinel.uuid, 'os-brick-')
|
||||
mock_lock.assert_called_once_with(mock.sentinel.uuid, 'os-brick-',
|
||||
external=True)
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
---
|
||||
upgrade:
|
||||
- |
|
||||
Services using os-brick need to set the ``lock_path`` configuration option
|
||||
in their ``[oslo_concurrency]`` section since it doesn't have a valid
|
||||
default (related `bug #1947370
|
||||
<https://bugs.launchpad.net/cinder/+bug/1947370>`_).
|
||||
fixes:
|
||||
- |
|
||||
`Bug #1947370 <https://bugs.launchpad.net/cinder/+bug/1947370>`_: Fixed
|
||||
race conditions on iSCSI with shared targets and NVMe ``connect_volume``
|
||||
and ``disconnect_volume`` calls.
|
Loading…
Reference in New Issue