Deadlock prevention support in synchronize

Some drivers may require multiple locks for a critical section, which
could potentially create deadlocks.  Like in the following example,
where `PowerMax` method `move_volume_between_storage_groups` creates 2
locks:

    @coordination.synchronized(
        "emc-sg-{source_storagegroup_name}-{serial_number}")
    @coordination.synchronized(
        "emc-sg-{target_storagegroup_name}-{serial_number}")
    def move_volume_between_storage_groups(
            self, serial_number, device_id, source_storagegroup_name,
            target_storagegroup_name, extra_specs, force=False,
            parent_sg=None):

That code can result in a deadlock if 2 opposite requests come in
concurrently and their first lock acquisition interleaves.

To make it easier for drivers to request multiple locks ensuring that no
deadlock happens the "synchronize" decorator in cinder.coordination
accepts multiple lock templates and acquires them sorted by rendered
name removing duplicates.

This helps prevent deadlocks because the same lock names will always be
acquired in the same order even if they are received in different order.

Change-Id: I820dd137892e032d076762608d746be187ba5019
This commit is contained in:
Gorka Eguileor 2022-07-06 19:08:45 +02:00 committed by Eric Harney
parent 2db3fc3ecd
commit 749772a6fb
3 changed files with 160 additions and 25 deletions

View File

@ -135,18 +135,43 @@ def synchronized_remove(glob_name, coordinator=COORDINATOR):
coordinator.remove_lock(glob_name)
def synchronized(lock_name: str,
def __acquire(lock, blocking, f_name):
"""Acquire a lock and return the time when it was acquired."""
t1 = timeutils.now()
name = utils.convert_str(lock.name)
LOG.debug('Acquiring lock "%s" by "%s"', name, f_name)
lock.acquire(blocking)
t2 = timeutils.now()
LOG.debug('Lock "%s" acquired by "%s" :: waited %0.3fs',
name, f_name, t2 - t1)
return t2
def __release(lock, acquired_time, f_name):
"""Release a lock ignoring exceptions."""
name = utils.convert_str(lock.name)
try:
lock.release()
held = timeutils.now() - acquired_time
LOG.debug('Lock "%s" released by "%s" :: held %0.3fs',
name, f_name, held)
except Exception as e:
LOG.error('Failed to release lock "%s": %s', name, e)
def synchronized(*lock_names: str,
blocking: bool = True,
coordinator: Coordinator = COORDINATOR) -> Callable:
"""Synchronization decorator.
:param str lock_name: Lock name.
:param str lock_names: Arbitrary number of Lock names.
:param blocking: If True, blocks until the lock is acquired.
If False, raises exception when not acquired. Otherwise,
the value is used as a timeout value and if lock is not acquired
after this number of seconds exception is raised.
after this number of seconds exception is raised. This is a keyword
only argument.
:param coordinator: Coordinator class to use when creating lock.
Defaults to the global coordinator.
Defaults to the global coordinator. This is a keyword only argument.
:raises tooz.coordination.LockAcquireFailed: if lock is not acquired
Decorating a method like so::
@ -175,37 +200,44 @@ def synchronized(lock_name: str,
def foo(self, vol, snap):
...
Multiple locks can be requested simultaneously and the decorator will
reorder the names by rendered lock names to prevent potential deadlocks.
@synchronized('{f_name}-{vol.id}-{snap[name]}',
'{f_name}-{vol.id}.delete')
def foo(self, vol, snap):
...
Available field names are: decorated function parameters and
`f_name` as a decorated function name.
"""
@decorator.decorator
def _synchronized(f, *a, **k) -> Callable:
call_args = inspect.getcallargs(f, *a, **k)
call_args['f_name'] = f.__name__
lock = coordinator.get_lock(lock_name.format(**call_args))
name = utils.convert_str(lock.name)
# Prevent deadlocks not duplicating and sorting them by name to always
# acquire them in the same order.
names = sorted(set([name.format(**call_args) for name in lock_names]))
locks = [coordinator.get_lock(name) for name in names]
acquired_times = []
f_name = f.__name__
t1 = timeutils.now()
t2 = None
try:
LOG.debug('Acquiring lock "%(name)s" by "%(f_name)s"',
{'name': name, 'f_name': f_name})
with lock(blocking):
t2 = timeutils.now()
LOG.debug('Lock "%(name)s" acquired by "%(f_name)s" :: '
'waited %(wait)s',
{'name': name, 'f_name': f_name,
'wait': "%0.3fs" % (t2 - t1)})
return f(*a, **k)
if len(locks) > 1: # Don't pollute logs for single locks
LOG.debug('Acquiring %s locks by %s', len(locks), f_name)
for lock in locks:
acquired_times.append(__acquire(lock, blocking, f_name))
if len(locks) > 1:
t = timeutils.now() - t1
LOG.debug('Acquired %s locks by %s in %0.3fs',
len(locks), f_name, t)
return f(*a, **k)
finally:
t3 = timeutils.now()
if t2 is None:
held_secs = "N/A"
else:
held_secs = "%0.3fs" % (t3 - t2)
LOG.debug(
'Lock "%(name)s" released by "%(f_name)s" :: held %(held)s',
{'name': name, 'f_name': f_name, 'held': held_secs})
for lock, acquired_time in zip(locks, acquired_times):
__release(lock, acquired_time, f_name)
return _synchronized

View File

@ -209,3 +209,71 @@ class CoordinationTestCase(test.TestCase):
coordination.synchronized_remove(mock.sentinel.glob_name, coordinator)
coordinator.remove_lock.assert_called_once_with(
mock.sentinel.glob_name)
@mock.patch.object(coordination.COORDINATOR, 'get_lock')
def test_synchronized_multiple_templates(self, get_lock):
"""Test locks requested in the right order and duplicates removed."""
locks = ['lock-{f_name}-%s-{foo.val}-{bar[val]}' % i for i in range(3)]
expect = [f'lock-func-{i}-7-8' for i in range(3)]
@coordination.synchronized(locks[1], locks[0], locks[1], locks[2])
def func(foo, bar):
pass
foo = mock.Mock(val=7)
bar = mock.MagicMock()
bar.__getitem__.return_value = 8
func(foo, bar)
self.assertEqual(len(expect), get_lock.call_count)
get_lock.assert_has_calls([mock.call(lock) for lock in expect])
self.assertEqual(len(expect), get_lock.return_value.acquire.call_count)
get_lock.return_value.acquire.assert_has_calls(
[mock.call(True)] * len(expect))
self.assertEqual(['foo', 'bar'], inspect.getfullargspec(func)[0])
@mock.patch('oslo_utils.timeutils.now', side_effect=[1, 2])
def test___acquire(self, mock_now):
lock = mock.Mock()
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__acquire'
res = getattr(coordination, '__acquire')(lock, mock.sentinel.blocking,
mock.sentinel.f_name)
self.assertEqual(2, res)
self.assertEqual(2, mock_now.call_count)
mock_now.assert_has_calls([mock.call(), mock.call()])
lock.acquire.assert_called_once_with(mock.sentinel.blocking)
@mock.patch('oslo_utils.timeutils.now')
def test___acquire_propagates_exception(self, mock_now):
lock = mock.Mock()
lock.acquire.side_effect = ValueError
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__acquire'
self.assertRaises(ValueError,
getattr(coordination, '__acquire'),
lock, mock.sentinel.blocking, mock.sentinel.f_name)
mock_now.assert_called_once_with()
lock.acquire.assert_called_once_with(mock.sentinel.blocking)
@mock.patch('oslo_utils.timeutils.now', return_value=2)
def test___release(self, mock_now):
lock = mock.Mock()
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__release'
getattr(coordination, '__release')(lock, 1, mock.sentinel.f_name)
mock_now.assert_called_once_with()
lock.release.assert_called_once_with()
@mock.patch('oslo_utils.timeutils.now')
def test___release_ignores_exception(self, mock_now):
lock = mock.Mock()
lock.release.side_effect = ValueError
# Using getattr to avoid AttributeError: module 'cinder.coordination'
# has no attribute '_CoordinationTestCase__release'
getattr(coordination, '__release')(lock, 1, mock.sentinel.f_name)
mock_now.assert_not_called()
lock.release.assert_called_once_with()

View File

@ -642,6 +642,10 @@ race conditions.
Global locking functionality is provided by the `synchronized` decorator from
`cinder.coordination`.
.. attention:: Optional `blocking` and `coordinator` arguments to the
`synchronized` decorator are **keyword** arguments only and cannot be passed
as positional arguments.
This method is more advanced than the one used for the `Process locks`_ and the
`Node locks`_, as the name supports templates. For the template we have all
the method parameters as well as `f_name` that represents that name of the
@ -673,6 +677,37 @@ attribute from `self`, and a recursive reference in the `snapshot` parameter.
@coordination.synchronized('{self.driver_prefix}-{snapshot.volume.id}')
def create_snapshot(self, snapshot):
Some drivers may require multiple locks for a critical section, which could
potentially create deadlocks. Like in the following example, where `PowerMax`
method `move_volume_between_storage_groups` creates 2 locks:
.. code-block:: python
@coordination.synchronized(
"emc-sg-{source_storagegroup_name}-{serial_number}")
@coordination.synchronized(
"emc-sg-{target_storagegroup_name}-{serial_number}")
def move_volume_between_storage_groups(
self, serial_number, device_id, source_storagegroup_name,
target_storagegroup_name, extra_specs, force=False,
parent_sg=None):
That code can result in a deadlock if 2 opposite requests come in concurrently
and their first lock acquisition interleaves.
The solution is calling the `synchronized` decorator with both lock names and
let it resolve the acquire ordering issue for us. The right code would be:
.. code-block:: python
@coordination.synchronized(
"emc-sg-{source_storagegroup_name}-{serial_number}",
"emc-sg-{target_storagegroup_name}-{serial_number}")
def move_volume_between_storage_groups(
self, serial_number, device_id, source_storagegroup_name,
target_storagegroup_name, extra_specs, force=False,
parent_sg=None):
Internally Cinder uses the `Tooz library`_ to provide the distributed locking.
By default, this library is configured for Active-Passive deployments, where it
uses file locks equivalent to those used for `Node locks`_.