From d2f6ec5569b1c1b68f567392cfc03a6476169b4b Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Fri, 18 Oct 2019 15:51:13 +0200 Subject: [PATCH] Remove file locks once we delete a resource File locks are never removed from the system, so they keep increasing in the locks directory, which can become problematic. In this patch we start trying to delete these lock files when we delete a volume or a snapshot. This affects the 2 type of file locks we currently have: - Using oslo lockutils synchronized with external=True - Using coordination.synchronized when deployed in Active-Passive and no DLM This will alleviate the ever increasing files in the locks directory. Deployment tools should implement a service that runs when the host is booting and cleans of the locks directory before the OpenStack services are started. Partial-Bug: #1432387 Change-Id: Ic73ee64257aeb024383c6cb79f2e8c04810aaf69 --- cinder/coordination.py | 39 ++++++- cinder/tests/unit/api/v3/test_attachments.py | 2 +- cinder/tests/unit/test_coordination.py | 102 +++++++++++++++++- cinder/tests/unit/test_utils.py | 62 +++++++++++ cinder/tests/unit/volume/__init__.py | 4 + cinder/tests/unit/volume/test_snapshot.py | 20 +++- cinder/tests/unit/volume/test_volume.py | 59 +++++++--- cinder/utils.py | 43 ++++++++ cinder/volume/api.py | 3 + cinder/volume/driver.py | 53 ++++++++- cinder/volume/drivers/hpe/hpe_3par_base.py | 5 + cinder/volume/flows/manager/create_volume.py | 8 +- cinder/volume/manager.py | 68 ++++++++++-- doc/source/contributor/high_availability.rst | 34 ++++++ ...file-locks-on-remove-e5898012f4114d3c.yaml | 7 ++ 15 files changed, 471 insertions(+), 38 deletions(-) create mode 100644 releasenotes/notes/clean-file-locks-on-remove-e5898012f4114d3c.yaml diff --git a/cinder/coordination.py b/cinder/coordination.py index 7fa6f55ade0..2de82db9948 100644 --- a/cinder/coordination.py +++ b/cinder/coordination.py @@ -15,7 +15,12 @@ """Coordination and locking utilities.""" +import errno +import glob import inspect +import os +import re +import sys import uuid import decorator @@ -55,16 +60,28 @@ class Coordinator(object): self.agent_id = agent_id or str(uuid.uuid4()) self.started = False self.prefix = prefix + self._file_path = None + + def _get_file_path(self, backend_url): + if backend_url.startswith('file://'): + path = backend_url[7:] + # Copied from TooZ's _normalize_path to get the same path they use + if sys.platform == 'win32': + path = re.sub(r'\\(?=\w:\\)', '', os.path.normpath(path)) + return os.path.abspath(os.path.join(path, self.prefix)) + return None def start(self): if self.started: return + backend_url = cfg.CONF.coordination.backend_url + # NOTE(bluex): Tooz expects member_id as a byte string. member_id = (self.prefix + self.agent_id).encode('ascii') - self.coordinator = coordination.get_coordinator( - cfg.CONF.coordination.backend_url, member_id) + self.coordinator = coordination.get_coordinator(backend_url, member_id) self.coordinator.start(start_heart=True) + self._file_path = self._get_file_path(backend_url) self.started = True def stop(self): @@ -87,10 +104,28 @@ class Coordinator(object): else: raise exception.LockCreationFailed(_('Coordinator uninitialized.')) + def remove_lock(self, glob_name): + # Most locks clean up on release, but not the file lock, so we manually + # clean them. + if self._file_path: + files = glob.glob(self._file_path + glob_name) + for file_name in files: + try: + os.remove(file_name) + except Exception as exc: + if not (isinstance(exc, OSError) and + exc.errno == errno.ENOENT): + LOG.warning('Failed to cleanup lock %(name)s: %(exc)s', + {'name': file_name, 'exc': exc}) + COORDINATOR = Coordinator(prefix='cinder-') +def synchronized_remove(glob_name, coordinator=COORDINATOR): + coordinator.remove_lock(glob_name) + + def synchronized(lock_name, blocking=True, coordinator=COORDINATOR): """Synchronization decorator. diff --git a/cinder/tests/unit/api/v3/test_attachments.py b/cinder/tests/unit/api/v3/test_attachments.py index 24934e67b53..3f27cb28a26 100644 --- a/cinder/tests/unit/api/v3/test_attachments.py +++ b/cinder/tests/unit/api/v3/test_attachments.py @@ -216,7 +216,7 @@ class AttachmentsAPITestCase(test.TestCase): def _create_attachment(self, ctxt=None, volume_uuid=None, instance_uuid=None, mountpoint=None, attach_time=None, detach_time=None, - attach_status=None, attach_mode=None, host=None): + attach_status=None, attach_mode=None, host=''): """Create an attachment object.""" ctxt = ctxt or self.ctxt attachment = objects.VolumeAttachment(ctxt) diff --git a/cinder/tests/unit/test_coordination.py b/cinder/tests/unit/test_coordination.py index 365e37aab99..e291c84fde9 100644 --- a/cinder/tests/unit/test_coordination.py +++ b/cinder/tests/unit/test_coordination.py @@ -13,6 +13,7 @@ # License for the specific language governing permissions and limitations # under the License. +import errno import inspect from unittest import mock @@ -43,14 +44,18 @@ class MockToozLock(tooz.locking.Lock): self.active_locks.remove(self.name) -@mock.patch('tooz.coordination.get_coordinator') class CoordinatorTestCase(test.TestCase): MOCK_TOOZ = False - def test_coordinator_start(self, get_coordinator): + @mock.patch('cinder.coordination.cfg.CONF.coordination.backend_url') + @mock.patch('cinder.coordination.Coordinator._get_file_path') + @mock.patch('tooz.coordination.get_coordinator') + def test_coordinator_start(self, get_coordinator, mock_get_file_path, + mock_backend_url): crd = get_coordinator.return_value agent = coordination.Coordinator() + self.assertIsNone(agent._file_path) agent.start() self.assertTrue(get_coordinator.called) self.assertTrue(crd.start.called) @@ -58,6 +63,10 @@ class CoordinatorTestCase(test.TestCase): agent.start() crd.start.assert_called_once_with(start_heart=True) + mock_get_file_path.assert_called_once_with(mock_backend_url) + self.assertEqual(mock_get_file_path.return_value, agent._file_path) + + @mock.patch('tooz.coordination.get_coordinator') def test_coordinator_stop(self, get_coordinator): crd = get_coordinator.return_value @@ -71,6 +80,7 @@ class CoordinatorTestCase(test.TestCase): agent.stop() crd.stop.assert_called_once_with() + @mock.patch('tooz.coordination.get_coordinator') def test_coordinator_lock(self, get_coordinator): crd = get_coordinator.return_value crd.get_lock.side_effect = lambda n: MockToozLock(n) @@ -90,6 +100,7 @@ class CoordinatorTestCase(test.TestCase): self.assertRaises(Locked, agent2.get_lock(lock_name).acquire) self.assertNotIn(expected_name, MockToozLock.active_locks) + @mock.patch('tooz.coordination.get_coordinator') def test_coordinator_offline(self, get_coordinator): crd = get_coordinator.return_value crd.start.side_effect = tooz.coordination.ToozConnectionError('err') @@ -98,9 +109,82 @@ class CoordinatorTestCase(test.TestCase): self.assertRaises(tooz.coordination.ToozError, agent.start) self.assertFalse(agent.started) + def test_get_file_path(self): + backend_url = 'file:///opt/stack/data/cinder' + res = coordination.COORDINATOR._get_file_path(backend_url) + self.assertEqual('/opt/stack/data/cinder/cinder-', res) + + def test_get_file_path_non_file(self): + backend_url = 'etcd3+http://192.168.1.95:2379' + res = coordination.COORDINATOR._get_file_path(backend_url) + self.assertIsNone(res) + + @mock.patch('cinder.coordination.COORDINATOR._file_path', None) + @mock.patch('glob.glob') + @mock.patch('os.remove') + def test_remove_lock_non_file_lock(self, mock_remove, mock_glob): + coordination.COORDINATOR.remove_lock('lock-file') + mock_glob.assert_not_called() + mock_remove.assert_not_called() + + @mock.patch('cinder.coordination.COORDINATOR._file_path', '/data/cinder-') + @mock.patch('glob.glob') + @mock.patch('os.remove') + def test_remove_lock(self, mock_remove, mock_glob): + mock_glob.return_value = ['/data/cinder-attachment_update-UUID-1', + '/data/cinder-attachment_update-UUID-2'] + + coordination.COORDINATOR.remove_lock('attachment_update-UUID-*') + + mock_glob.assert_called_once_with( + '/data/cinder-attachment_update-UUID-*') + self.assertEqual(2, mock_remove.call_count) + mock_remove.has_calls( + [mock.call('/data/cinder-attachment_update-UUID-1'), + mock.call('/data/cinder-attachment_update-UUID-2')]) + + @mock.patch('cinder.coordination.COORDINATOR._file_path', '/data/cinder-') + @mock.patch('cinder.coordination.LOG.warning') + @mock.patch('glob.glob') + @mock.patch('os.remove') + def test_remove_lock_missing_file(self, mock_remove, mock_glob, mock_log): + mock_glob.return_value = ['/data/cinder-attachment_update-UUID-1', + '/data/cinder-attachment_update-UUID-2'] + mock_remove.side_effect = [OSError(errno.ENOENT, ''), None] + + coordination.COORDINATOR.remove_lock('attachment_update-UUID-*') + + mock_glob.assert_called_once_with( + '/data/cinder-attachment_update-UUID-*') + self.assertEqual(2, mock_remove.call_count) + mock_remove.has_calls( + [mock.call('/data/cinder-attachment_update-UUID-1'), + mock.call('/data/cinder-attachment_update-UUID-2')]) + mock_log.assert_not_called() + + @mock.patch('cinder.coordination.COORDINATOR._file_path', '/data/cinder-') + @mock.patch('cinder.coordination.LOG.warning') + @mock.patch('glob.glob') + @mock.patch('os.remove') + def test_remove_lock_unknown_failure(self, mock_remove, mock_glob, + mock_log): + mock_glob.return_value = ['/data/cinder-attachment_update-UUID-1', + '/data/cinder-attachment_update-UUID-2'] + mock_remove.side_effect = [ValueError(), None] + + coordination.COORDINATOR.remove_lock('attachment_update-UUID-*') + + mock_glob.assert_called_once_with( + '/data/cinder-attachment_update-UUID-*') + self.assertEqual(2, mock_remove.call_count) + mock_remove.has_calls( + [mock.call('/data/cinder-attachment_update-UUID-1'), + mock.call('/data/cinder-attachment_update-UUID-2')]) + self.assertEqual(1, mock_log.call_count) + -@mock.patch.object(coordination.COORDINATOR, 'get_lock') class CoordinationTestCase(test.TestCase): + @mock.patch.object(coordination.COORDINATOR, 'get_lock') def test_synchronized(self, get_lock): @coordination.synchronized('lock-{f_name}-{foo.val}-{bar[val]}') def func(foo, bar): @@ -113,3 +197,15 @@ class CoordinationTestCase(test.TestCase): func(foo, bar) get_lock.assert_called_with('lock-func-7-8') self.assertEqual(['foo', 'bar'], inspect.getfullargspec(func)[0]) + + @mock.patch('cinder.coordination.COORDINATOR.remove_lock') + def test_synchronized_remove(self, mock_remove): + coordination.synchronized_remove(mock.sentinel.glob_name) + mock_remove.assert_called_once_with(mock.sentinel.glob_name) + + @mock.patch('cinder.coordination.COORDINATOR.remove_lock') + def test_synchronized_remove_custom_coordinator(self, mock_remove): + coordinator = mock.Mock() + coordination.synchronized_remove(mock.sentinel.glob_name, coordinator) + coordinator.remove_lock.assert_called_once_with( + mock.sentinel.glob_name) diff --git a/cinder/tests/unit/test_utils.py b/cinder/tests/unit/test_utils.py index b97c5c533a6..e4184179275 100644 --- a/cinder/tests/unit/test_utils.py +++ b/cinder/tests/unit/test_utils.py @@ -1443,3 +1443,65 @@ class TestKeystoneProjectGet(test.TestCase): project = api_utils.get_project( self.context, self.context.project_id) self.assertEqual(expected_project.__dict__, project.__dict__) + + +class TestCleanFileLocks(test.TestCase): + + @mock.patch('cinder.utils.LOG.warning') + @mock.patch('cinder.utils.synchronized_remove') + def test_clean_volume_file_locks(self, mock_remove, mock_log): + driver = mock.Mock() + + utils.clean_volume_file_locks('UUID', driver) + + self.assertEqual(3, mock_remove.call_count) + mock_remove.assert_has_calls([mock.call('UUID-delete_volume'), + mock.call('UUID'), + mock.call('UUID-detach_volume')]) + driver.clean_volume_file_locks.assert_called_once_with('UUID') + mock_log.assert_not_called() + + @mock.patch('cinder.utils.LOG.warning') + @mock.patch('cinder.utils.synchronized_remove') + def test_clean_volume_file_locks_errors(self, mock_remove, mock_log): + driver = mock.Mock() + driver.clean_volume_file_locks.side_effect = Exception + mock_remove.side_effect = [None, Exception, None] + + utils.clean_volume_file_locks('UUID', driver) + + self.assertEqual(3, mock_remove.call_count) + mock_remove.assert_has_calls([mock.call('UUID-delete_volume'), + mock.call('UUID'), + mock.call('UUID-detach_volume')]) + driver.clean_volume_file_locks.assert_called_once_with('UUID') + self.assertEqual(2, mock_log.call_count) + + @mock.patch('cinder.utils.LOG.warning') + @mock.patch('cinder.utils.synchronized_remove') + def test_clean_snapshot_file_locks(self, mock_remove, mock_log): + driver = mock.Mock() + + utils.clean_snapshot_file_locks('UUID', driver) + + mock_remove.assert_called_once_with('UUID-delete_snapshot') + driver.clean_snapshot_file_locks.assert_called_once_with('UUID') + mock_log.assert_not_called() + + @mock.patch('cinder.utils.LOG.warning') + @mock.patch('cinder.utils.synchronized_remove') + def test_clean_snapshot_file_locks_failures(self, mock_remove, mock_log): + driver = mock.Mock() + driver.clean_snapshot_file_locks.side_effect = Exception + mock_remove.side_effect = Exception + + utils.clean_snapshot_file_locks('UUID', driver) + + mock_remove.assert_called_once_with('UUID-delete_snapshot') + driver.clean_snapshot_file_locks.assert_called_once_with('UUID') + self.assertEqual(2, mock_log.call_count) + + @mock.patch('cinder.coordination.synchronized_remove') + def test_api_clean_volume_file_locks(self, mock_remove): + utils.api_clean_volume_file_locks('UUID') + mock_remove.assert_called_once_with('attachment_update-UUID-*') diff --git a/cinder/tests/unit/volume/__init__.py b/cinder/tests/unit/volume/__init__.py index 8f8d9de70fd..a8f8f9a5eda 100644 --- a/cinder/tests/unit/volume/__init__.py +++ b/cinder/tests/unit/volume/__init__.py @@ -81,6 +81,10 @@ class BaseVolumeTestCase(test.TestCase): self.called = [] self.volume_api = volume_api.API() + # Don't accidentaly make a call to delete a file from the system + self.mock_lock_remove = self.patch('cinder.utils.synchronized_remove') + self.mock_dlm_lock_remove = self.patch('cinder.coordination.os.remove') + def _cleanup(self): try: shutil.rmtree(CONF.volumes_dir) diff --git a/cinder/tests/unit/volume/test_snapshot.py b/cinder/tests/unit/volume/test_snapshot.py index 672a4566b40..9f6129ad915 100644 --- a/cinder/tests/unit/volume/test_snapshot.py +++ b/cinder/tests/unit/volume/test_snapshot.py @@ -76,13 +76,15 @@ class SnapshotTestCase(base.BaseVolumeTestCase): self.vol_type = db.volume_type_get_by_name(self.context, 'vol_type_name') - def test_delete_snapshot_frozen(self): + @mock.patch('cinder.utils.clean_snapshot_file_locks') + def test_delete_snapshot_frozen(self, mock_clean): service = tests_utils.create_service(self.context, {'frozen': True}) volume = tests_utils.create_volume(self.context, host=service.host) snapshot = tests_utils.create_snapshot(self.context, volume.id) self.assertRaises(exception.InvalidInput, self.volume_api.delete_snapshot, self.context, snapshot) + mock_clean.assert_not_called() @ddt.data('create_snapshot', 'create_snapshot_force') def test_create_snapshot_frozen(self, method): @@ -114,8 +116,9 @@ class SnapshotTestCase(base.BaseVolumeTestCase): self.volume.delete_snapshot(self.context, snapshot_obj) self.volume.delete_volume(self.context, volume_src) + @mock.patch('cinder.utils.clean_snapshot_file_locks') @mock.patch('cinder.tests.unit.fake_notifier.FakeNotifier._notify') - def test_create_delete_snapshot(self, mock_notify): + def test_create_delete_snapshot(self, mock_notify, mock_clean): """Test snapshot can be created and deleted.""" volume = tests_utils.create_volume( self.context, @@ -146,6 +149,7 @@ class SnapshotTestCase(base.BaseVolumeTestCase): any_order=True) self.volume.delete_snapshot(self.context, snapshot) + mock_clean.assert_called_once_with(snapshot.id, self.volume.driver) self.assert_notify_called(mock_notify, (['INFO', 'volume.create.start'], ['INFO', 'volume.create.end'], @@ -514,7 +518,8 @@ class SnapshotTestCase(base.BaseVolumeTestCase): snapshot.destroy() db.volume_destroy(self.context, volume_id) - def test_delete_busy_snapshot(self): + @mock.patch('cinder.utils.clean_snapshot_file_locks') + def test_delete_busy_snapshot(self, mock_clean): """Test snapshot can be created and deleted.""" self.volume.driver.vg = fake_lvm.FakeBrickLVM('cinder-volumes', @@ -540,9 +545,11 @@ class SnapshotTestCase(base.BaseVolumeTestCase): self.assertEqual(fields.SnapshotStatus.AVAILABLE, snapshot_ref.status) mock_del_snap.assert_called_once_with(snapshot) + mock_clean.assert_not_called() + @mock.patch('cinder.utils.clean_snapshot_file_locks') @test.testtools.skipIf(sys.platform == "darwin", "SKIP on OSX") - def test_delete_no_dev_fails(self): + def test_delete_no_dev_fails(self, mock_clean): """Test delete snapshot with no dev file fails.""" self.mock_object(os.path, 'exists', lambda x: False) self.volume.driver.vg = fake_lvm.FakeBrickLVM('cinder-volumes', @@ -567,6 +574,7 @@ class SnapshotTestCase(base.BaseVolumeTestCase): self.assertEqual(fields.SnapshotStatus.AVAILABLE, snapshot_ref.status) mock_del_snap.assert_called_once_with(snapshot) + mock_clean.assert_not_called() def test_force_delete_snapshot(self): """Test snapshot can be forced to delete.""" @@ -618,7 +626,8 @@ class SnapshotTestCase(base.BaseVolumeTestCase): self.context, snap) - def test_delete_snapshot_driver_not_initialized(self): + @mock.patch('cinder.utils.clean_snapshot_file_locks') + def test_delete_snapshot_driver_not_initialized(self, mock_clean): volume = tests_utils.create_volume(self.context, **self.volume_params) snapshot = tests_utils.create_snapshot(self.context, volume.id) @@ -630,6 +639,7 @@ class SnapshotTestCase(base.BaseVolumeTestCase): snapshot.refresh() self.assertEqual(fields.SnapshotStatus.ERROR_DELETING, snapshot.status) + mock_clean.assert_not_called() @ddt.data({'all_tenants': '1', 'name': 'snap1'}, {'all_tenants': 'true', 'name': 'snap1'}, diff --git a/cinder/tests/unit/volume/test_volume.py b/cinder/tests/unit/volume/test_volume.py index 3b5108b98eb..6a4cc5ccefd 100644 --- a/cinder/tests/unit/volume/test_volume.py +++ b/cinder/tests/unit/volume/test_volume.py @@ -346,11 +346,13 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.assertEqual("error_deleting", volume.status) volume.destroy() + @mock.patch('cinder.utils.clean_volume_file_locks') @mock.patch('cinder.tests.unit.fake_notifier.FakeNotifier._notify') @mock.patch('cinder.quota.QUOTAS.rollback', new=mock.Mock()) @mock.patch('cinder.quota.QUOTAS.commit', new=mock.Mock()) @mock.patch('cinder.quota.QUOTAS.reserve', return_value=['RESERVATION']) - def test_create_delete_volume(self, _mock_reserve, mock_notify): + def test_create_delete_volume(self, _mock_reserve, mock_notify, + mock_clean): """Test volume can be created and deleted.""" volume = tests_utils.create_volume( self.context, @@ -387,6 +389,7 @@ class VolumeTestCase(base.BaseVolumeTestCase): db.volume_get, self.context, volume_id) + mock_clean.assert_called_once_with(volume_id, self.volume.driver) def test_create_delete_volume_with_metadata(self): """Test volume can be created with metadata and deleted.""" @@ -403,11 +406,13 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.context, volume_id) - def test_delete_volume_frozen(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test_delete_volume_frozen(self, mock_clean): service = tests_utils.create_service(self.context, {'frozen': True}) volume = tests_utils.create_volume(self.context, host=service.host) self.assertRaises(exception.InvalidInput, self.volume_api.delete, self.context, volume) + mock_clean.assert_not_called() def test_delete_volume_another_cluster_fails(self): """Test delete of volume from another cluster fails.""" @@ -564,7 +569,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): 'fake_key1', FAKE_METADATA_TYPE.fake_type) - def test_delete_volume_metadata_maintenance(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test_delete_volume_metadata_maintenance(self, mock_clean): """Test delete volume metadata in maintenance.""" FAKE_METADATA_TYPE = enum.Enum('METADATA_TYPES', 'fake_type') test_meta1 = {'fake_key1': 'fake_value1', 'fake_key2': 'fake_value2'} @@ -577,6 +583,7 @@ class VolumeTestCase(base.BaseVolumeTestCase): volume, 'fake_key1', FAKE_METADATA_TYPE.fake_type) + mock_clean.assert_not_called() def test_accept_transfer_maintenance(self): """Test accept transfer in maintenance.""" @@ -974,7 +981,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.assertEqual("deleting", volume.status) volume.destroy() - def test_delete_busy_volume(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test_delete_busy_volume(self, mock_clean): """Test volume survives deletion if driver reports it as busy.""" volume = tests_utils.create_volume(self.context, **self.volume_params) volume_id = volume['id'] @@ -989,8 +997,10 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.assertEqual(volume_id, volume_ref.id) self.assertEqual("available", volume_ref.status) mock_del_vol.assert_called_once_with(volume) + mock_clean.assert_not_called() - def test_unmanage_encrypted_volume_fails(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test_unmanage_encrypted_volume_fails(self, mock_clean): volume = tests_utils.create_volume( self.context, encryption_key_id=fake.ENCRYPTION_KEY_ID, @@ -1002,6 +1012,7 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.context, volume, unmanage_only=True) + mock_clean.assert_not_called() self.volume.delete_volume(self.context, volume) def test_unmanage_cascade_delete_fails(self): @@ -1082,7 +1093,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): volume_api.get_all(self.context, filters={'all_tenants': '1'}) self.assertTrue(get_all.called) - def test_delete_volume_in_error_extending(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test_delete_volume_in_error_extending(self, mock_clean): """Test volume can be deleted in error_extending stats.""" # create a volume volume = tests_utils.create_volume(self.context, **self.volume_params) @@ -1094,16 +1106,19 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.volume.delete_volume(self.context, volume) self.assertRaises(exception.NotFound, db.volume_get, self.context, volume['id']) + mock_clean.assert_called_once_with(volume.id, self.volume.driver) + @mock.patch('cinder.utils.clean_volume_file_locks') @mock.patch.object(db.sqlalchemy.api, 'volume_get', side_effect=exception.VolumeNotFound( volume_id='12345678-1234-5678-1234-567812345678')) - def test_delete_volume_not_found(self, mock_get_volume): + def test_delete_volume_not_found(self, mock_get_volume, mock_clean): """Test delete volume moves on if the volume does not exist.""" volume_id = '12345678-1234-5678-1234-567812345678' volume = objects.Volume(self.context, status='available', id=volume_id) self.volume.delete_volume(self.context, volume) self.assertTrue(mock_get_volume.called) + mock_clean.assert_called_once_with(volume_id, self.volume.driver) @mock.patch('cinder.volume.drivers.lvm.LVMVolumeDriver.' 'create_volume_from_snapshot') @@ -2119,7 +2134,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): """Test volume can't be deleted in maintenance status.""" self._test_cannot_delete_volume('maintenance') - def _test_cannot_delete_volume(self, status): + @mock.patch('cinder.utils.clean_volume_file_locks') + def _test_cannot_delete_volume(self, status, mock_clean): """Test volume can't be deleted in invalid stats.""" # create a volume and assign to host volume = tests_utils.create_volume(self.context, CONF.host, @@ -2130,6 +2146,7 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.volume_api.delete, self.context, volume) + mock_clean.assert_not_called() # clean up self.volume.delete_volume(self.context, volume) @@ -2172,7 +2189,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): db.volume_destroy(self.context, volume.id) - def test__revert_to_snapshot_generic_failed(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test__revert_to_snapshot_generic_failed(self, mock_clean): fake_volume = tests_utils.create_volume(self.context, status='available') fake_snapshot = tests_utils.create_snapshot(self.context, @@ -2197,8 +2215,11 @@ class VolumeTestCase(base.BaseVolumeTestCase): mock_copy.assert_called_once_with( self.context, temp_volume, fake_volume) mock_driver_delete.assert_called_once_with(temp_volume) + mock_clean.assert_called_once_with(temp_volume.id, + self.volume.driver) - def test__revert_to_snapshot_generic(self): + @mock.patch('cinder.utils.clean_volume_file_locks') + def test__revert_to_snapshot_generic(self, mock_clean): fake_volume = tests_utils.create_volume(self.context, status='available') fake_snapshot = tests_utils.create_snapshot(self.context, @@ -2218,6 +2239,8 @@ class VolumeTestCase(base.BaseVolumeTestCase): mock_copy.assert_called_once_with( self.context, temp_volume, fake_volume) mock_driver_delete.assert_called_once_with(temp_volume) + mock_clean.assert_called_once_with(temp_volume.id, + self.volume.driver) @ddt.data({'driver_error': True}, {'driver_error': False}) @@ -3133,14 +3156,16 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.assertRaises(exception.VolumeNotFound, self.volume.create_volume, self.context, test_vol, - {'volume_properties': self.volume_params}, + {'volume_properties': self.volume_params, + 'source_volid': fake.VOLUME_ID}, {'retry': {'num_attempts': 1, 'host': []}}) volume = db.volume_get(context.get_admin_context(), test_vol_id) self.assertEqual('error', volume['status']) self.assertEqual({'_pool0': {'allocated_capacity_gb': 1}}, self.volume.stats['pools']) - def test_cascade_delete_volume_with_snapshots(self): + @mock.patch('cinder.utils.api_clean_volume_file_locks') + def test_cascade_delete_volume_with_snapshots(self, mock_api_clean): """Test volume deletion with dependent snapshots.""" volume = tests_utils.create_volume(self.context, **self.volume_params) self.volume.create_volume(self.context, volume) @@ -3158,8 +3183,10 @@ class VolumeTestCase(base.BaseVolumeTestCase): volume_api.delete(self.context, volume, cascade=True) + mock_api_clean.assert_called_once_with(volume.id) - def test_cascade_delete_volume_with_snapshots_error(self): + @mock.patch('cinder.utils.api_clean_volume_file_locks') + def test_cascade_delete_volume_with_snapshots_error(self, mock_api_clean): """Test volume deletion with dependent snapshots.""" volume = tests_utils.create_volume(self.context, **self.volume_params) self.volume.create_volume(self.context, volume) @@ -3182,8 +3209,11 @@ class VolumeTestCase(base.BaseVolumeTestCase): self.context, volume, cascade=True) + mock_api_clean.assert_not_called() - def test_cascade_force_delete_volume_with_snapshots_error(self): + @mock.patch('cinder.utils.api_clean_volume_file_locks') + def test_cascade_force_delete_volume_with_snapshots_error(self, + mock_api_clean): """Test volume force deletion with errored dependent snapshots.""" volume = tests_utils.create_volume(self.context, host='fakehost') @@ -3202,6 +3232,7 @@ class VolumeTestCase(base.BaseVolumeTestCase): volume = objects.Volume.get_by_id(self.context, volume.id) self.assertEqual('deleting', volume.status) + mock_api_clean.assert_called_once_with(volume.id) def test_cascade_delete_volume_with_snapshots_in_other_project(self): """Test volume deletion with dependent snapshots in other project.""" diff --git a/cinder/utils.py b/cinder/utils.py index fcbb9f6af31..b96df875f4b 100644 --- a/cinder/utils.py +++ b/cinder/utils.py @@ -57,6 +57,7 @@ from oslo_utils import strutils from oslo_utils import timeutils import tenacity +from cinder import coordination from cinder import exception from cinder.i18n import _ @@ -69,6 +70,48 @@ INFINITE_UNKNOWN_VALUES = ('infinite', 'unknown') synchronized = lockutils.synchronized_with_prefix('cinder-') +synchronized_remove = lockutils.remove_external_lock_file_with_prefix( + 'cinder-') + + +def clean_volume_file_locks(volume_id, driver): + """Remove file locks used by Cinder. + + This doesn't take care of driver locks, those should be handled in driver's + delete_volume method. + """ + for name in (volume_id + '-delete_volume', volume_id, + volume_id + '-detach_volume'): + try: + synchronized_remove(name) + except Exception as exc: + LOG.warning('Failed to cleanup volume lock %(name)s: %(exc)s', + {'name': name, 'exc': exc}) + + try: + driver.clean_volume_file_locks(volume_id) + except Exception as exc: + LOG.warning('Failed to cleanup driver locks for volume %(id)s: ' + '%(exc)s', {'id': volume_id, 'exc': exc}) + + +def api_clean_volume_file_locks(volume_id): + coordination.synchronized_remove('attachment_update-' + volume_id + '-*') + + +def clean_snapshot_file_locks(snapshot_id, driver): + try: + name = snapshot_id + '-delete_snapshot' + synchronized_remove(name) + except Exception as exc: + LOG.warning('Failed to cleanup snapshot lock %(name)s: %(exc)s', + {'name': name, 'exc': exc}) + + try: + driver.clean_snapshot_file_locks(snapshot_id) + except Exception as exc: + LOG.warning('Failed to cleanup driver locks for snapshot %(id)s: ' + '%(exc)s', {'id': snapshot_id, 'exc': exc}) def as_int(obj: Union[int, float, str], quiet: bool = True) -> int: diff --git a/cinder/volume/api.py b/cinder/volume/api.py index 8c5df3f50b7..fdd0c5f46bc 100644 --- a/cinder/volume/api.py +++ b/cinder/volume/api.py @@ -425,6 +425,7 @@ class API(base.Base): LOG.info("Delete volume request issued successfully.", resource={'type': 'volume', 'id': volume.id}) + utils.api_clean_volume_file_locks(volume.id) return if not unmanage_only: @@ -533,6 +534,7 @@ class API(base.Base): volume, unmanage_only, cascade) + utils.api_clean_volume_file_locks(volume.id) LOG.info("Delete volume request issued successfully.", resource=volume) @@ -2297,6 +2299,7 @@ class API(base.Base): volume.status = status_updates['status'] volume.attach_status = status_updates['attach_status'] volume.save() + return remaining_attachments diff --git a/cinder/volume/driver.py b/cinder/volume/driver.py index e5d434e0ef2..1433d211db1 100644 --- a/cinder/volume/driver.py +++ b/cinder/volume/driver.py @@ -620,6 +620,9 @@ class BaseVD(object, metaclass=abc.ABCMeta): It is imperative that this operation ensures that the data from the deleted volume cannot leak into new volumes when they are created, as new volumes are likely to belong to a different tenant/project. + + If the driver uses custom file locks they should be cleaned on success + using cinder.utils.synchronized_remove """ return @@ -1953,6 +1956,50 @@ class BaseVD(object, metaclass=abc.ABCMeta): return [CONF.backend_defaults._group._opts[cfg_name]['opt'] for cfg_name in cfg_names] + @classmethod + def clean_volume_file_locks(cls, volume_id): + """Clean up driver specific volume locks. + + This method will be called when a volume has been removed from Cinder + or when we detect that the volume doesn't exist. + + There are 3 types of locks in Cinder: + + - Process locks: Don't need cleanup + - Node locks: Must use cinder.utils.synchronized_remove + - Global locks: Must use cinder.coordination.synchronized_remove + + When using method cinder.utils.synchronized_remove we must pass the + exact lock name, whereas method cinder.coordination.synchronized_remove + accepts a glob. + + Refer to clean_volume_file_locks, api_clean_volume_file_locks, and + clean_snapshot_file_locks in cinder.utils for examples. + """ + pass + + @classmethod + def clean_snapshot_file_locks(self, snapshot_id): + """Clean up driver specific snapshot locks. + + This method will be called when a snapshot has been removed from cinder + or when we detect that the snapshot doesn't exist. + + There are 3 types of locks in Cinder: + + - Process locks: Don't need cleanup + - Node locks: Must use cinder.utils.synchronized_remove + - Global locks: Must use cinder.coordination.synchronized_remove + + When using method cinder.utils.synchronized_remove we must pass the + exact lock name, whereas method cinder.coordination.synchronized_remove + accepts a glob. + + Refer to clean_volume_file_locks, api_clean_volume_file_locks, and + clean_snapshot_file_locks in cinder.utils for examples. + """ + pass + class CloneableImageVD(object, metaclass=abc.ABCMeta): @abc.abstractmethod @@ -2217,7 +2264,11 @@ class VolumeDriver(ManageableVD, CloneableImageVD, ManageableSnapshotsVD, raise NotImplementedError() def delete_snapshot(self, snapshot): - """Deletes a snapshot.""" + """Deletes a snapshot. + + If the driver uses custom file locks they should be cleaned on success + using cinder.utils.synchronized_remove + """ raise NotImplementedError() def local_path(self, volume): diff --git a/cinder/volume/drivers/hpe/hpe_3par_base.py b/cinder/volume/drivers/hpe/hpe_3par_base.py index b6eb1d9ec2e..67ecd4d3336 100644 --- a/cinder/volume/drivers/hpe/hpe_3par_base.py +++ b/cinder/volume/drivers/hpe/hpe_3par_base.py @@ -32,6 +32,7 @@ except ImportError: from oslo_log import log as logging +from cinder import coordination from cinder import exception from cinder.i18n import _ from cinder.volume import driver @@ -668,3 +669,7 @@ class HPE3PARDriverBase(driver.ManageableVD, default="normal") return properties, 'HPE:3PAR' + + @classmethod + def clean_volume_file_locks(cls, volume_id): + coordination.synchronized_remove('3par-' + volume_id) diff --git a/cinder/volume/flows/manager/create_volume.py b/cinder/volume/flows/manager/create_volume.py index 7775e82b716..7ad0313d131 100644 --- a/cinder/volume/flows/manager/create_volume.py +++ b/cinder/volume/flows/manager/create_volume.py @@ -75,7 +75,7 @@ class OnFailureRescheduleTask(flow_utils.CinderTask): this volume elsewhere. """ - def __init__(self, reschedule_context, db, driver, scheduler_rpcapi, + def __init__(self, reschedule_context, db, manager, scheduler_rpcapi, do_reschedule): requires = ['filter_properties', 'request_spec', 'volume', 'context'] @@ -84,7 +84,7 @@ class OnFailureRescheduleTask(flow_utils.CinderTask): self.do_reschedule = do_reschedule self.scheduler_rpcapi = scheduler_rpcapi self.db = db - self.driver = driver + self.manager = manager self.reschedule_context = reschedule_context # These exception types will trigger the volume to be set into error # status rather than being rescheduled. @@ -175,7 +175,7 @@ class OnFailureRescheduleTask(flow_utils.CinderTask): # host field will be erased. Just in case volume was already created at # the backend, we attempt to delete it. try: - self.driver.delete_volume(volume) + self.manager.driver_delete_volume(volume) except Exception: # Most likely the volume weren't created at the backend. We can # safely ignore this. @@ -1257,7 +1257,7 @@ def get_flow(context, manager, db, driver, scheduler_rpcapi, host, volume, # status when reverting the flow. Meanwhile, no need to revert process of # ExtractVolumeRefTask. do_reschedule = allow_reschedule and request_spec and retry - volume_flow.add(OnFailureRescheduleTask(reschedule_context, db, driver, + volume_flow.add(OnFailureRescheduleTask(reschedule_context, db, manager, scheduler_rpcapi, do_reschedule)) LOG.debug("Volume reschedule parameters: %(allow)s " diff --git a/cinder/volume/manager.py b/cinder/volume/manager.py index f40149db55b..800d94c4c64 100644 --- a/cinder/volume/manager.py +++ b/cinder/volume/manager.py @@ -35,6 +35,7 @@ intact. """ +import functools import time import typing as ty @@ -74,6 +75,7 @@ from cinder.objects import cgsnapshot from cinder.objects import consistencygroup from cinder.objects import fields from cinder import quota +from cinder import utils from cinder import volume as cinder_volume from cinder.volume import configuration as config from cinder.volume.flows.manager import create_volume @@ -184,6 +186,37 @@ MAPPING = { } +def clean_volume_locks(func): + @functools.wraps(func) + def wrapper(self, context, volume, *args, **kwargs): + try: + skip_clean = func(self, context, volume, *args, **kwargs) + except Exception: + # On quota failure volume will have been deleted from the DB + skip_clean = not volume.deleted + raise + finally: + if not skip_clean: + # Most TooZ drivers clean after themselves (like etcd3), so + # we clean TooZ file locks that are the same as oslo's. + utils.clean_volume_file_locks(volume.id, self.driver) + return wrapper + + +def clean_snapshot_locks(func): + @functools.wraps(func) + def wrapper(self, context, snapshot, *args, **kwargs): + try: + skip_clean = func(self, context, snapshot, *args, **kwargs) + except Exception: + skip_clean = not snapshot.deleted + raise + finally: + if not skip_clean: + utils.clean_snapshot_file_locks(snapshot.id, self.driver) + return wrapper + + class VolumeManager(manager.CleanableManager, manager.SchedulerDependentManager): """Manages attachable block storage devices.""" @@ -772,6 +805,12 @@ class VolumeManager(manager.CleanableManager, else: with coordination.COORDINATOR.get_lock(locked_action): _run_flow() + except exception.VolumeNotFound: + with excutils.save_and_reraise_exception(): + utils.clean_volume_file_locks(source_volid, self.driver) + except exception.SnapshotNotFound: + with excutils.save_and_reraise_exception(): + utils.clean_snapshot_file_locks(snapshot_id, self.driver) finally: try: flow_engine.storage.fetch('refreshed') @@ -818,7 +857,19 @@ class VolumeManager(manager.CleanableManager, 'backend': backend}) raise exception.Invalid(msg) - @coordination.synchronized('{volume.id}-{f_name}') + def driver_delete_volume(self, volume): + self.driver.delete_volume(volume) + # Most TooZ drivers clean after themselves (like etcd3), so we don't + # worry about those locks, only about TooZ file locks that are the same + # as oslo's. + utils.clean_volume_file_locks(volume.id, self.driver) + + def driver_delete_snapshot(self, snapshot): + self.driver.delete_snapshot(snapshot) + utils.clean_snapshot_file_locks(snapshot.id, self.driver) + + @clean_volume_locks + @coordination.synchronized('{volume.id}-delete_volume') @objects.Volume.set_workers def delete_volume(self, context: context.RequestContext, @@ -926,7 +977,7 @@ class VolumeManager(manager.CleanableManager, # If this is a destination volume, we have to clear the database # record to avoid user confusion. self._clear_db(is_migrating_dest, volume, 'available') - return + return True # Let caller know we skipped deletion except Exception: with excutils.save_and_reraise_exception(): # If this is a destination volume, we have to clear the @@ -1014,7 +1065,7 @@ class VolumeManager(manager.CleanableManager, temp_vol = self.driver._create_temp_volume_from_snapshot( ctxt, volume, snapshot, volume_options=v_options) self._copy_volume_data(ctxt, temp_vol, volume) - self.driver.delete_volume(temp_vol) + self.driver_delete_volume(temp_vol) temp_vol.destroy() except Exception: with excutils.save_and_reraise_exception(): @@ -1025,7 +1076,7 @@ class VolumeManager(manager.CleanableManager, {'snapshot': snapshot.id, 'volume': volume.id}) if temp_vol and temp_vol.status == 'available': - self.driver.delete_volume(temp_vol) + self.driver_delete_volume(temp_vol) temp_vol.destroy() def _revert_to_snapshot(self, context, volume, snapshot) -> None: @@ -1220,7 +1271,8 @@ class VolumeManager(manager.CleanableManager, resource=snapshot) return snapshot.id - @coordination.synchronized('{snapshot.id}-{f_name}') + @clean_snapshot_locks + @coordination.synchronized('{snapshot.id}-delete_snapshot') def delete_snapshot(self, context: context.RequestContext, snapshot: objects.Snapshot, @@ -1257,7 +1309,7 @@ class VolumeManager(manager.CleanableManager, resource_type=message_field.Resource.VOLUME_SNAPSHOT, resource_uuid=snapshot['id'], exception=busy_error) - return + return True # Let caller know we skipped deletion except Exception as delete_error: with excutils.save_and_reraise_exception(): snapshot.status = fields.SnapshotStatus.ERROR_DELETING @@ -3787,7 +3839,7 @@ class VolumeManager(manager.CleanableManager, volume_model_update = {'id': volume_ref.id} try: self.driver.remove_export(context, volume_ref) - self.driver.delete_volume(volume_ref) + self.driver_delete_volume(volume_ref) volume_model_update['status'] = 'deleted' except exception.VolumeIsBusy: volume_model_update['status'] = 'available' @@ -4102,7 +4154,7 @@ class VolumeManager(manager.CleanableManager, for snapshot in snapshots: snapshot_model_update = {'id': snapshot.id} try: - self.driver.delete_snapshot(snapshot) + self.driver_delete_snapshot(snapshot) snapshot_model_update['status'] = ( fields.SnapshotStatus.DELETED) except exception.SnapshotIsBusy: diff --git a/doc/source/contributor/high_availability.rst b/doc/source/contributor/high_availability.rst index 3d59ba0dabf..a75d2c085b4 100644 --- a/doc/source/contributor/high_availability.rst +++ b/doc/source/contributor/high_availability.rst @@ -607,6 +607,27 @@ Example from `cinder/backup/manager.py`: choose a generic lock name for all your locks and try to create a unique name for each locking domain. +Drivers that use node locks based on volumes should implement method +``clean_volume_file_locks`` and if they use locks based on the snapshots they +should also implement ``clean_snapshot_file_locks`` and use method +``synchronized_remove`` from ``cinder.utils``. + +Example for a driver that used ``cinder.utils.synchronized``: + +.. code-block:: python + + def my_operation(self, volume): + @utils.synchronized('my-driver-lock' + volume.id) + def method(): + pass + + method() + + @classmethod + def clean_volume_file_locks(cls, volume_id): + utils.synchronized_remove('my-driver-lock-' + volume_id) + + Global locks ~~~~~~~~~~~~ @@ -664,6 +685,19 @@ For a detailed description of the requirement for global locks in cinder please refer to the `replacing local locks with Tooz`_ and `manager local locks`_ specs. +Drivers that use global locks based on volumes should implement method +``clean_volume_file_locks`` and if they use locks based on the snapshots they +should also implement ``clean_snapshot_file_locks`` and use method +``synchronized_remove`` from ``cinder.coordination``. + +Example for the 3PAR driver: + +.. code-block:: python + + @classmethod + def clean_volume_file_locks(cls, volume_id): + coordination.synchronized_remove('3par-' + volume_id) + Cinder locking ~~~~~~~~~~~~~~ diff --git a/releasenotes/notes/clean-file-locks-on-remove-e5898012f4114d3c.yaml b/releasenotes/notes/clean-file-locks-on-remove-e5898012f4114d3c.yaml new file mode 100644 index 00000000000..732e35713aa --- /dev/null +++ b/releasenotes/notes/clean-file-locks-on-remove-e5898012f4114d3c.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + `Bug #1432387 `_: Try + to automatically clean up file locks after a resource (volume, snapshot) is + deleted. This will alleviate the issue of the locks directory always + increasing the number of files.