From dbcbfefe517268c3a2159b3bcd2d9a865b9d0a81 Mon Sep 17 00:00:00 2001 From: Gorka Eguileor Date: Mon, 8 Jun 2020 18:35:06 +0200 Subject: [PATCH] Add cinder-manage command to remove file locks This patch adds a new subgroup of commands called `util` with a single command ``remove_locks` that takes care of deleting locks that will no longer be used because the resources they are related to are no longer there. This works for volumes and snapshots and with the cinder services running or stopped. It gets the file locks filtering by name (must have the "cinder-" prefix and an UUID) and then if the service is online it will not delete file locks for resources that are still present in the database. Closes-Bug: #1432387 Change-Id: I2535017e112c8bcb9a2e516876f52a945e9c7da8 --- cinder/cmd/manage.py | 112 +++++++++++++ cinder/tests/unit/test_cmd.py | 147 ++++++++++++++++++ doc/source/cli/cinder-manage.rst | 42 +++++ ...lean-file-locks-tool-3a62ba05ef2d2239.yaml | 10 ++ 4 files changed, 311 insertions(+) create mode 100644 releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml diff --git a/cinder/cmd/manage.py b/cinder/cmd/manage.py index 5f1dd810fd8..3f10f9fbd58 100644 --- a/cinder/cmd/manage.py +++ b/cinder/cmd/manage.py @@ -52,7 +52,12 @@ import collections import collections.abc as collections_abc +import errno +import glob +import itertools import logging as python_logging +import os +import re import sys import time @@ -830,6 +835,112 @@ class ConsistencyGroupCommands(object): gr.save() +class UtilCommands(object): + """Generic utils.""" + + @staticmethod + def _get_resources_locks(): + """Get all vol/snap/backup file lock paths.""" + backup_locks_prefix = 'cinder-cleanup_incomplete_backups_' + oslo_dir = os.path.abspath(cfg.CONF.oslo_concurrency.lock_path) + filenames = glob.glob(os.path.join(oslo_dir, 'cinder-*')) + + backend_url = cfg.CONF.coordination.backend_url + if backend_url.startswith('file://'): + tooz_dir = os.path.abspath(backend_url[7:]) + if tooz_dir != oslo_dir: + filenames += glob.glob(os.path.join(tooz_dir, 'cinder-*')) + + volumes = collections.defaultdict(list) + snapshots = collections.defaultdict(list) + backups = collections.defaultdict(list) + matcher = re.compile('.*?([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-' + '[0-9a-f]{4}-[0-9a-f]{12}).*?', re.IGNORECASE) + for filename in filenames: + basename = os.path.basename(filename) + match = matcher.match(basename) + if match: + dest = snapshots if 'snapshot' in basename else volumes + res_id = match.group(1) + dest[res_id].append(filename) + elif basename.startswith(backup_locks_prefix): + pgrp = basename[34:] + backups[pgrp].append(filename) + + return volumes, snapshots, backups + + def _exclude_running_backups(self, backups): + """Remove backup entries from the dict for running backup services.""" + for backup_pgrp in list(backups.keys()): + # The PGRP is the same as the PID of the parent process, so we know + # the lock could be in use if the process is running and it's the + # cinder-backup command (the PID could have been reused). + cmdline_file = os.path.join('/proc', backup_pgrp, 'cmdline') + try: + with open(cmdline_file, 'r') as f: + if 'cinder-backup' in f.read(): + del backups[backup_pgrp] + except FileNotFoundError: + continue + except Exception: + # Unexpected error, leaving the lock file just in case + del backups[backup_pgrp] + + @args('--services-offline', dest='online', + action='store_false', default=True, + help='All locks can be deleted as Cinder services are not running.') + def clean_locks(self, online): + """Clean file locks for vols, snaps, and backups on the current host. + + Should be run on any host where we are running a Cinder service (API, + Scheduler, Volume, Backup) and can be run with the Cinder services + running or stopped. + + If the services are running it will check existing resources in the + Cinder database in order to know which resources are still available + (it's not safe to remove their file locks) and will only remove the + file locks for the resources that are no longer present. Deleting + locks while the services are offline is faster as there's no need to + check the database. + + For backups, the way to know if we can remove the startup lock is by + checking if the PGRP in the file name is currently running + cinder-backup. + + Default assumes that services are online, must pass + ``--services-offline`` to specify that they are offline. + + Doesn't clean DLM locks (except when using file locks), as those don't + leave lock leftovers. + """ + self.ctxt = context.get_admin_context() + # Find volume and snapshots ids, and backups PGRP based on the existing + # file locks + volumes, snapshots, backups = self._get_resources_locks() + + # If services are online we cannot delete locks for existing resources + if online: + # We don't want to delete file locks for existing resources + volumes = {vol_id: files for vol_id, files in volumes.items() + if not objects.Volume.exists(vol_id)} + snapshots = {snap_id: files for snap_id, files in snapshots.items() + if not objects.Snapshot.exists(snap_id)} + self._exclude_running_backups(backups) + + # Now clean + for filenames in itertools.chain(volumes.values(), + snapshots.values(), + backups.values()): + for filename in filenames: + try: + os.remove(filename) + except Exception as exc: + if not (isinstance(exc, OSError) and + exc.errno == errno.ENOENT): + print('Failed to cleanup lock %(name)s: %(exc)s', + {'name': filename, 'exc': exc}) + + CATEGORIES = { 'backup': BackupCommands, 'config': ConfigCommands, @@ -841,6 +952,7 @@ CATEGORIES = { 'service': ServiceCommands, 'version': VersionCommands, 'volume': VolumeCommands, + 'util': UtilCommands, } diff --git a/cinder/tests/unit/test_cmd.py b/cinder/tests/unit/test_cmd.py index 4d7818acdff..b95c22c1f4e 100644 --- a/cinder/tests/unit/test_cmd.py +++ b/cinder/tests/unit/test_cmd.py @@ -10,7 +10,9 @@ # License for the specific language governing permissions and limitations # under the License. +import collections import datetime +import errno import io import re import sys @@ -1242,6 +1244,151 @@ class TestCinderManageCmd(test.TestCase): service_commands = cinder_manage.ServiceCommands() self.assertIsNone(service_commands.remove('abinary', 'ahost')) + @mock.patch('glob.glob') + def test_util__get_resources_locks(self, mock_glob): + cinder_manage.cfg.CONF.set_override('lock_path', '/locks', + group='oslo_concurrency') + cinder_manage.cfg.CONF.set_override('backend_url', 'file:///dlm', + group='coordination') + + vol1 = fake.VOLUME_ID + vol2 = fake.VOLUME2_ID + snap = fake.SNAPSHOT_ID + attach = fake.ATTACHMENT_ID + + files = [ + 'cinder-something', # Non UUID files are ignored + f'/locks/cinder-{vol1}-delete_volume', + f'/locks/cinder-{vol2}-delete_volume', + f'/locks/cinder-{vol2}', + f'/locks/cinder-{vol2}-detach_volume', + f'/locks/cinder-{snap}-delete_snapshot', + '/locks/cinder-cleanup_incomplete_backups_12345', + '/locks/cinder-unrelated-backup-named-file', + ] + dlm_locks = [ + f'/dlm/cinder-attachment_update-{vol2}-{attach}', + ] + mock_glob.side_effect = [files, dlm_locks] + + commands = cinder_manage.UtilCommands() + res = commands._get_resources_locks() + + self.assertEqual(2, mock_glob.call_count) + mock_glob.assert_has_calls([ + mock.call('/locks/cinder-*'), + mock.call('/dlm/cinder-*') + ]) + + expected_vols = { + vol1: [f'/locks/cinder-{vol1}-delete_volume'], + vol2: [f'/locks/cinder-{vol2}-delete_volume', + f'/locks/cinder-{vol2}', + f'/locks/cinder-{vol2}-detach_volume', + f'/dlm/cinder-attachment_update-{vol2}-{attach}'], + } + expected_snaps = { + snap: [f'/locks/cinder-{snap}-delete_snapshot'] + } + expected_backups = { + '12345': ['/locks/cinder-cleanup_incomplete_backups_12345'] + } + expected = (expected_vols, expected_snaps, expected_backups) + self.assertEqual(expected, res) + + @mock.patch.object(cinder_manage, 'open') + def test__exclude_running_backups(self, mock_open): + mock_running = mock.mock_open(read_data='cinder-backup --config-file ' + '/etc/cinder/cinder.conf') + file_running = mock_running.return_value.__enter__.return_value + mock_other = mock.mock_open(read_data='python') + file_other = mock_other.return_value.__enter__.return_value + + mock_open.side_effect = (FileNotFoundError, mock_running.return_value, + mock_other.return_value, + ValueError) + + backups = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341', + '12342': '/locks/cinder-cleanup_incomplete_backups_12342', + '12343': '/locks/cinder-cleanup_incomplete_backups_12343', + '12344': '/locks/cinder-cleanup_incomplete_backups_12344'} + + expected = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341', + '12343': '/locks/cinder-cleanup_incomplete_backups_12343'} + + commands = cinder_manage.UtilCommands() + res = commands._exclude_running_backups(backups) + + self.assertIsNone(res) + self.assertEqual(expected, backups) + + self.assertEqual(4, mock_open.call_count) + mock_open.assert_has_calls([mock.call('/proc/12341/cmdline', 'r'), + mock.call('/proc/12342/cmdline', 'r'), + mock.call('/proc/12343/cmdline', 'r'), + mock.call('/proc/12344/cmdline', 'r')]) + file_running.read.assert_called_once_with() + file_other.read.assert_called_once_with() + + @ddt.data(True, False) + @mock.patch.object(cinder_manage, 'print') + @mock.patch.object(cinder_manage.os, 'remove') + @mock.patch.object(cinder_manage.UtilCommands, '_exclude_running_backups') + @mock.patch('cinder.objects.Snapshot.exists') + @mock.patch('cinder.objects.Volume.exists') + @mock.patch.object(cinder_manage.UtilCommands, '_get_resources_locks') + @mock.patch.object(cinder_manage.context, 'get_admin_context') + def test_clean_locks(self, online, mock_ctxt, mock_get_locks, + mock_vol_exists, mock_snap_exists, mock_exclude_backs, + mock_remove, mock_print): + vol1_files = [f'/locks/cinder-{fake.VOLUME_ID}-delete_volume'] + vol2_files = [f'/locks/cinder-{fake.VOLUME2_ID}-delete_volume', + f'/locks/cinder-{fake.VOLUME2_ID}', + f'/locks/cinder-{fake.VOLUME2_ID}-detach_volume', + f'/dlm/cinder-attachment_update-{fake.VOLUME2_ID}-' + f'{fake.ATTACHMENT_ID}'] + vols = collections.OrderedDict(((fake.VOLUME_ID, vol1_files), + (fake.VOLUME2_ID, vol2_files))) + snap_files = [f'/locks/cinder-{fake.SNAPSHOT_ID}-delete_snapshot'] + snaps = {fake.SNAPSHOT_ID: snap_files} + back_files = ['/locks/cinder-cleanup_incomplete_backups_12345'] + backs = {'12345': back_files} + mock_get_locks.return_value = (vols, snaps, backs) + mock_vol_exists.side_effect = (True, False) + mock_snap_exists.return_value = False + mock_remove.side_effect = [None, errno.ENOENT, None, None, + errno.ENOENT, ValueError, None] + + commands = cinder_manage.UtilCommands() + commands.clean_locks(online=online) + + mock_ctxt.assert_called_once_with() + mock_get_locks.assert_called_once_with() + expected_calls = ([mock.call(v) for v in vol1_files] + + [mock.call(v) for v in vol2_files] + + [mock.call(s) for s in snap_files] + + [mock.call(b) for b in back_files]) + if online: + self.assertEqual(2, mock_vol_exists.call_count) + mock_vol_exists.assert_has_calls((mock.call(fake.VOLUME_ID), + mock.call(fake.VOLUME2_ID))) + mock_snap_exists.assert_called_once_with(fake.SNAPSHOT_ID) + mock_exclude_backs.assert_called_once_with(backs) + # If services are online we'll check resources that still exist + # and then we won't delete those that do. In this case the files + # for the first volume. + del expected_calls[0] + else: + mock_vol_exists.assert_not_called() + mock_snap_exists.assert_not_called() + mock_exclude_backs.assert_not_called() + + self.assertEqual(len(expected_calls), mock_remove.call_count) + mock_remove.assert_has_calls(expected_calls) + + # Only the ValueError exception should be logged + self.assertEqual(1, mock_print.call_count) + @test.testtools.skipIf(sys.platform == 'darwin', 'Not supported on macOS') class TestCinderRtstoolCmd(test.TestCase): diff --git a/doc/source/cli/cinder-manage.rst b/doc/source/cli/cinder-manage.rst index 0ff00bd3dcb..8aa57df42ee 100644 --- a/doc/source/cli/cinder-manage.rst +++ b/doc/source/cli/cinder-manage.rst @@ -243,6 +243,48 @@ Displays the current configuration parameters (options) for Cinder. The optional flag parameter may be used to display the configuration of one parameter. +Cinder Util +~~~~~~~~~~~ + +``cinder-manage util clean_locks [-h] [--services-offline]`` + +Clean file locks on the current host that were created and are used by drivers +and cinder services for volumes, snapshots, and the backup service on the +current host. + +Should be run on any host where we are running a Cinder service (API, +Scheduler, Volume, Backup) and can be run with the Cinder services running or +stopped. + +If the services are running it will check existing resources in the Cinder +database in order to only remove resources that are no longer present (it's +safe to delete the files). + +For backups, the way to know if we can remove the startup lock is by checking +if the PGRP in the file name is currently running cinder-backup. + +Deleting locks while the services are offline is faster as there's no need to +check the database or the running processes. + +Default assumes that services are online, must pass ``--services-offline`` to +specify that they are offline. + +The common use case for running the command with ``--services-offline`` is to +be called on startup as a service unit before any cinder service is started. +Command will be usually called without the ``--services-offline`` parameter +manually or from a cron job. + +.. warning:: + + Passing ``--services-offline`` when the Cinder services are still running + breaks the locking mechanism and can lead to undesired behavior in ongoing + Cinder operations. + +.. note:: + + This command doesn't clean DLM locks (except when using file locks), as + those don't leave lock leftovers. + FILES ===== diff --git a/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml b/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml new file mode 100644 index 00000000000..bba0b9da063 --- /dev/null +++ b/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + `Bug #1432387 `_: Add a + command to cinder-manage to clean up file locks existing in hosts where + there is a Cinder service running (API, Scheduler, Volume, Backup). + Command works with the Cinder services running, useful to be called as a + cron job, as well as stopped, to be called on host startup. Command + invocation ``cinder-manage util clean_locks`` with optional parameter + ``--services-offline``.