diff --git a/cinder/cmd/manage.py b/cinder/cmd/manage.py index 5f1dd810fd8..3f10f9fbd58 100644 --- a/cinder/cmd/manage.py +++ b/cinder/cmd/manage.py @@ -52,7 +52,12 @@ import collections import collections.abc as collections_abc +import errno +import glob +import itertools import logging as python_logging +import os +import re import sys import time @@ -830,6 +835,112 @@ class ConsistencyGroupCommands(object): gr.save() +class UtilCommands(object): + """Generic utils.""" + + @staticmethod + def _get_resources_locks(): + """Get all vol/snap/backup file lock paths.""" + backup_locks_prefix = 'cinder-cleanup_incomplete_backups_' + oslo_dir = os.path.abspath(cfg.CONF.oslo_concurrency.lock_path) + filenames = glob.glob(os.path.join(oslo_dir, 'cinder-*')) + + backend_url = cfg.CONF.coordination.backend_url + if backend_url.startswith('file://'): + tooz_dir = os.path.abspath(backend_url[7:]) + if tooz_dir != oslo_dir: + filenames += glob.glob(os.path.join(tooz_dir, 'cinder-*')) + + volumes = collections.defaultdict(list) + snapshots = collections.defaultdict(list) + backups = collections.defaultdict(list) + matcher = re.compile('.*?([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-' + '[0-9a-f]{4}-[0-9a-f]{12}).*?', re.IGNORECASE) + for filename in filenames: + basename = os.path.basename(filename) + match = matcher.match(basename) + if match: + dest = snapshots if 'snapshot' in basename else volumes + res_id = match.group(1) + dest[res_id].append(filename) + elif basename.startswith(backup_locks_prefix): + pgrp = basename[34:] + backups[pgrp].append(filename) + + return volumes, snapshots, backups + + def _exclude_running_backups(self, backups): + """Remove backup entries from the dict for running backup services.""" + for backup_pgrp in list(backups.keys()): + # The PGRP is the same as the PID of the parent process, so we know + # the lock could be in use if the process is running and it's the + # cinder-backup command (the PID could have been reused). + cmdline_file = os.path.join('/proc', backup_pgrp, 'cmdline') + try: + with open(cmdline_file, 'r') as f: + if 'cinder-backup' in f.read(): + del backups[backup_pgrp] + except FileNotFoundError: + continue + except Exception: + # Unexpected error, leaving the lock file just in case + del backups[backup_pgrp] + + @args('--services-offline', dest='online', + action='store_false', default=True, + help='All locks can be deleted as Cinder services are not running.') + def clean_locks(self, online): + """Clean file locks for vols, snaps, and backups on the current host. + + Should be run on any host where we are running a Cinder service (API, + Scheduler, Volume, Backup) and can be run with the Cinder services + running or stopped. + + If the services are running it will check existing resources in the + Cinder database in order to know which resources are still available + (it's not safe to remove their file locks) and will only remove the + file locks for the resources that are no longer present. Deleting + locks while the services are offline is faster as there's no need to + check the database. + + For backups, the way to know if we can remove the startup lock is by + checking if the PGRP in the file name is currently running + cinder-backup. + + Default assumes that services are online, must pass + ``--services-offline`` to specify that they are offline. + + Doesn't clean DLM locks (except when using file locks), as those don't + leave lock leftovers. + """ + self.ctxt = context.get_admin_context() + # Find volume and snapshots ids, and backups PGRP based on the existing + # file locks + volumes, snapshots, backups = self._get_resources_locks() + + # If services are online we cannot delete locks for existing resources + if online: + # We don't want to delete file locks for existing resources + volumes = {vol_id: files for vol_id, files in volumes.items() + if not objects.Volume.exists(vol_id)} + snapshots = {snap_id: files for snap_id, files in snapshots.items() + if not objects.Snapshot.exists(snap_id)} + self._exclude_running_backups(backups) + + # Now clean + for filenames in itertools.chain(volumes.values(), + snapshots.values(), + backups.values()): + for filename in filenames: + try: + os.remove(filename) + except Exception as exc: + if not (isinstance(exc, OSError) and + exc.errno == errno.ENOENT): + print('Failed to cleanup lock %(name)s: %(exc)s', + {'name': filename, 'exc': exc}) + + CATEGORIES = { 'backup': BackupCommands, 'config': ConfigCommands, @@ -841,6 +952,7 @@ CATEGORIES = { 'service': ServiceCommands, 'version': VersionCommands, 'volume': VolumeCommands, + 'util': UtilCommands, } diff --git a/cinder/tests/unit/test_cmd.py b/cinder/tests/unit/test_cmd.py index 4d7818acdff..b95c22c1f4e 100644 --- a/cinder/tests/unit/test_cmd.py +++ b/cinder/tests/unit/test_cmd.py @@ -10,7 +10,9 @@ # License for the specific language governing permissions and limitations # under the License. +import collections import datetime +import errno import io import re import sys @@ -1242,6 +1244,151 @@ class TestCinderManageCmd(test.TestCase): service_commands = cinder_manage.ServiceCommands() self.assertIsNone(service_commands.remove('abinary', 'ahost')) + @mock.patch('glob.glob') + def test_util__get_resources_locks(self, mock_glob): + cinder_manage.cfg.CONF.set_override('lock_path', '/locks', + group='oslo_concurrency') + cinder_manage.cfg.CONF.set_override('backend_url', 'file:///dlm', + group='coordination') + + vol1 = fake.VOLUME_ID + vol2 = fake.VOLUME2_ID + snap = fake.SNAPSHOT_ID + attach = fake.ATTACHMENT_ID + + files = [ + 'cinder-something', # Non UUID files are ignored + f'/locks/cinder-{vol1}-delete_volume', + f'/locks/cinder-{vol2}-delete_volume', + f'/locks/cinder-{vol2}', + f'/locks/cinder-{vol2}-detach_volume', + f'/locks/cinder-{snap}-delete_snapshot', + '/locks/cinder-cleanup_incomplete_backups_12345', + '/locks/cinder-unrelated-backup-named-file', + ] + dlm_locks = [ + f'/dlm/cinder-attachment_update-{vol2}-{attach}', + ] + mock_glob.side_effect = [files, dlm_locks] + + commands = cinder_manage.UtilCommands() + res = commands._get_resources_locks() + + self.assertEqual(2, mock_glob.call_count) + mock_glob.assert_has_calls([ + mock.call('/locks/cinder-*'), + mock.call('/dlm/cinder-*') + ]) + + expected_vols = { + vol1: [f'/locks/cinder-{vol1}-delete_volume'], + vol2: [f'/locks/cinder-{vol2}-delete_volume', + f'/locks/cinder-{vol2}', + f'/locks/cinder-{vol2}-detach_volume', + f'/dlm/cinder-attachment_update-{vol2}-{attach}'], + } + expected_snaps = { + snap: [f'/locks/cinder-{snap}-delete_snapshot'] + } + expected_backups = { + '12345': ['/locks/cinder-cleanup_incomplete_backups_12345'] + } + expected = (expected_vols, expected_snaps, expected_backups) + self.assertEqual(expected, res) + + @mock.patch.object(cinder_manage, 'open') + def test__exclude_running_backups(self, mock_open): + mock_running = mock.mock_open(read_data='cinder-backup --config-file ' + '/etc/cinder/cinder.conf') + file_running = mock_running.return_value.__enter__.return_value + mock_other = mock.mock_open(read_data='python') + file_other = mock_other.return_value.__enter__.return_value + + mock_open.side_effect = (FileNotFoundError, mock_running.return_value, + mock_other.return_value, + ValueError) + + backups = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341', + '12342': '/locks/cinder-cleanup_incomplete_backups_12342', + '12343': '/locks/cinder-cleanup_incomplete_backups_12343', + '12344': '/locks/cinder-cleanup_incomplete_backups_12344'} + + expected = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341', + '12343': '/locks/cinder-cleanup_incomplete_backups_12343'} + + commands = cinder_manage.UtilCommands() + res = commands._exclude_running_backups(backups) + + self.assertIsNone(res) + self.assertEqual(expected, backups) + + self.assertEqual(4, mock_open.call_count) + mock_open.assert_has_calls([mock.call('/proc/12341/cmdline', 'r'), + mock.call('/proc/12342/cmdline', 'r'), + mock.call('/proc/12343/cmdline', 'r'), + mock.call('/proc/12344/cmdline', 'r')]) + file_running.read.assert_called_once_with() + file_other.read.assert_called_once_with() + + @ddt.data(True, False) + @mock.patch.object(cinder_manage, 'print') + @mock.patch.object(cinder_manage.os, 'remove') + @mock.patch.object(cinder_manage.UtilCommands, '_exclude_running_backups') + @mock.patch('cinder.objects.Snapshot.exists') + @mock.patch('cinder.objects.Volume.exists') + @mock.patch.object(cinder_manage.UtilCommands, '_get_resources_locks') + @mock.patch.object(cinder_manage.context, 'get_admin_context') + def test_clean_locks(self, online, mock_ctxt, mock_get_locks, + mock_vol_exists, mock_snap_exists, mock_exclude_backs, + mock_remove, mock_print): + vol1_files = [f'/locks/cinder-{fake.VOLUME_ID}-delete_volume'] + vol2_files = [f'/locks/cinder-{fake.VOLUME2_ID}-delete_volume', + f'/locks/cinder-{fake.VOLUME2_ID}', + f'/locks/cinder-{fake.VOLUME2_ID}-detach_volume', + f'/dlm/cinder-attachment_update-{fake.VOLUME2_ID}-' + f'{fake.ATTACHMENT_ID}'] + vols = collections.OrderedDict(((fake.VOLUME_ID, vol1_files), + (fake.VOLUME2_ID, vol2_files))) + snap_files = [f'/locks/cinder-{fake.SNAPSHOT_ID}-delete_snapshot'] + snaps = {fake.SNAPSHOT_ID: snap_files} + back_files = ['/locks/cinder-cleanup_incomplete_backups_12345'] + backs = {'12345': back_files} + mock_get_locks.return_value = (vols, snaps, backs) + mock_vol_exists.side_effect = (True, False) + mock_snap_exists.return_value = False + mock_remove.side_effect = [None, errno.ENOENT, None, None, + errno.ENOENT, ValueError, None] + + commands = cinder_manage.UtilCommands() + commands.clean_locks(online=online) + + mock_ctxt.assert_called_once_with() + mock_get_locks.assert_called_once_with() + expected_calls = ([mock.call(v) for v in vol1_files] + + [mock.call(v) for v in vol2_files] + + [mock.call(s) for s in snap_files] + + [mock.call(b) for b in back_files]) + if online: + self.assertEqual(2, mock_vol_exists.call_count) + mock_vol_exists.assert_has_calls((mock.call(fake.VOLUME_ID), + mock.call(fake.VOLUME2_ID))) + mock_snap_exists.assert_called_once_with(fake.SNAPSHOT_ID) + mock_exclude_backs.assert_called_once_with(backs) + # If services are online we'll check resources that still exist + # and then we won't delete those that do. In this case the files + # for the first volume. + del expected_calls[0] + else: + mock_vol_exists.assert_not_called() + mock_snap_exists.assert_not_called() + mock_exclude_backs.assert_not_called() + + self.assertEqual(len(expected_calls), mock_remove.call_count) + mock_remove.assert_has_calls(expected_calls) + + # Only the ValueError exception should be logged + self.assertEqual(1, mock_print.call_count) + @test.testtools.skipIf(sys.platform == 'darwin', 'Not supported on macOS') class TestCinderRtstoolCmd(test.TestCase): diff --git a/doc/source/cli/cinder-manage.rst b/doc/source/cli/cinder-manage.rst index 0ff00bd3dcb..8aa57df42ee 100644 --- a/doc/source/cli/cinder-manage.rst +++ b/doc/source/cli/cinder-manage.rst @@ -243,6 +243,48 @@ Displays the current configuration parameters (options) for Cinder. The optional flag parameter may be used to display the configuration of one parameter. +Cinder Util +~~~~~~~~~~~ + +``cinder-manage util clean_locks [-h] [--services-offline]`` + +Clean file locks on the current host that were created and are used by drivers +and cinder services for volumes, snapshots, and the backup service on the +current host. + +Should be run on any host where we are running a Cinder service (API, +Scheduler, Volume, Backup) and can be run with the Cinder services running or +stopped. + +If the services are running it will check existing resources in the Cinder +database in order to only remove resources that are no longer present (it's +safe to delete the files). + +For backups, the way to know if we can remove the startup lock is by checking +if the PGRP in the file name is currently running cinder-backup. + +Deleting locks while the services are offline is faster as there's no need to +check the database or the running processes. + +Default assumes that services are online, must pass ``--services-offline`` to +specify that they are offline. + +The common use case for running the command with ``--services-offline`` is to +be called on startup as a service unit before any cinder service is started. +Command will be usually called without the ``--services-offline`` parameter +manually or from a cron job. + +.. warning:: + + Passing ``--services-offline`` when the Cinder services are still running + breaks the locking mechanism and can lead to undesired behavior in ongoing + Cinder operations. + +.. note:: + + This command doesn't clean DLM locks (except when using file locks), as + those don't leave lock leftovers. + FILES ===== diff --git a/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml b/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml new file mode 100644 index 00000000000..bba0b9da063 --- /dev/null +++ b/releasenotes/notes/clean-file-locks-tool-3a62ba05ef2d2239.yaml @@ -0,0 +1,10 @@ +--- +features: + - | + `Bug #1432387 `_: Add a + command to cinder-manage to clean up file locks existing in hosts where + there is a Cinder service running (API, Scheduler, Volume, Backup). + Command works with the Cinder services running, useful to be called as a + cron job, as well as stopped, to be called on host startup. Command + invocation ``cinder-manage util clean_locks`` with optional parameter + ``--services-offline``.