Add cinder-manage command to remove file locks

This patch adds a new subgroup of commands called `util` with a single
command ``remove_locks` that takes care of deleting locks that will no
longer be used because the resources they are related to are no longer
there.

This works for volumes and snapshots and with the cinder services
running or stopped.

It gets the file locks filtering by name (must have the "cinder-" prefix
and an UUID) and then if the service is online it will not delete file
locks for resources that are still present in the database.

Closes-Bug: #1432387
Change-Id: I2535017e112c8bcb9a2e516876f52a945e9c7da8
This commit is contained in:
Gorka Eguileor 2020-06-08 18:35:06 +02:00 committed by Eric Harney
parent d2f6ec5569
commit dbcbfefe51
4 changed files with 311 additions and 0 deletions

View File

@ -52,7 +52,12 @@
import collections
import collections.abc as collections_abc
import errno
import glob
import itertools
import logging as python_logging
import os
import re
import sys
import time
@ -830,6 +835,112 @@ class ConsistencyGroupCommands(object):
gr.save()
class UtilCommands(object):
"""Generic utils."""
@staticmethod
def _get_resources_locks():
"""Get all vol/snap/backup file lock paths."""
backup_locks_prefix = 'cinder-cleanup_incomplete_backups_'
oslo_dir = os.path.abspath(cfg.CONF.oslo_concurrency.lock_path)
filenames = glob.glob(os.path.join(oslo_dir, 'cinder-*'))
backend_url = cfg.CONF.coordination.backend_url
if backend_url.startswith('file://'):
tooz_dir = os.path.abspath(backend_url[7:])
if tooz_dir != oslo_dir:
filenames += glob.glob(os.path.join(tooz_dir, 'cinder-*'))
volumes = collections.defaultdict(list)
snapshots = collections.defaultdict(list)
backups = collections.defaultdict(list)
matcher = re.compile('.*?([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-'
'[0-9a-f]{4}-[0-9a-f]{12}).*?', re.IGNORECASE)
for filename in filenames:
basename = os.path.basename(filename)
match = matcher.match(basename)
if match:
dest = snapshots if 'snapshot' in basename else volumes
res_id = match.group(1)
dest[res_id].append(filename)
elif basename.startswith(backup_locks_prefix):
pgrp = basename[34:]
backups[pgrp].append(filename)
return volumes, snapshots, backups
def _exclude_running_backups(self, backups):
"""Remove backup entries from the dict for running backup services."""
for backup_pgrp in list(backups.keys()):
# The PGRP is the same as the PID of the parent process, so we know
# the lock could be in use if the process is running and it's the
# cinder-backup command (the PID could have been reused).
cmdline_file = os.path.join('/proc', backup_pgrp, 'cmdline')
try:
with open(cmdline_file, 'r') as f:
if 'cinder-backup' in f.read():
del backups[backup_pgrp]
except FileNotFoundError:
continue
except Exception:
# Unexpected error, leaving the lock file just in case
del backups[backup_pgrp]
@args('--services-offline', dest='online',
action='store_false', default=True,
help='All locks can be deleted as Cinder services are not running.')
def clean_locks(self, online):
"""Clean file locks for vols, snaps, and backups on the current host.
Should be run on any host where we are running a Cinder service (API,
Scheduler, Volume, Backup) and can be run with the Cinder services
running or stopped.
If the services are running it will check existing resources in the
Cinder database in order to know which resources are still available
(it's not safe to remove their file locks) and will only remove the
file locks for the resources that are no longer present. Deleting
locks while the services are offline is faster as there's no need to
check the database.
For backups, the way to know if we can remove the startup lock is by
checking if the PGRP in the file name is currently running
cinder-backup.
Default assumes that services are online, must pass
``--services-offline`` to specify that they are offline.
Doesn't clean DLM locks (except when using file locks), as those don't
leave lock leftovers.
"""
self.ctxt = context.get_admin_context()
# Find volume and snapshots ids, and backups PGRP based on the existing
# file locks
volumes, snapshots, backups = self._get_resources_locks()
# If services are online we cannot delete locks for existing resources
if online:
# We don't want to delete file locks for existing resources
volumes = {vol_id: files for vol_id, files in volumes.items()
if not objects.Volume.exists(vol_id)}
snapshots = {snap_id: files for snap_id, files in snapshots.items()
if not objects.Snapshot.exists(snap_id)}
self._exclude_running_backups(backups)
# Now clean
for filenames in itertools.chain(volumes.values(),
snapshots.values(),
backups.values()):
for filename in filenames:
try:
os.remove(filename)
except Exception as exc:
if not (isinstance(exc, OSError) and
exc.errno == errno.ENOENT):
print('Failed to cleanup lock %(name)s: %(exc)s',
{'name': filename, 'exc': exc})
CATEGORIES = {
'backup': BackupCommands,
'config': ConfigCommands,
@ -841,6 +952,7 @@ CATEGORIES = {
'service': ServiceCommands,
'version': VersionCommands,
'volume': VolumeCommands,
'util': UtilCommands,
}

View File

@ -10,7 +10,9 @@
# License for the specific language governing permissions and limitations
# under the License.
import collections
import datetime
import errno
import io
import re
import sys
@ -1242,6 +1244,151 @@ class TestCinderManageCmd(test.TestCase):
service_commands = cinder_manage.ServiceCommands()
self.assertIsNone(service_commands.remove('abinary', 'ahost'))
@mock.patch('glob.glob')
def test_util__get_resources_locks(self, mock_glob):
cinder_manage.cfg.CONF.set_override('lock_path', '/locks',
group='oslo_concurrency')
cinder_manage.cfg.CONF.set_override('backend_url', 'file:///dlm',
group='coordination')
vol1 = fake.VOLUME_ID
vol2 = fake.VOLUME2_ID
snap = fake.SNAPSHOT_ID
attach = fake.ATTACHMENT_ID
files = [
'cinder-something', # Non UUID files are ignored
f'/locks/cinder-{vol1}-delete_volume',
f'/locks/cinder-{vol2}-delete_volume',
f'/locks/cinder-{vol2}',
f'/locks/cinder-{vol2}-detach_volume',
f'/locks/cinder-{snap}-delete_snapshot',
'/locks/cinder-cleanup_incomplete_backups_12345',
'/locks/cinder-unrelated-backup-named-file',
]
dlm_locks = [
f'/dlm/cinder-attachment_update-{vol2}-{attach}',
]
mock_glob.side_effect = [files, dlm_locks]
commands = cinder_manage.UtilCommands()
res = commands._get_resources_locks()
self.assertEqual(2, mock_glob.call_count)
mock_glob.assert_has_calls([
mock.call('/locks/cinder-*'),
mock.call('/dlm/cinder-*')
])
expected_vols = {
vol1: [f'/locks/cinder-{vol1}-delete_volume'],
vol2: [f'/locks/cinder-{vol2}-delete_volume',
f'/locks/cinder-{vol2}',
f'/locks/cinder-{vol2}-detach_volume',
f'/dlm/cinder-attachment_update-{vol2}-{attach}'],
}
expected_snaps = {
snap: [f'/locks/cinder-{snap}-delete_snapshot']
}
expected_backups = {
'12345': ['/locks/cinder-cleanup_incomplete_backups_12345']
}
expected = (expected_vols, expected_snaps, expected_backups)
self.assertEqual(expected, res)
@mock.patch.object(cinder_manage, 'open')
def test__exclude_running_backups(self, mock_open):
mock_running = mock.mock_open(read_data='cinder-backup --config-file '
'/etc/cinder/cinder.conf')
file_running = mock_running.return_value.__enter__.return_value
mock_other = mock.mock_open(read_data='python')
file_other = mock_other.return_value.__enter__.return_value
mock_open.side_effect = (FileNotFoundError, mock_running.return_value,
mock_other.return_value,
ValueError)
backups = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341',
'12342': '/locks/cinder-cleanup_incomplete_backups_12342',
'12343': '/locks/cinder-cleanup_incomplete_backups_12343',
'12344': '/locks/cinder-cleanup_incomplete_backups_12344'}
expected = {'12341': '/locks/cinder-cleanup_incomplete_backups_12341',
'12343': '/locks/cinder-cleanup_incomplete_backups_12343'}
commands = cinder_manage.UtilCommands()
res = commands._exclude_running_backups(backups)
self.assertIsNone(res)
self.assertEqual(expected, backups)
self.assertEqual(4, mock_open.call_count)
mock_open.assert_has_calls([mock.call('/proc/12341/cmdline', 'r'),
mock.call('/proc/12342/cmdline', 'r'),
mock.call('/proc/12343/cmdline', 'r'),
mock.call('/proc/12344/cmdline', 'r')])
file_running.read.assert_called_once_with()
file_other.read.assert_called_once_with()
@ddt.data(True, False)
@mock.patch.object(cinder_manage, 'print')
@mock.patch.object(cinder_manage.os, 'remove')
@mock.patch.object(cinder_manage.UtilCommands, '_exclude_running_backups')
@mock.patch('cinder.objects.Snapshot.exists')
@mock.patch('cinder.objects.Volume.exists')
@mock.patch.object(cinder_manage.UtilCommands, '_get_resources_locks')
@mock.patch.object(cinder_manage.context, 'get_admin_context')
def test_clean_locks(self, online, mock_ctxt, mock_get_locks,
mock_vol_exists, mock_snap_exists, mock_exclude_backs,
mock_remove, mock_print):
vol1_files = [f'/locks/cinder-{fake.VOLUME_ID}-delete_volume']
vol2_files = [f'/locks/cinder-{fake.VOLUME2_ID}-delete_volume',
f'/locks/cinder-{fake.VOLUME2_ID}',
f'/locks/cinder-{fake.VOLUME2_ID}-detach_volume',
f'/dlm/cinder-attachment_update-{fake.VOLUME2_ID}-'
f'{fake.ATTACHMENT_ID}']
vols = collections.OrderedDict(((fake.VOLUME_ID, vol1_files),
(fake.VOLUME2_ID, vol2_files)))
snap_files = [f'/locks/cinder-{fake.SNAPSHOT_ID}-delete_snapshot']
snaps = {fake.SNAPSHOT_ID: snap_files}
back_files = ['/locks/cinder-cleanup_incomplete_backups_12345']
backs = {'12345': back_files}
mock_get_locks.return_value = (vols, snaps, backs)
mock_vol_exists.side_effect = (True, False)
mock_snap_exists.return_value = False
mock_remove.side_effect = [None, errno.ENOENT, None, None,
errno.ENOENT, ValueError, None]
commands = cinder_manage.UtilCommands()
commands.clean_locks(online=online)
mock_ctxt.assert_called_once_with()
mock_get_locks.assert_called_once_with()
expected_calls = ([mock.call(v) for v in vol1_files] +
[mock.call(v) for v in vol2_files] +
[mock.call(s) for s in snap_files] +
[mock.call(b) for b in back_files])
if online:
self.assertEqual(2, mock_vol_exists.call_count)
mock_vol_exists.assert_has_calls((mock.call(fake.VOLUME_ID),
mock.call(fake.VOLUME2_ID)))
mock_snap_exists.assert_called_once_with(fake.SNAPSHOT_ID)
mock_exclude_backs.assert_called_once_with(backs)
# If services are online we'll check resources that still exist
# and then we won't delete those that do. In this case the files
# for the first volume.
del expected_calls[0]
else:
mock_vol_exists.assert_not_called()
mock_snap_exists.assert_not_called()
mock_exclude_backs.assert_not_called()
self.assertEqual(len(expected_calls), mock_remove.call_count)
mock_remove.assert_has_calls(expected_calls)
# Only the ValueError exception should be logged
self.assertEqual(1, mock_print.call_count)
@test.testtools.skipIf(sys.platform == 'darwin', 'Not supported on macOS')
class TestCinderRtstoolCmd(test.TestCase):

View File

@ -243,6 +243,48 @@ Displays the current configuration parameters (options) for Cinder. The
optional flag parameter may be used to display the configuration of one
parameter.
Cinder Util
~~~~~~~~~~~
``cinder-manage util clean_locks [-h] [--services-offline]``
Clean file locks on the current host that were created and are used by drivers
and cinder services for volumes, snapshots, and the backup service on the
current host.
Should be run on any host where we are running a Cinder service (API,
Scheduler, Volume, Backup) and can be run with the Cinder services running or
stopped.
If the services are running it will check existing resources in the Cinder
database in order to only remove resources that are no longer present (it's
safe to delete the files).
For backups, the way to know if we can remove the startup lock is by checking
if the PGRP in the file name is currently running cinder-backup.
Deleting locks while the services are offline is faster as there's no need to
check the database or the running processes.
Default assumes that services are online, must pass ``--services-offline`` to
specify that they are offline.
The common use case for running the command with ``--services-offline`` is to
be called on startup as a service unit before any cinder service is started.
Command will be usually called without the ``--services-offline`` parameter
manually or from a cron job.
.. warning::
Passing ``--services-offline`` when the Cinder services are still running
breaks the locking mechanism and can lead to undesired behavior in ongoing
Cinder operations.
.. note::
This command doesn't clean DLM locks (except when using file locks), as
those don't leave lock leftovers.
FILES
=====

View File

@ -0,0 +1,10 @@
---
features:
- |
`Bug #1432387 <https://bugs.launchpad.net/cinder/+bug/1432387>`_: Add a
command to cinder-manage to clean up file locks existing in hosts where
there is a Cinder service running (API, Scheduler, Volume, Backup).
Command works with the Cinder services running, useful to be called as a
cron job, as well as stopped, to be called on host startup. Command
invocation ``cinder-manage util clean_locks`` with optional parameter
``--services-offline``.