Merge "relinker: Remove replication locks for empty parts"

This commit is contained in:
Zuul 2021-06-02 20:09:21 +00:00 committed by Gerrit Code Review
commit 4f96fe5a84
3 changed files with 178 additions and 28 deletions

View File

@ -24,10 +24,11 @@ import os
import time
from collections import defaultdict
from swift.common.exceptions import LockTimeout
from swift.common.storage_policy import POLICIES
from swift.common.utils import replace_partition_in_path, config_true_value, \
audit_location_generator, get_logger, readconf, drop_privileges, \
RateLimitedIterator, lock_path, PrefixLoggerAdapter, distribute_evenly, \
RateLimitedIterator, PrefixLoggerAdapter, distribute_evenly, \
non_negative_float, non_negative_int, config_auto_int_value, \
dump_recon_cache, get_partition_from_path
from swift.obj import diskfile
@ -325,8 +326,11 @@ class Relinker(object):
device, dirty_partition, [], self.policy)
if self.do_cleanup:
hashes = self.diskfile_mgr.get_hashes(
device, int(partition), [], self.policy)
try:
hashes = self.diskfile_mgr.get_hashes(
device, int(partition), [], self.policy)
except LockTimeout:
hashes = 1 # truthy, but invalid
# In any reasonably-large cluster, we'd expect all old
# partitions P to be empty after cleanup (i.e., it's unlikely
# that there's another partition Q := P//2 that also has data
@ -338,21 +342,24 @@ class Relinker(object):
# starts and little data is written to handoffs during the
# increase).
if not hashes:
with lock_path(partition_path):
# Same lock used by invalidate_hashes, consolidate_hashes,
# get_hashes
try:
for f in ('hashes.pkl', 'hashes.invalid', '.lock'):
try:
with self.diskfile_mgr.replication_lock(
device, self.policy, partition), \
self.diskfile_mgr.partition_lock(
device, self.policy, partition):
# Order here is somewhat important for crash-tolerance
for f in ('hashes.pkl', 'hashes.invalid', '.lock',
'.lock-replication'):
try:
os.unlink(os.path.join(partition_path, f))
except OSError as e:
if e.errno != errno.ENOENT:
raise
except OSError:
pass
try:
# Note that as soon as we've deleted the lock files, some
# other process could come along and make new ones -- so
# this may well complain that the directory is not empty
os.rmdir(partition_path)
except OSError:
except (OSError, LockTimeout):
# Most likely, some data landed in here or we hit an error
# above. Let the replicator deal with things; it was worth
# a shot.
@ -512,7 +519,17 @@ class Relinker(object):
if created_links:
self.linked_into_partitions.add(get_partition_from_path(
self.conf['devices'], new_hash_path))
diskfile.invalidate_hash(os.path.dirname(new_hash_path))
try:
diskfile.invalidate_hash(os.path.dirname(new_hash_path))
except (Exception, LockTimeout) as exc:
# at this point, the link's created. even if we counted it as
# an error, a subsequent run wouldn't find any work to do. so,
# don't bother; instead, wait for replication to be re-enabled
# so post-replication rehashing or periodic rehashing can
# eventually pick up the change
self.logger.warning(
'Error invalidating suffix for %s: %r',
new_hash_path, exc)
if self.do_cleanup and not missing_links:
# use the sorted list to help unit testing
@ -539,7 +556,7 @@ class Relinker(object):
# relinking into the new part-power space
try:
diskfile.invalidate_hash(os.path.dirname(hash_path))
except Exception as exc:
except (Exception, LockTimeout) as exc:
# note: not counted as an error
self.logger.warning(
'Error invalidating suffix for %s: %r',

View File

@ -205,6 +205,7 @@ LOG_LINE_DEFAULT_FORMAT = '{remote_addr} - - [{time.d}/{time.b}/{time.Y}' \
'"{referer}" "{txn_id}" "{user_agent}" ' \
'{trans_time:.4f} "{additional_info}" {pid} ' \
'{policy_index}'
DEFAULT_LOCK_TIMEOUT = 10
class InvalidHashPathConfigError(ValueError):
@ -2849,7 +2850,8 @@ def _get_any_lock(fds):
@contextmanager
def lock_path(directory, timeout=10, timeout_class=None, limit=1, name=None):
def lock_path(directory, timeout=None, timeout_class=None,
limit=1, name=None):
"""
Context manager that acquires a lock on a directory. This will block until
the lock can be acquired, or the timeout time has expired (whichever occurs
@ -2860,7 +2862,8 @@ def lock_path(directory, timeout=10, timeout_class=None, limit=1, name=None):
workaround by locking a hidden file in the directory.
:param directory: directory to be locked
:param timeout: timeout (in seconds)
:param timeout: timeout (in seconds). If None, defaults to
DEFAULT_LOCK_TIMEOUT
:param timeout_class: The class of the exception to raise if the
lock cannot be granted within the timeout. Will be
constructed as timeout_class(timeout, lockpath). Default:
@ -2874,10 +2877,12 @@ def lock_path(directory, timeout=10, timeout_class=None, limit=1, name=None):
:raises TypeError: if limit is not an int.
:raises ValueError: if limit is less than 1.
"""
if limit < 1:
raise ValueError('limit must be greater than or equal to 1')
if timeout is None:
timeout = DEFAULT_LOCK_TIMEOUT
if timeout_class is None:
timeout_class = swift.common.exceptions.LockTimeout
if limit < 1:
raise ValueError('limit must be greater than or equal to 1')
mkdirs(directory)
lockpath = '%s/.lock' % directory
if name:
@ -2905,17 +2910,20 @@ def lock_path(directory, timeout=10, timeout_class=None, limit=1, name=None):
@contextmanager
def lock_file(filename, timeout=10, append=False, unlink=True):
def lock_file(filename, timeout=None, append=False, unlink=True):
"""
Context manager that acquires a lock on a file. This will block until
the lock can be acquired, or the timeout time has expired (whichever occurs
first).
:param filename: file to be locked
:param timeout: timeout (in seconds)
:param timeout: timeout (in seconds). If None, defaults to
DEFAULT_LOCK_TIMEOUT
:param append: True if file should be opened in append mode
:param unlink: True if the file should be unlinked at the end
"""
if timeout is None:
timeout = DEFAULT_LOCK_TIMEOUT
flags = os.O_CREAT | os.O_RDWR
if append:
flags |= os.O_APPEND
@ -2950,14 +2958,15 @@ def lock_file(filename, timeout=10, append=False, unlink=True):
file_obj.close()
def lock_parent_directory(filename, timeout=10):
def lock_parent_directory(filename, timeout=None):
"""
Context manager that acquires a lock on the parent directory of the given
file path. This will block until the lock can be acquired, or the timeout
time has expired (whichever occurs first).
:param filename: file path of the parent directory to be locked
:param timeout: timeout (in seconds)
:param timeout: timeout (in seconds). If None, defaults to
DEFAULT_LOCK_TIMEOUT
"""
return lock_path(os.path.dirname(filename), timeout=timeout)

View File

@ -38,7 +38,7 @@ from swift.common.storage_policy import (
get_policy_string)
from swift.obj.diskfile import write_metadata, DiskFileRouter, \
DiskFileManager, relink_paths
DiskFileManager, relink_paths, BaseDiskFileManager
from test.debug_logger import debug_logger
from test.unit import skip_if_no_xattrs, DEFAULT_TEST_EC_TYPE, \
@ -206,10 +206,12 @@ class TestRelinker(unittest.TestCase):
@contextmanager
def _mock_relinker(self):
with mock.patch.object(relinker.logging, 'getLogger',
return_value=self.logger):
with mock.patch('swift.cli.relinker.DEFAULT_RECON_CACHE_PATH',
self.recon_cache_path):
yield
return_value=self.logger), \
mock.patch.object(relinker, 'get_logger',
return_value=self.logger), \
mock.patch('swift.cli.relinker.DEFAULT_RECON_CACHE_PATH',
self.recon_cache_path):
yield
def test_workers_parent(self):
os.mkdir(os.path.join(self.devices, 'sda2'))
@ -3394,7 +3396,7 @@ class TestRelinker(unittest.TestCase):
'(0 files, 0 linked, 1 removed, 0 errors)', info_lines)
self.assertEqual([], self.logger.get_lines_for_level('error'))
def test_cleanup_old_part_careful(self):
def test_cleanup_old_part_careful_file(self):
self._common_test_cleanup()
# make some extra junk file in the part
extra_file = os.path.join(self.part_dir, 'extra')
@ -3411,6 +3413,124 @@ class TestRelinker(unittest.TestCase):
self.assertTrue(os.path.exists(self.part_dir))
self.assertEqual([], self.logger.get_lines_for_level('error'))
def test_cleanup_old_part_careful_dir(self):
self._common_test_cleanup()
# make some extra junk directory in the part
extra_dir = os.path.join(self.part_dir, 'extra')
os.mkdir(extra_dir)
self.assertEqual(0, relinker.main([
'cleanup',
'--swift-dir', self.testdir,
'--devices', self.devices,
'--skip-mount',
]))
# old partition can't be cleaned up
self.assertTrue(os.path.exists(self.part_dir))
self.assertTrue(os.path.exists(extra_dir))
def test_cleanup_old_part_replication_lock_taken(self):
# verify that relinker must take the replication lock before deleting
# it, and handles the LockTimeout when unable to take it
self._common_test_cleanup()
config = """
[DEFAULT]
swift_dir = %s
devices = %s
mount_check = false
replication_lock_timeout = 1
[object-relinker]
""" % (self.testdir, self.devices)
conf_file = os.path.join(self.testdir, 'relinker.conf')
with open(conf_file, 'w') as f:
f.write(dedent(config))
with utils.lock_path(self.part_dir, name='replication'):
# lock taken so relinker should be unable to remove the lock file
with self._mock_relinker():
self.assertEqual(0, relinker.main(['cleanup', conf_file]))
# old partition can't be cleaned up
self.assertTrue(os.path.exists(self.part_dir))
self.assertTrue(os.path.exists(
os.path.join(self.part_dir, '.lock-replication')))
self.assertEqual([], self.logger.get_lines_for_level('error'))
def test_cleanup_old_part_partition_lock_taken_during_get_hashes(self):
# verify that relinker handles LockTimeouts when rehashing
self._common_test_cleanup()
config = """
[DEFAULT]
swift_dir = %s
devices = %s
mount_check = false
replication_lock_timeout = 1
[object-relinker]
""" % (self.testdir, self.devices)
conf_file = os.path.join(self.testdir, 'relinker.conf')
with open(conf_file, 'w') as f:
f.write(dedent(config))
orig_get_hashes = BaseDiskFileManager.get_hashes
def new_get_hashes(*args, **kwargs):
# lock taken so relinker should be unable to rehash
with utils.lock_path(self.part_dir):
return orig_get_hashes(*args, **kwargs)
with self._mock_relinker(), \
mock.patch('swift.common.utils.DEFAULT_LOCK_TIMEOUT', 0.1), \
mock.patch.object(BaseDiskFileManager,
'get_hashes', new_get_hashes):
self.assertEqual(0, relinker.main(['cleanup', conf_file]))
# old partition can't be cleaned up
self.assertTrue(os.path.exists(self.part_dir))
self.assertTrue(os.path.exists(
os.path.join(self.part_dir, '.lock')))
self.assertEqual([], self.logger.get_lines_for_level('error'))
self.assertEqual([], self.logger.get_lines_for_level('warning'))
def test_cleanup_old_part_lock_taken_between_get_hashes_and_rm(self):
# verify that relinker must take the partition lock before deleting
# it, and handles the LockTimeout when unable to take it
self._common_test_cleanup()
config = """
[DEFAULT]
swift_dir = %s
devices = %s
mount_check = false
replication_lock_timeout = 1
[object-relinker]
""" % (self.testdir, self.devices)
conf_file = os.path.join(self.testdir, 'relinker.conf')
with open(conf_file, 'w') as f:
f.write(dedent(config))
orig_replication_lock = BaseDiskFileManager.replication_lock
@contextmanager
def new_lock(*args, **kwargs):
# lock taken so relinker should be unable to rehash
with utils.lock_path(self.part_dir):
with orig_replication_lock(*args, **kwargs) as cm:
yield cm
with self._mock_relinker(), \
mock.patch('swift.common.utils.DEFAULT_LOCK_TIMEOUT', 0.1), \
mock.patch.object(BaseDiskFileManager,
'replication_lock', new_lock):
self.assertEqual(0, relinker.main(['cleanup', conf_file]))
# old partition can't be cleaned up
self.assertTrue(os.path.exists(self.part_dir))
self.assertTrue(os.path.exists(
os.path.join(self.part_dir, '.lock')))
self.assertEqual([], self.logger.get_lines_for_level('error'))
self.assertEqual([], self.logger.get_lines_for_level('warning'))
def test_cleanup_old_part_robust(self):
self._common_test_cleanup()
@ -3425,6 +3545,10 @@ class TestRelinker(unittest.TestCase):
set(os.listdir(self.part_dir)))
# unlink a random file, should be empty
os.unlink(os.path.join(self.part_dir, 'hashes.pkl'))
# create an ssync replication lock, too
with open(os.path.join(self.part_dir,
'.lock-replication'), 'w'):
pass
calls.append(True)
elif part == self.next_part:
# sometimes our random obj needs to rehash the next part too