reconstructor: make quarantine delay configurable

Previously the reconstructor would quarantine isolated durable
fragments that were more than reclaim_age old. This patch adds a
quarantine_age option for the reconstructor which defaults to
reclaim_age but can be used to configure the age that a fragment must
reach before quarantining.

Change-Id: I867f3ea0cf60620c576da0c1f2c65cec2cf19aa0
This commit is contained in:
Alistair Coles 2021-07-02 15:42:33 +01:00
parent 2117a32b99
commit 2fd5b87dc5
5 changed files with 77 additions and 6 deletions

View File

@ -495,6 +495,19 @@ ionice_priority None I/O scheduling priority o
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
quarantine_threshold 0 The reconstructor may quarantine
stale isolated fragments
when it fails to fetch
more than the
quarantine_threshold
number of fragments
(including the stale
fragment) during an
attempt to reconstruct.
quarantine_age reclaim_age Fragments are not quarantined
until they are older than
quarantine_age, which defaults
to the value of reclaim_age.
=========================== ======================== ================================
****************

View File

@ -412,9 +412,12 @@ use = egg:swift#recon
# Note: the quarantine_threshold applies equally to all policies, but for each
# policy it is effectively capped at (ec_ndata - 1) so that a fragment is never
# quarantined when sufficient fragments exist to reconstruct the object.
# Fragments are not quarantined until they are older than the reclaim_age.
# quarantine_threshold = 0
#
# Fragments are not quarantined until they are older than
# quarantine_age, which defaults to the value of reclaim_age.
# quarantine_age =
#
# Sets the maximum number of nodes to which requests will be made before
# quarantining a fragment. You can use '* replicas' at the end to have it use
# the number given times the number of replicas for the ring being used for the

View File

@ -44,7 +44,7 @@ from swift.obj.ssync_sender import Sender as ssync_sender
from swift.common.http import HTTP_OK, HTTP_NOT_FOUND, \
HTTP_INSUFFICIENT_STORAGE
from swift.obj.diskfile import DiskFileRouter, get_data_dir, \
get_tmp_dir
get_tmp_dir, DEFAULT_RECLAIM_AGE
from swift.common.storage_policy import POLICIES, EC_POLICY
from swift.common.exceptions import ConnectionTimeout, DiskFileError, \
SuffixSyncError, PartitionLockTimeout
@ -236,6 +236,9 @@ class ObjectReconstructor(Daemon):
'rebuild_handoff_node_count', 2))
self.quarantine_threshold = non_negative_int(
conf.get('quarantine_threshold', 0))
self.quarantine_age = int(
conf.get('quarantine_age',
conf.get('reclaim_age', DEFAULT_RECLAIM_AGE)))
self.request_node_count = config_request_node_count_value(
conf.get('request_node_count', '2 * replicas'))
self.nondurable_purge_delay = non_negative_float(
@ -507,7 +510,7 @@ class ObjectReconstructor(Daemon):
# worth more investigation
return False
if time.time() - float(local_timestamp) <= df.manager.reclaim_age:
if time.time() - float(local_timestamp) <= self.quarantine_age:
# If the fragment has not yet passed reclaim age then it is
# likely that a tombstone will be reverted to this node, or
# neighbor frags will get reverted from handoffs to *other* nodes

View File

@ -425,7 +425,7 @@ class TestReconstructorRebuild(ECProbeTest):
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0'})
{'quarantine_age': '0'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))
@ -462,7 +462,7 @@ class TestReconstructorRebuild(ECProbeTest):
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0', 'quarantine_threshold': '1'})
{'quarantine_age': '0', 'quarantine_threshold': '1'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))
@ -515,7 +515,7 @@ class TestReconstructorRebuild(ECProbeTest):
for conf_index in self.configs['object-reconstructor'].keys():
reconstructor = self.run_custom_daemon(
ObjectReconstructor, 'object-reconstructor', conf_index,
{'reclaim_age': '0', 'quarantine_threshold': '1'})
{'quarantine_age': '0', 'quarantine_threshold': '1'})
logger = reconstructor.logger.logger
error_lines.append(logger.get_lines_for_level('error'))
warning_lines.append(logger.get_lines_for_level('warning'))

View File

@ -5407,6 +5407,37 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
object_reconstructor.ObjectReconstructor(
{'nondurable_purge_delay': bad})
def test_quarantine_age_conf(self):
# defaults to DEFAULT_RECLAIM_AGE
reconstructor = object_reconstructor.ObjectReconstructor({})
self.assertEqual(604800, reconstructor.quarantine_age)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_age': '0'})
self.assertEqual(0, reconstructor.quarantine_age)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_age': '1'})
self.assertEqual(1, reconstructor.quarantine_age)
# trumps reclaim_age
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_age': '1', 'reclaim_age': 0})
self.assertEqual(1, reconstructor.quarantine_age)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_age': '1', 'reclaim_age': 2})
self.assertEqual(1, reconstructor.quarantine_age)
reconstructor = object_reconstructor.ObjectReconstructor(
{'quarantine_age': 2.2})
self.assertEqual(2, reconstructor.quarantine_age)
for bad in ('1.1', 'auto', 'bad'):
with annotate_failure(bad):
with self.assertRaises(ValueError):
object_reconstructor.ObjectReconstructor(
{'quarantine_age': bad})
def test_request_node_count_conf(self):
# default is 1 * replicas
reconstructor = object_reconstructor.ObjectReconstructor({})
@ -5600,6 +5631,18 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
self._assert_diskfile_quarantined()
self._verify_error_lines(2, other_responses, 2)
def test_reconstruct_fa_quarantine_threshold_two_with_quarantine_age(self):
num_other_resps = 2 * self.policy.object_ring.replicas - 3
other_responses = [(404, None, None)] * num_other_resps
conf = {'quarantine_threshold': 2,
'quarantine_age': 0, # quarantine age trumps reclaim age
'reclaim_age': 1000}
exc = self._do_test_reconstruct_insufficient_frags(
conf, 2, other_responses)
self.assertIsInstance(exc, DiskFileQuarantined)
self._assert_diskfile_quarantined()
self._verify_error_lines(2, other_responses, 2)
def test_reconstruct_fa_no_quarantine_more_than_threshold_frags(self):
# default config
num_other_resps = self.policy.object_ring.replicas - 2
@ -5727,6 +5770,15 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor):
self._assert_diskfile_not_quarantined()
self._verify_error_lines(1, other_responses, 1)
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1,
'quarantine_age': 10000, # quarantine_age trumps reclaim_age
'reclaim_age': 0},
1, other_responses)
self.assertIsInstance(exc, DiskFileError)
self._assert_diskfile_not_quarantined()
self._verify_error_lines(1, other_responses, 1)
exc = self._do_test_reconstruct_insufficient_frags(
{'quarantine_threshold': 1}, # default reclaim_age
1, other_responses)