From 2fd5b87dc5b0e0fe99421becef77bd888ca02368 Mon Sep 17 00:00:00 2001 From: Alistair Coles Date: Fri, 2 Jul 2021 15:42:33 +0100 Subject: [PATCH] reconstructor: make quarantine delay configurable Previously the reconstructor would quarantine isolated durable fragments that were more than reclaim_age old. This patch adds a quarantine_age option for the reconstructor which defaults to reclaim_age but can be used to configure the age that a fragment must reach before quarantining. Change-Id: I867f3ea0cf60620c576da0c1f2c65cec2cf19aa0 --- doc/source/config/object_server_config.rst | 13 ++++++ etc/object-server.conf-sample | 5 ++- swift/obj/reconstructor.py | 7 ++- test/probe/test_reconstructor_rebuild.py | 6 +-- test/unit/obj/test_reconstructor.py | 52 ++++++++++++++++++++++ 5 files changed, 77 insertions(+), 6 deletions(-) diff --git a/doc/source/config/object_server_config.rst b/doc/source/config/object_server_config.rst index 6a2b8edcad..6dac546322 100644 --- a/doc/source/config/object_server_config.rst +++ b/doc/source/config/object_server_config.rst @@ -495,6 +495,19 @@ ionice_priority None I/O scheduling priority o Work only with ionice_class. Ignored if IOPRIO_CLASS_IDLE is set. +quarantine_threshold 0 The reconstructor may quarantine + stale isolated fragments + when it fails to fetch + more than the + quarantine_threshold + number of fragments + (including the stale + fragment) during an + attempt to reconstruct. +quarantine_age reclaim_age Fragments are not quarantined + until they are older than + quarantine_age, which defaults + to the value of reclaim_age. =========================== ======================== ================================ **************** diff --git a/etc/object-server.conf-sample b/etc/object-server.conf-sample index a1719e1fab..d82a18a65f 100644 --- a/etc/object-server.conf-sample +++ b/etc/object-server.conf-sample @@ -412,9 +412,12 @@ use = egg:swift#recon # Note: the quarantine_threshold applies equally to all policies, but for each # policy it is effectively capped at (ec_ndata - 1) so that a fragment is never # quarantined when sufficient fragments exist to reconstruct the object. -# Fragments are not quarantined until they are older than the reclaim_age. # quarantine_threshold = 0 # +# Fragments are not quarantined until they are older than +# quarantine_age, which defaults to the value of reclaim_age. +# quarantine_age = +# # Sets the maximum number of nodes to which requests will be made before # quarantining a fragment. You can use '* replicas' at the end to have it use # the number given times the number of replicas for the ring being used for the diff --git a/swift/obj/reconstructor.py b/swift/obj/reconstructor.py index cc051b4513..7a2def7c82 100644 --- a/swift/obj/reconstructor.py +++ b/swift/obj/reconstructor.py @@ -44,7 +44,7 @@ from swift.obj.ssync_sender import Sender as ssync_sender from swift.common.http import HTTP_OK, HTTP_NOT_FOUND, \ HTTP_INSUFFICIENT_STORAGE from swift.obj.diskfile import DiskFileRouter, get_data_dir, \ - get_tmp_dir + get_tmp_dir, DEFAULT_RECLAIM_AGE from swift.common.storage_policy import POLICIES, EC_POLICY from swift.common.exceptions import ConnectionTimeout, DiskFileError, \ SuffixSyncError, PartitionLockTimeout @@ -236,6 +236,9 @@ class ObjectReconstructor(Daemon): 'rebuild_handoff_node_count', 2)) self.quarantine_threshold = non_negative_int( conf.get('quarantine_threshold', 0)) + self.quarantine_age = int( + conf.get('quarantine_age', + conf.get('reclaim_age', DEFAULT_RECLAIM_AGE))) self.request_node_count = config_request_node_count_value( conf.get('request_node_count', '2 * replicas')) self.nondurable_purge_delay = non_negative_float( @@ -507,7 +510,7 @@ class ObjectReconstructor(Daemon): # worth more investigation return False - if time.time() - float(local_timestamp) <= df.manager.reclaim_age: + if time.time() - float(local_timestamp) <= self.quarantine_age: # If the fragment has not yet passed reclaim age then it is # likely that a tombstone will be reverted to this node, or # neighbor frags will get reverted from handoffs to *other* nodes diff --git a/test/probe/test_reconstructor_rebuild.py b/test/probe/test_reconstructor_rebuild.py index 2e44ed1548..4fc546f72a 100644 --- a/test/probe/test_reconstructor_rebuild.py +++ b/test/probe/test_reconstructor_rebuild.py @@ -425,7 +425,7 @@ class TestReconstructorRebuild(ECProbeTest): for conf_index in self.configs['object-reconstructor'].keys(): reconstructor = self.run_custom_daemon( ObjectReconstructor, 'object-reconstructor', conf_index, - {'reclaim_age': '0'}) + {'quarantine_age': '0'}) logger = reconstructor.logger.logger error_lines.append(logger.get_lines_for_level('error')) warning_lines.append(logger.get_lines_for_level('warning')) @@ -462,7 +462,7 @@ class TestReconstructorRebuild(ECProbeTest): for conf_index in self.configs['object-reconstructor'].keys(): reconstructor = self.run_custom_daemon( ObjectReconstructor, 'object-reconstructor', conf_index, - {'reclaim_age': '0', 'quarantine_threshold': '1'}) + {'quarantine_age': '0', 'quarantine_threshold': '1'}) logger = reconstructor.logger.logger error_lines.append(logger.get_lines_for_level('error')) warning_lines.append(logger.get_lines_for_level('warning')) @@ -515,7 +515,7 @@ class TestReconstructorRebuild(ECProbeTest): for conf_index in self.configs['object-reconstructor'].keys(): reconstructor = self.run_custom_daemon( ObjectReconstructor, 'object-reconstructor', conf_index, - {'reclaim_age': '0', 'quarantine_threshold': '1'}) + {'quarantine_age': '0', 'quarantine_threshold': '1'}) logger = reconstructor.logger.logger error_lines.append(logger.get_lines_for_level('error')) warning_lines.append(logger.get_lines_for_level('warning')) diff --git a/test/unit/obj/test_reconstructor.py b/test/unit/obj/test_reconstructor.py index a897b4ae1d..51bf5d9308 100644 --- a/test/unit/obj/test_reconstructor.py +++ b/test/unit/obj/test_reconstructor.py @@ -5407,6 +5407,37 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): object_reconstructor.ObjectReconstructor( {'nondurable_purge_delay': bad}) + def test_quarantine_age_conf(self): + # defaults to DEFAULT_RECLAIM_AGE + reconstructor = object_reconstructor.ObjectReconstructor({}) + self.assertEqual(604800, reconstructor.quarantine_age) + + reconstructor = object_reconstructor.ObjectReconstructor( + {'quarantine_age': '0'}) + self.assertEqual(0, reconstructor.quarantine_age) + + reconstructor = object_reconstructor.ObjectReconstructor( + {'quarantine_age': '1'}) + self.assertEqual(1, reconstructor.quarantine_age) + + # trumps reclaim_age + reconstructor = object_reconstructor.ObjectReconstructor( + {'quarantine_age': '1', 'reclaim_age': 0}) + self.assertEqual(1, reconstructor.quarantine_age) + reconstructor = object_reconstructor.ObjectReconstructor( + {'quarantine_age': '1', 'reclaim_age': 2}) + self.assertEqual(1, reconstructor.quarantine_age) + + reconstructor = object_reconstructor.ObjectReconstructor( + {'quarantine_age': 2.2}) + self.assertEqual(2, reconstructor.quarantine_age) + + for bad in ('1.1', 'auto', 'bad'): + with annotate_failure(bad): + with self.assertRaises(ValueError): + object_reconstructor.ObjectReconstructor( + {'quarantine_age': bad}) + def test_request_node_count_conf(self): # default is 1 * replicas reconstructor = object_reconstructor.ObjectReconstructor({}) @@ -5600,6 +5631,18 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): self._assert_diskfile_quarantined() self._verify_error_lines(2, other_responses, 2) + def test_reconstruct_fa_quarantine_threshold_two_with_quarantine_age(self): + num_other_resps = 2 * self.policy.object_ring.replicas - 3 + other_responses = [(404, None, None)] * num_other_resps + conf = {'quarantine_threshold': 2, + 'quarantine_age': 0, # quarantine age trumps reclaim age + 'reclaim_age': 1000} + exc = self._do_test_reconstruct_insufficient_frags( + conf, 2, other_responses) + self.assertIsInstance(exc, DiskFileQuarantined) + self._assert_diskfile_quarantined() + self._verify_error_lines(2, other_responses, 2) + def test_reconstruct_fa_no_quarantine_more_than_threshold_frags(self): # default config num_other_resps = self.policy.object_ring.replicas - 2 @@ -5727,6 +5770,15 @@ class TestReconstructFragmentArchive(BaseTestObjectReconstructor): self._assert_diskfile_not_quarantined() self._verify_error_lines(1, other_responses, 1) + exc = self._do_test_reconstruct_insufficient_frags( + {'quarantine_threshold': 1, + 'quarantine_age': 10000, # quarantine_age trumps reclaim_age + 'reclaim_age': 0}, + 1, other_responses) + self.assertIsInstance(exc, DiskFileError) + self._assert_diskfile_not_quarantined() + self._verify_error_lines(1, other_responses, 1) + exc = self._do_test_reconstruct_insufficient_frags( {'quarantine_threshold': 1}, # default reclaim_age 1, other_responses)