From 926c61bccf722ef0080d7c88a590b2d6a76f424b Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Mon, 26 Apr 2021 14:48:40 -0700 Subject: [PATCH] relinker: Only mark partitions "done" if there were no (new) errors This way operators can re-run the relinker in the face of errors without needing to manually clear the state file. Change-Id: Ida1c1c0c8a695b1b226121b426b8226a43f3056b Co-Authored-By: Clay Gerrard --- swift/cli/relinker.py | 48 ++++++++++++++++++++++------------ test/unit/cli/test_relinker.py | 38 ++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/swift/cli/relinker.py b/swift/cli/relinker.py index 566ab60362..042398dc93 100644 --- a/swift/cli/relinker.py +++ b/swift/cli/relinker.py @@ -73,6 +73,14 @@ class Relinker(object): 'policies': 0, } + @property + def total_errors(self): + return sum([ + self.stats['errors'], + self.stats.get('unmounted', 0), + self.stats.get('unlistable_partitions', 0), + ]) + def devices_filter(self, _, devices): if self.device_list: devices = [d for d in devices if d in self.device_list] @@ -169,7 +177,9 @@ class Relinker(object): return partitions - # Save states when a partition is done + def hook_pre_partition(self, partition_path): + self.pre_partition_errors = self.total_errors + def hook_post_partition(self, partition_path): datadir_path, part = os.path.split(os.path.abspath(partition_path)) device_path, datadir_name = os.path.split(datadir_path) @@ -241,13 +251,15 @@ class Relinker(object): # a shot. pass - # Then mark this part as done, in case the process is interrupted and - # needs to resume. - self.states["state"][part] = True - with open(state_tmp_file, 'wt') as f: - json.dump(self.states, f) - os.fsync(f.fileno()) - os.rename(state_tmp_file, state_file) + # If there were no errors, mark this partition as done. This is handy + # in case the process is interrupted and needs to resume, or there + # were errors and the relinker needs to run again. + if self.pre_partition_errors == self.total_errors: + self.states["state"][part] = True + with open(state_tmp_file, 'wt') as f: + json.dump(self.states, f) + os.fsync(f.fileno()) + os.rename(state_tmp_file, state_file) num_parts_done = sum( 1 for part in self.states["state"].values() if part) @@ -441,6 +453,7 @@ class Relinker(object): hook_pre_device=self.hook_pre_device, hook_post_device=self.hook_post_device, partitions_filter=self.partitions_filter, + hook_pre_partition=self.hook_pre_partition, hook_post_partition=self.hook_post_partition, hashes_filter=self.hashes_filter, logger=self.logger, @@ -480,6 +493,15 @@ class Relinker(object): "No policy found to increase the partition power.") return EXIT_NO_APPLICABLE_POLICY + if self.total_errors > 0: + log_method = self.logger.warning + # NB: audit_location_generator logs unmounted disks as warnings, + # but we want to treat them as errors + status = EXIT_ERROR + else: + log_method = self.logger.info + status = EXIT_SUCCESS + hash_dirs = self.stats.pop('hash_dirs') files = self.stats.pop('files') linked = self.stats.pop('linked') @@ -498,19 +520,11 @@ class Relinker(object): 'There were unexpected errors while enumerating disk ' 'files: %r', self.stats) - if action_errors + listdir_errors + unmounted > 0: - log_method = self.logger.warning - # NB: audit_location_generator logs unmounted disks as warnings, - # but we want to treat them as errors - status = EXIT_ERROR - else: - log_method = self.logger.info - status = EXIT_SUCCESS - log_method( '%d hash dirs processed (cleanup=%s) (%d files, %d linked, ' '%d removed, %d errors)', hash_dirs, self.do_cleanup, files, linked, removed, action_errors + listdir_errors) + return status diff --git a/test/unit/cli/test_relinker.py b/test/unit/cli/test_relinker.py index 2545cd5ebf..ff118d29f8 100644 --- a/test/unit/cli/test_relinker.py +++ b/test/unit/cli/test_relinker.py @@ -2537,11 +2537,13 @@ class TestRelinker(unittest.TestCase): }, self.logger)[r.policy] # Ack partition 96 + r.hook_pre_partition(os.path.join(datadir_path, '96')) r.hook_post_partition(os.path.join(datadir_path, '96')) self.assertEqual(r.states["state"], {'96': True, '227': False}) - self.assertIn("Step: relink Device: sda1 Policy: %s " - "Partitions: 1/2" % r.policy.name, - self.logger.get_lines_for_level("info")) + self.assertEqual(self.logger.get_lines_for_level("info"), [ + "Step: relink Device: sda1 Policy: %s " + "Partitions: 1/2" % r.policy.name, + ]) with open(state_file, 'rt') as f: self.assertEqual(json.load(f), { "part_power": PART_POWER, @@ -2549,15 +2551,39 @@ class TestRelinker(unittest.TestCase): "state": {'96': True, '227': False}}) # Restart relinking after only part 96 was done + self.logger.clear() + self.assertEqual(['227'], + r.partitions_filter("", ['96', '227', '312'])) + self.assertEqual(r.states["state"], {'96': True, '227': False}) + + # ...but there's an error + r.hook_pre_partition(os.path.join(datadir_path, '227')) + r.stats['errors'] += 1 + r.hook_post_partition(os.path.join(datadir_path, '227')) + self.assertEqual(self.logger.get_lines_for_level("info"), [ + "Step: relink Device: sda1 Policy: %s " + "Partitions: 1/2" % r.policy.name, + ]) + self.assertEqual(r.states["state"], {'96': True, '227': False}) + with open(state_file, 'rt') as f: + self.assertEqual(json.load(f), { + "part_power": PART_POWER, + "next_part_power": PART_POWER + 1, + "state": {'96': True, '227': False}}) + + # OK, one more try + self.logger.clear() self.assertEqual(['227'], r.partitions_filter("", ['96', '227', '312'])) self.assertEqual(r.states["state"], {'96': True, '227': False}) # Ack partition 227 + r.hook_pre_partition(os.path.join(datadir_path, '227')) r.hook_post_partition(os.path.join(datadir_path, '227')) - self.assertIn("Step: relink Device: sda1 Policy: %s " - "Partitions: 2/2" % r.policy.name, - self.logger.get_lines_for_level("info")) + self.assertEqual(self.logger.get_lines_for_level("info"), [ + "Step: relink Device: sda1 Policy: %s " + "Partitions: 2/2" % r.policy.name, + ]) self.assertEqual(r.states["state"], {'96': True, '227': True}) with open(state_file, 'rt') as f: self.assertEqual(json.load(f), {