From e39a0d1959b2167022fdb5dab6fed462d51b2fe8 Mon Sep 17 00:00:00 2001 From: Clay Gerrard Date: Mon, 6 Jan 2025 17:03:45 -0600 Subject: [PATCH] Leave updater per-device stats in recon for debugging It's confusing and unecessary to have the last cycles per-device object-updater stats reaped from recon immediately during aggregation and make it impossible to debug stats aggregation. Drive-by: fix some bugs with stats aggregation Change-Id: I9df7c2d1c31646a3200614b629598576eb9e64c0 --- swift/obj/updater.py | 24 +++++++++-------- test/unit/obj/test_updater.py | 49 +++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 19 deletions(-) diff --git a/swift/obj/updater.py b/swift/obj/updater.py index 2500e73df3..8253f099ca 100644 --- a/swift/obj/updater.py +++ b/swift/obj/updater.py @@ -583,11 +583,9 @@ class ObjectUpdater(Daemon): device_stats = recon_cache.get('object_updater_per_device', {}) if not isinstance(device_stats, dict): raise TypeError('object_updater_per_device must be a dict') - device_stats = {k: (v if v is not None else {}) - for k, v in device_stats.items()} - devices_to_remove = set(device_stats) - set(devices) - update_device_stats = {d: {} for d in devices_to_remove} + device_stats = {dev: device_stats.get(dev) or {} + for dev in devices} aggregated_oldest_entries = [] @@ -615,7 +613,7 @@ class ObjectUpdater(Daemon): )) / float(len(device_stats)) ) - * max(self.updater_workers, 1) + * max(1, min(self.updater_workers, len(device_stats))) if device_stats else 0, 'failures_oldest_timestamp': min( @@ -635,13 +633,17 @@ class ObjectUpdater(Daemon): 'oldest_entries': aggregated_oldest_entries, }, } + + recon_dump = { + 'object_updater_sweep': elapsed, + 'object_updater_stats': aggregated_stats, + 'object_updater_last': now + } + if devices_to_remove: + recon_dump['object_updater_per_device'] = { + d: {} for d in devices_to_remove} dump_recon_cache( - { - 'object_updater_sweep': elapsed, - 'object_updater_stats': aggregated_stats, - 'object_updater_per_device': update_device_stats, - 'object_updater_last': now - }, + recon_dump, self.rcache, self.logger, ) diff --git a/test/unit/obj/test_updater.py b/test/unit/obj/test_updater.py index c4bea6a352..6c886ec7c7 100644 --- a/test/unit/obj/test_updater.py +++ b/test/unit/obj/test_updater.py @@ -74,6 +74,7 @@ def _sorted_listdir(path): @patch_policies(_mocked_policies) class TestObjectUpdater(unittest.TestCase): + maxDiff = None def setUp(self): utils.HASH_PATH_SUFFIX = b'endcap' @@ -1051,6 +1052,12 @@ class TestObjectUpdater(unittest.TestCase): 'tracker_memory_usage': 256, 'failures_account_container_count': 1, }, + # N.B. this is unrealistic test stub to cover failures modes + # where workers die weird, for the expected aggregated stats + # calculation sda2 & sda3 are contributing to the count of + # devices when averaging per-worker device memory but no actual + # values; so the reported "tracker_memory_usage" appears + # artifically deflated under failure 'sda2': { 'failures_oldest_timestamp': 124.56789, 'failures_oldest_timestamp_age': 789.012, @@ -1080,6 +1087,17 @@ class TestObjectUpdater(unittest.TestCase): 'object_updater_sweep': 30, 'object_updater_stats': expected_aggregated_stats, 'object_updater_last': now, + 'object_updater_per_device': { + 'sda1': { + 'tracker_memory_usage': 256, + 'failures_account_container_count': 1, + }, + 'sda2': { + 'failures_oldest_timestamp': 124.56789, + 'failures_oldest_timestamp_age': 789.012, + }, + 'sda3': None, + }, } self.assertEqual(expected_recon, found_data) @@ -1106,8 +1124,13 @@ class TestObjectUpdater(unittest.TestCase): } } utils.dump_recon_cache(empty_recon, ou.rcache, ou.logger) + with open(recon_file) as f: + found_data = json.load(f) + # N.B. recon doesn't let you write second level keys with empty values; + # because it's hijacked that spelling to mean "remove this sub-key" + self.assertEqual({'object_updater_per_device': {}}, found_data) now = float(next(self.ts_iter)) - ou.aggregate_and_dump_recon(['sda1', 'sda2'], 30, now) + ou.aggregate_and_dump_recon(['sda1', 'sda2', 'sda3'], 30, now) with open(recon_file) as f: found_data = json.load(f) @@ -1126,7 +1149,8 @@ class TestObjectUpdater(unittest.TestCase): expected_recon = { 'object_updater_sweep': 30, 'object_updater_stats': expected_aggregated_stats, - 'object_updater_last': now + 'object_updater_last': now, + 'object_updater_per_device': {}, } self.assertEqual(expected_recon, found_data) @@ -1169,6 +1193,7 @@ class TestObjectUpdater(unittest.TestCase): ou = object_updater.ObjectUpdater({ 'devices': self.devices_dir, 'swift_dir': self.testdir, + # N.B. we have less devices than workers! 'updater_workers': 2, 'recon_cache_path': recon_path, 'async_tracker_dump_count': 2, @@ -1178,17 +1203,24 @@ class TestObjectUpdater(unittest.TestCase): 'object_updater_per_device': { 'sda1': { 'failures_account_container_count': 2, - 'tracker_memory_usage': 512, + 'tracker_memory_usage': 384, }, + # this is a relatively realistic example of existing recon data + # for a pre-existing per-device key for a removed device 'sda2': { 'failures_account_container_count': 1, 'tracker_memory_usage': 256, }, + # N.B. this sda3 is an unrealistic test stub to cover failures + # modes where workers die weird 'sda3': None, } } utils.dump_recon_cache(existing_recon, ou.rcache, ou.logger) now = float(next(self.ts_iter)) + # N.B. because neither sda2 nor sda3 are passed in as expected devices + # to aggregate neither contribute to device count when calculating + # per-worker averages in during aggregate ou.aggregate_and_dump_recon(['sda1'], 30, now) with open(recon_file) as f: @@ -1196,7 +1228,9 @@ class TestObjectUpdater(unittest.TestCase): expected_aggregated_stats = { 'failures_account_container_count': 2, - 'tracker_memory_usage': 512.0, + # only sda1 should be considered because it's the only device value + # that wouldn't be stale + 'tracker_memory_usage': 384, 'failures_oldest_timestamp': None, 'failures_oldest_timestamp_age': None, 'failures_oldest_timestamp_account_containers': { @@ -1209,7 +1243,7 @@ class TestObjectUpdater(unittest.TestCase): 'object_updater_per_device': { 'sda1': { 'failures_account_container_count': 2, - 'tracker_memory_usage': 512, + 'tracker_memory_usage': 384, }, }, 'object_updater_sweep': 30, @@ -1407,10 +1441,9 @@ class TestObjectUpdater(unittest.TestCase): del expected_recon['object_updater_per_device']['sdx'] self.assertEqual(expected_recon, found_data) - self.assertAlmostEqual( - 755.5, + self.assertEqual( + (512 + 256 + 128) / 3 * 2, found_data['object_updater_stats']['tracker_memory_usage'], - delta=200 ) def test_obj_put_legacy_updates(self):