From 7a105b5ef01e990d52ef2111b23707904cd41867 Mon Sep 17 00:00:00 2001 From: Matthew Oliver Date: Tue, 6 Feb 2018 06:19:25 +0000 Subject: [PATCH] Add and pipe reconstructor stats through recon This patch plumbs the object-reconstructor stats that are dropped into recon cache out through the middleware and swift-recon tool. This adds a '/recon/reconstruction/object' to the middleware. As such the swift-recon tool has grown a '-R' or '--reconstruction' option access this data from each node. Plus some tests and documentation updates. Change-Id: I98582732ca5ccb2e7d2369b53abf9aa8c0ede00c --- doc/manpages/swift-recon.1 | 2 + doc/source/admin_guide.rst | 3 +- swift/cli/recon.py | 146 +++++++++++++++------- swift/common/middleware/recon.py | 9 ++ test/unit/cli/test_recon.py | 83 ++++++++++-- test/unit/common/middleware/test_recon.py | 26 ++++ 6 files changed, 216 insertions(+), 53 deletions(-) diff --git a/doc/manpages/swift-recon.1 b/doc/manpages/swift-recon.1 index c5a30b6d47..dc5a19d002 100644 --- a/doc/manpages/swift-recon.1 +++ b/doc/manpages/swift-recon.1 @@ -58,6 +58,8 @@ Get updater stats Get expirer stats .IP "\fB-r, --replication\fR" Get replication stats +.IP "\fB-R, --reconstruction\fR" +Get reconstruction stats .IP "\fB-u, --unmounted\fR" Check cluster for unmounted devices .IP "\fB-d, --diskusage\fR" diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index d44f671387..c3f4a10788 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -764,7 +764,7 @@ This information can also be queried via the swift-recon command line utility:: fhines@ubuntu:~$ swift-recon -h Usage: usage: swift-recon [-v] [--suppress] [-a] [-r] [-u] [-d] - [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + [-R] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] account|container|object Defaults to object server. @@ -778,6 +778,7 @@ This information can also be queried via the swift-recon command line utility:: --suppress Suppress most connection related errors -a, --async Get async stats -r, --replication Get replication stats + -R, --reconstruction Get reconstruction stats --auditor Get auditor stats --updater Get updater stats --expirer Get expirer stats diff --git a/swift/cli/recon.py b/swift/cli/recon.py index 63f8afb3f1..cd09528750 100644 --- a/swift/cli/recon.py +++ b/swift/cli/recon.py @@ -431,55 +431,28 @@ class SwiftRecon(object): print("[%s] - No hosts returned valid data." % k) print("=" * 79) - def replication_check(self, hosts): - """ - Obtain and print replication statistics + def _calculate_least_and_most_recent(self, url_time_data): + """calulate and print the least and most recent urls - :param hosts: set of hosts to check. in the format of: - set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + Given a list of url and time tuples calulate the most and least + recent timings and print it out. + :param url_time_data: list of url and time tuples: [(url, time_), ..] """ - stats = {'replication_time': [], 'failure': [], 'success': [], - 'attempted': []} - recon = Scout("replication/%s" % self.server_type, self.verbose, - self.suppress_errors, self.timeout) - print("[%s] Checking on replication" % self._ptime()) least_recent_time = 9999999999 least_recent_url = None most_recent_time = 0 most_recent_url = None - for url, response, status, ts_start, ts_end in self.pool.imap( - recon.scout, hosts): - if status == 200: - stats['replication_time'].append( - response.get('replication_time', - response.get('object_replication_time', 0))) - repl_stats = response.get('replication_stats') - if repl_stats: - for stat_key in ['attempted', 'failure', 'success']: - stats[stat_key].append(repl_stats.get(stat_key)) - last = response.get('replication_last', - response.get('object_replication_last', 0)) - if last is None: - continue - if last < least_recent_time: - least_recent_time = last - least_recent_url = url - if last > most_recent_time: - most_recent_time = last - most_recent_url = url - for k in stats: - if stats[k]: - if k != 'replication_time': - computed = self._gen_stats(stats[k], - name='replication_%s' % k) - else: - computed = self._gen_stats(stats[k], name=k) - if computed['reported'] > 0: - self._print_stats(computed) - else: - print("[%s] - No hosts returned valid data." % k) - else: - print("[%s] - No hosts returned valid data." % k) + + for url, last in url_time_data: + if last is None: + continue + if last < least_recent_time: + least_recent_time = last + least_recent_url = url + if last > most_recent_time: + most_recent_time = last + most_recent_url = url + if least_recent_url is not None: host = urlparse(least_recent_url).netloc if not least_recent_time: @@ -497,6 +470,79 @@ class SwiftRecon(object): print('Most recent completion was %s (%d %s ago) by %s.' % ( self._ptime(most_recent_time), elapsed, elapsed_unit, host)) + + def reconstruction_check(self, hosts): + """ + Obtain and print reconstructon statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + stats = [] + last_stats = [] + recon = Scout("reconstruction/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on reconstructors" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats.append(response.get('object_reconstruction_time')) + last = response.get('object_reconstruction_last', 0) + last_stats.append((url, last)) + if stats: + computed = self._gen_stats(stats, + name='object_reconstruction_time') + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[object_reconstruction_time] - No hosts returned " + "valid data.") + else: + print("[object_reconstruction_time] - No hosts returned " + "valid data.") + self._calculate_least_and_most_recent(last_stats) + print("=" * 79) + + def replication_check(self, hosts): + """ + Obtain and print replication statistics + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6220), ('127.0.0.2', 6230)]) + """ + stats = {'replication_time': [], 'failure': [], 'success': [], + 'attempted': []} + last_stats = [] + recon = Scout("replication/%s" % self.server_type, self.verbose, + self.suppress_errors, self.timeout) + print("[%s] Checking on replication" % self._ptime()) + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status == 200: + stats['replication_time'].append( + response.get('replication_time', + response.get('object_replication_time', 0))) + repl_stats = response.get('replication_stats') + if repl_stats: + for stat_key in ['attempted', 'failure', 'success']: + stats[stat_key].append(repl_stats.get(stat_key)) + last = response.get('replication_last', + response.get('object_replication_last', 0)) + last_stats.append((url, last)) + for k in stats: + if stats[k]: + if k != 'replication_time': + computed = self._gen_stats(stats[k], + name='replication_%s' % k) + else: + computed = self._gen_stats(stats[k], name=k) + if computed['reported'] > 0: + self._print_stats(computed) + else: + print("[%s] - No hosts returned valid data." % k) + else: + print("[%s] - No hosts returned valid data." % k) + self._calculate_least_and_most_recent(last_stats) print("=" * 79) def updater_check(self, hosts): @@ -1036,7 +1082,7 @@ class SwiftRecon(object): print("=" * 79) usage = ''' usage: %prog [ []] - [-v] [--suppress] [-a] [-r] [-u] [-d] + [-v] [--suppress] [-a] [-r] [-u] [-d] [-R] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] [--human-readable] @@ -1055,6 +1101,8 @@ class SwiftRecon(object): help="Get async stats") args.add_option('--replication', '-r', action="store_true", help="Get replication stats") + args.add_option('--reconstruction', '-R', action="store_true", + help="Get reconstruction stats") args.add_option('--auditor', action="store_true", help="Get auditor stats") args.add_option('--updater', action="store_true", @@ -1094,7 +1142,7 @@ class SwiftRecon(object): help='Also show the lowest COUNT entries in rank \ order.') args.add_option('--all', action="store_true", - help="Perform all checks. Equal to \t\t\t-arudlqT " + help="Perform all checks. Equal to \t\t\t-arRudlqT " "--md5 --sockstat --auditor --updater --expirer " "--driveaudit --validate-servers --swift-versions") args.add_option('--region', type="int", @@ -1152,6 +1200,7 @@ class SwiftRecon(object): self.object_auditor_check(hosts) self.updater_check(hosts) self.expirer_check(hosts) + self.reconstruction_check(hosts) elif self.server_type == 'container': self.auditor_check(hosts) self.updater_check(hosts) @@ -1209,6 +1258,13 @@ class SwiftRecon(object): print("Error: Can't check sharding on non container " "servers.") print("=" * 79) + if options.reconstruction: + if self.server_type == 'object': + self.reconstruction_check(hosts) + else: + print("Error: Can't check reconstruction stats on " + "non object servers.") + print("=" * 79) if options.validate_servers: self.server_type_check(hosts) if options.loadstats: diff --git a/swift/common/middleware/recon.py b/swift/common/middleware/recon.py index e1b5d7e579..2b58179878 100644 --- a/swift/common/middleware/recon.py +++ b/swift/common/middleware/recon.py @@ -169,6 +169,13 @@ class ReconMiddleware(object): else: return None + def get_reconstruction_info(self): + """get reconstruction info""" + reconstruction_list = ['object_reconstruction_last', + 'object_reconstruction_time'] + return self._from_recon_cache(reconstruction_list, + self.object_recon_cache) + def get_device_info(self): """get devices""" try: @@ -399,6 +406,8 @@ class ReconMiddleware(object): content = self.get_sharding_info() elif rcheck == "relinker": content = self.get_relinker_info() + elif rcheck == "reconstruction" and rtype == 'object': + content = self.get_reconstruction_info() else: content = "Invalid path: %s" % req.path return Response(request=req, status="404 Not Found", diff --git a/test/unit/cli/test_recon.py b/test/unit/cli/test_recon.py index b5290ad919..421e68aeb1 100644 --- a/test/unit/cli/test_recon.py +++ b/test/unit/cli/test_recon.py @@ -648,6 +648,45 @@ aliases = %s self.assertRaises(SystemExit, recon.main) self.assertIn('Invalid Storage Policy', stdout.getvalue()) + def test_calculate_least_and_most_recent(self): + now = 1517894596 + + def test_least_most(data, expected): + stdout = StringIO() + with mock.patch('sys.stdout', new=stdout), \ + mock.patch('time.time', return_value=now): + self.recon_instance._calculate_least_and_most_recent(data) + self.assertEqual(stdout.getvalue(), expected) + + # first the empty set + test_least_most([], '') + expected = 'Oldest completion was NEVER by my.url.\n' + test_least_most([('http://my.url/is/awesome', 0)], expected) + + expected = ( + 'Oldest completion was 2018-02-06 05:23:11 (5 seconds ago) ' + 'by my.url.\n' + 'Most recent completion was 2018-02-06 05:23:11 (5 seconds ago) ' + 'by my.url.\n') + data = [('http://my.url/is/awesome', now - 5)] + test_least_most(data, expected) + + expected = ( + 'Oldest completion was 2018-02-06 05:06:36 (16 minutes ago) ' + 'by a.diff.url.\n' + 'Most recent completion was 2018-02-06 05:23:11 (5 seconds ago) ' + 'by my.url.\n') + data.append(('http://a.diff.url/not/as/awesome', now - 1000)) + test_least_most(data, expected) + + # now through larger sets at it + for extra in (5, 10, 40, 100): + data.extend([ + ('http://extra.%d.url/blah' % (extra + r), + now - random.randint(6, 999)) for r in range(extra)]) + random.shuffle(data) + test_least_most(data, expected) + class TestReconCommands(unittest.TestCase): def setUp(self): @@ -1067,6 +1106,41 @@ class TestReconCommands(unittest.TestCase): cli.sharding_check([('127.0.0.1', 6011), ('127.0.0.1', 6021)]) mock_print.assert_has_calls(default_calls, any_order=True) + @ mock.patch('six.moves.builtins.print') + @ mock.patch('time.time') + def test_reconstruction_check(self, mock_now, mock_print): + now = 1430000000.0 + + def dummy_request(*args, **kwargs): + return [ + ('http://127.0.0.1:6011/recon/reconstruction', + {"object_reconstruction_last": now, + "object_reconstruction_time": 42}, + 200, 0, 0), + ('http://127.0.0.1:6021/recon/reconstruction', + {"object_reconstruction_last": now, + "object_reconstruction_time": 23}, + 200, 0, 0)] + + cli = recon.SwiftRecon() + cli.pool.imap = dummy_request + + default_calls = [ + mock.call('[object_reconstruction_time] low: 23, high: 42, ' + 'avg: 32.5, total: 65, Failed: 0.0%, no_result: 0, ' + 'reported: 2'), + mock.call('Oldest completion was 2015-04-25 22:13:20 ' + + '(42 seconds ago) by 127.0.0.1:6011.'), + mock.call('Most recent completion was 2015-04-25 22:13:20 ' + + '(42 seconds ago) by 127.0.0.1:6011.'), + ] + + mock_now.return_value = now + 42 + cli.reconstruction_check([('127.0.0.1', 6011), ('127.0.0.1', 6021)]) + # We need any_order=True because the order of calls depends on the dict + # that is returned from the recon middleware, thus can't rely on it + mock_print.assert_has_calls(default_calls, any_order=True) + @mock.patch('six.moves.builtins.print') @mock.patch('time.time') def test_load_check(self, mock_now, mock_print): @@ -1077,16 +1151,11 @@ class TestReconCommands(unittest.TestCase): ('http://127.0.0.1:6010/recon/load', {"1m": 0.2, "5m": 0.4, "15m": 0.25, "processes": 10000, "tasks": "1/128"}, - 200, - 0, - 0), + 200, 0, 0), ('http://127.0.0.1:6020/recon/load', {"1m": 0.4, "5m": 0.8, "15m": 0.75, "processes": 9000, "tasks": "1/200"}, - 200, - 0, - 0), - ] + 200, 0, 0)] cli = recon.SwiftRecon() cli.pool.imap = dummy_request diff --git a/test/unit/common/middleware/test_recon.py b/test/unit/common/middleware/test_recon.py index 10ef95dd4c..4b4656d26f 100644 --- a/test/unit/common/middleware/test_recon.py +++ b/test/unit/common/middleware/test_recon.py @@ -168,6 +168,9 @@ class FakeRecon(object): def fake_relinker(self): return {"relinktest": "1"} + def fake_reconstruction(self): + return {'reconstructiontest': "1"} + def fake_updater(self, recon_type): self.fake_updater_rtype = recon_type return {'updatertest': "1"} @@ -807,6 +810,21 @@ class TestReconSuccess(TestCase): rv = self.app.get_replication_info('unrecognized_recon_type') self.assertIsNone(rv) + def test_get_reconstruction(self): + from_cache_response = { + "object_reconstruction_time": 0.2615511417388916, + "object_reconstruction_last": 1357969645.25} + self.fakecache.fakeout_calls = [] + self.fakecache.fakeout = from_cache_response + rv = self.app.get_reconstruction_info() + self.assertEqual(self.fakecache.fakeout_calls, + [((['object_reconstruction_last', + 'object_reconstruction_time'], + '/var/cache/swift/object.recon'), {})]) + self.assertEqual(rv, { + "object_reconstruction_time": 0.2615511417388916, + "object_reconstruction_last": 1357969645.25}) + def test_get_updater_info_container(self): from_cache_response = {"container_updater_sweep": 18.476239919662476} self.fakecache.fakeout_calls = [] @@ -1333,6 +1351,7 @@ class TestReconMiddleware(unittest.TestCase): self.app.get_async_info = self.frecon.fake_async self.app.get_device_info = self.frecon.fake_get_device_info self.app.get_replication_info = self.frecon.fake_replication + self.app.get_reconstruction_info = self.frecon.fake_reconstruction self.app.get_auditor_info = self.frecon.fake_auditor self.app.get_updater_info = self.frecon.fake_updater self.app.get_expirer_info = self.frecon.fake_expirer @@ -1380,6 +1399,13 @@ class TestReconMiddleware(unittest.TestCase): resp = self.app(req.environ, start_response) self.assertEqual(resp, get_device_resp) + def test_reconstruction_info(self): + get_reconstruction_resp = [b'{"reconstructiontest": "1"}'] + req = Request.blank('/recon/reconstruction/object', + environ={'REQUEST_METHOD': 'GET'}) + resp = self.app(req.environ, start_response) + self.assertEqual(resp, get_reconstruction_resp) + def test_recon_get_replication_notype(self): get_replication_resp = [b'{"replicationtest": "1"}'] req = Request.blank('/recon/replication',