Add and pipe reconstructor stats through recon

This patch plumbs the object-reconstructor stats that are dropped
into recon cache out through the middleware and swift-recon tool.

This adds a '/recon/reconstruction/object' to the middleware. As such
the swift-recon tool has grown a '-R' or '--reconstruction' option
access this data from each node.

Plus some tests and documentation updates.

Change-Id: I98582732ca5ccb2e7d2369b53abf9aa8c0ede00c
This commit is contained in:
Matthew Oliver 2018-02-06 06:19:25 +00:00 committed by Pete Zaitcev
parent a8f1512863
commit 7a105b5ef0
6 changed files with 216 additions and 53 deletions

View File

@ -58,6 +58,8 @@ Get updater stats
Get expirer stats
.IP "\fB-r, --replication\fR"
Get replication stats
.IP "\fB-R, --reconstruction\fR"
Get reconstruction stats
.IP "\fB-u, --unmounted\fR"
Check cluster for unmounted devices
.IP "\fB-d, --diskusage\fR"

View File

@ -764,7 +764,7 @@ This information can also be queried via the swift-recon command line utility::
fhines@ubuntu:~$ swift-recon -h
Usage:
usage: swift-recon <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[-R] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
<server_type> account|container|object
Defaults to object server.
@ -778,6 +778,7 @@ This information can also be queried via the swift-recon command line utility::
--suppress Suppress most connection related errors
-a, --async Get async stats
-r, --replication Get replication stats
-R, --reconstruction Get reconstruction stats
--auditor Get auditor stats
--updater Get updater stats
--expirer Get expirer stats

View File

@ -431,55 +431,28 @@ class SwiftRecon(object):
print("[%s] - No hosts returned valid data." % k)
print("=" * 79)
def replication_check(self, hosts):
"""
Obtain and print replication statistics
def _calculate_least_and_most_recent(self, url_time_data):
"""calulate and print the least and most recent urls
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6220), ('127.0.0.2', 6230)])
Given a list of url and time tuples calulate the most and least
recent timings and print it out.
:param url_time_data: list of url and time tuples: [(url, time_), ..]
"""
stats = {'replication_time': [], 'failure': [], 'success': [],
'attempted': []}
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on replication" % self._ptime())
least_recent_time = 9999999999
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time',
response.get('object_replication_time', 0)))
repl_stats = response.get('replication_stats')
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
last = response.get('replication_last',
response.get('object_replication_last', 0))
if last is None:
continue
if last < least_recent_time:
least_recent_time = last
least_recent_url = url
if last > most_recent_time:
most_recent_time = last
most_recent_url = url
for k in stats:
if stats[k]:
if k != 'replication_time':
computed = self._gen_stats(stats[k],
name='replication_%s' % k)
else:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[%s] - No hosts returned valid data." % k)
else:
print("[%s] - No hosts returned valid data." % k)
for url, last in url_time_data:
if last is None:
continue
if last < least_recent_time:
least_recent_time = last
least_recent_url = url
if last > most_recent_time:
most_recent_time = last
most_recent_url = url
if least_recent_url is not None:
host = urlparse(least_recent_url).netloc
if not least_recent_time:
@ -497,6 +470,79 @@ class SwiftRecon(object):
print('Most recent completion was %s (%d %s ago) by %s.' % (
self._ptime(most_recent_time),
elapsed, elapsed_unit, host))
def reconstruction_check(self, hosts):
"""
Obtain and print reconstructon statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = []
last_stats = []
recon = Scout("reconstruction/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on reconstructors" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats.append(response.get('object_reconstruction_time'))
last = response.get('object_reconstruction_last', 0)
last_stats.append((url, last))
if stats:
computed = self._gen_stats(stats,
name='object_reconstruction_time')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[object_reconstruction_time] - No hosts returned "
"valid data.")
else:
print("[object_reconstruction_time] - No hosts returned "
"valid data.")
self._calculate_least_and_most_recent(last_stats)
print("=" * 79)
def replication_check(self, hosts):
"""
Obtain and print replication statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6220), ('127.0.0.2', 6230)])
"""
stats = {'replication_time': [], 'failure': [], 'success': [],
'attempted': []}
last_stats = []
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on replication" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time',
response.get('object_replication_time', 0)))
repl_stats = response.get('replication_stats')
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
last = response.get('replication_last',
response.get('object_replication_last', 0))
last_stats.append((url, last))
for k in stats:
if stats[k]:
if k != 'replication_time':
computed = self._gen_stats(stats[k],
name='replication_%s' % k)
else:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[%s] - No hosts returned valid data." % k)
else:
print("[%s] - No hosts returned valid data." % k)
self._calculate_least_and_most_recent(last_stats)
print("=" * 79)
def updater_check(self, hosts):
@ -1036,7 +1082,7 @@ class SwiftRecon(object):
print("=" * 79)
usage = '''
usage: %prog <server_type> [<server_type> [<server_type>]]
[-v] [--suppress] [-a] [-r] [-u] [-d]
[-v] [--suppress] [-a] [-r] [-u] [-d] [-R]
[-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[--human-readable]
@ -1055,6 +1101,8 @@ class SwiftRecon(object):
help="Get async stats")
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--reconstruction', '-R', action="store_true",
help="Get reconstruction stats")
args.add_option('--auditor', action="store_true",
help="Get auditor stats")
args.add_option('--updater', action="store_true",
@ -1094,7 +1142,7 @@ class SwiftRecon(object):
help='Also show the lowest COUNT entries in rank \
order.')
args.add_option('--all', action="store_true",
help="Perform all checks. Equal to \t\t\t-arudlqT "
help="Perform all checks. Equal to \t\t\t-arRudlqT "
"--md5 --sockstat --auditor --updater --expirer "
"--driveaudit --validate-servers --swift-versions")
args.add_option('--region', type="int",
@ -1152,6 +1200,7 @@ class SwiftRecon(object):
self.object_auditor_check(hosts)
self.updater_check(hosts)
self.expirer_check(hosts)
self.reconstruction_check(hosts)
elif self.server_type == 'container':
self.auditor_check(hosts)
self.updater_check(hosts)
@ -1209,6 +1258,13 @@ class SwiftRecon(object):
print("Error: Can't check sharding on non container "
"servers.")
print("=" * 79)
if options.reconstruction:
if self.server_type == 'object':
self.reconstruction_check(hosts)
else:
print("Error: Can't check reconstruction stats on "
"non object servers.")
print("=" * 79)
if options.validate_servers:
self.server_type_check(hosts)
if options.loadstats:

View File

@ -169,6 +169,13 @@ class ReconMiddleware(object):
else:
return None
def get_reconstruction_info(self):
"""get reconstruction info"""
reconstruction_list = ['object_reconstruction_last',
'object_reconstruction_time']
return self._from_recon_cache(reconstruction_list,
self.object_recon_cache)
def get_device_info(self):
"""get devices"""
try:
@ -399,6 +406,8 @@ class ReconMiddleware(object):
content = self.get_sharding_info()
elif rcheck == "relinker":
content = self.get_relinker_info()
elif rcheck == "reconstruction" and rtype == 'object':
content = self.get_reconstruction_info()
else:
content = "Invalid path: %s" % req.path
return Response(request=req, status="404 Not Found",

View File

@ -648,6 +648,45 @@ aliases = %s
self.assertRaises(SystemExit, recon.main)
self.assertIn('Invalid Storage Policy', stdout.getvalue())
def test_calculate_least_and_most_recent(self):
now = 1517894596
def test_least_most(data, expected):
stdout = StringIO()
with mock.patch('sys.stdout', new=stdout), \
mock.patch('time.time', return_value=now):
self.recon_instance._calculate_least_and_most_recent(data)
self.assertEqual(stdout.getvalue(), expected)
# first the empty set
test_least_most([], '')
expected = 'Oldest completion was NEVER by my.url.\n'
test_least_most([('http://my.url/is/awesome', 0)], expected)
expected = (
'Oldest completion was 2018-02-06 05:23:11 (5 seconds ago) '
'by my.url.\n'
'Most recent completion was 2018-02-06 05:23:11 (5 seconds ago) '
'by my.url.\n')
data = [('http://my.url/is/awesome', now - 5)]
test_least_most(data, expected)
expected = (
'Oldest completion was 2018-02-06 05:06:36 (16 minutes ago) '
'by a.diff.url.\n'
'Most recent completion was 2018-02-06 05:23:11 (5 seconds ago) '
'by my.url.\n')
data.append(('http://a.diff.url/not/as/awesome', now - 1000))
test_least_most(data, expected)
# now through larger sets at it
for extra in (5, 10, 40, 100):
data.extend([
('http://extra.%d.url/blah' % (extra + r),
now - random.randint(6, 999)) for r in range(extra)])
random.shuffle(data)
test_least_most(data, expected)
class TestReconCommands(unittest.TestCase):
def setUp(self):
@ -1067,6 +1106,41 @@ class TestReconCommands(unittest.TestCase):
cli.sharding_check([('127.0.0.1', 6011), ('127.0.0.1', 6021)])
mock_print.assert_has_calls(default_calls, any_order=True)
@ mock.patch('six.moves.builtins.print')
@ mock.patch('time.time')
def test_reconstruction_check(self, mock_now, mock_print):
now = 1430000000.0
def dummy_request(*args, **kwargs):
return [
('http://127.0.0.1:6011/recon/reconstruction',
{"object_reconstruction_last": now,
"object_reconstruction_time": 42},
200, 0, 0),
('http://127.0.0.1:6021/recon/reconstruction',
{"object_reconstruction_last": now,
"object_reconstruction_time": 23},
200, 0, 0)]
cli = recon.SwiftRecon()
cli.pool.imap = dummy_request
default_calls = [
mock.call('[object_reconstruction_time] low: 23, high: 42, '
'avg: 32.5, total: 65, Failed: 0.0%, no_result: 0, '
'reported: 2'),
mock.call('Oldest completion was 2015-04-25 22:13:20 ' +
'(42 seconds ago) by 127.0.0.1:6011.'),
mock.call('Most recent completion was 2015-04-25 22:13:20 ' +
'(42 seconds ago) by 127.0.0.1:6011.'),
]
mock_now.return_value = now + 42
cli.reconstruction_check([('127.0.0.1', 6011), ('127.0.0.1', 6021)])
# We need any_order=True because the order of calls depends on the dict
# that is returned from the recon middleware, thus can't rely on it
mock_print.assert_has_calls(default_calls, any_order=True)
@mock.patch('six.moves.builtins.print')
@mock.patch('time.time')
def test_load_check(self, mock_now, mock_print):
@ -1077,16 +1151,11 @@ class TestReconCommands(unittest.TestCase):
('http://127.0.0.1:6010/recon/load',
{"1m": 0.2, "5m": 0.4, "15m": 0.25,
"processes": 10000, "tasks": "1/128"},
200,
0,
0),
200, 0, 0),
('http://127.0.0.1:6020/recon/load',
{"1m": 0.4, "5m": 0.8, "15m": 0.75,
"processes": 9000, "tasks": "1/200"},
200,
0,
0),
]
200, 0, 0)]
cli = recon.SwiftRecon()
cli.pool.imap = dummy_request

View File

@ -168,6 +168,9 @@ class FakeRecon(object):
def fake_relinker(self):
return {"relinktest": "1"}
def fake_reconstruction(self):
return {'reconstructiontest': "1"}
def fake_updater(self, recon_type):
self.fake_updater_rtype = recon_type
return {'updatertest': "1"}
@ -807,6 +810,21 @@ class TestReconSuccess(TestCase):
rv = self.app.get_replication_info('unrecognized_recon_type')
self.assertIsNone(rv)
def test_get_reconstruction(self):
from_cache_response = {
"object_reconstruction_time": 0.2615511417388916,
"object_reconstruction_last": 1357969645.25}
self.fakecache.fakeout_calls = []
self.fakecache.fakeout = from_cache_response
rv = self.app.get_reconstruction_info()
self.assertEqual(self.fakecache.fakeout_calls,
[((['object_reconstruction_last',
'object_reconstruction_time'],
'/var/cache/swift/object.recon'), {})])
self.assertEqual(rv, {
"object_reconstruction_time": 0.2615511417388916,
"object_reconstruction_last": 1357969645.25})
def test_get_updater_info_container(self):
from_cache_response = {"container_updater_sweep": 18.476239919662476}
self.fakecache.fakeout_calls = []
@ -1333,6 +1351,7 @@ class TestReconMiddleware(unittest.TestCase):
self.app.get_async_info = self.frecon.fake_async
self.app.get_device_info = self.frecon.fake_get_device_info
self.app.get_replication_info = self.frecon.fake_replication
self.app.get_reconstruction_info = self.frecon.fake_reconstruction
self.app.get_auditor_info = self.frecon.fake_auditor
self.app.get_updater_info = self.frecon.fake_updater
self.app.get_expirer_info = self.frecon.fake_expirer
@ -1380,6 +1399,13 @@ class TestReconMiddleware(unittest.TestCase):
resp = self.app(req.environ, start_response)
self.assertEqual(resp, get_device_resp)
def test_reconstruction_info(self):
get_reconstruction_resp = [b'{"reconstructiontest": "1"}']
req = Request.blank('/recon/reconstruction/object',
environ={'REQUEST_METHOD': 'GET'})
resp = self.app(req.environ, start_response)
self.assertEqual(resp, get_reconstruction_resp)
def test_recon_get_replication_notype(self):
get_replication_resp = [b'{"replicationtest": "1"}']
req = Request.blank('/recon/replication',