Merge "Time synchronization check in recon."

This commit is contained in:
Jenkins 2015-08-18 01:21:22 +00:00 committed by Gerrit Code Review
commit 617c6b0107
7 changed files with 219 additions and 37 deletions

View File

@ -156,6 +156,7 @@ Maru Newby (mnewby@internap.com)
Newptone (xingchao@unitedstack.com)
Colin Nicholson (colin.nicholson@iomart.com)
Zhenguo Niu (zhenguo@unitedstack.com)
Ondrej Novy (ondrej.novy@firma.seznam.cz)
Timothy Okwii (tokwii@cisco.com)
Matthew Oliver (matt@oliver.net.au)
Hisashi Osanai (osanai.hisashi@jp.fujitsu.com)

View File

@ -25,7 +25,7 @@
.SH SYNOPSIS
.LP
.B swift-recon
\ <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
\ <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
.SH DESCRIPTION
.PP
@ -80,8 +80,10 @@ Get md5sum of servers ring and compare to local copy
Get cluster socket usage stats
.IP "\fB--driveaudit\fR"
Get drive audit error stats
.IP "\fB-T, --time\fR"
Check time synchronization
.IP "\fB--all\fR"
Perform all checks. Equivalent to \-arudlq \-\-md5
Perform all checks. Equivalent to \-arudlqT \-\-md5
.IP "\fB--region=REGION\fR"
Only query servers in specified region
.IP "\fB-z ZONE, --zone=ZONE\fR"

View File

@ -560,7 +560,7 @@ This information can also be queried via the swift-recon command line utility::
fhines@ubuntu:~$ swift-recon -h
Usage:
usage: swift-recon <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
<server_type> account|container|object
Defaults to object server.
@ -583,7 +583,8 @@ This information can also be queried via the swift-recon command line utility::
-q, --quarantined Get cluster quarantine stats
--md5 Get md5sum of servers ring and compare to local copy
--sockstat Get cluster socket usage stats
--all Perform all checks. Equal to -arudlq --md5 --sockstat
-T, --time Check time synchronization
--all Perform all checks. Equal to -arudlqT --md5 --sockstat
-z ZONE, --zone=ZONE Only query servers in specified zone
-t SECONDS, --timeout=SECONDS
Time to wait for a response from a server

View File

@ -100,11 +100,14 @@ class Scout(object):
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, and status)
:returns: tuple of (recon url used, response body, status, time start
and time end)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
ts_start = time.time()
url, content, status = self.scout_host(base_url, self.recon_type)
return url, content, status
ts_end = time.time()
return url, content, status, ts_start, ts_end
def scout_server_type(self, host):
"""
@ -253,7 +256,8 @@ class SwiftRecon(object):
if self.verbose:
for ring_file, ring_sum in rings.items():
print("-> On disk %s md5sum: %s" % (ring_file, ring_sum))
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status != 200:
errors = errors + 1
continue
@ -291,7 +295,8 @@ class SwiftRecon(object):
printfn("[%s] Checking swift.conf md5sum" % self._ptime())
if self.verbose:
printfn("-> On disk swift.conf md5sum: %s" % (conf_sum,))
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response[SWIFT_CONF_FILE] != conf_sum:
printfn("!! %s (%s) doesn't match on disk md5sum" %
@ -317,7 +322,8 @@ class SwiftRecon(object):
recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking async pendings" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response['async_pending']
stats = self._gen_stats(scan.values(), 'async_pending')
@ -338,7 +344,8 @@ class SwiftRecon(object):
recon = Scout("driveaudit", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking drive-audit errors" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response['drive_audit_errors']
stats = self._gen_stats(scan.values(), 'drive_audit_errors')
@ -361,7 +368,8 @@ class SwiftRecon(object):
self.timeout)
print("[%s] Getting unmounted drives from %s hosts..." %
(self._ptime(), len(hosts)))
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
unmounted[url] = []
errors[url] = []
@ -414,7 +422,8 @@ class SwiftRecon(object):
recon = Scout("expirer/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on expirers" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['object_expiration_pass'].append(
response.get('object_expiration_pass'))
@ -447,7 +456,8 @@ class SwiftRecon(object):
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time'))
@ -511,7 +521,8 @@ class SwiftRecon(object):
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats[url] = response['object_replication_time']
last = response.get('object_replication_last', 0)
@ -562,7 +573,8 @@ class SwiftRecon(object):
recon = Scout("updater/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking updater times" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response['%s_updater_sweep' % self.server_type]:
stats.append(response['%s_updater_sweep' %
@ -592,7 +604,8 @@ class SwiftRecon(object):
recon = Scout("auditor/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking auditor stats" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response
if len(scan) < 1:
@ -665,7 +678,8 @@ class SwiftRecon(object):
recon = Scout("auditor/object", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking auditor stats " % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response['object_auditor_stats_ALL']:
all_scan[url] = response['object_auditor_stats_ALL']
@ -736,7 +750,8 @@ class SwiftRecon(object):
recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking load averages" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
@ -765,7 +780,8 @@ class SwiftRecon(object):
recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking quarantine" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
@ -799,7 +815,8 @@ class SwiftRecon(object):
recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking socket usage" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
@ -835,7 +852,8 @@ class SwiftRecon(object):
recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking disk usage now" % self._ptime())
for url, response, status in self.pool.imap(recon.scout, hosts):
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
hostusage = []
for entry in response:
@ -915,6 +933,47 @@ class SwiftRecon(object):
host = urlparse(url).netloc.split(':')[0]
print('%.02f%% %s' % (used, '%-15s %s' % (host, device)))
def time_check(self, hosts):
"""
Check a time synchronization of hosts with current time
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
matches = 0
errors = 0
recon = Scout("time", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking time-sync" % self._ptime())
for url, ts_remote, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status != 200:
errors = errors + 1
continue
if (ts_remote < ts_start or ts_remote > ts_end):
diff = abs(ts_end - ts_remote)
ts_end_f = time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(ts_end))
ts_remote_f = time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(ts_remote))
print("!! %s current time is %s, but remote is %s, "
"differs by %.2f sec" % (
url,
ts_end_f,
ts_remote_f,
diff))
continue
matches += 1
if self.verbose:
print("-> %s matches." % url)
print("%s/%s hosts matched, %s error[s] while checking hosts." % (
matches, len(hosts), errors))
print("=" * 79)
def main(self):
"""
Retrieve and report cluster info from hosts running recon middleware.
@ -922,7 +981,7 @@ class SwiftRecon(object):
print("=" * 79)
usage = '''
usage: %prog <server_type> [-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[--human-readable]
<server_type>\taccount|container|object
@ -964,13 +1023,15 @@ class SwiftRecon(object):
help="Get cluster socket usage stats")
args.add_option('--driveaudit', action="store_true",
help="Get drive audit error stats")
args.add_option('--time', '-T', action="store_true",
help="Check time synchronization")
args.add_option('--top', type='int', metavar='COUNT', default=0,
help='Also show the top COUNT entries in rank order.')
args.add_option('--lowest', type='int', metavar='COUNT', default=0,
help='Also show the lowest COUNT entries in rank \
order.')
args.add_option('--all', action="store_true",
help="Perform all checks. Equal to \t\t\t-arudlq "
help="Perform all checks. Equal to \t\t\t-arudlqT "
"--md5 --sockstat --auditor --updater --expirer")
args.add_option('--region', type="int",
help="Only query servers in specified region")
@ -1031,6 +1092,7 @@ class SwiftRecon(object):
self.socket_usage(hosts)
self.server_type_check(hosts)
self.driveaudit_check(hosts)
self.time_check(hosts)
else:
if options.async:
if self.server_type == 'object':
@ -1075,6 +1137,8 @@ class SwiftRecon(object):
self.socket_usage(hosts)
if options.driveaudit:
self.driveaudit_check(hosts)
if options.time:
self.time_check(hosts)
def main():

View File

@ -15,6 +15,7 @@
import errno
import os
import time
from swift import gettext_ as _
from swift import __version__ as swiftver
@ -328,6 +329,11 @@ class ReconMiddleware(object):
raise
return sockstat
def get_time(self):
"""get current time"""
return time.time()
def GET(self, req):
root, rcheck, rtype = req.split_path(1, 3, True)
all_rtypes = ['account', 'container', 'object']
@ -368,6 +374,8 @@ class ReconMiddleware(object):
content = self.get_version()
elif rcheck == "driveaudit":
content = self.get_driveaudit_error()
elif rcheck == "time":
content = self.get_time()
else:
content = "Invalid path: %s" % req.path
return Response(request=req, status="404 Not Found",

View File

@ -61,7 +61,7 @@ class TestScout(unittest.TestCase):
@mock.patch('eventlet.green.urllib2.urlopen')
def test_scout_ok(self, mock_urlopen):
mock_urlopen.return_value.read = lambda: json.dumps([])
url, content, status = self.scout_instance.scout(
url, content, status, ts_start, ts_end = self.scout_instance.scout(
("127.0.0.1", "8080"))
self.assertEqual(url, self.url)
self.assertEqual(content, [])
@ -70,7 +70,7 @@ class TestScout(unittest.TestCase):
@mock.patch('eventlet.green.urllib2.urlopen')
def test_scout_url_error(self, mock_urlopen):
mock_urlopen.side_effect = urllib2.URLError("")
url, content, status = self.scout_instance.scout(
url, content, status, ts_start, ts_end = self.scout_instance.scout(
("127.0.0.1", "8080"))
self.assertTrue(isinstance(content, urllib2.URLError))
self.assertEqual(url, self.url)
@ -80,7 +80,7 @@ class TestScout(unittest.TestCase):
def test_scout_http_error(self, mock_urlopen):
mock_urlopen.side_effect = urllib2.HTTPError(
self.url, 404, "Internal error", None, None)
url, content, status = self.scout_instance.scout(
url, content, status, ts_start, ts_end = self.scout_instance.scout(
("127.0.0.1", "8080"))
self.assertEqual(url, self.url)
self.assertTrue(isinstance(content, urllib2.HTTPError))
@ -218,7 +218,7 @@ class TestRecon(unittest.TestCase):
'/etc/swift/object-1.ring.gz': empty_file_hash,
}
status = 200
scout_instance.scout.return_value = (url, response, status)
scout_instance.scout.return_value = (url, response, status, 0, 0)
mock_scout.return_value = scout_instance
stdout = StringIO()
mock_hash = mock.MagicMock()
@ -274,7 +274,7 @@ class TestRecon(unittest.TestCase):
url = 'http://%s:%s/recon/quarantined' % host
response = responses[host[1]]
status = 200
return url, response, status
return url, response, status, 0, 0
stdout = StringIO()
patches = [
@ -311,7 +311,7 @@ class TestRecon(unittest.TestCase):
url = 'http://%s:%s/recon/driveaudit' % host
response = responses[host[1]]
status = 200
return url, response, status
return url, response, status, 0, 0
stdout = StringIO()
patches = [
@ -491,7 +491,7 @@ class TestReconCommands(unittest.TestCase):
return [('http://127.0.0.1:6010/recon/auditor/object', {
'object_auditor_stats_ALL': values,
'object_auditor_stats_ZBF': values,
}, 200)]
}, 200, 0, 0)]
response = {}
@ -535,7 +535,9 @@ class TestReconCommands(unittest.TestCase):
"avail": 15, "used": 85, "size": 100},
{"device": "sdd1", "mounted": True,
"avail": 15, "used": 85, "size": 100}],
200)]
200,
0,
0)]
cli = recon.SwiftRecon()
cli.pool.imap = dummy_request
@ -586,11 +588,15 @@ class TestReconCommands(unittest.TestCase):
('http://127.0.0.1:6010/recon/replication/object',
{"object_replication_time": 61,
"object_replication_last": now},
200),
200,
0,
0),
('http://127.0.0.1:6020/recon/replication/object',
{"object_replication_time": 23,
"object_replication_last": now},
200),
200,
0,
0),
]
cli = recon.SwiftRecon()
@ -625,7 +631,9 @@ class TestReconCommands(unittest.TestCase):
"remote_merge": 0, "diff_capped": 0, "start": now,
"hashmatch": 0, "diff": 0, "empty": 0},
"replication_time": 42},
200),
200,
0,
0),
('http://127.0.0.1:6021/recon/replication/container',
{"replication_last": now,
"replication_stats": {
@ -634,7 +642,9 @@ class TestReconCommands(unittest.TestCase):
"remote_merge": 0, "diff_capped": 0, "start": now,
"hashmatch": 0, "diff": 0, "empty": 0},
"replication_time": 23},
200),
200,
0,
0),
]
cli = recon.SwiftRecon()
@ -671,11 +681,15 @@ class TestReconCommands(unittest.TestCase):
('http://127.0.0.1:6010/recon/load',
{"1m": 0.2, "5m": 0.4, "15m": 0.25,
"processes": 10000, "tasks": "1/128"},
200),
200,
0,
0),
('http://127.0.0.1:6020/recon/load',
{"1m": 0.4, "5m": 0.8, "15m": 0.75,
"processes": 9000, "tasks": "1/200"},
200),
200,
0,
0),
]
cli = recon.SwiftRecon()
@ -695,3 +709,75 @@ class TestReconCommands(unittest.TestCase):
# We need any_order=True because the order of calls depends on the dict
# that is returned from the recon middleware, thus can't rely on it
mock_print.assert_has_calls(default_calls, any_order=True)
@mock.patch('__builtin__.print')
@mock.patch('time.time')
def test_time_check(self, mock_now, mock_print):
now = 1430000000.0
mock_now.return_value = now
def dummy_request(*args, **kwargs):
return [
('http://127.0.0.1:6010/recon/load',
now,
200,
now - 0.5,
now + 0.5),
('http://127.0.0.1:6020/recon/load',
now,
200,
now,
now),
]
cli = recon.SwiftRecon()
cli.pool.imap = dummy_request
default_calls = [
mock.call('2/2 hosts matched, 0 error[s] while checking hosts.')
]
cli.time_check([('127.0.0.1', 6010), ('127.0.0.1', 6020)])
# We need any_order=True because the order of calls depends on the dict
# that is returned from the recon middleware, thus can't rely on it
mock_print.assert_has_calls(default_calls, any_order=True)
@mock.patch('__builtin__.print')
@mock.patch('time.time')
def test_time_check_mismatch(self, mock_now, mock_print):
now = 1430000000.0
mock_now.return_value = now
def dummy_request(*args, **kwargs):
return [
('http://127.0.0.1:6010/recon/time',
now,
200,
now + 0.5,
now + 1.3),
('http://127.0.0.1:6020/recon/time',
now,
200,
now,
now),
]
cli = recon.SwiftRecon()
cli.pool.imap = dummy_request
default_calls = [
mock.call("!! http://127.0.0.1:6010/recon/time current time is "
"2015-04-25 22:13:21, but remote is "
"2015-04-25 22:13:20, differs by 1.30 sec"),
mock.call('1/2 hosts matched, 0 error[s] while checking hosts.'),
]
def mock_localtime(*args, **kwargs):
return time.gmtime(*args, **kwargs)
with mock.patch("time.localtime", mock_localtime):
cli.time_check([('127.0.0.1', 6010), ('127.0.0.1', 6020)])
# We need any_order=True because the order of calls depends on the dict
# that is returned from the recon middleware, thus can't rely on it
mock_print.assert_has_calls(default_calls, any_order=True)

View File

@ -175,6 +175,9 @@ class FakeRecon(object):
def fake_driveaudit(self):
return {'driveaudittest': "1"}
def fake_time(self):
return {'timetest': "1"}
def nocontent(self):
return None
@ -855,6 +858,15 @@ class TestReconSuccess(TestCase):
'/var/cache/swift/drive.recon'), {})])
self.assertEquals(rv, {'drive_audit_errors': 7})
def test_get_time(self):
def fake_time():
return 1430000000.0
with mock.patch("time.time", fake_time):
now = fake_time()
rv = self.app.get_time()
self.assertEquals(rv, now)
class TestReconMiddleware(unittest.TestCase):
@ -884,6 +896,7 @@ class TestReconMiddleware(unittest.TestCase):
self.app.get_quarantine_count = self.frecon.fake_quarantined
self.app.get_socket_info = self.frecon.fake_sockstat
self.app.get_driveaudit_error = self.frecon.fake_driveaudit
self.app.get_time = self.frecon.fake_time
def test_recon_get_mem(self):
get_mem_resp = ['{"memtest": "1"}']
@ -1118,5 +1131,12 @@ class TestReconMiddleware(unittest.TestCase):
resp = self.app(req.environ, start_response)
self.assertEquals(resp, get_driveaudit_resp)
def test_recon_get_time(self):
get_time_resp = ['{"timetest": "1"}']
req = Request.blank('/recon/time',
environ={'REQUEST_METHOD': 'GET'})
resp = self.app(req.environ, start_response)
self.assertEquals(resp, get_time_resp)
if __name__ == '__main__':
unittest.main()