From dd2f1be3b124d3901ebbc176a7adc462b6449667 Mon Sep 17 00:00:00 2001 From: Ondrej Novy Date: Tue, 16 Jun 2015 17:42:58 +0200 Subject: [PATCH] Time synchronization check in recon. This change add call time to recon middleware and param --time to recon CLI. This is usefull for checking if time in cluster is synchronized. Change-Id: I62373e681f64d0bd71f4aeb287953dd3b2ea5662 --- AUTHORS | 1 + doc/manpages/swift-recon.1 | 6 +- doc/source/admin_guide.rst | 5 +- swift/cli/recon.py | 102 +++++++++++++++---- swift/common/middleware/recon.py | 8 ++ test/unit/cli/test_recon.py | 114 +++++++++++++++++++--- test/unit/common/middleware/test_recon.py | 20 ++++ 7 files changed, 219 insertions(+), 37 deletions(-) diff --git a/AUTHORS b/AUTHORS index ebde31a39a..1f628d9c3b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -156,6 +156,7 @@ Maru Newby (mnewby@internap.com) Newptone (xingchao@unitedstack.com) Colin Nicholson (colin.nicholson@iomart.com) Zhenguo Niu (zhenguo@unitedstack.com) +Ondrej Novy (ondrej.novy@firma.seznam.cz) Timothy Okwii (tokwii@cisco.com) Matthew Oliver (matt@oliver.net.au) Hisashi Osanai (osanai.hisashi@jp.fujitsu.com) diff --git a/doc/manpages/swift-recon.1 b/doc/manpages/swift-recon.1 index c635861aca..3120405005 100644 --- a/doc/manpages/swift-recon.1 +++ b/doc/manpages/swift-recon.1 @@ -25,7 +25,7 @@ .SH SYNOPSIS .LP .B swift-recon -\ [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] +\ [-v] [--suppress] [-a] [-r] [-u] [-d] [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] .SH DESCRIPTION .PP @@ -80,8 +80,10 @@ Get md5sum of servers ring and compare to local copy Get cluster socket usage stats .IP "\fB--driveaudit\fR" Get drive audit error stats +.IP "\fB-T, --time\fR" +Check time synchronization .IP "\fB--all\fR" -Perform all checks. Equivalent to \-arudlq \-\-md5 +Perform all checks. Equivalent to \-arudlqT \-\-md5 .IP "\fB--region=REGION\fR" Only query servers in specified region .IP "\fB-z ZONE, --zone=ZONE\fR" diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index f27c20741e..5c151aabeb 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -555,7 +555,7 @@ This information can also be queried via the swift-recon command line utility:: fhines@ubuntu:~$ swift-recon -h Usage: usage: swift-recon [-v] [--suppress] [-a] [-r] [-u] [-d] - [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] account|container|object Defaults to object server. @@ -578,7 +578,8 @@ This information can also be queried via the swift-recon command line utility:: -q, --quarantined Get cluster quarantine stats --md5 Get md5sum of servers ring and compare to local copy --sockstat Get cluster socket usage stats - --all Perform all checks. Equal to -arudlq --md5 --sockstat + -T, --time Check time synchronization + --all Perform all checks. Equal to -arudlqT --md5 --sockstat -z ZONE, --zone=ZONE Only query servers in specified zone -t SECONDS, --timeout=SECONDS Time to wait for a response from a server diff --git a/swift/cli/recon.py b/swift/cli/recon.py index f57f75c22a..79e0721c04 100755 --- a/swift/cli/recon.py +++ b/swift/cli/recon.py @@ -100,11 +100,14 @@ class Scout(object): Obtain telemetry from a host running the swift recon middleware. :param host: host to check - :returns: tuple of (recon url used, response body, and status) + :returns: tuple of (recon url used, response body, status, time start + and time end) """ base_url = "http://%s:%s/recon/" % (host[0], host[1]) + ts_start = time.time() url, content, status = self.scout_host(base_url, self.recon_type) - return url, content, status + ts_end = time.time() + return url, content, status, ts_start, ts_end def scout_server_type(self, host): """ @@ -253,7 +256,8 @@ class SwiftRecon(object): if self.verbose: for ring_file, ring_sum in rings.items(): print("-> On disk %s md5sum: %s" % (ring_file, ring_sum)) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status != 200: errors = errors + 1 continue @@ -291,7 +295,8 @@ class SwiftRecon(object): printfn("[%s] Checking swift.conf md5sum" % self._ptime()) if self.verbose: printfn("-> On disk swift.conf md5sum: %s" % (conf_sum,)) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: if response[SWIFT_CONF_FILE] != conf_sum: printfn("!! %s (%s) doesn't match on disk md5sum" % @@ -317,7 +322,8 @@ class SwiftRecon(object): recon = Scout("async", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking async pendings" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: scan[url] = response['async_pending'] stats = self._gen_stats(scan.values(), 'async_pending') @@ -338,7 +344,8 @@ class SwiftRecon(object): recon = Scout("driveaudit", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking drive-audit errors" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: scan[url] = response['drive_audit_errors'] stats = self._gen_stats(scan.values(), 'drive_audit_errors') @@ -361,7 +368,8 @@ class SwiftRecon(object): self.timeout) print("[%s] Getting unmounted drives from %s hosts..." % (self._ptime(), len(hosts))) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: unmounted[url] = [] errors[url] = [] @@ -414,7 +422,8 @@ class SwiftRecon(object): recon = Scout("expirer/%s" % self.server_type, self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking on expirers" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: stats['object_expiration_pass'].append( response.get('object_expiration_pass')) @@ -447,7 +456,8 @@ class SwiftRecon(object): least_recent_url = None most_recent_time = 0 most_recent_url = None - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: stats['replication_time'].append( response.get('replication_time')) @@ -511,7 +521,8 @@ class SwiftRecon(object): least_recent_url = None most_recent_time = 0 most_recent_url = None - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: stats[url] = response['object_replication_time'] last = response.get('object_replication_last', 0) @@ -562,7 +573,8 @@ class SwiftRecon(object): recon = Scout("updater/%s" % self.server_type, self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking updater times" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: if response['%s_updater_sweep' % self.server_type]: stats.append(response['%s_updater_sweep' % @@ -592,7 +604,8 @@ class SwiftRecon(object): recon = Scout("auditor/%s" % self.server_type, self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking auditor stats" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: scan[url] = response if len(scan) < 1: @@ -665,7 +678,8 @@ class SwiftRecon(object): recon = Scout("auditor/object", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking auditor stats " % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: if response['object_auditor_stats_ALL']: all_scan[url] = response['object_auditor_stats_ALL'] @@ -736,7 +750,8 @@ class SwiftRecon(object): recon = Scout("load", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking load averages" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: load1[url] = response['1m'] load5[url] = response['5m'] @@ -765,7 +780,8 @@ class SwiftRecon(object): recon = Scout("quarantined", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking quarantine" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: objq[url] = response['objects'] conq[url] = response['containers'] @@ -799,7 +815,8 @@ class SwiftRecon(object): recon = Scout("sockstat", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking socket usage" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: inuse4[url] = response['tcp_in_use'] mem[url] = response['tcp_mem_allocated_bytes'] @@ -835,7 +852,8 @@ class SwiftRecon(object): recon = Scout("diskusage", self.verbose, self.suppress_errors, self.timeout) print("[%s] Checking disk usage now" % self._ptime()) - for url, response, status in self.pool.imap(recon.scout, hosts): + for url, response, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): if status == 200: hostusage = [] for entry in response: @@ -915,6 +933,47 @@ class SwiftRecon(object): host = urlparse(url).netloc.split(':')[0] print('%.02f%% %s' % (used, '%-15s %s' % (host, device))) + def time_check(self, hosts): + """ + Check a time synchronization of hosts with current time + + :param hosts: set of hosts to check. in the format of: + set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]) + """ + + matches = 0 + errors = 0 + recon = Scout("time", self.verbose, self.suppress_errors, + self.timeout) + print("[%s] Checking time-sync" % self._ptime()) + for url, ts_remote, status, ts_start, ts_end in self.pool.imap( + recon.scout, hosts): + if status != 200: + errors = errors + 1 + continue + if (ts_remote < ts_start or ts_remote > ts_end): + diff = abs(ts_end - ts_remote) + ts_end_f = time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(ts_end)) + ts_remote_f = time.strftime( + "%Y-%m-%d %H:%M:%S", + time.localtime(ts_remote)) + + print("!! %s current time is %s, but remote is %s, " + "differs by %.2f sec" % ( + url, + ts_end_f, + ts_remote_f, + diff)) + continue + matches += 1 + if self.verbose: + print("-> %s matches." % url) + print("%s/%s hosts matched, %s error[s] while checking hosts." % ( + matches, len(hosts), errors)) + print("=" * 79) + def main(self): """ Retrieve and report cluster info from hosts running recon middleware. @@ -922,7 +981,7 @@ class SwiftRecon(object): print("=" * 79) usage = ''' usage: %prog [-v] [--suppress] [-a] [-r] [-u] [-d] - [-l] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] + [-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat] [--human-readable] \taccount|container|object @@ -964,13 +1023,15 @@ class SwiftRecon(object): help="Get cluster socket usage stats") args.add_option('--driveaudit', action="store_true", help="Get drive audit error stats") + args.add_option('--time', '-T', action="store_true", + help="Check time synchronization") args.add_option('--top', type='int', metavar='COUNT', default=0, help='Also show the top COUNT entries in rank order.') args.add_option('--lowest', type='int', metavar='COUNT', default=0, help='Also show the lowest COUNT entries in rank \ order.') args.add_option('--all', action="store_true", - help="Perform all checks. Equal to \t\t\t-arudlq " + help="Perform all checks. Equal to \t\t\t-arudlqT " "--md5 --sockstat --auditor --updater --expirer") args.add_option('--region', type="int", help="Only query servers in specified region") @@ -1031,6 +1092,7 @@ class SwiftRecon(object): self.socket_usage(hosts) self.server_type_check(hosts) self.driveaudit_check(hosts) + self.time_check(hosts) else: if options.async: if self.server_type == 'object': @@ -1075,6 +1137,8 @@ class SwiftRecon(object): self.socket_usage(hosts) if options.driveaudit: self.driveaudit_check(hosts) + if options.time: + self.time_check(hosts) def main(): diff --git a/swift/common/middleware/recon.py b/swift/common/middleware/recon.py index 88d5243a4d..a20e197cfd 100644 --- a/swift/common/middleware/recon.py +++ b/swift/common/middleware/recon.py @@ -15,6 +15,7 @@ import errno import os +import time from swift import gettext_ as _ from swift import __version__ as swiftver @@ -328,6 +329,11 @@ class ReconMiddleware(object): raise return sockstat + def get_time(self): + """get current time""" + + return time.time() + def GET(self, req): root, rcheck, rtype = req.split_path(1, 3, True) all_rtypes = ['account', 'container', 'object'] @@ -368,6 +374,8 @@ class ReconMiddleware(object): content = self.get_version() elif rcheck == "driveaudit": content = self.get_driveaudit_error() + elif rcheck == "time": + content = self.get_time() else: content = "Invalid path: %s" % req.path return Response(request=req, status="404 Not Found", diff --git a/test/unit/cli/test_recon.py b/test/unit/cli/test_recon.py index dd53ae9d54..26dd6fb2c1 100644 --- a/test/unit/cli/test_recon.py +++ b/test/unit/cli/test_recon.py @@ -61,7 +61,7 @@ class TestScout(unittest.TestCase): @mock.patch('eventlet.green.urllib2.urlopen') def test_scout_ok(self, mock_urlopen): mock_urlopen.return_value.read = lambda: json.dumps([]) - url, content, status = self.scout_instance.scout( + url, content, status, ts_start, ts_end = self.scout_instance.scout( ("127.0.0.1", "8080")) self.assertEqual(url, self.url) self.assertEqual(content, []) @@ -70,7 +70,7 @@ class TestScout(unittest.TestCase): @mock.patch('eventlet.green.urllib2.urlopen') def test_scout_url_error(self, mock_urlopen): mock_urlopen.side_effect = urllib2.URLError("") - url, content, status = self.scout_instance.scout( + url, content, status, ts_start, ts_end = self.scout_instance.scout( ("127.0.0.1", "8080")) self.assertTrue(isinstance(content, urllib2.URLError)) self.assertEqual(url, self.url) @@ -80,7 +80,7 @@ class TestScout(unittest.TestCase): def test_scout_http_error(self, mock_urlopen): mock_urlopen.side_effect = urllib2.HTTPError( self.url, 404, "Internal error", None, None) - url, content, status = self.scout_instance.scout( + url, content, status, ts_start, ts_end = self.scout_instance.scout( ("127.0.0.1", "8080")) self.assertEqual(url, self.url) self.assertTrue(isinstance(content, urllib2.HTTPError)) @@ -218,7 +218,7 @@ class TestRecon(unittest.TestCase): '/etc/swift/object-1.ring.gz': empty_file_hash, } status = 200 - scout_instance.scout.return_value = (url, response, status) + scout_instance.scout.return_value = (url, response, status, 0, 0) mock_scout.return_value = scout_instance stdout = StringIO() mock_hash = mock.MagicMock() @@ -274,7 +274,7 @@ class TestRecon(unittest.TestCase): url = 'http://%s:%s/recon/quarantined' % host response = responses[host[1]] status = 200 - return url, response, status + return url, response, status, 0, 0 stdout = StringIO() patches = [ @@ -311,7 +311,7 @@ class TestRecon(unittest.TestCase): url = 'http://%s:%s/recon/driveaudit' % host response = responses[host[1]] status = 200 - return url, response, status + return url, response, status, 0, 0 stdout = StringIO() patches = [ @@ -491,7 +491,7 @@ class TestReconCommands(unittest.TestCase): return [('http://127.0.0.1:6010/recon/auditor/object', { 'object_auditor_stats_ALL': values, 'object_auditor_stats_ZBF': values, - }, 200)] + }, 200, 0, 0)] response = {} @@ -535,7 +535,9 @@ class TestReconCommands(unittest.TestCase): "avail": 15, "used": 85, "size": 100}, {"device": "sdd1", "mounted": True, "avail": 15, "used": 85, "size": 100}], - 200)] + 200, + 0, + 0)] cli = recon.SwiftRecon() cli.pool.imap = dummy_request @@ -586,11 +588,15 @@ class TestReconCommands(unittest.TestCase): ('http://127.0.0.1:6010/recon/replication/object', {"object_replication_time": 61, "object_replication_last": now}, - 200), + 200, + 0, + 0), ('http://127.0.0.1:6020/recon/replication/object', {"object_replication_time": 23, "object_replication_last": now}, - 200), + 200, + 0, + 0), ] cli = recon.SwiftRecon() @@ -625,7 +631,9 @@ class TestReconCommands(unittest.TestCase): "remote_merge": 0, "diff_capped": 0, "start": now, "hashmatch": 0, "diff": 0, "empty": 0}, "replication_time": 42}, - 200), + 200, + 0, + 0), ('http://127.0.0.1:6021/recon/replication/container', {"replication_last": now, "replication_stats": { @@ -634,7 +642,9 @@ class TestReconCommands(unittest.TestCase): "remote_merge": 0, "diff_capped": 0, "start": now, "hashmatch": 0, "diff": 0, "empty": 0}, "replication_time": 23}, - 200), + 200, + 0, + 0), ] cli = recon.SwiftRecon() @@ -671,11 +681,15 @@ class TestReconCommands(unittest.TestCase): ('http://127.0.0.1:6010/recon/load', {"1m": 0.2, "5m": 0.4, "15m": 0.25, "processes": 10000, "tasks": "1/128"}, - 200), + 200, + 0, + 0), ('http://127.0.0.1:6020/recon/load', {"1m": 0.4, "5m": 0.8, "15m": 0.75, "processes": 9000, "tasks": "1/200"}, - 200), + 200, + 0, + 0), ] cli = recon.SwiftRecon() @@ -695,3 +709,75 @@ class TestReconCommands(unittest.TestCase): # We need any_order=True because the order of calls depends on the dict # that is returned from the recon middleware, thus can't rely on it mock_print.assert_has_calls(default_calls, any_order=True) + + @mock.patch('__builtin__.print') + @mock.patch('time.time') + def test_time_check(self, mock_now, mock_print): + now = 1430000000.0 + mock_now.return_value = now + + def dummy_request(*args, **kwargs): + return [ + ('http://127.0.0.1:6010/recon/load', + now, + 200, + now - 0.5, + now + 0.5), + ('http://127.0.0.1:6020/recon/load', + now, + 200, + now, + now), + ] + + cli = recon.SwiftRecon() + cli.pool.imap = dummy_request + + default_calls = [ + mock.call('2/2 hosts matched, 0 error[s] while checking hosts.') + ] + + cli.time_check([('127.0.0.1', 6010), ('127.0.0.1', 6020)]) + # We need any_order=True because the order of calls depends on the dict + # that is returned from the recon middleware, thus can't rely on it + mock_print.assert_has_calls(default_calls, any_order=True) + + @mock.patch('__builtin__.print') + @mock.patch('time.time') + def test_time_check_mismatch(self, mock_now, mock_print): + now = 1430000000.0 + mock_now.return_value = now + + def dummy_request(*args, **kwargs): + return [ + ('http://127.0.0.1:6010/recon/time', + now, + 200, + now + 0.5, + now + 1.3), + ('http://127.0.0.1:6020/recon/time', + now, + 200, + now, + now), + ] + + cli = recon.SwiftRecon() + cli.pool.imap = dummy_request + + default_calls = [ + mock.call("!! http://127.0.0.1:6010/recon/time current time is " + "2015-04-25 22:13:21, but remote is " + "2015-04-25 22:13:20, differs by 1.30 sec"), + mock.call('1/2 hosts matched, 0 error[s] while checking hosts.'), + ] + + def mock_localtime(*args, **kwargs): + return time.gmtime(*args, **kwargs) + + with mock.patch("time.localtime", mock_localtime): + cli.time_check([('127.0.0.1', 6010), ('127.0.0.1', 6020)]) + + # We need any_order=True because the order of calls depends on the dict + # that is returned from the recon middleware, thus can't rely on it + mock_print.assert_has_calls(default_calls, any_order=True) diff --git a/test/unit/common/middleware/test_recon.py b/test/unit/common/middleware/test_recon.py index 05a11ce859..c383e43563 100644 --- a/test/unit/common/middleware/test_recon.py +++ b/test/unit/common/middleware/test_recon.py @@ -175,6 +175,9 @@ class FakeRecon(object): def fake_driveaudit(self): return {'driveaudittest': "1"} + def fake_time(self): + return {'timetest': "1"} + def nocontent(self): return None @@ -855,6 +858,15 @@ class TestReconSuccess(TestCase): '/var/cache/swift/drive.recon'), {})]) self.assertEquals(rv, {'drive_audit_errors': 7}) + def test_get_time(self): + def fake_time(): + return 1430000000.0 + + with mock.patch("time.time", fake_time): + now = fake_time() + rv = self.app.get_time() + self.assertEquals(rv, now) + class TestReconMiddleware(unittest.TestCase): @@ -884,6 +896,7 @@ class TestReconMiddleware(unittest.TestCase): self.app.get_quarantine_count = self.frecon.fake_quarantined self.app.get_socket_info = self.frecon.fake_sockstat self.app.get_driveaudit_error = self.frecon.fake_driveaudit + self.app.get_time = self.frecon.fake_time def test_recon_get_mem(self): get_mem_resp = ['{"memtest": "1"}'] @@ -1118,5 +1131,12 @@ class TestReconMiddleware(unittest.TestCase): resp = self.app(req.environ, start_response) self.assertEquals(resp, get_driveaudit_resp) + def test_recon_get_time(self): + get_time_resp = ['{"timetest": "1"}'] + req = Request.blank('/recon/time', + environ={'REQUEST_METHOD': 'GET'}) + resp = self.app(req.environ, start_response) + self.assertEquals(resp, get_time_resp) + if __name__ == '__main__': unittest.main()