Files
swift/swift/cli/recon.py
lijunbo 6788bf4aad Fixed get ring name from recon cli
This patch uses a more specific string to match
the ring name in recon cli.

Almost all the places in the project where need to get the
suffix (like ring.gz, ts, data and .etc) will include the '.'
in front, if a file named 'object.sring.gz' in the swift_dir
will be added in the ring_names, which is not what we want.

Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1680704
Change-Id: Ida659fa71585f9b0cf36da75b58b28e6a25533df
2017-04-13 07:01:03 +00:00

1112 lines
46 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
cmdline utility to perform cluster reconnaissance
"""
from __future__ import print_function
from eventlet.green import socket
from six.moves.urllib.parse import urlparse
from swift.common.utils import SWIFT_CONF_FILE, md5_hash_for_file
from swift.common.ring import Ring
from swift.common.storage_policy import POLICIES
import eventlet
import json
import optparse
import time
import sys
import six
import os
if six.PY3:
from eventlet.green.urllib import request as urllib2
else:
from eventlet.green import urllib2
def seconds2timeunit(seconds):
elapsed = seconds
unit = 'seconds'
if elapsed >= 60:
elapsed = elapsed / 60.0
unit = 'minutes'
if elapsed >= 60:
elapsed = elapsed / 60.0
unit = 'hours'
if elapsed >= 24:
elapsed = elapsed / 24.0
unit = 'days'
return elapsed, unit
def size_suffix(size):
suffixes = ['bytes', 'kB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
for suffix in suffixes:
if size < 1000:
return "%s %s" % (size, suffix)
size = size // 1000
return "%s %s" % (size, suffix)
class Scout(object):
"""
Obtain swift recon information
"""
def __init__(self, recon_type, verbose=False, suppress_errors=False,
timeout=5):
self.recon_type = recon_type
self.verbose = verbose
self.suppress_errors = suppress_errors
self.timeout = timeout
def scout_host(self, base_url, recon_type):
"""
Perform the actual HTTP request to obtain swift recon telemtry.
:param base_url: the base url of the host you wish to check. str of the
format 'http://127.0.0.1:6200/recon/'
:param recon_type: the swift recon check to request.
:returns: tuple of (recon url used, response body, and status)
"""
url = base_url + recon_type
try:
body = urllib2.urlopen(url, timeout=self.timeout).read()
content = json.loads(body)
if self.verbose:
print("-> %s: %s" % (url, content))
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print("-> %s: %s" % (url, err))
content = err
status = err.code
except (urllib2.URLError, socket.timeout) as err:
if not self.suppress_errors or self.verbose:
print("-> %s: %s" % (url, err))
content = err
status = -1
return url, content, status
def scout(self, host):
"""
Obtain telemetry from a host running the swift recon middleware.
:param host: host to check
:returns: tuple of (recon url used, response body, status, time start
and time end)
"""
base_url = "http://%s:%s/recon/" % (host[0], host[1])
ts_start = time.time()
url, content, status = self.scout_host(base_url, self.recon_type)
ts_end = time.time()
return url, content, status, ts_start, ts_end
def scout_server_type(self, host):
"""
Obtain Server header by calling OPTIONS.
:param host: host to check
:returns: Server type, status
"""
try:
url = "http://%s:%s/" % (host[0], host[1])
req = urllib2.Request(url)
req.get_method = lambda: 'OPTIONS'
conn = urllib2.urlopen(req)
header = conn.info().getheader('Server')
server_header = header.split('/')
content = server_header[0]
status = 200
except urllib2.HTTPError as err:
if not self.suppress_errors or self.verbose:
print("-> %s: %s" % (url, err))
content = err
status = err.code
except (urllib2.URLError, socket.timeout) as err:
if not self.suppress_errors or self.verbose:
print("-> %s: %s" % (url, err))
content = err
status = -1
return url, content, status
class SwiftRecon(object):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
def __init__(self):
self.verbose = False
self.suppress_errors = False
self.timeout = 5
self.pool_size = 30
self.pool = eventlet.GreenPool(self.pool_size)
self.check_types = ['account', 'container', 'object']
self.server_type = 'object'
def _gen_stats(self, stats, name=None):
"""Compute various stats from a list of values."""
cstats = [x for x in stats if x is not None]
if len(cstats) > 0:
ret_dict = {'low': min(cstats), 'high': max(cstats),
'total': sum(cstats), 'reported': len(cstats),
'number_none': len(stats) - len(cstats), 'name': name}
ret_dict['average'] = \
ret_dict['total'] / float(len(cstats))
ret_dict['perc_none'] = \
ret_dict['number_none'] * 100.0 / len(stats)
else:
ret_dict = {'reported': 0}
return ret_dict
def _print_stats(self, stats):
"""
print out formatted stats to console
:param stats: dict of stats generated by _gen_stats
"""
print('[%(name)s] low: %(low)d, high: %(high)d, avg: '
'%(average).1f, total: %(total)d, '
'Failed: %(perc_none).1f%%, no_result: %(number_none)d, '
'reported: %(reported)d' % stats)
def _ptime(self, timev=None):
"""
:param timev: a unix timestamp or None
:returns: a pretty string of the current time or provided time in UTC
"""
if timev:
return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(timev))
else:
return time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
def get_hosts(self, region_filter, zone_filter, swift_dir, ring_names):
"""
Get a list of hosts in the rings.
:param region_filter: Only list regions matching given filter
:param zone_filter: Only list zones matching given filter
:param swift_dir: Directory of swift config, usually /etc/swift
:param ring_names: Collection of ring names, such as
['object', 'object-2']
:returns: a set of tuples containing the ip and port of hosts
"""
rings = [Ring(swift_dir, ring_name=n) for n in ring_names]
devs = [d for r in rings for d in r.devs if d]
if region_filter is not None:
devs = [d for d in devs if d['region'] == region_filter]
if zone_filter is not None:
devs = [d for d in devs if d['zone'] == zone_filter]
return set((d['ip'], d['port']) for d in devs)
def get_ringmd5(self, hosts, swift_dir):
"""
Compare ring md5sum's with those on remote host
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
:param swift_dir: The local directory with the ring files.
"""
matches = 0
errors = 0
ring_names = set()
if self.server_type == 'object':
for ring_name in os.listdir(swift_dir):
if ring_name.startswith('object') and \
ring_name.endswith('.ring.gz'):
ring_names.add(ring_name)
else:
ring_name = '%s.ring.gz' % self.server_type
ring_names.add(ring_name)
rings = {}
for ring_name in ring_names:
rings[ring_name] = md5_hash_for_file(
os.path.join(swift_dir, ring_name))
recon = Scout("ringmd5", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking ring md5sums" % self._ptime())
if self.verbose:
for ring_file, ring_sum in rings.items():
print("-> On disk %s md5sum: %s" % (ring_file, ring_sum))
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status != 200:
errors = errors + 1
continue
success = True
for remote_ring_file, remote_ring_sum in response.items():
remote_ring_name = os.path.basename(remote_ring_file)
if not remote_ring_name.startswith(self.server_type):
continue
ring_sum = rings.get(remote_ring_name, None)
if remote_ring_sum != ring_sum:
success = False
print("!! %s (%s => %s) doesn't match on disk md5sum" % (
url, remote_ring_name, remote_ring_sum))
if not success:
errors += 1
continue
matches += 1
if self.verbose:
print("-> %s matches." % url)
print("%s/%s hosts matched, %s error[s] while checking hosts." % (
matches, len(hosts), errors))
print("=" * 79)
def get_swiftconfmd5(self, hosts, printfn=print):
"""
Compare swift.conf md5sum with that on remote hosts
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
:param printfn: function to print text; defaults to print()
"""
matches = 0
errors = 0
conf_sum = md5_hash_for_file(SWIFT_CONF_FILE)
recon = Scout("swiftconfmd5", self.verbose, self.suppress_errors,
self.timeout)
printfn("[%s] Checking swift.conf md5sum" % self._ptime())
if self.verbose:
printfn("-> On disk swift.conf md5sum: %s" % (conf_sum,))
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response[SWIFT_CONF_FILE] != conf_sum:
printfn("!! %s (%s) doesn't match on disk md5sum" %
(url, response[SWIFT_CONF_FILE]))
else:
matches = matches + 1
if self.verbose:
printfn("-> %s matches." % url)
else:
errors = errors + 1
printfn("%s/%s hosts matched, %s error[s] while checking hosts."
% (matches, len(hosts), errors))
printfn("=" * 79)
def async_check(self, hosts):
"""
Obtain and print async pending statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
recon = Scout("async", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking async pendings" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response['async_pending']
stats = self._gen_stats(scan.values(), 'async_pending')
if stats['reported'] > 0:
self._print_stats(stats)
else:
print("[async_pending] - No hosts returned valid data.")
print("=" * 79)
def driveaudit_check(self, hosts):
"""
Obtain and print drive audit error statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)]
"""
scan = {}
recon = Scout("driveaudit", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking drive-audit errors" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response['drive_audit_errors']
stats = self._gen_stats(scan.values(), 'drive_audit_errors')
if stats['reported'] > 0:
self._print_stats(stats)
else:
print("[drive_audit_errors] - No hosts returned valid data.")
print("=" * 79)
def umount_check(self, hosts):
"""
Check for and print unmounted drives
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
unmounted = {}
errors = {}
recon = Scout("unmounted", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Getting unmounted drives from %s hosts..." %
(self._ptime(), len(hosts)))
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
unmounted[url] = []
errors[url] = []
for i in response:
if not isinstance(i['mounted'], bool):
errors[url].append(i['device'])
else:
unmounted[url].append(i['device'])
for host in unmounted:
node = urlparse(host).netloc
for entry in unmounted[host]:
print("Not mounted: %s on %s" % (entry, node))
for host in errors:
node = urlparse(host).netloc
for entry in errors[host]:
print("Device errors: %s on %s" % (entry, node))
print("=" * 79)
def server_type_check(self, hosts):
"""
Check for server types on the ring
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
errors = {}
recon = Scout("server_type_check", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Validating server type '%s' on %s hosts..." %
(self._ptime(), self.server_type, len(hosts)))
for url, response, status in self.pool.imap(
recon.scout_server_type, hosts):
if status == 200:
if response != self.server_type + '-server':
errors[url] = response
print("%s/%s hosts ok, %s error[s] while checking hosts." % (
len(hosts) - len(errors), len(hosts), len(errors)))
for host in errors:
print("Invalid: %s is %s" % (host, errors[host]))
print("=" * 79)
def expirer_check(self, hosts):
"""
Obtain and print expirer statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'object_expiration_pass': [], 'expired_last_pass': []}
recon = Scout("expirer/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on expirers" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['object_expiration_pass'].append(
response.get('object_expiration_pass'))
stats['expired_last_pass'].append(
response.get('expired_last_pass'))
for k in stats:
if stats[k]:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[%s] - No hosts returned valid data." % k)
else:
print("[%s] - No hosts returned valid data." % k)
print("=" * 79)
def replication_check(self, hosts):
"""
Obtain and print replication statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {'replication_time': [], 'failure': [], 'success': [],
'attempted': []}
recon = Scout("replication/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking on replication" % self._ptime())
least_recent_time = 9999999999
least_recent_url = None
most_recent_time = 0
most_recent_url = None
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
stats['replication_time'].append(
response.get('replication_time',
response.get('object_replication_time', 0)))
repl_stats = response.get('replication_stats')
if repl_stats:
for stat_key in ['attempted', 'failure', 'success']:
stats[stat_key].append(repl_stats.get(stat_key))
last = response.get('replication_last',
response.get('object_replication_last', 0))
if last < least_recent_time:
least_recent_time = last
least_recent_url = url
if last > most_recent_time:
most_recent_time = last
most_recent_url = url
for k in stats:
if stats[k]:
if k != 'replication_time':
computed = self._gen_stats(stats[k],
name='replication_%s' % k)
else:
computed = self._gen_stats(stats[k], name=k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[%s] - No hosts returned valid data." % k)
else:
print("[%s] - No hosts returned valid data." % k)
if least_recent_url is not None:
host = urlparse(least_recent_url).netloc
if not least_recent_time:
print('Oldest completion was NEVER by %s.' % host)
else:
elapsed = time.time() - least_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print('Oldest completion was %s (%d %s ago) by %s.' % (
self._ptime(least_recent_time),
elapsed, elapsed_unit, host))
if most_recent_url is not None:
host = urlparse(most_recent_url).netloc
elapsed = time.time() - most_recent_time
elapsed, elapsed_unit = seconds2timeunit(elapsed)
print('Most recent completion was %s (%d %s ago) by %s.' % (
self._ptime(most_recent_time),
elapsed, elapsed_unit, host))
print("=" * 79)
def updater_check(self, hosts):
"""
Obtain and print updater statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = []
recon = Scout("updater/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking updater times" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response['%s_updater_sweep' % self.server_type]:
stats.append(response['%s_updater_sweep' %
self.server_type])
if len(stats) > 0:
computed = self._gen_stats(stats, name='updater_last_sweep')
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[updater_last_sweep] - No hosts returned valid data.")
else:
print("[updater_last_sweep] - No hosts returned valid data.")
print("=" * 79)
def auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
scan = {}
adone = '%s_auditor_pass_completed' % self.server_type
afail = '%s_audits_failed' % self.server_type
apass = '%s_audits_passed' % self.server_type
asince = '%s_audits_since' % self.server_type
recon = Scout("auditor/%s" % self.server_type, self.verbose,
self.suppress_errors, self.timeout)
print("[%s] Checking auditor stats" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
scan[url] = response
if len(scan) < 1:
print("Error: No hosts available")
return
stats = {}
stats[adone] = [scan[i][adone] for i in scan
if scan[i][adone] is not None]
stats[afail] = [scan[i][afail] for i in scan
if scan[i][afail] is not None]
stats[apass] = [scan[i][apass] for i in scan
if scan[i][apass] is not None]
stats[asince] = [scan[i][asince] for i in scan
if scan[i][asince] is not None]
for k in stats:
if len(stats[k]) < 1:
print("[%s] - No hosts returned valid data." % k)
else:
if k != asince:
computed = self._gen_stats(stats[k], k)
if computed['reported'] > 0:
self._print_stats(computed)
if len(stats[asince]) >= 1:
low = min(stats[asince])
high = max(stats[asince])
total = sum(stats[asince])
average = total / len(stats[asince])
print('[last_pass] oldest: %s, newest: %s, avg: %s' %
(self._ptime(low), self._ptime(high), self._ptime(average)))
print("=" * 79)
def nested_get_value(self, key, recon_entry):
"""
Generator that yields all values for given key in a recon cache entry.
This is for use with object auditor recon cache entries. If the
object auditor has run in parallel, the recon cache will have entries
of the form: {'object_auditor_stats_ALL': { 'disk1': {..},
'disk2': {..},
'disk3': {..},
...}}
If the object auditor hasn't run in parallel, the recon cache will have
entries of the form: {'object_auditor_stats_ALL': {...}}.
The ZBF auditor doesn't run in parallel. However, if a subset of
devices is selected for auditing, the recon cache will have an entry
of the form: {'object_auditor_stats_ZBF': { 'disk1disk2..diskN': {}}
We use this generator to find all instances of a particular key in
these multi-level dictionaries.
"""
for k, v in recon_entry.items():
if isinstance(v, dict):
for value in self.nested_get_value(key, v):
yield value
if k == key:
yield v
def object_auditor_check(self, hosts):
"""
Obtain and print obj auditor statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
all_scan = {}
zbf_scan = {}
atime = 'audit_time'
bprocessed = 'bytes_processed'
passes = 'passes'
errors = 'errors'
quarantined = 'quarantined'
recon = Scout("auditor/object", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking auditor stats " % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
if response['object_auditor_stats_ALL']:
all_scan[url] = response['object_auditor_stats_ALL']
if response['object_auditor_stats_ZBF']:
zbf_scan[url] = response['object_auditor_stats_ZBF']
if len(all_scan) > 0:
stats = {}
stats[atime] = [sum(self.nested_get_value(atime, all_scan[i]))
for i in all_scan]
stats[bprocessed] = [sum(self.nested_get_value(bprocessed,
all_scan[i])) for i in all_scan]
stats[passes] = [sum(self.nested_get_value(passes, all_scan[i]))
for i in all_scan]
stats[errors] = [sum(self.nested_get_value(errors, all_scan[i]))
for i in all_scan]
stats[quarantined] = [sum(self.nested_get_value(quarantined,
all_scan[i])) for i in all_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print("[Auditor %s] - No hosts returned valid data." % k)
else:
computed = self._gen_stats(stats[k],
name='ALL_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[ALL_auditor] - No hosts returned valid data.")
else:
print("[ALL_auditor] - No hosts returned valid data.")
if len(zbf_scan) > 0:
stats = {}
stats[atime] = [sum(self.nested_get_value(atime, zbf_scan[i]))
for i in zbf_scan]
stats[bprocessed] = [sum(self.nested_get_value(bprocessed,
zbf_scan[i])) for i in zbf_scan]
stats[errors] = [sum(self.nested_get_value(errors, zbf_scan[i]))
for i in zbf_scan]
stats[quarantined] = [sum(self.nested_get_value(quarantined,
zbf_scan[i])) for i in zbf_scan]
for k in stats:
if None in stats[k]:
stats[k] = [x for x in stats[k] if x is not None]
if len(stats[k]) < 1:
print("[Auditor %s] - No hosts returned valid data." % k)
else:
computed = self._gen_stats(stats[k],
name='ZBF_%s_last_path' % k)
if computed['reported'] > 0:
self._print_stats(computed)
else:
print("[ZBF_auditor] - No hosts returned valid data.")
else:
print("[ZBF_auditor] - No hosts returned valid data.")
print("=" * 79)
def load_check(self, hosts):
"""
Obtain and print load average statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
load1 = {}
load5 = {}
load15 = {}
recon = Scout("load", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking load averages" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
load1[url] = response['1m']
load5[url] = response['5m']
load15[url] = response['15m']
stats = {"1m": load1, "5m": load5, "15m": load15}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='%s_load_avg' % item)
self._print_stats(computed)
else:
print("[%s_load_avg] - No hosts returned valid data." % item)
print("=" * 79)
def quarantine_check(self, hosts):
"""
Obtain and print quarantine statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
objq = {}
conq = {}
acctq = {}
stats = {}
recon = Scout("quarantined", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking quarantine" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
objq[url] = response['objects']
conq[url] = response['containers']
acctq[url] = response['accounts']
for key in response.get('policies', {}):
pkey = "objects_%s" % key
stats.setdefault(pkey, {})
stats[pkey][url] = response['policies'][key]['objects']
stats.update({"objects": objq, "containers": conq, "accounts": acctq})
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(),
name='quarantined_%s' % item)
self._print_stats(computed)
else:
print("No hosts returned valid data.")
print("=" * 79)
def socket_usage(self, hosts):
"""
Obtain and print /proc/net/sockstat statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
inuse4 = {}
mem = {}
inuse6 = {}
timewait = {}
orphan = {}
recon = Scout("sockstat", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking socket usage" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
inuse4[url] = response['tcp_in_use']
mem[url] = response['tcp_mem_allocated_bytes']
inuse6[url] = response.get('tcp6_in_use', 0)
timewait[url] = response['time_wait']
orphan[url] = response['orphan']
stats = {"tcp_in_use": inuse4, "tcp_mem_allocated_bytes": mem,
"tcp6_in_use": inuse6, "time_wait": timewait,
"orphan": orphan}
for item in stats:
if len(stats[item]) > 0:
computed = self._gen_stats(stats[item].values(), item)
self._print_stats(computed)
else:
print("No hosts returned valid data.")
print("=" * 79)
def disk_usage(self, hosts, top=0, lowest=0, human_readable=False):
"""
Obtain and print disk usage statistics
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
stats = {}
highs = []
lows = []
raw_total_used = []
raw_total_avail = []
percents = {}
top_percents = [(None, 0)] * top
low_percents = [(None, 100)] * lowest
recon = Scout("diskusage", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking disk usage now" % self._ptime())
for url, response, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status == 200:
hostusage = []
for entry in response:
if not isinstance(entry['mounted'], bool):
print("-> %s/%s: Error: %s" % (url, entry['device'],
entry['mounted']))
elif entry['mounted']:
used = float(entry['used']) / float(entry['size']) \
* 100.0
raw_total_used.append(entry['used'])
raw_total_avail.append(entry['avail'])
hostusage.append(round(used, 2))
for ident, oused in top_percents:
if oused < used:
top_percents.append(
(url + ' ' + entry['device'], used))
top_percents.sort(key=lambda x: -x[1])
top_percents.pop()
break
for ident, oused in low_percents:
if oused > used:
low_percents.append(
(url + ' ' + entry['device'], used))
low_percents.sort(key=lambda x: x[1])
low_percents.pop()
break
stats[url] = hostusage
for url in stats:
if len(stats[url]) > 0:
# get per host hi/los for another day
low = min(stats[url])
high = max(stats[url])
highs.append(high)
lows.append(low)
for percent in stats[url]:
percents[int(percent)] = percents.get(int(percent), 0) + 1
else:
print("-> %s: Error. No drive info available." % url)
if len(lows) > 0:
low = min(lows)
high = max(highs)
# dist graph shamelessly stolen from https://github.com/gholt/tcod
print("Distribution Graph:")
mul = 69.0 / max(percents.values())
for percent in sorted(percents):
print('% 3d%%%5d %s' % (percent, percents[percent],
'*' * int(percents[percent] * mul)))
raw_used = sum(raw_total_used)
raw_avail = sum(raw_total_avail)
raw_total = raw_used + raw_avail
avg_used = 100.0 * raw_used / raw_total
if human_readable:
raw_used = size_suffix(raw_used)
raw_avail = size_suffix(raw_avail)
raw_total = size_suffix(raw_total)
print("Disk usage: space used: %s of %s" % (raw_used, raw_total))
print("Disk usage: space free: %s of %s" % (raw_avail, raw_total))
print("Disk usage: lowest: %s%%, highest: %s%%, avg: %s%%" %
(low, high, avg_used))
else:
print("No hosts returned valid data.")
print("=" * 79)
if top_percents:
print('TOP %s' % top)
for ident, used in top_percents:
if ident:
url, device = ident.split()
host = urlparse(url).netloc.split(':')[0]
print('%.02f%% %s' % (used, '%-15s %s' % (host, device)))
if low_percents:
print('LOWEST %s' % lowest)
for ident, used in low_percents:
if ident:
url, device = ident.split()
host = urlparse(url).netloc.split(':')[0]
print('%.02f%% %s' % (used, '%-15s %s' % (host, device)))
def time_check(self, hosts):
"""
Check a time synchronization of hosts with current time
:param hosts: set of hosts to check. in the format of:
set([('127.0.0.1', 6020), ('127.0.0.2', 6030)])
"""
matches = 0
errors = 0
recon = Scout("time", self.verbose, self.suppress_errors,
self.timeout)
print("[%s] Checking time-sync" % self._ptime())
for url, ts_remote, status, ts_start, ts_end in self.pool.imap(
recon.scout, hosts):
if status != 200:
errors = errors + 1
continue
if (ts_remote < ts_start or ts_remote > ts_end):
diff = abs(ts_end - ts_remote)
ts_end_f = self._ptime(ts_end)
ts_remote_f = self._ptime(ts_remote)
print("!! %s current time is %s, but remote is %s, "
"differs by %.2f sec" % (
url,
ts_end_f,
ts_remote_f,
diff))
continue
matches += 1
if self.verbose:
print("-> %s matches." % url)
print("%s/%s hosts matched, %s error[s] while checking hosts." % (
matches, len(hosts), errors))
print("=" * 79)
def _get_ring_names(self, policy=None):
'''
Retrieve name of ring files.
If no policy is passed and the server type is object,
the ring names of all storage-policies are retrieved.
:param policy: name or index of storage policy, only applicable
with server_type==object.
:returns: list of ring names.
'''
if self.server_type == 'object':
ring_names = [p.ring_name for p in POLICIES if (
p.name == policy or not policy or (
policy.isdigit() and int(policy) == int(p)))]
else:
ring_names = [self.server_type]
return ring_names
def main(self):
"""
Retrieve and report cluster info from hosts running recon middleware.
"""
print("=" * 79)
usage = '''
usage: %prog <server_type> [<server_type> [<server_type>]]
[-v] [--suppress] [-a] [-r] [-u] [-d]
[-l] [-T] [--md5] [--auditor] [--updater] [--expirer] [--sockstat]
[--human-readable]
<server_type>\taccount|container|object
Defaults to object server.
ex: %prog container -l --auditor
'''
args = optparse.OptionParser(usage)
args.add_option('--verbose', '-v', action="store_true",
help="Print verbose info")
args.add_option('--suppress', action="store_true",
help="Suppress most connection related errors")
args.add_option('--async', '-a', action="store_true",
help="Get async stats")
args.add_option('--replication', '-r', action="store_true",
help="Get replication stats")
args.add_option('--auditor', action="store_true",
help="Get auditor stats")
args.add_option('--updater', action="store_true",
help="Get updater stats")
args.add_option('--expirer', action="store_true",
help="Get expirer stats")
args.add_option('--unmounted', '-u', action="store_true",
help="Check cluster for unmounted devices")
args.add_option('--diskusage', '-d', action="store_true",
help="Get disk usage stats")
args.add_option('--human-readable', action="store_true",
help="Use human readable suffix for disk usage stats")
args.add_option('--loadstats', '-l', action="store_true",
help="Get cluster load average stats")
args.add_option('--quarantined', '-q', action="store_true",
help="Get cluster quarantine stats")
args.add_option('--validate-servers', action="store_true",
help="Validate servers on the ring")
args.add_option('--md5', action="store_true",
help="Get md5sum of servers ring and compare to "
"local copy")
args.add_option('--sockstat', action="store_true",
help="Get cluster socket usage stats")
args.add_option('--driveaudit', action="store_true",
help="Get drive audit error stats")
args.add_option('--time', '-T', action="store_true",
help="Check time synchronization")
args.add_option('--top', type='int', metavar='COUNT', default=0,
help='Also show the top COUNT entries in rank order.')
args.add_option('--lowest', type='int', metavar='COUNT', default=0,
help='Also show the lowest COUNT entries in rank \
order.')
args.add_option('--all', action="store_true",
help="Perform all checks. Equal to \t\t\t-arudlqT "
"--md5 --sockstat --auditor --updater --expirer "
"--driveaudit --validate-servers")
args.add_option('--region', type="int",
help="Only query servers in specified region")
args.add_option('--zone', '-z', type="int",
help="Only query servers in specified zone")
args.add_option('--timeout', '-t', type="int", metavar="SECONDS",
help="Time to wait for a response from a server",
default=5)
args.add_option('--swiftdir', default="/etc/swift",
help="Default = /etc/swift")
args.add_option('--policy', '-p',
help='Only query object servers in specified '
'storage policy (specified as name or index).')
options, arguments = args.parse_args()
if len(sys.argv) <= 1 or len(arguments) > len(self.check_types):
args.print_help()
sys.exit(0)
if arguments:
arguments = set(arguments)
if arguments.issubset(self.check_types):
server_types = arguments
else:
print("Invalid Server Type")
args.print_help()
sys.exit(1)
else: # default
server_types = ['object']
swift_dir = options.swiftdir
self.verbose = options.verbose
self.suppress_errors = options.suppress
self.timeout = options.timeout
for server_type in server_types:
self.server_type = server_type
ring_names = self._get_ring_names(options.policy)
if not ring_names:
print('Invalid Storage Policy: %s' % options.policy)
args.print_help()
sys.exit(0)
hosts = self.get_hosts(options.region, options.zone,
swift_dir, ring_names)
print("--> Starting reconnaissance on %s hosts (%s)" %
(len(hosts), self.server_type))
print("=" * 79)
if options.all:
if self.server_type == 'object':
self.async_check(hosts)
self.object_auditor_check(hosts)
self.updater_check(hosts)
self.expirer_check(hosts)
elif self.server_type == 'container':
self.auditor_check(hosts)
self.updater_check(hosts)
elif self.server_type == 'account':
self.auditor_check(hosts)
self.replication_check(hosts)
self.umount_check(hosts)
self.load_check(hosts)
self.disk_usage(hosts, options.top, options.lowest,
options.human_readable)
self.get_ringmd5(hosts, swift_dir)
self.get_swiftconfmd5(hosts)
self.quarantine_check(hosts)
self.socket_usage(hosts)
self.server_type_check(hosts)
self.driveaudit_check(hosts)
self.time_check(hosts)
else:
if options.async:
if self.server_type == 'object':
self.async_check(hosts)
else:
print("Error: Can't check asyncs on non object "
"servers.")
print("=" * 79)
if options.unmounted:
self.umount_check(hosts)
if options.replication:
self.replication_check(hosts)
if options.auditor:
if self.server_type == 'object':
self.object_auditor_check(hosts)
else:
self.auditor_check(hosts)
if options.updater:
if self.server_type == 'account':
print("Error: Can't check updaters on account "
"servers.")
print("=" * 79)
else:
self.updater_check(hosts)
if options.expirer:
if self.server_type == 'object':
self.expirer_check(hosts)
else:
print("Error: Can't check expired on non object "
"servers.")
print("=" * 79)
if options.validate_servers:
self.server_type_check(hosts)
if options.loadstats:
self.load_check(hosts)
if options.diskusage:
self.disk_usage(hosts, options.top, options.lowest,
options.human_readable)
if options.md5:
self.get_ringmd5(hosts, swift_dir)
self.get_swiftconfmd5(hosts)
if options.quarantined:
self.quarantine_check(hosts)
if options.sockstat:
self.socket_usage(hosts)
if options.driveaudit:
self.driveaudit_check(hosts)
if options.time:
self.time_check(hosts)
def main():
try:
reconnoiter = SwiftRecon()
reconnoiter.main()
except KeyboardInterrupt:
print('\n')