swift/bin/swift-account-audit

372 lines
16 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env python
# Copyright (c) 2010-2012 OpenStack Foundation
2010-07-12 17:03:45 -05:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from hashlib import md5
import getopt
from itertools import chain
import json
2010-07-12 17:03:45 -05:00
from eventlet.greenpool import GreenPool
from eventlet.event import Event
from six.moves.urllib.parse import quote
2010-07-12 17:03:45 -05:00
from swift.common.ring import Ring
from swift.common.utils import split_path
from swift.common.bufferedhttp import http_connect
usage = """
Usage!
%(cmd)s [options] [url 1] [url 2] ...
-c [concurrency] Set the concurrency, default 50
-r [ring dir] Ring locations, default /etc/swift
-e [filename] File for writing a list of inconsistent urls
-d Also download files and verify md5
You can also feed a list of urls to the script through stdin.
Examples!
%(cmd)s SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076
%(cmd)s SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container/object
%(cmd)s -e errors.txt SOSO_88ad0b83-b2c5-4fa1-b2d6-60c597202076/container
%(cmd)s < errors.txt
%(cmd)s -c 25 -d < errors.txt
""" % {'cmd': sys.argv[0]}
class Auditor(object):
def __init__(self, swift_dir='/etc/swift', concurrency=50, deep=False,
error_file=None):
2010-07-12 17:03:45 -05:00
self.pool = GreenPool(concurrency)
self.object_ring = Ring(swift_dir, ring_name='object')
self.container_ring = Ring(swift_dir, ring_name='container')
self.account_ring = Ring(swift_dir, ring_name='account')
2010-07-12 17:03:45 -05:00
self.deep = deep
self.error_file = error_file
# zero out stats
self.accounts_checked = self.account_exceptions = \
self.account_not_found = self.account_container_mismatch = \
self.account_object_mismatch = self.objects_checked = \
self.object_exceptions = self.object_not_found = \
self.object_checksum_mismatch = self.containers_checked = \
self.container_exceptions = self.container_count_mismatch = \
self.container_not_found = self.container_obj_mismatch = 0
self.list_cache = {}
self.in_progress = {}
def audit_object(self, account, container, name):
path = '/%s/%s/%s' % (account, container, name)
part, nodes = self.object_ring.get_nodes(
account, container.encode('utf-8'), name.encode('utf-8'))
2010-07-12 17:03:45 -05:00
container_listing = self.audit_container(account, container)
consistent = True
if name not in container_listing:
print " Object %s missing in container listing!" % path
consistent = False
hash = None
else:
hash = container_listing[name]['hash']
etags = []
for node in nodes:
try:
if self.deep:
conn = http_connect(node['ip'], node['port'],
node['device'], part, 'GET', path, {})
2010-07-12 17:03:45 -05:00
resp = conn.getresponse()
calc_hash = md5()
chunk = True
while chunk:
chunk = resp.read(8192)
calc_hash.update(chunk)
calc_hash = calc_hash.hexdigest()
if resp.status // 100 != 2:
self.object_not_found += 1
consistent = False
print ' Bad status GETting object "%s" on %s/%s' \
% (path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
continue
if resp.getheader('ETag').strip('"') != calc_hash:
self.object_checksum_mismatch += 1
consistent = False
print ' MD5 does not match etag for "%s" on %s/%s' \
% (path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
etags.append(resp.getheader('ETag'))
else:
conn = http_connect(node['ip'], node['port'],
node['device'], part, 'HEAD',
path.encode('utf-8'), {})
2010-07-12 17:03:45 -05:00
resp = conn.getresponse()
if resp.status // 100 != 2:
self.object_not_found += 1
consistent = False
print ' Bad status HEADing object "%s" on %s/%s' \
% (path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
continue
etags.append(resp.getheader('ETag'))
except Exception:
self.object_exceptions += 1
consistent = False
print ' Exception fetching object "%s" on %s/%s' \
% (path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
continue
if not etags:
consistent = False
print " Failed fo fetch object %s at all!" % path
elif hash:
for etag in etags:
if resp.getheader('ETag').strip('"') != hash:
consistent = False
self.object_checksum_mismatch += 1
print ' ETag mismatch for "%s" on %s/%s' \
% (path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
if not consistent and self.error_file:
print >>open(self.error_file, 'a'), path
self.objects_checked += 1
def audit_container(self, account, name, recurse=False):
if (account, name) in self.in_progress:
self.in_progress[(account, name)].wait()
if (account, name) in self.list_cache:
return self.list_cache[(account, name)]
self.in_progress[(account, name)] = Event()
print 'Auditing container "%s"' % name
path = '/%s/%s' % (account, name)
2010-07-12 17:03:45 -05:00
account_listing = self.audit_account(account)
consistent = True
if name not in account_listing:
consistent = False
print " Container %s not in account listing!" % path
part, nodes = \
self.container_ring.get_nodes(account, name.encode('utf-8'))
2010-07-12 17:03:45 -05:00
rec_d = {}
responses = {}
for node in nodes:
marker = ''
results = True
while results:
try:
conn = http_connect(node['ip'], node['port'],
node['device'], part, 'GET',
path.encode('utf-8'), {},
'format=json&marker=%s' %
quote(marker.encode('utf-8')))
2010-07-12 17:03:45 -05:00
resp = conn.getresponse()
if resp.status // 100 != 2:
self.container_not_found += 1
consistent = False
print(' Bad status GETting container "%s" on %s/%s' %
(path, node['ip'], node['device']))
2010-07-12 17:03:45 -05:00
break
if node['id'] not in responses:
responses[node['id']] = dict(resp.getheaders())
results = json.loads(resp.read())
2010-07-12 17:03:45 -05:00
except Exception:
self.container_exceptions += 1
consistent = False
print ' Exception GETting container "%s" on %s/%s' % \
(path, node['ip'], node['device'])
2010-07-12 17:03:45 -05:00
break
if results:
marker = results[-1]['name']
for obj in results:
obj_name = obj['name']
if obj_name not in rec_d:
rec_d[obj_name] = obj
if (obj['last_modified'] !=
rec_d[obj_name]['last_modified']):
2010-07-12 17:03:45 -05:00
self.container_obj_mismatch += 1
consistent = False
print(" Different versions of %s/%s "
"in container dbs." % (name, obj['name']))
if (obj['last_modified'] >
rec_d[obj_name]['last_modified']):
2010-07-12 17:03:45 -05:00
rec_d[obj_name] = obj
obj_counts = [int(header['x-container-object-count'])
for header in responses.values()]
2010-07-12 17:03:45 -05:00
if not obj_counts:
consistent = False
print " Failed to fetch container %s at all!" % path
else:
if len(set(obj_counts)) != 1:
self.container_count_mismatch += 1
consistent = False
print " Container databases don't agree on number of objects."
print " Max: %s, Min: %s" % (max(obj_counts), min(obj_counts))
self.containers_checked += 1
self.list_cache[(account, name)] = rec_d
self.in_progress[(account, name)].send(True)
del self.in_progress[(account, name)]
if recurse:
for obj in rec_d.keys():
self.pool.spawn_n(self.audit_object, account, name, obj)
if not consistent and self.error_file:
print >>open(self.error_file, 'a'), path
return rec_d
def audit_account(self, account, recurse=False):
if account in self.in_progress:
self.in_progress[account].wait()
if account in self.list_cache:
return self.list_cache[account]
self.in_progress[account] = Event()
print 'Auditing account "%s"' % account
2010-07-12 17:03:45 -05:00
consistent = True
path = '/%s' % account
part, nodes = self.account_ring.get_nodes(account)
responses = {}
for node in nodes:
marker = ''
results = True
while results:
node_id = node['id']
try:
conn = http_connect(node['ip'], node['port'],
node['device'], part, 'GET', path, {},
'format=json&marker=%s' %
quote(marker.encode('utf-8')))
2010-07-12 17:03:45 -05:00
resp = conn.getresponse()
if resp.status // 100 != 2:
self.account_not_found += 1
consistent = False
print(" Bad status GETting account '%s' "
" from %ss:%ss" %
(account, node['ip'], node['device']))
2010-07-12 17:03:45 -05:00
break
results = json.loads(resp.read())
2010-07-12 17:03:45 -05:00
except Exception:
self.account_exceptions += 1
consistent = False
print(" Exception GETting account '%s' on %ss:%ss" %
(account, node['ip'], node['device']))
2010-07-12 17:03:45 -05:00
break
if node_id not in responses:
responses[node_id] = [dict(resp.getheaders()), []]
responses[node_id][1].extend(results)
if results:
marker = results[-1]['name']
headers = [resp[0] for resp in responses.values()]
cont_counts = [int(header['x-account-container-count'])
for header in headers]
2010-07-12 17:03:45 -05:00
if len(set(cont_counts)) != 1:
self.account_container_mismatch += 1
consistent = False
print(" Account databases for '%s' don't agree on"
" number of containers." % account)
if cont_counts:
print " Max: %s, Min: %s" % (max(cont_counts),
min(cont_counts))
2010-07-12 17:03:45 -05:00
obj_counts = [int(header['x-account-object-count'])
for header in headers]
2010-07-12 17:03:45 -05:00
if len(set(obj_counts)) != 1:
self.account_object_mismatch += 1
consistent = False
print(" Account databases for '%s' don't agree on"
" number of objects." % account)
if obj_counts:
print " Max: %s, Min: %s" % (max(obj_counts),
min(obj_counts))
2010-07-12 17:03:45 -05:00
containers = set()
for resp in responses.values():
containers.update(container['name'] for container in resp[1])
self.list_cache[account] = containers
self.in_progress[account].send(True)
del self.in_progress[account]
self.accounts_checked += 1
if recurse:
for container in containers:
self.pool.spawn_n(self.audit_container, account,
container, True)
2010-07-12 17:03:45 -05:00
if not consistent and self.error_file:
print >>open(self.error_file, 'a'), path
return containers
def audit(self, account, container=None, obj=None):
if obj and container:
self.pool.spawn_n(self.audit_object, account, container, obj)
elif container:
self.pool.spawn_n(self.audit_container, account, container, True)
else:
self.pool.spawn_n(self.audit_account, account, True)
def wait(self):
self.pool.waitall()
def print_stats(self):
def _print_stat(name, stat):
# Right align stat name in a field of 18 characters
print "{0:>18}: {1}".format(name, stat)
2010-07-12 17:03:45 -05:00
print
_print_stat("Accounts checked", self.accounts_checked)
2010-07-12 17:03:45 -05:00
if self.account_not_found:
_print_stat("Missing Replicas", self.account_not_found)
2010-07-12 17:03:45 -05:00
if self.account_exceptions:
_print_stat("Exceptions", self.account_exceptions)
2010-07-12 17:03:45 -05:00
if self.account_container_mismatch:
_print_stat("Container mismatch", self.account_container_mismatch)
2010-07-12 17:03:45 -05:00
if self.account_object_mismatch:
_print_stat("Object mismatch", self.account_object_mismatch)
2010-07-12 17:03:45 -05:00
print
_print_stat("Containers checked", self.containers_checked)
2010-07-12 17:03:45 -05:00
if self.container_not_found:
_print_stat("Missing Replicas", self.container_not_found)
2010-07-12 17:03:45 -05:00
if self.container_exceptions:
_print_stat("Exceptions", self.container_exceptions)
2010-07-12 17:03:45 -05:00
if self.container_count_mismatch:
_print_stat("Count mismatch", self.container_count_mismatch)
2010-07-12 17:03:45 -05:00
if self.container_obj_mismatch:
_print_stat("Object mismatch", self.container_obj_mismatch)
2010-07-12 17:03:45 -05:00
print
_print_stat("Objects checked", self.objects_checked)
2010-07-12 17:03:45 -05:00
if self.object_not_found:
_print_stat("Missing Replicas", self.object_not_found)
2010-07-12 17:03:45 -05:00
if self.object_exceptions:
_print_stat("Exceptions", self.object_exceptions)
2010-07-12 17:03:45 -05:00
if self.object_checksum_mismatch:
_print_stat("MD5 Mismatch", self.object_checksum_mismatch)
2010-07-12 17:03:45 -05:00
if __name__ == '__main__':
try:
optlist, args = getopt.getopt(sys.argv[1:], 'c:r:e:d')
except getopt.GetoptError as err:
2010-07-12 17:03:45 -05:00
print str(err)
print usage
sys.exit(2)
if not args and os.isatty(sys.stdin.fileno()):
print usage
sys.exit()
opts = dict(optlist)
options = {
'concurrency': int(opts.get('-c', 50)),
'error_file': opts.get('-e', None),
'swift_dir': opts.get('-r', '/etc/swift'),
'deep': '-d' in opts,
}
auditor = Auditor(**options)
if not os.isatty(sys.stdin.fileno()):
args = chain(args, sys.stdin)
for path in args:
path = '/' + path.rstrip('\r\n').lstrip('/')
auditor.audit(*split_path(path, 1, 3, True))
auditor.wait()
auditor.print_stats()