Add option to skip downloading/uploading identical files

swift has already the upload option "--changed" to upload only changed
files. This patch adds a similar feature by comparing the md5 of the
local file to the remote object etag.

When used in combination with download the MD5 hexdigest of each file
is sent with an "If-None-Match" header to skip downloading identical files.

When used in combination with upload the MD5 is compared to the remote
etag by using the already existing HEAD request.

Change-Id: I727b0456558c6a7742b2428c6d1c45c4bfaf66e9
This commit is contained in:
Christian Schwede 2014-02-11 09:13:26 +00:00
parent f13288ae32
commit 9b3ec3705f

View File

@ -21,7 +21,8 @@ from errno import EEXIST, ENOENT
from hashlib import md5
from optparse import OptionParser, SUPPRESS_HELP
from os import environ, listdir, makedirs, utime, _exit as os_exit
from os.path import basename, dirname, getmtime, getsize, isdir, join
from os.path import basename, dirname, getmtime, getsize, isdir, join, \
sep as os_path_sep
from random import shuffle
from sys import argv, exit, stderr, stdout
from time import sleep, time, gmtime, strftime
@ -274,7 +275,7 @@ Optional arguments:
Adds a customized request header to the query, like
"Range" or "If-Match". This argument is repeatable.
Example --header "content-type:text/plain"
--skip-identical Skip downloading files that are identical on both sides
'''.strip("\n")
@ -310,6 +311,10 @@ def st_download(parser, args, thread_manager):
'Adds a customized request header to the query, like "Range" or '
'"If-Match". This argument is repeatable. Example'
' --header "content-type:text/plain"')
parser.add_option(
'--skip-identical', action='store_true', dest='skip_identical',
default=False, help='Skip downloading files that are identical on '
'both sides')
(options, args) = parse_args(parser, args)
args = args[1:]
if options.out_file == '-':
@ -330,6 +335,23 @@ def st_download(parser, args, thread_manager):
container, obj, out_file = queue_arg
else:
raise Exception("Invalid queue_arg length of %s" % len(queue_arg))
path = options.yes_all and join(container, obj) or obj
path = path.lstrip(os_path_sep)
if options.skip_identical and out_file != '-':
filename = out_file if out_file else path
try:
fp = open(filename, 'rb')
except IOError:
pass
else:
with fp:
md5sum = md5()
while True:
data = fp.read(65536)
if not data:
break
md5sum.update(data)
req_headers['If-None-Match'] = md5sum.hexdigest()
try:
start_time = time()
headers, body = \
@ -342,9 +364,6 @@ def st_download(parser, args, thread_manager):
else:
content_length = None
etag = headers.get('etag')
path = options.yes_all and join(container, obj) or obj
if path[:1] in ('/', '\\'):
path = path[1:]
md5sum = None
make_dir = not options.no_download and out_file != "-"
if content_type.split(';', 1)[0] == 'text/directory':
@ -409,6 +428,9 @@ def st_download(parser, args, thread_manager):
else:
thread_manager.print_msg('%s [%s]', path, time_str)
except ClientException as err:
if err.http_status == 304 and options.skip_identical:
thread_manager.print_msg("Skipped identical file '%s'", path)
return
if err.http_status != 404:
raise
thread_manager.error("Object '%s/%s' not found", container, obj)
@ -762,7 +784,7 @@ def st_post(parser, args, thread_manager):
thread_manager.error('Usage: %s post %s\n%s', basename(argv[0]),
st_post_options, st_post_help)
st_upload_options = '''[--changed] [--segment-size <size>]
st_upload_options = '''[--changed] [--skip-identical] [--segment-size <size>]
[--segment-container <container>] [--leave-segments]
[--object-threads <thread>] [--segment-threads <threads>]
[--header <header>] [--use-slo]
@ -781,6 +803,7 @@ Positional arguments:
Optional arguments:
--changed Only upload files that have changed since the last
upload
--skip-identical Skip uploading files that are identical on both sides
--segment-size <size> Upload files in segments no larger than <size> and
then create a "manifest" file that will download all
the segments as if it were the original file
@ -815,6 +838,10 @@ def st_upload(parser, args, thread_manager):
'-c', '--changed', action='store_true', dest='changed',
default=False, help='Will only upload files that have changed since '
'the last upload')
parser.add_option(
'--skip-identical', action='store_true', dest='skip_identical',
default=False, help='Skip uploading files that are identical on '
'both sides')
parser.add_option(
'-S', '--segment-size', dest='segment_size', help='Will '
'upload files in segments no larger than <size> and then create a '
@ -922,11 +949,32 @@ def st_upload(parser, args, thread_manager):
old_manifest = None
old_slo_manifest_paths = []
new_slo_manifest_paths = set()
if options.changed or not options.leave_segments:
if options.changed or options.skip_identical \
or not options.leave_segments:
if options.skip_identical:
checksum = None
try:
fp = open(path, 'rb')
except IOError:
pass
else:
with fp:
md5sum = md5()
while True:
data = fp.read(65536)
if not data:
break
md5sum.update(data)
checksum = md5sum.hexdigest()
try:
headers = conn.head_object(container, obj)
cl = int(headers.get('content-length'))
mt = headers.get('x-object-meta-mtime')
if (options.skip_identical and
checksum == headers.get('etag')):
thread_manager.print_msg(
"Skipped identical file '%s'", path)
return
if options.changed and cl == getsize(path) and \
mt == put_headers['x-object-meta-mtime']:
return