Merge "Add option to skip downloading/uploading identical files"

This commit is contained in:
Jenkins 2014-02-18 01:45:54 +00:00 committed by Gerrit Code Review
commit 0aa0f4f6cc

@ -21,7 +21,8 @@ from errno import EEXIST, ENOENT
from hashlib import md5 from hashlib import md5
from optparse import OptionParser, SUPPRESS_HELP from optparse import OptionParser, SUPPRESS_HELP
from os import environ, listdir, makedirs, utime, _exit as os_exit from os import environ, listdir, makedirs, utime, _exit as os_exit
from os.path import basename, dirname, getmtime, getsize, isdir, join from os.path import basename, dirname, getmtime, getsize, isdir, join, \
sep as os_path_sep
from random import shuffle from random import shuffle
from sys import argv, exit, stderr, stdout from sys import argv, exit, stderr, stdout
from time import sleep, time, gmtime, strftime from time import sleep, time, gmtime, strftime
@ -274,7 +275,7 @@ Optional arguments:
Adds a customized request header to the query, like Adds a customized request header to the query, like
"Range" or "If-Match". This argument is repeatable. "Range" or "If-Match". This argument is repeatable.
Example --header "content-type:text/plain" Example --header "content-type:text/plain"
--skip-identical Skip downloading files that are identical on both sides
'''.strip("\n") '''.strip("\n")
@ -310,6 +311,10 @@ def st_download(parser, args, thread_manager):
'Adds a customized request header to the query, like "Range" or ' 'Adds a customized request header to the query, like "Range" or '
'"If-Match". This argument is repeatable. Example' '"If-Match". This argument is repeatable. Example'
' --header "content-type:text/plain"') ' --header "content-type:text/plain"')
parser.add_option(
'--skip-identical', action='store_true', dest='skip_identical',
default=False, help='Skip downloading files that are identical on '
'both sides')
(options, args) = parse_args(parser, args) (options, args) = parse_args(parser, args)
args = args[1:] args = args[1:]
if options.out_file == '-': if options.out_file == '-':
@ -330,6 +335,23 @@ def st_download(parser, args, thread_manager):
container, obj, out_file = queue_arg container, obj, out_file = queue_arg
else: else:
raise Exception("Invalid queue_arg length of %s" % len(queue_arg)) raise Exception("Invalid queue_arg length of %s" % len(queue_arg))
path = options.yes_all and join(container, obj) or obj
path = path.lstrip(os_path_sep)
if options.skip_identical and out_file != '-':
filename = out_file if out_file else path
try:
fp = open(filename, 'rb')
except IOError:
pass
else:
with fp:
md5sum = md5()
while True:
data = fp.read(65536)
if not data:
break
md5sum.update(data)
req_headers['If-None-Match'] = md5sum.hexdigest()
try: try:
start_time = time() start_time = time()
headers, body = \ headers, body = \
@ -342,9 +364,6 @@ def st_download(parser, args, thread_manager):
else: else:
content_length = None content_length = None
etag = headers.get('etag') etag = headers.get('etag')
path = options.yes_all and join(container, obj) or obj
if path[:1] in ('/', '\\'):
path = path[1:]
md5sum = None md5sum = None
make_dir = not options.no_download and out_file != "-" make_dir = not options.no_download and out_file != "-"
if content_type.split(';', 1)[0] == 'text/directory': if content_type.split(';', 1)[0] == 'text/directory':
@ -409,6 +428,9 @@ def st_download(parser, args, thread_manager):
else: else:
thread_manager.print_msg('%s [%s]', path, time_str) thread_manager.print_msg('%s [%s]', path, time_str)
except ClientException as err: except ClientException as err:
if err.http_status == 304 and options.skip_identical:
thread_manager.print_msg("Skipped identical file '%s'", path)
return
if err.http_status != 404: if err.http_status != 404:
raise raise
thread_manager.error("Object '%s/%s' not found", container, obj) thread_manager.error("Object '%s/%s' not found", container, obj)
@ -762,7 +784,7 @@ def st_post(parser, args, thread_manager):
thread_manager.error('Usage: %s post %s\n%s', basename(argv[0]), thread_manager.error('Usage: %s post %s\n%s', basename(argv[0]),
st_post_options, st_post_help) st_post_options, st_post_help)
st_upload_options = '''[--changed] [--segment-size <size>] st_upload_options = '''[--changed] [--skip-identical] [--segment-size <size>]
[--segment-container <container>] [--leave-segments] [--segment-container <container>] [--leave-segments]
[--object-threads <thread>] [--segment-threads <threads>] [--object-threads <thread>] [--segment-threads <threads>]
[--header <header>] [--use-slo] [--header <header>] [--use-slo]
@ -781,6 +803,7 @@ Positional arguments:
Optional arguments: Optional arguments:
--changed Only upload files that have changed since the last --changed Only upload files that have changed since the last
upload upload
--skip-identical Skip uploading files that are identical on both sides
--segment-size <size> Upload files in segments no larger than <size> and --segment-size <size> Upload files in segments no larger than <size> and
then create a "manifest" file that will download all then create a "manifest" file that will download all
the segments as if it were the original file the segments as if it were the original file
@ -815,6 +838,10 @@ def st_upload(parser, args, thread_manager):
'-c', '--changed', action='store_true', dest='changed', '-c', '--changed', action='store_true', dest='changed',
default=False, help='Will only upload files that have changed since ' default=False, help='Will only upload files that have changed since '
'the last upload') 'the last upload')
parser.add_option(
'--skip-identical', action='store_true', dest='skip_identical',
default=False, help='Skip uploading files that are identical on '
'both sides')
parser.add_option( parser.add_option(
'-S', '--segment-size', dest='segment_size', help='Will ' '-S', '--segment-size', dest='segment_size', help='Will '
'upload files in segments no larger than <size> and then create a ' 'upload files in segments no larger than <size> and then create a '
@ -922,11 +949,32 @@ def st_upload(parser, args, thread_manager):
old_manifest = None old_manifest = None
old_slo_manifest_paths = [] old_slo_manifest_paths = []
new_slo_manifest_paths = set() new_slo_manifest_paths = set()
if options.changed or not options.leave_segments: if options.changed or options.skip_identical \
or not options.leave_segments:
if options.skip_identical:
checksum = None
try:
fp = open(path, 'rb')
except IOError:
pass
else:
with fp:
md5sum = md5()
while True:
data = fp.read(65536)
if not data:
break
md5sum.update(data)
checksum = md5sum.hexdigest()
try: try:
headers = conn.head_object(container, obj) headers = conn.head_object(container, obj)
cl = int(headers.get('content-length')) cl = int(headers.get('content-length'))
mt = headers.get('x-object-meta-mtime') mt = headers.get('x-object-meta-mtime')
if (options.skip_identical and
checksum == headers.get('etag')):
thread_manager.print_msg(
"Skipped identical file '%s'", path)
return
if options.changed and cl == getsize(path) and \ if options.changed and cl == getsize(path) and \
mt == put_headers['x-object-meta-mtime']: mt == put_headers['x-object-meta-mtime']:
return return