Add option to skip downloading/uploading identical files
swift has already the upload option "--changed" to upload only changed files. This patch adds a similar feature by comparing the md5 of the local file to the remote object etag. When used in combination with download the MD5 hexdigest of each file is sent with an "If-None-Match" header to skip downloading identical files. When used in combination with upload the MD5 is compared to the remote etag by using the already existing HEAD request. Change-Id: I727b0456558c6a7742b2428c6d1c45c4bfaf66e9
This commit is contained in:
parent
f13288ae32
commit
9b3ec3705f
62
bin/swift
62
bin/swift
@ -21,7 +21,8 @@ from errno import EEXIST, ENOENT
|
||||
from hashlib import md5
|
||||
from optparse import OptionParser, SUPPRESS_HELP
|
||||
from os import environ, listdir, makedirs, utime, _exit as os_exit
|
||||
from os.path import basename, dirname, getmtime, getsize, isdir, join
|
||||
from os.path import basename, dirname, getmtime, getsize, isdir, join, \
|
||||
sep as os_path_sep
|
||||
from random import shuffle
|
||||
from sys import argv, exit, stderr, stdout
|
||||
from time import sleep, time, gmtime, strftime
|
||||
@ -274,7 +275,7 @@ Optional arguments:
|
||||
Adds a customized request header to the query, like
|
||||
"Range" or "If-Match". This argument is repeatable.
|
||||
Example --header "content-type:text/plain"
|
||||
|
||||
--skip-identical Skip downloading files that are identical on both sides
|
||||
'''.strip("\n")
|
||||
|
||||
|
||||
@ -310,6 +311,10 @@ def st_download(parser, args, thread_manager):
|
||||
'Adds a customized request header to the query, like "Range" or '
|
||||
'"If-Match". This argument is repeatable. Example'
|
||||
' --header "content-type:text/plain"')
|
||||
parser.add_option(
|
||||
'--skip-identical', action='store_true', dest='skip_identical',
|
||||
default=False, help='Skip downloading files that are identical on '
|
||||
'both sides')
|
||||
(options, args) = parse_args(parser, args)
|
||||
args = args[1:]
|
||||
if options.out_file == '-':
|
||||
@ -330,6 +335,23 @@ def st_download(parser, args, thread_manager):
|
||||
container, obj, out_file = queue_arg
|
||||
else:
|
||||
raise Exception("Invalid queue_arg length of %s" % len(queue_arg))
|
||||
path = options.yes_all and join(container, obj) or obj
|
||||
path = path.lstrip(os_path_sep)
|
||||
if options.skip_identical and out_file != '-':
|
||||
filename = out_file if out_file else path
|
||||
try:
|
||||
fp = open(filename, 'rb')
|
||||
except IOError:
|
||||
pass
|
||||
else:
|
||||
with fp:
|
||||
md5sum = md5()
|
||||
while True:
|
||||
data = fp.read(65536)
|
||||
if not data:
|
||||
break
|
||||
md5sum.update(data)
|
||||
req_headers['If-None-Match'] = md5sum.hexdigest()
|
||||
try:
|
||||
start_time = time()
|
||||
headers, body = \
|
||||
@ -342,9 +364,6 @@ def st_download(parser, args, thread_manager):
|
||||
else:
|
||||
content_length = None
|
||||
etag = headers.get('etag')
|
||||
path = options.yes_all and join(container, obj) or obj
|
||||
if path[:1] in ('/', '\\'):
|
||||
path = path[1:]
|
||||
md5sum = None
|
||||
make_dir = not options.no_download and out_file != "-"
|
||||
if content_type.split(';', 1)[0] == 'text/directory':
|
||||
@ -409,6 +428,9 @@ def st_download(parser, args, thread_manager):
|
||||
else:
|
||||
thread_manager.print_msg('%s [%s]', path, time_str)
|
||||
except ClientException as err:
|
||||
if err.http_status == 304 and options.skip_identical:
|
||||
thread_manager.print_msg("Skipped identical file '%s'", path)
|
||||
return
|
||||
if err.http_status != 404:
|
||||
raise
|
||||
thread_manager.error("Object '%s/%s' not found", container, obj)
|
||||
@ -762,7 +784,7 @@ def st_post(parser, args, thread_manager):
|
||||
thread_manager.error('Usage: %s post %s\n%s', basename(argv[0]),
|
||||
st_post_options, st_post_help)
|
||||
|
||||
st_upload_options = '''[--changed] [--segment-size <size>]
|
||||
st_upload_options = '''[--changed] [--skip-identical] [--segment-size <size>]
|
||||
[--segment-container <container>] [--leave-segments]
|
||||
[--object-threads <thread>] [--segment-threads <threads>]
|
||||
[--header <header>] [--use-slo]
|
||||
@ -781,6 +803,7 @@ Positional arguments:
|
||||
Optional arguments:
|
||||
--changed Only upload files that have changed since the last
|
||||
upload
|
||||
--skip-identical Skip uploading files that are identical on both sides
|
||||
--segment-size <size> Upload files in segments no larger than <size> and
|
||||
then create a "manifest" file that will download all
|
||||
the segments as if it were the original file
|
||||
@ -815,6 +838,10 @@ def st_upload(parser, args, thread_manager):
|
||||
'-c', '--changed', action='store_true', dest='changed',
|
||||
default=False, help='Will only upload files that have changed since '
|
||||
'the last upload')
|
||||
parser.add_option(
|
||||
'--skip-identical', action='store_true', dest='skip_identical',
|
||||
default=False, help='Skip uploading files that are identical on '
|
||||
'both sides')
|
||||
parser.add_option(
|
||||
'-S', '--segment-size', dest='segment_size', help='Will '
|
||||
'upload files in segments no larger than <size> and then create a '
|
||||
@ -922,11 +949,32 @@ def st_upload(parser, args, thread_manager):
|
||||
old_manifest = None
|
||||
old_slo_manifest_paths = []
|
||||
new_slo_manifest_paths = set()
|
||||
if options.changed or not options.leave_segments:
|
||||
if options.changed or options.skip_identical \
|
||||
or not options.leave_segments:
|
||||
if options.skip_identical:
|
||||
checksum = None
|
||||
try:
|
||||
fp = open(path, 'rb')
|
||||
except IOError:
|
||||
pass
|
||||
else:
|
||||
with fp:
|
||||
md5sum = md5()
|
||||
while True:
|
||||
data = fp.read(65536)
|
||||
if not data:
|
||||
break
|
||||
md5sum.update(data)
|
||||
checksum = md5sum.hexdigest()
|
||||
try:
|
||||
headers = conn.head_object(container, obj)
|
||||
cl = int(headers.get('content-length'))
|
||||
mt = headers.get('x-object-meta-mtime')
|
||||
if (options.skip_identical and
|
||||
checksum == headers.get('etag')):
|
||||
thread_manager.print_msg(
|
||||
"Skipped identical file '%s'", path)
|
||||
return
|
||||
if options.changed and cl == getsize(path) and \
|
||||
mt == put_headers['x-object-meta-mtime']:
|
||||
return
|
||||
|
Loading…
x
Reference in New Issue
Block a user