Bulk upload: treat user xattrs as object metadata

Currently, if you PUT a single object, then you can also associate
metadata with it by putting it in the request headers, prefixed with
"X-Object-Meta". However, if you're bulk-uploading objects, then you
have no way to assign any metadata.

The tar file format* allows for arbitrary UTF-8 key/value pairs to be
associated with each file in an archive (as well as with the archive
itself, but we don't care about that here). If a file has extended
attributes, then tar will store those as key/value pairs.

This commit makes bulk upload read those extended attributes, if
present, and convert those to Swift object metadata. Attributes
starting with "user.meta" are converted to object metadata, and
"user.mime_type"** is converted to Content-Type.

For example, if you have a file "setup.py":

    $ setfattr -n user.mime_type -v "application/python-setup" setup.py
    $ setfattr -n user.meta.lunch -v "burger and fries" setup.py
    $ setfattr -n user.meta.dinner -v "baked ziti" setup.py
    $ setfattr -n user.stuff -v "whee" setup.py

This will get translated to headers:

    Content-Type: application/python-setup
    X-Object-Meta-Lunch: burger and fries
    X-Object-Meta-Dinner: baked ziti

Swift will handle xattrs stored by both GNU and BSD tar***. Only
xattrs user.mime_type and user.meta.* are processed; others are
ignored.

This brings bulk upload much closer to feature-parity with non-bulk upload.

* The POSIX 1003.1-2001 (pax) format, at least. There are a few
  different, mutually-incompatible tar formats out there, because of
  course there are. This is the default format on GNU tar 1.27.1 or
  later.

** http://standards.freedesktop.org/shared-mime-info-spec/latest/ar01s02.html#idm140622087713936

*** Even with pax-format tarballs, different encoders store xattrs
    slightly differently; for example, GNU tar stores the xattr
    "user.rubberducky" as pax header "SCHILY.xattr.user.rubberducky",
    while BSD tar (which uses libarchive) stores it as
    "LIBARCHIVE.xattr.user.rubberducky". One might wonder if this is
    some programmer's attempt at job security.

Change-Id: I5e3ce87d31054f5239e86d47c45adbde2bb93640
This commit is contained in:
Samuel Merritt 2015-04-21 17:38:04 -07:00
parent 843236a635
commit 215cd551df
2 changed files with 129 additions and 1 deletions
swift/common/middleware
test/unit/common/middleware

@ -75,6 +75,23 @@ def get_response_body(data_format, data_dict, error_list):
return output
def pax_key_to_swift_header(pax_key):
if (pax_key == u"SCHILY.xattr.user.mime_type" or
pax_key == u"LIBARCHIVE.xattr.user.mime_type"):
return "Content-Type"
elif pax_key.startswith(u"SCHILY.xattr.user.meta."):
useful_part = pax_key[len(u"SCHILY.xattr.user.meta."):]
return "X-Object-Meta-" + useful_part.encode("utf-8")
elif pax_key.startswith(u"LIBARCHIVE.xattr.user.meta."):
useful_part = pax_key[len(u"LIBARCHIVE.xattr.user.meta."):]
return "X-Object-Meta-" + useful_part.encode("utf-8")
else:
# You can get things like atime/mtime/ctime or filesystem ACLs in
# pax headers; those aren't really user metadata. The same goes for
# other, non-user metadata.
return None
class Bulk(object):
"""
Middleware that will do many operations on a single request.
@ -464,6 +481,16 @@ class Bulk(object):
new_env['HTTP_USER_AGENT'] = \
'%s BulkExpand' % req.environ.get('HTTP_USER_AGENT')
create_obj_req = Request.blank(destination, new_env)
for pax_key, pax_value in tar_info.pax_headers.items():
header_name = pax_key_to_swift_header(pax_key)
if header_name:
# Both pax_key and pax_value are unicode
# strings; the key is already UTF-8 encoded, but
# we still have to encode the value.
create_obj_req.headers[header_name] = \
pax_value.encode("utf-8")
resp = create_obj_req.get_response(self.app)
containers_accessed.add(container)
if resp.is_success:

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# Copyright (c) 2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -25,9 +26,11 @@ from tempfile import mkdtemp
from StringIO import StringIO
from eventlet import sleep
from mock import patch, call
from test.unit.common.middleware.helpers import FakeSwift
from swift.common import utils, constraints
from swift.common.middleware import bulk
from swift.common.swob import Request, Response, HTTPException
from swift.common.swob import Request, Response, HTTPException, \
HTTPNoContent, HTTPCreated, HeaderKeyDict
from swift.common.http import HTTP_NOT_FOUND, HTTP_UNAUTHORIZED
@ -126,6 +129,104 @@ def build_tar_tree(tar, start_path, tree_obj, base_path=''):
tar.addfile(tar_info)
class TestUntarMetadata(unittest.TestCase):
def setUp(self):
self.app = FakeSwift()
self.bulk = bulk.filter_factory({})(self.app)
self.testdir = mkdtemp(suffix='tmp_test_bulk')
def tearDown(self):
rmtree(self.testdir, ignore_errors=1)
def test_extract_metadata(self):
self.app.register('HEAD', '/v1/a/c?extract-archive=tar',
HTTPNoContent, {}, None)
self.app.register('PUT', '/v1/a/c/obj1?extract-archive=tar',
HTTPCreated, {}, None)
self.app.register('PUT', '/v1/a/c/obj2?extract-archive=tar',
HTTPCreated, {}, None)
# It's a real pain to instantiate TarInfo objects directly; they
# really want to come from a file on disk or a tarball. So, we write
# out some files and add pax headers to them as they get placed into
# the tarball.
with open(os.path.join(self.testdir, "obj1"), "w") as fh1:
fh1.write("obj1 contents\n")
with open(os.path.join(self.testdir, "obj2"), "w") as fh2:
fh2.write("obj2 contents\n")
tar_ball = StringIO()
tar_file = tarfile.TarFile.open(fileobj=tar_ball, mode="w",
format=tarfile.PAX_FORMAT)
# With GNU tar 1.27.1 or later (possibly 1.27 as well), a file with
# extended attribute user.thingy = dingy gets put into the tarfile
# with pax_headers containing key/value pair
# (SCHILY.xattr.user.thingy, dingy), both unicode strings (py2: type
# unicode, not type str).
#
# With BSD tar (libarchive), you get key/value pair
# (LIBARCHIVE.xattr.user.thingy, dingy), which strikes me as
# gratuitous incompatibility.
#
# Still, we'll support uploads with both. Just heap more code on the
# problem until you can forget it's under there.
with open(os.path.join(self.testdir, "obj1")) as fh1:
tar_info1 = tar_file.gettarinfo(fileobj=fh1,
arcname="obj1")
tar_info1.pax_headers[u'SCHILY.xattr.user.mime_type'] = \
u'application/food-diary'
tar_info1.pax_headers[u'SCHILY.xattr.user.meta.lunch'] = \
u'sopa de albóndigas'
tar_info1.pax_headers[
u'SCHILY.xattr.user.meta.afternoon-snack'] = \
u'gigantic bucket of coffee'
tar_file.addfile(tar_info1, fh1)
with open(os.path.join(self.testdir, "obj2")) as fh2:
tar_info2 = tar_file.gettarinfo(fileobj=fh2,
arcname="obj2")
tar_info2.pax_headers[
u'LIBARCHIVE.xattr.user.meta.muppet'] = u'bert'
tar_info2.pax_headers[
u'LIBARCHIVE.xattr.user.meta.cat'] = u'fluffy'
tar_info2.pax_headers[
u'LIBARCHIVE.xattr.user.notmeta'] = u'skipped'
tar_file.addfile(tar_info2, fh2)
tar_ball.seek(0)
req = Request.blank('/v1/a/c?extract-archive=tar')
req.environ['REQUEST_METHOD'] = 'PUT'
req.environ['wsgi.input'] = tar_ball
req.headers['transfer-encoding'] = 'chunked'
req.headers['accept'] = 'application/json;q=1.0'
resp = req.get_response(self.bulk)
self.assertEqual(resp.status_int, 200)
# sanity check to make sure the upload worked
upload_status = utils.json.loads(resp.body)
self.assertEqual(upload_status['Number Files Created'], 2)
put1_headers = HeaderKeyDict(self.app.calls_with_headers[1][2])
self.assertEqual(
put1_headers.get('Content-Type'),
'application/food-diary')
self.assertEqual(
put1_headers.get('X-Object-Meta-Lunch'),
'sopa de alb\xc3\xb3ndigas')
self.assertEqual(
put1_headers.get('X-Object-Meta-Afternoon-Snack'),
'gigantic bucket of coffee')
put2_headers = HeaderKeyDict(self.app.calls_with_headers[2][2])
self.assertEqual(put2_headers.get('X-Object-Meta-Muppet'), 'bert')
self.assertEqual(put2_headers.get('X-Object-Meta-Cat'), 'fluffy')
self.assertEqual(put2_headers.get('Content-Type'), None)
self.assertEqual(put2_headers.get('X-Object-Meta-Blah'), None)
class TestUntar(unittest.TestCase):
def setUp(self):