From bb716573ab5c8455348ec013feb894421e0e1f1c Mon Sep 17 00:00:00 2001
From: Tim Burke <tim.burke@gmail.com>
Date: Wed, 20 May 2015 00:39:41 -0700
Subject: [PATCH] Allow SLO PUTs to forgo per-segment integrity checks

While manifests still require 'etag' and 'size_bytes' fields for each
segment (to catch user errors like 'etaf' or 'size_btyes'), an explicit
null for either will skip that particular integrity check and instead
use whatever value is retrieved when HEADing the segment. So, if a user
uploads a manifest like:

    [{"path": "/con/obj_seg_1", "etag": null, "size_bytes": 1048576},
     {"path": "/con/obj_seg_2", "etag": "etag2", "size_bytes": null},
     {"path": "/con/obj_seg_3", "etag": null, "size_bytes": null}]

then the etag will only be verified for the /con/obj_seg_2 segment,
and the segment size will only be verified for the /con/obj_seg_1
segment. However, the manifest that's ultimately stored (and can be
retrieved with a ?multipart-manifest=get query-string) will still look
like:

    [{"name": "/con/obj_seg_1", "hash": "etag1", "bytes": 1048576, ...},
     {"name": "/con/obj_seg_2", "hash": "etag2", "bytes": 1048576, ...},
     {"name": "/con/obj_seg_3", "hash": "etag3", "bytes": 1234, ...}]

This allows the middleware to continue performing integrity checks on
object GET.

Change-Id: I2c4e585221387dd02a8679a50398d6b614407b12
DocImpact
---
 swift/common/middleware/slo.py          | 54 ++++++++++++-----
 test/functional/tests.py                | 79 +++++++++++++++++++++++++
 test/unit/common/middleware/test_slo.py | 55 +++++++++++++++++
 3 files changed, 172 insertions(+), 16 deletions(-)

diff --git a/swift/common/middleware/slo.py b/swift/common/middleware/slo.py
index d8df829981..3c3ad7feac 100644
--- a/swift/common/middleware/slo.py
+++ b/swift/common/middleware/slo.py
@@ -36,8 +36,8 @@ json data format. The data to be supplied for each segment is::
 
     path: the path to the segment (not including account)
           /container/object_name
-    etag: the etag given back when the segment was PUT
-    size_bytes: the size of the segment in bytes
+    etag: the etag given back when the segment was PUT, or null
+    size_bytes: the size of the segment in bytes, or null
 
 The format of the list will be::
 
@@ -48,15 +48,25 @@ The format of the list will be::
 
 The number of object segments is limited to a configurable amount, default
 1000. Each segment, except for the final one, must be at least 1 megabyte
-(configurable). On upload, the middleware will head every segment passed in and
-verify the size and etag of each. If any of the objects do not match (not
+(configurable). On upload, the middleware will head every segment passed in to
+verify:
+
+ 1. the segment exists (i.e. the HEAD was successful);
+ 2. the segment meets minimum size requirements (if not the last segment);
+ 3. if the user provided a non-null etag, the etag matches; and
+ 4. if the user provided a non-null size_bytes, the size_bytes matches.
+
+Note that the etag and size_bytes keys are still required; this acts as a guard
+against user errors such as typos. If any of the objects fail to verify (not
 found, size/etag mismatch, below minimum size) then the user will receive a 4xx
 error response. If everything does match, the user will receive a 2xx response
 and the SLO object is ready for downloading.
 
 Behind the scenes, on success, a json manifest generated from the user input is
 sent to object servers with an extra "X-Static-Large-Object: True" header
-and a modified Content-Type. The parameter: swift_bytes=$total_size will be
+and a modified Content-Type. The items in this manifest will include the etag
+and size_bytes for each segment, regardless of whether the client specified
+them for verification. The parameter: swift_bytes=$total_size will be
 appended to the existing Content-Type, where total_size is the sum of all
 the included segments' size_bytes. This extra parameter will be hidden from
 the user.
@@ -73,9 +83,11 @@ Retrieving a Large Object
 
 A GET request to the manifest object will return the concatenation of the
 objects from the manifest much like DLO. If any of the segments from the
-manifest are not found or their Etag/Content Length no longer match the
-connection will drop. In this case a 409 Conflict will be logged in the proxy
-logs and the user will receive incomplete results.
+manifest are not found or their Etag/Content Length have changed since upload,
+the connection will drop. In this case a 409 Conflict will be logged in the
+proxy logs and the user will receive incomplete results. Note that this will be
+enforced regardless of whether the user perfomed per-segment validation during
+upload.
 
 The headers from this GET or HEAD request will return the metadata attached
 to the manifest object itself with some exceptions::
@@ -594,8 +606,11 @@ class StaticLargeObject(object):
             try:
                 seg_size = int(seg_dict['size_bytes'])
             except (ValueError, TypeError):
-                raise HTTPBadRequest('Invalid Manifest File')
-            if seg_size < self.min_segment_size and \
+                if seg_dict['size_bytes'] is None:
+                    seg_size = None
+                else:
+                    raise HTTPBadRequest('Invalid Manifest File')
+            if seg_size is not None and seg_size < self.min_segment_size and \
                     index < len(parsed_data) - 1:
                 raise HTTPBadRequest(
                     'Each segment, except the last, must be at least '
@@ -613,11 +628,18 @@ class StaticLargeObject(object):
             head_seg_resp = \
                 Request.blank(obj_path, new_env).get_response(self)
             if head_seg_resp.is_success:
-                total_size += seg_size
-                if seg_size != head_seg_resp.content_length:
+                if head_seg_resp.content_length < self.min_segment_size and \
+                        index < len(parsed_data) - 1:
+                    raise HTTPBadRequest(
+                        'Each segment, except the last, must be at least '
+                        '%d bytes.' % self.min_segment_size)
+                total_size += head_seg_resp.content_length
+                if seg_size is not None and \
+                        seg_size != head_seg_resp.content_length:
                     problem_segments.append([quote(obj_name), 'Size Mismatch'])
-                if seg_dict['etag'] == head_seg_resp.etag:
-                    slo_etag.update(seg_dict['etag'])
+                if seg_dict['etag'] is None or \
+                        seg_dict['etag'] == head_seg_resp.etag:
+                    slo_etag.update(head_seg_resp.etag)
                 else:
                     problem_segments.append([quote(obj_name), 'Etag Mismatch'])
                 if head_seg_resp.last_modified:
@@ -629,8 +651,8 @@ class StaticLargeObject(object):
                 last_modified_formatted = \
                     last_modified.strftime('%Y-%m-%dT%H:%M:%S.%f')
                 seg_data = {'name': '/' + seg_dict['path'].lstrip('/'),
-                            'bytes': seg_size,
-                            'hash': seg_dict['etag'],
+                            'bytes': head_seg_resp.content_length,
+                            'hash': head_seg_resp.etag,
                             'content_type': head_seg_resp.content_type,
                             'last_modified': last_modified_formatted}
                 if config_true_value(
diff --git a/test/functional/tests.py b/test/functional/tests.py
index 3fbbdd784e..df96e5c4da 100644
--- a/test/functional/tests.py
+++ b/test/functional/tests.py
@@ -2152,6 +2152,15 @@ class TestSloEnv(object):
                 seg_info['seg_e']]),
             parms={'multipart-manifest': 'put'})
 
+        file_item = cls.container.file("manifest-db")
+        file_item.write(
+            json.dumps([
+                {'path': seg_info['seg_d']['path'], 'etag': None,
+                 'size_bytes': None},
+                {'path': seg_info['seg_b']['path'], 'etag': None,
+                 'size_bytes': None},
+            ]), parms={'multipart-manifest': 'put'})
+
 
 class TestSlo(Base):
     env = TestSloEnv
@@ -2259,6 +2268,52 @@ class TestSlo(Base):
         else:
             self.fail("Expected ResponseError but didn't get it")
 
+    def test_slo_unspecified_etag(self):
+        file_item = self.env.container.file("manifest-a-unspecified-etag")
+        file_item.write(
+            json.dumps([{
+                'size_bytes': 1024 * 1024,
+                'etag': None,
+                'path': '/%s/%s' % (self.env.container.name, 'seg_a')}]),
+            parms={'multipart-manifest': 'put'})
+        self.assert_status(201)
+
+    def test_slo_unspecified_size(self):
+        file_item = self.env.container.file("manifest-a-unspecified-size")
+        file_item.write(
+            json.dumps([{
+                'size_bytes': None,
+                'etag': hashlib.md5('a' * 1024 * 1024).hexdigest(),
+                'path': '/%s/%s' % (self.env.container.name, 'seg_a')}]),
+            parms={'multipart-manifest': 'put'})
+        self.assert_status(201)
+
+    def test_slo_missing_etag(self):
+        file_item = self.env.container.file("manifest-a-missing-etag")
+        try:
+            file_item.write(
+                json.dumps([{
+                    'size_bytes': 1024 * 1024,
+                    'path': '/%s/%s' % (self.env.container.name, 'seg_a')}]),
+                parms={'multipart-manifest': 'put'})
+        except ResponseError as err:
+            self.assertEqual(400, err.status)
+        else:
+            self.fail("Expected ResponseError but didn't get it")
+
+    def test_slo_missing_size(self):
+        file_item = self.env.container.file("manifest-a-missing-size")
+        try:
+            file_item.write(
+                json.dumps([{
+                    'etag': hashlib.md5('a' * 1024 * 1024).hexdigest(),
+                    'path': '/%s/%s' % (self.env.container.name, 'seg_a')}]),
+                parms={'multipart-manifest': 'put'})
+        except ResponseError as err:
+            self.assertEqual(400, err.status)
+        else:
+            self.fail("Expected ResponseError but didn't get it")
+
     def test_slo_overwrite_segment_with_manifest(self):
         file_item = self.env.container.file("seg_b")
         try:
@@ -2367,6 +2422,30 @@ class TestSlo(Base):
         except ValueError:
             self.fail("GET with multipart-manifest=get got invalid json")
 
+    def test_slo_get_the_manifest_with_details_from_server(self):
+        manifest = self.env.container.file("manifest-db")
+        got_body = manifest.read(parms={'multipart-manifest': 'get'})
+
+        self.assertEqual('application/json; charset=utf-8',
+                         manifest.content_type)
+        try:
+            value = json.loads(got_body)
+        except ValueError:
+            self.fail("GET with multipart-manifest=get got invalid json")
+
+        self.assertEqual(len(value), 2)
+        self.assertEqual(value[0]['bytes'], 1024 * 1024)
+        self.assertEqual(value[0]['hash'],
+                         hashlib.md5('d' * 1024 * 1024).hexdigest())
+        self.assertEqual(value[0]['name'],
+                         '/%s/seg_d' % self.env.container.name.decode("utf-8"))
+
+        self.assertEqual(value[1]['bytes'], 1024 * 1024)
+        self.assertEqual(value[1]['hash'],
+                         hashlib.md5('b' * 1024 * 1024).hexdigest())
+        self.assertEqual(value[1]['name'],
+                         '/%s/seg_b' % self.env.container.name.decode("utf-8"))
+
     def test_slo_head_the_manifest(self):
         manifest = self.env.container.file("manifest-abcde")
         got_info = manifest.info(parms={'multipart-manifest': 'get'})
diff --git a/test/unit/common/middleware/test_slo.py b/test/unit/common/middleware/test_slo.py
index d70a25ccc4..86a11734d3 100644
--- a/test/unit/common/middleware/test_slo.py
+++ b/test/unit/common/middleware/test_slo.py
@@ -441,6 +441,61 @@ class TestSloPutManifest(SloTestCase):
         self.assertEqual(status, '409 Conflict')
         self.assertEqual(self.app.call_count, 1)
 
+    def test_handle_multipart_put_skip_size_check(self):
+        good_data = json.dumps(
+            [{'path': '/checktest/a_1', 'etag': 'a', 'size_bytes': None},
+             {'path': '/checktest/b_2', 'etag': 'b', 'size_bytes': None}])
+        req = Request.blank(
+            '/v1/AUTH_test/checktest/man_3?multipart-manifest=put',
+            environ={'REQUEST_METHOD': 'PUT'}, body=good_data)
+        status, headers, body = self.call_slo(req)
+        self.assertEquals(self.app.call_count, 3)
+
+        # Check that we still populated the manifest properly from our HEADs
+        req = Request.blank(
+            # this string looks weird, but it's just an artifact
+            # of FakeSwift
+            '/v1/AUTH_test/checktest/man_3?multipart-manifest=put',
+            environ={'REQUEST_METHOD': 'GET'})
+        status, headers, body = self.call_app(req)
+        manifest_data = json.loads(body)
+        self.assertEquals(1, manifest_data[0]['bytes'])
+        self.assertEquals(2, manifest_data[1]['bytes'])
+
+    def test_handle_multipart_put_skip_size_check_still_uses_min_size(self):
+        with patch.object(self.slo, 'min_segment_size', 50):
+            test_json_data = json.dumps([{'path': '/cont/small_object',
+                                          'etag': 'etagoftheobjectsegment',
+                                          'size_bytes': None},
+                                         {'path': '/cont/small_object',
+                                          'etag': 'etagoftheobjectsegment',
+                                          'size_bytes': 100}])
+            req = Request.blank('/v1/AUTH_test/c/o', body=test_json_data)
+            with self.assertRaises(HTTPException) as cm:
+                self.slo.handle_multipart_put(req, fake_start_response)
+            self.assertEquals(cm.exception.status_int, 400)
+
+    def test_handle_multipart_put_skip_etag_check(self):
+        good_data = json.dumps(
+            [{'path': '/checktest/a_1', 'etag': None, 'size_bytes': 1},
+             {'path': '/checktest/b_2', 'etag': None, 'size_bytes': 2}])
+        req = Request.blank(
+            '/v1/AUTH_test/checktest/man_3?multipart-manifest=put',
+            environ={'REQUEST_METHOD': 'PUT'}, body=good_data)
+        status, headers, body = self.call_slo(req)
+        self.assertEquals(self.app.call_count, 3)
+
+        # Check that we still populated the manifest properly from our HEADs
+        req = Request.blank(
+            # this string looks weird, but it's just an artifact
+            # of FakeSwift
+            '/v1/AUTH_test/checktest/man_3?multipart-manifest=put',
+            environ={'REQUEST_METHOD': 'GET'})
+        status, headers, body = self.call_app(req)
+        manifest_data = json.loads(body)
+        self.assertEquals('a', manifest_data[0]['hash'])
+        self.assertEquals('b', manifest_data[1]['hash'])
+
 
 class TestSloDeleteManifest(SloTestCase):