2013-06-13 11:24:29 -07:00
|
|
|
#!/usr/bin/env python
|
2013-09-20 01:00:54 +08:00
|
|
|
# Copyright (c) 2010-2012 OpenStack Foundation
|
2013-06-13 11:24:29 -07:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
import email.parser
|
2014-06-25 11:38:54 -07:00
|
|
|
import itertools
|
2014-11-11 17:03:29 -08:00
|
|
|
import random
|
2014-06-25 11:38:54 -07:00
|
|
|
import time
|
2013-06-13 11:24:29 -07:00
|
|
|
import unittest
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
from collections import defaultdict
|
2013-08-28 21:26:08 +00:00
|
|
|
from contextlib import contextmanager
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
import json
|
|
|
|
from hashlib import md5
|
2013-06-13 11:24:29 -07:00
|
|
|
|
2013-09-19 21:05:46 +00:00
|
|
|
import mock
|
2014-11-17 20:29:45 -08:00
|
|
|
from eventlet import Timeout
|
2015-08-12 13:32:50 -07:00
|
|
|
from six import BytesIO
|
2015-05-25 18:28:02 +02:00
|
|
|
from six.moves import range
|
2013-09-19 21:05:46 +00:00
|
|
|
|
2013-08-28 21:26:08 +00:00
|
|
|
import swift
|
2015-10-16 11:27:34 -05:00
|
|
|
from swift.common import utils, swob, exceptions
|
2016-03-02 10:28:51 +00:00
|
|
|
from swift.common.header_key_dict import HeaderKeyDict
|
2013-06-13 11:24:29 -07:00
|
|
|
from swift.proxy import server as proxy_server
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
from swift.proxy.controllers import obj
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
from swift.proxy.controllers.base import \
|
|
|
|
get_container_info as _real_get_container_info
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
from swift.common.storage_policy import POLICIES, ECDriverError, StoragePolicy
|
2014-05-27 01:17:13 -07:00
|
|
|
|
2014-06-25 11:38:54 -07:00
|
|
|
from test.unit import FakeRing, FakeMemcache, fake_http_connect, \
|
2015-05-06 16:29:06 -07:00
|
|
|
debug_logger, patch_policies, SlowBody, FakeStatus
|
2014-11-17 20:29:45 -08:00
|
|
|
from test.unit.proxy.test_server import node_error_count
|
2013-08-28 21:26:08 +00:00
|
|
|
|
2013-08-31 23:13:15 -04:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def unchunk_body(chunked_body):
|
|
|
|
body = ''
|
|
|
|
remaining = chunked_body
|
|
|
|
while remaining:
|
|
|
|
hex_length, remaining = remaining.split('\r\n', 1)
|
|
|
|
length = int(hex_length, 16)
|
|
|
|
body += remaining[:length]
|
|
|
|
remaining = remaining[length + 2:]
|
|
|
|
return body
|
|
|
|
|
|
|
|
|
2013-08-28 21:26:08 +00:00
|
|
|
@contextmanager
|
|
|
|
def set_http_connect(*args, **kwargs):
|
|
|
|
old_connect = swift.proxy.controllers.base.http_connect
|
|
|
|
new_connect = fake_http_connect(*args, **kwargs)
|
2014-06-25 11:38:54 -07:00
|
|
|
try:
|
|
|
|
swift.proxy.controllers.base.http_connect = new_connect
|
|
|
|
swift.proxy.controllers.obj.http_connect = new_connect
|
|
|
|
swift.proxy.controllers.account.http_connect = new_connect
|
|
|
|
swift.proxy.controllers.container.http_connect = new_connect
|
|
|
|
yield new_connect
|
|
|
|
left_over_status = list(new_connect.code_iter)
|
|
|
|
if left_over_status:
|
|
|
|
raise AssertionError('left over status %r' % left_over_status)
|
|
|
|
finally:
|
|
|
|
swift.proxy.controllers.base.http_connect = old_connect
|
|
|
|
swift.proxy.controllers.obj.http_connect = old_connect
|
|
|
|
swift.proxy.controllers.account.http_connect = old_connect
|
|
|
|
swift.proxy.controllers.container.http_connect = old_connect
|
|
|
|
|
|
|
|
|
|
|
|
class PatchedObjControllerApp(proxy_server.Application):
|
2014-11-17 20:29:45 -08:00
|
|
|
"""
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
This patch is just a hook over the proxy server's __call__ to ensure
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
that calls to get_container_info will return the stubbed value for
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
container_info if it's a container info call.
|
2014-11-17 20:29:45 -08:00
|
|
|
"""
|
2014-06-25 11:38:54 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
container_info = {}
|
|
|
|
per_container_info = {}
|
|
|
|
|
|
|
|
def __call__(self, *args, **kwargs):
|
2014-06-25 11:38:54 -07:00
|
|
|
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
def _fake_get_container_info(env, app, swift_source=None):
|
|
|
|
_vrs, account, container, _junk = utils.split_path(
|
|
|
|
env['PATH_INFO'], 3, 4)
|
|
|
|
|
|
|
|
# Seed the cache with our container info so that the real
|
|
|
|
# get_container_info finds it.
|
|
|
|
ic = env.setdefault('swift.infocache', {})
|
2016-04-27 13:31:11 -05:00
|
|
|
cache_key = "container/%s/%s" % (account, container)
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
|
|
|
|
old_value = ic.get(cache_key)
|
|
|
|
|
|
|
|
# Copy the container info so we don't hand out a reference to a
|
|
|
|
# mutable thing that's set up only once at compile time. Nothing
|
|
|
|
# *should* mutate it, but it's better to be paranoid than wrong.
|
|
|
|
if container in self.per_container_info:
|
|
|
|
ic[cache_key] = self.per_container_info[container].copy()
|
|
|
|
else:
|
|
|
|
ic[cache_key] = self.container_info.copy()
|
|
|
|
|
|
|
|
real_info = _real_get_container_info(env, app, swift_source)
|
|
|
|
|
|
|
|
if old_value is None:
|
|
|
|
del ic[cache_key]
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
else:
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
ic[cache_key] = old_value
|
2013-08-28 21:26:08 +00:00
|
|
|
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
return real_info
|
|
|
|
|
|
|
|
with mock.patch('swift.proxy.server.get_container_info',
|
|
|
|
new=_fake_get_container_info), \
|
|
|
|
mock.patch('swift.proxy.controllers.base.get_container_info',
|
|
|
|
new=_fake_get_container_info):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
return super(
|
|
|
|
PatchedObjControllerApp, self).__call__(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
class BaseObjectControllerMixin(object):
|
|
|
|
container_info = {
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
'status': 200,
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
'write_acl': None,
|
|
|
|
'read_acl': None,
|
|
|
|
'storage_policy': None,
|
|
|
|
'sync_key': None,
|
|
|
|
'versions': None,
|
|
|
|
}
|
|
|
|
|
|
|
|
# this needs to be set on the test case
|
|
|
|
controller_cls = None
|
2013-06-13 11:24:29 -07:00
|
|
|
|
|
|
|
def setUp(self):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# setup fake rings with handoffs
|
|
|
|
for policy in POLICIES:
|
|
|
|
policy.object_ring.max_more_nodes = policy.object_ring.replicas
|
|
|
|
|
|
|
|
self.logger = debug_logger('proxy-server')
|
|
|
|
self.logger.thread_locals = ('txn1', '127.0.0.2')
|
|
|
|
self.app = PatchedObjControllerApp(
|
2013-06-13 11:24:29 -07:00
|
|
|
None, FakeMemcache(), account_ring=FakeRing(),
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
container_ring=FakeRing(), logger=self.logger)
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# you can over-ride the container_info just by setting it on the app
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
# (see PatchedObjControllerApp for details)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.app.container_info = dict(self.container_info)
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# default policy and ring references
|
|
|
|
self.policy = POLICIES.default
|
|
|
|
self.obj_ring = self.policy.object_ring
|
|
|
|
self._ts_iter = (utils.Timestamp(t) for t in
|
|
|
|
itertools.count(int(time.time())))
|
|
|
|
|
|
|
|
def ts(self):
|
2015-06-15 22:10:45 +05:30
|
|
|
return next(self._ts_iter)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def replicas(self, policy=None):
|
|
|
|
policy = policy or POLICIES.default
|
|
|
|
return policy.object_ring.replicas
|
|
|
|
|
|
|
|
def quorum(self, policy=None):
|
|
|
|
policy = policy or POLICIES.default
|
|
|
|
return policy.quorum
|
2013-06-13 11:24:29 -07:00
|
|
|
|
|
|
|
def test_iter_nodes_local_first_noops_when_no_affinity(self):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# this test needs a stable node order - most don't
|
|
|
|
self.app.sort_nodes = lambda l: l
|
|
|
|
controller = self.controller_cls(
|
|
|
|
self.app, 'a', 'c', 'o')
|
2013-06-13 11:24:29 -07:00
|
|
|
self.app.write_affinity_is_local_fn = None
|
2014-05-27 01:17:13 -07:00
|
|
|
object_ring = self.app.get_object_ring(None)
|
|
|
|
all_nodes = object_ring.get_part_nodes(1)
|
|
|
|
all_nodes.extend(object_ring.get_more_nodes(1))
|
2013-06-13 11:24:29 -07:00
|
|
|
|
|
|
|
local_first_nodes = list(controller.iter_nodes_local_first(
|
2014-05-27 01:17:13 -07:00
|
|
|
object_ring, 1))
|
2013-06-13 11:24:29 -07:00
|
|
|
|
|
|
|
self.maxDiff = None
|
|
|
|
|
|
|
|
self.assertEqual(all_nodes, local_first_nodes)
|
|
|
|
|
|
|
|
def test_iter_nodes_local_first_moves_locals_first(self):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
controller = self.controller_cls(
|
|
|
|
self.app, 'a', 'c', 'o')
|
2013-08-31 23:13:15 -04:00
|
|
|
self.app.write_affinity_is_local_fn = (
|
|
|
|
lambda node: node['region'] == 1)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# we'll write to one more than replica count local nodes
|
|
|
|
self.app.write_affinity_node_count = lambda r: r + 1
|
2013-06-13 11:24:29 -07:00
|
|
|
|
2014-05-27 01:17:13 -07:00
|
|
|
object_ring = self.app.get_object_ring(None)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# make our fake ring have plenty of nodes, and not get limited
|
|
|
|
# artificially by the proxy max request node count
|
|
|
|
object_ring.max_more_nodes = 100000
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
# nothing magic about * 2 + 3, just a way to make it bigger
|
|
|
|
self.app.request_node_count = lambda r: r * 2 + 3
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
2014-05-27 01:17:13 -07:00
|
|
|
all_nodes = object_ring.get_part_nodes(1)
|
|
|
|
all_nodes.extend(object_ring.get_more_nodes(1))
|
2013-06-13 11:24:29 -07:00
|
|
|
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
# limit to the number we're going to look at in this request
|
|
|
|
nodes_requested = self.app.request_node_count(object_ring.replicas)
|
|
|
|
all_nodes = all_nodes[:nodes_requested]
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
# make sure we have enough local nodes (sanity)
|
|
|
|
all_local_nodes = [n for n in all_nodes if
|
|
|
|
self.app.write_affinity_is_local_fn(n)]
|
|
|
|
self.assertTrue(len(all_local_nodes) >= self.replicas() + 1)
|
|
|
|
|
|
|
|
# finally, create the local_first_nodes iter and flatten it out
|
2013-06-13 11:24:29 -07:00
|
|
|
local_first_nodes = list(controller.iter_nodes_local_first(
|
2014-05-27 01:17:13 -07:00
|
|
|
object_ring, 1))
|
2013-06-13 11:24:29 -07:00
|
|
|
|
|
|
|
# the local nodes move up in the ordering
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.assertEqual([1] * (self.replicas() + 1), [
|
|
|
|
node['region'] for node in local_first_nodes[
|
|
|
|
:self.replicas() + 1]])
|
2013-06-13 11:24:29 -07:00
|
|
|
# we don't skip any nodes
|
2014-04-28 19:22:51 -07:00
|
|
|
self.assertEqual(len(all_nodes), len(local_first_nodes))
|
2013-06-13 11:24:29 -07:00
|
|
|
self.assertEqual(sorted(all_nodes), sorted(local_first_nodes))
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_iter_nodes_local_first_best_effort(self):
|
|
|
|
controller = self.controller_cls(
|
|
|
|
self.app, 'a', 'c', 'o')
|
|
|
|
self.app.write_affinity_is_local_fn = (
|
|
|
|
lambda node: node['region'] == 1)
|
|
|
|
|
|
|
|
object_ring = self.app.get_object_ring(None)
|
|
|
|
all_nodes = object_ring.get_part_nodes(1)
|
|
|
|
all_nodes.extend(object_ring.get_more_nodes(1))
|
|
|
|
|
|
|
|
local_first_nodes = list(controller.iter_nodes_local_first(
|
|
|
|
object_ring, 1))
|
|
|
|
|
|
|
|
# we won't have quite enough local nodes...
|
|
|
|
self.assertEqual(len(all_nodes), self.replicas() +
|
|
|
|
POLICIES.default.object_ring.max_more_nodes)
|
|
|
|
all_local_nodes = [n for n in all_nodes if
|
|
|
|
self.app.write_affinity_is_local_fn(n)]
|
|
|
|
self.assertEqual(len(all_local_nodes), self.replicas())
|
|
|
|
# but the local nodes we do have are at the front of the local iter
|
|
|
|
first_n_local_first_nodes = local_first_nodes[:len(all_local_nodes)]
|
|
|
|
self.assertEqual(sorted(all_local_nodes),
|
|
|
|
sorted(first_n_local_first_nodes))
|
|
|
|
# but we *still* don't *skip* any nodes
|
|
|
|
self.assertEqual(len(all_nodes), len(local_first_nodes))
|
|
|
|
self.assertEqual(sorted(all_nodes), sorted(local_first_nodes))
|
|
|
|
|
2013-08-28 21:26:08 +00:00
|
|
|
def test_connect_put_node_timeout(self):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
controller = self.controller_cls(
|
|
|
|
self.app, 'a', 'c', 'o')
|
2014-04-04 15:26:32 -04:00
|
|
|
self.app.conn_timeout = 0.05
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect(slow_connect=True):
|
2013-08-31 23:13:15 -04:00
|
|
|
nodes = [dict(ip='', port='', device='')]
|
|
|
|
res = controller._connect_put_node(nodes, '', '', {}, ('', ''))
|
2013-08-28 21:26:08 +00:00
|
|
|
self.assertTrue(res is None)
|
2013-06-13 11:24:29 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_DELETE_simple(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
codes = [204] * self.replicas()
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
2013-09-19 21:05:46 +00:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_DELETE_missing_one(self):
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
# Obviously this test doesn't work if we're testing 1 replica.
|
|
|
|
# In that case, we don't have any failovers to check.
|
|
|
|
if self.replicas() == 1:
|
|
|
|
return
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
codes = [404] + [204] * (self.replicas() - 1)
|
|
|
|
random.shuffle(codes)
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
2014-07-02 14:39:42 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_DELETE_not_found(self):
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
# Obviously this test doesn't work if we're testing 1 replica.
|
|
|
|
# In that case, we don't have any failovers to check.
|
|
|
|
if self.replicas() == 1:
|
|
|
|
return
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
codes = [404] * (self.replicas() - 1) + [204]
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 404)
|
2014-06-25 11:38:54 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_DELETE_mostly_found(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
mostly_204s = [204] * self.quorum()
|
|
|
|
codes = mostly_204s + [404] * (self.replicas() - len(mostly_204s))
|
|
|
|
self.assertEqual(len(codes), self.replicas())
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_DELETE_mostly_not_found(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
mostly_404s = [404] * self.quorum()
|
|
|
|
codes = mostly_404s + [204] * (self.replicas() - len(mostly_404s))
|
|
|
|
self.assertEqual(len(codes), self.replicas())
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 404)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_DELETE_half_not_found_statuses(self):
|
|
|
|
self.obj_ring.set_replicas(4)
|
|
|
|
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
with set_http_connect(404, 204, 404, 204):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_DELETE_half_not_found_headers_and_body(self):
|
|
|
|
# Transformed responses have bogus bodies and headers, so make sure we
|
|
|
|
# send the client headers and body from a real node's response.
|
|
|
|
self.obj_ring.set_replicas(4)
|
|
|
|
|
|
|
|
status_codes = (404, 404, 204, 204)
|
|
|
|
bodies = ('not found', 'not found', '', '')
|
|
|
|
headers = [{}, {}, {'Pick-Me': 'yes'}, {'Pick-Me': 'yes'}]
|
|
|
|
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
with set_http_connect(*status_codes, body_iter=bodies,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
|
|
|
self.assertEqual(resp.headers.get('Pick-Me'), 'yes')
|
|
|
|
self.assertEqual(resp.body, '')
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_DELETE_handoff(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='DELETE')
|
|
|
|
codes = [204] * self.replicas()
|
|
|
|
with set_http_connect(507, *codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 204)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_POST_non_int_delete_after(self):
|
|
|
|
t = str(int(time.time() + 100)) + '.1'
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='POST',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-After': t})
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('Non-integer X-Delete-After', resp.body)
|
|
|
|
|
|
|
|
def test_PUT_non_int_delete_after(self):
|
|
|
|
t = str(int(time.time() + 100)) + '.1'
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-After': t})
|
|
|
|
with set_http_connect():
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('Non-integer X-Delete-After', resp.body)
|
|
|
|
|
|
|
|
def test_POST_negative_delete_after(self):
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='POST',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-After': '-60'})
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('X-Delete-After in past', resp.body)
|
|
|
|
|
|
|
|
def test_PUT_negative_delete_after(self):
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-After': '-60'})
|
|
|
|
with set_http_connect():
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('X-Delete-After in past', resp.body)
|
|
|
|
|
|
|
|
def test_POST_delete_at_non_integer(self):
|
|
|
|
t = str(int(time.time() + 100)) + '.1'
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='POST',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-At': t})
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('Non-integer X-Delete-At', resp.body)
|
|
|
|
|
|
|
|
def test_PUT_delete_at_non_integer(self):
|
|
|
|
t = str(int(time.time() - 100)) + '.1'
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-At': t})
|
|
|
|
with set_http_connect():
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('Non-integer X-Delete-At', resp.body)
|
|
|
|
|
|
|
|
def test_POST_delete_at_in_past(self):
|
|
|
|
t = str(int(time.time() - 100))
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='POST',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-At': t})
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('X-Delete-At in past', resp.body)
|
|
|
|
|
|
|
|
def test_PUT_delete_at_in_past(self):
|
|
|
|
t = str(int(time.time() - 100))
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
|
|
|
'X-Delete-At': t})
|
|
|
|
with set_http_connect():
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 400)
|
|
|
|
self.assertEqual('X-Delete-At in past', resp.body)
|
|
|
|
|
|
|
|
def test_HEAD_simple(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='HEAD')
|
|
|
|
with set_http_connect(200):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
2016-02-04 05:18:42 -08:00
|
|
|
self.assertIn('Accept-Ranges', resp.headers)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_HEAD_x_newest(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='HEAD',
|
|
|
|
headers={'X-Newest': 'true'})
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
with set_http_connect(*([200] * self.replicas())):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_HEAD_x_newest_different_timestamps(self):
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='HEAD',
|
|
|
|
headers={'X-Newest': 'true'})
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
timestamps = [next(ts) for i in range(self.replicas())]
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
newest_timestamp = timestamps[-1]
|
|
|
|
random.shuffle(timestamps)
|
|
|
|
backend_response_headers = [{
|
|
|
|
'X-Backend-Timestamp': t.internal,
|
|
|
|
'X-Timestamp': t.normal
|
|
|
|
} for t in timestamps]
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
with set_http_connect(*([200] * self.replicas()),
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
headers=backend_response_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['x-timestamp'], newest_timestamp.normal)
|
|
|
|
|
|
|
|
def test_HEAD_x_newest_with_two_vector_timestamps(self):
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='HEAD',
|
|
|
|
headers={'X-Newest': 'true'})
|
|
|
|
ts = (utils.Timestamp(time.time(), offset=offset)
|
|
|
|
for offset in itertools.count())
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
timestamps = [next(ts) for i in range(self.replicas())]
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
newest_timestamp = timestamps[-1]
|
|
|
|
random.shuffle(timestamps)
|
|
|
|
backend_response_headers = [{
|
|
|
|
'X-Backend-Timestamp': t.internal,
|
|
|
|
'X-Timestamp': t.normal
|
|
|
|
} for t in timestamps]
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
with set_http_connect(*([200] * self.replicas()),
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
headers=backend_response_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['x-backend-timestamp'],
|
|
|
|
newest_timestamp.internal)
|
|
|
|
|
|
|
|
def test_HEAD_x_newest_with_some_missing(self):
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='HEAD',
|
|
|
|
headers={'X-Newest': 'true'})
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
request_count = self.app.request_node_count(self.obj_ring.replicas)
|
|
|
|
backend_response_headers = [{
|
|
|
|
'x-timestamp': next(ts).normal,
|
|
|
|
} for i in range(request_count)]
|
|
|
|
responses = [404] * (request_count - 1)
|
|
|
|
responses.append(200)
|
|
|
|
request_log = []
|
|
|
|
|
|
|
|
def capture_requests(ip, port, device, part, method, path,
|
|
|
|
headers=None, **kwargs):
|
|
|
|
req = {
|
|
|
|
'ip': ip,
|
|
|
|
'port': port,
|
|
|
|
'device': device,
|
|
|
|
'part': part,
|
|
|
|
'method': method,
|
|
|
|
'path': path,
|
|
|
|
'headers': headers,
|
|
|
|
}
|
|
|
|
request_log.append(req)
|
|
|
|
with set_http_connect(*responses,
|
|
|
|
headers=backend_response_headers,
|
|
|
|
give_connect=capture_requests):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
for req in request_log:
|
|
|
|
self.assertEqual(req['method'], 'HEAD')
|
|
|
|
self.assertEqual(req['path'], '/a/c/o')
|
|
|
|
|
|
|
|
def test_container_sync_delete(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='DELETE', headers={
|
2015-06-15 22:10:45 +05:30
|
|
|
'X-Timestamp': next(ts).internal})
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
codes = [409] * self.obj_ring.replicas
|
2015-06-15 22:10:45 +05:30
|
|
|
ts_iter = itertools.repeat(next(ts).internal)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 409)
|
|
|
|
|
|
|
|
def test_PUT_requires_length(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 411)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
def test_container_update_backend_requests(self):
|
|
|
|
for policy in POLICIES:
|
|
|
|
req = swift.common.swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT',
|
|
|
|
headers={'Content-Length': '0',
|
|
|
|
'X-Backend-Storage-Policy-Index': int(policy)})
|
|
|
|
controller = self.controller_cls(self.app, 'a', 'c', 'o')
|
|
|
|
|
|
|
|
# This is the number of container updates we're doing, simulating
|
|
|
|
# 1 to 15 container replicas.
|
|
|
|
for num_containers in range(1, 16):
|
|
|
|
containers = [{'ip': '1.0.0.%s' % i,
|
|
|
|
'port': '60%s' % str(i).zfill(2),
|
|
|
|
'device': 'sdb'} for i in range(num_containers)]
|
|
|
|
|
|
|
|
backend_headers = controller._backend_requests(
|
|
|
|
req, self.replicas(policy), 1, containers)
|
|
|
|
|
|
|
|
# how many of the backend headers have a container update
|
|
|
|
container_updates = len(
|
|
|
|
[headers for headers in backend_headers
|
|
|
|
if 'X-Container-Partition' in headers])
|
|
|
|
|
|
|
|
if num_containers <= self.quorum(policy):
|
|
|
|
# filling case
|
|
|
|
expected = min(self.quorum(policy) + 1,
|
|
|
|
self.replicas(policy))
|
|
|
|
else:
|
|
|
|
# container updates >= object replicas
|
|
|
|
expected = min(num_containers,
|
|
|
|
self.replicas(policy))
|
|
|
|
|
|
|
|
self.assertEqual(container_updates, expected)
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# end of BaseObjectControllerMixin
|
2014-06-25 11:38:54 -07:00
|
|
|
|
2014-07-02 14:39:42 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
@patch_policies()
|
|
|
|
class TestReplicatedObjController(BaseObjectControllerMixin,
|
|
|
|
unittest.TestCase):
|
2014-07-02 14:39:42 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
controller_cls = obj.ReplicatedObjectController
|
2014-03-19 21:51:17 +00:00
|
|
|
|
|
|
|
def test_PUT_simple(self):
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
2014-03-19 21:51:17 +00:00
|
|
|
req.headers['content-length'] = '0'
|
|
|
|
with set_http_connect(201, 201, 201):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
2014-03-19 21:51:17 +00:00
|
|
|
|
2016-01-12 12:50:43 +05:30
|
|
|
def test_txn_id_logging_on_PUT(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
|
|
|
self.app.logger.txn_id = req.environ['swift.trans_id'] = 'test-txn-id'
|
|
|
|
req.headers['content-length'] = '0'
|
|
|
|
# we capture stdout since the debug log formatter prints the formatted
|
|
|
|
# message to stdout
|
|
|
|
stdout = BytesIO()
|
|
|
|
with set_http_connect((100, Timeout()), 503, 503), \
|
|
|
|
mock.patch('sys.stdout', stdout):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 503)
|
|
|
|
for line in stdout.getvalue().splitlines():
|
|
|
|
self.assertIn('test-txn-id', line)
|
|
|
|
self.assertIn('Trying to get final status of PUT to',
|
|
|
|
stdout.getvalue())
|
|
|
|
|
2015-11-16 14:27:51 -08:00
|
|
|
def test_PUT_empty_bad_etag(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
|
|
|
req.headers['Content-Length'] = '0'
|
|
|
|
req.headers['Etag'] = '"catbus"'
|
|
|
|
|
|
|
|
# The 2-tuple here makes getexpect() return 422, not 100. For
|
|
|
|
# objects that are >0 bytes, you get a 100 Continue and then a 422
|
|
|
|
# Unprocessable Entity after sending the body. For zero-byte
|
|
|
|
# objects, though, you get the 422 right away.
|
|
|
|
codes = [FakeStatus((422, 422))
|
|
|
|
for _junk in range(self.replicas())]
|
|
|
|
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 422)
|
|
|
|
|
2014-03-19 21:51:17 +00:00
|
|
|
def test_PUT_if_none_match(self):
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
2014-03-19 21:51:17 +00:00
|
|
|
req.headers['if-none-match'] = '*'
|
|
|
|
req.headers['content-length'] = '0'
|
|
|
|
with set_http_connect(201, 201, 201):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
2014-03-19 21:51:17 +00:00
|
|
|
|
|
|
|
def test_PUT_if_none_match_denied(self):
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
2014-03-19 21:51:17 +00:00
|
|
|
req.headers['if-none-match'] = '*'
|
|
|
|
req.headers['content-length'] = '0'
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect(201, 412, 201):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 412)
|
2014-03-19 21:51:17 +00:00
|
|
|
|
|
|
|
def test_PUT_if_none_match_not_star(self):
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
2014-03-19 21:51:17 +00:00
|
|
|
req.headers['if-none-match'] = 'somethingelse'
|
|
|
|
req.headers['content-length'] = '0'
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect():
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 400)
|
2014-03-19 21:51:17 +00:00
|
|
|
|
2014-11-17 20:29:45 -08:00
|
|
|
def test_PUT_connect_exceptions(self):
|
|
|
|
object_ring = self.app.get_object_ring(None)
|
|
|
|
self.app.sort_nodes = lambda n: n # disable shuffle
|
|
|
|
|
|
|
|
def test_status_map(statuses, expected):
|
|
|
|
self.app._error_limiting = {}
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
with set_http_connect(*statuses):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, expected)
|
|
|
|
|
|
|
|
base_status = [201] * 3
|
|
|
|
# test happy path
|
|
|
|
test_status_map(list(base_status), 201)
|
|
|
|
for i in range(3):
|
|
|
|
self.assertEqual(node_error_count(
|
|
|
|
self.app, object_ring.devs[i]), 0)
|
|
|
|
# single node errors and test isolation
|
|
|
|
for i in range(3):
|
|
|
|
status_list = list(base_status)
|
|
|
|
status_list[i] = 503
|
|
|
|
test_status_map(status_list, 201)
|
|
|
|
for j in range(3):
|
|
|
|
self.assertEqual(node_error_count(
|
|
|
|
self.app, object_ring.devs[j]), 1 if j == i else 0)
|
|
|
|
# connect errors
|
|
|
|
test_status_map((201, Timeout(), 201, 201), 201)
|
|
|
|
self.assertEqual(node_error_count(
|
|
|
|
self.app, object_ring.devs[1]), 1)
|
|
|
|
test_status_map((Exception('kaboom!'), 201, 201, 201), 201)
|
|
|
|
self.assertEqual(node_error_count(
|
|
|
|
self.app, object_ring.devs[0]), 1)
|
|
|
|
# expect errors
|
|
|
|
test_status_map((201, 201, (503, None), 201), 201)
|
|
|
|
self.assertEqual(node_error_count(
|
|
|
|
self.app, object_ring.devs[2]), 1)
|
|
|
|
test_status_map(((507, None), 201, 201, 201), 201)
|
|
|
|
self.assertEqual(
|
|
|
|
node_error_count(self.app, object_ring.devs[0]),
|
|
|
|
self.app.error_suppression_limit + 1)
|
|
|
|
# response errors
|
|
|
|
test_status_map(((100, Timeout()), 201, 201), 201)
|
|
|
|
self.assertEqual(
|
|
|
|
node_error_count(self.app, object_ring.devs[0]), 1)
|
|
|
|
test_status_map((201, 201, (100, Exception())), 201)
|
|
|
|
self.assertEqual(
|
|
|
|
node_error_count(self.app, object_ring.devs[2]), 1)
|
|
|
|
test_status_map((201, (100, 507), 201), 201)
|
|
|
|
self.assertEqual(
|
|
|
|
node_error_count(self.app, object_ring.devs[1]),
|
|
|
|
self.app.error_suppression_limit + 1)
|
|
|
|
|
2015-10-16 11:27:34 -05:00
|
|
|
def test_PUT_error_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-02-18 11:59:31 +05:30
|
|
|
raise IOError('error message')
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
with set_http_connect(201, 201, 201):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 499)
|
|
|
|
|
|
|
|
def test_PUT_chunkreadtimeout_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise exceptions.ChunkReadTimeout()
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
with set_http_connect(201, 201, 201):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 408)
|
|
|
|
|
|
|
|
def test_PUT_timeout_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise Timeout()
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
with set_http_connect(201, 201, 201):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 499)
|
|
|
|
|
|
|
|
def test_PUT_exception_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise Exception('exception message')
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
with set_http_connect(201, 201, 201):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 500)
|
|
|
|
|
2014-03-19 21:51:17 +00:00
|
|
|
def test_GET_simple(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
with set_http_connect(200):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
2016-02-04 05:18:42 -08:00
|
|
|
self.assertIn('Accept-Ranges', resp.headers)
|
2014-03-19 21:51:17 +00:00
|
|
|
|
2015-11-17 16:15:59 +09:00
|
|
|
def test_GET_transfer_encoding_chunked(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
with set_http_connect(200, headers={'transfer-encoding': 'chunked'}):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['Transfer-Encoding'], 'chunked')
|
|
|
|
|
2014-06-25 11:38:54 -07:00
|
|
|
def test_GET_error(self):
|
2014-03-19 21:51:17 +00:00
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
2014-08-29 17:14:46 +10:00
|
|
|
self.app.logger.txn_id = req.environ['swift.trans_id'] = 'my-txn-id'
|
|
|
|
stdout = BytesIO()
|
|
|
|
with set_http_connect(503, 200), \
|
|
|
|
mock.patch('sys.stdout', stdout):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
2014-08-29 17:14:46 +10:00
|
|
|
for line in stdout.getvalue().splitlines():
|
|
|
|
self.assertIn('my-txn-id', line)
|
|
|
|
self.assertIn('From Object Server', stdout.getvalue())
|
2014-06-25 11:38:54 -07:00
|
|
|
|
|
|
|
def test_GET_handoff(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
codes = [503] * self.obj_ring.replicas + [200]
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
2014-06-25 11:38:54 -07:00
|
|
|
|
|
|
|
def test_GET_not_found(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
codes = [404] * (self.obj_ring.replicas +
|
|
|
|
self.obj_ring.max_more_nodes)
|
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 404)
|
2014-06-25 11:38:54 -07:00
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_PUT_delete_at(self):
|
|
|
|
t = str(int(time.time() + 100))
|
2014-08-08 02:14:27 -07:00
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
'X-Delete-At': t})
|
2014-08-08 02:14:27 -07:00
|
|
|
put_headers = []
|
|
|
|
|
|
|
|
def capture_headers(ip, port, device, part, method, path, headers,
|
|
|
|
**kwargs):
|
|
|
|
if method == 'PUT':
|
|
|
|
put_headers.append(headers)
|
|
|
|
codes = [201] * self.obj_ring.replicas
|
|
|
|
with set_http_connect(*codes, give_connect=capture_headers):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
2014-08-08 02:14:27 -07:00
|
|
|
for given_headers in put_headers:
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(given_headers.get('X-Delete-At'), t)
|
2014-08-08 02:14:27 -07:00
|
|
|
self.assertTrue('X-Delete-At-Host' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Device' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Partition' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Container' in given_headers)
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_PUT_converts_delete_after_to_delete_at(self):
|
2014-08-08 02:14:27 -07:00
|
|
|
req = swob.Request.blank('/v1/a/c/o', method='PUT', body='',
|
|
|
|
headers={'Content-Type': 'foo/bar',
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
'X-Delete-After': '60'})
|
2014-08-08 02:14:27 -07:00
|
|
|
put_headers = []
|
|
|
|
|
|
|
|
def capture_headers(ip, port, device, part, method, path, headers,
|
|
|
|
**kwargs):
|
|
|
|
if method == 'PUT':
|
|
|
|
put_headers.append(headers)
|
|
|
|
codes = [201] * self.obj_ring.replicas
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
t = time.time()
|
2014-08-08 02:14:27 -07:00
|
|
|
with set_http_connect(*codes, give_connect=capture_headers):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
with mock.patch('time.time', lambda: t):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
expected_delete_at = str(int(t) + 60)
|
2014-08-08 02:14:27 -07:00
|
|
|
for given_headers in put_headers:
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(given_headers.get('X-Delete-At'),
|
|
|
|
expected_delete_at)
|
2014-08-08 02:14:27 -07:00
|
|
|
self.assertTrue('X-Delete-At-Host' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Device' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Partition' in given_headers)
|
|
|
|
self.assertTrue('X-Delete-At-Container' in given_headers)
|
|
|
|
|
2014-06-25 11:38:54 -07:00
|
|
|
def test_container_sync_put_x_timestamp_not_found(self):
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.app.container_info['storage_policy'] = policy_index
|
2014-06-25 11:38:54 -07:00
|
|
|
put_timestamp = utils.Timestamp(time.time()).normal
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
|
|
|
'X-Timestamp': put_timestamp})
|
2014-06-25 20:36:33 -07:00
|
|
|
codes = [201] * self.obj_ring.replicas
|
|
|
|
with set_http_connect(*codes):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 201)
|
|
|
|
|
|
|
|
def test_container_sync_put_x_timestamp_match(self):
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.app.container_info['storage_policy'] = policy_index
|
2014-06-25 11:38:54 -07:00
|
|
|
put_timestamp = utils.Timestamp(time.time()).normal
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
|
|
|
'X-Timestamp': put_timestamp})
|
|
|
|
ts_iter = itertools.repeat(put_timestamp)
|
2014-06-25 20:36:33 -07:00
|
|
|
codes = [409] * self.obj_ring.replicas
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
|
|
|
def test_container_sync_put_x_timestamp_older(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.app.container_info['storage_policy'] = policy_index
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
2015-06-15 22:10:45 +05:30
|
|
|
'X-Timestamp': next(ts).internal})
|
|
|
|
ts_iter = itertools.repeat(next(ts).internal)
|
2014-06-25 20:36:33 -07:00
|
|
|
codes = [409] * self.obj_ring.replicas
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
|
|
|
def test_container_sync_put_x_timestamp_newer(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
2015-06-15 22:10:45 +05:30
|
|
|
orig_timestamp = next(ts).internal
|
2014-06-25 11:38:54 -07:00
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
2015-06-15 22:10:45 +05:30
|
|
|
'X-Timestamp': next(ts).internal})
|
2014-06-25 11:38:54 -07:00
|
|
|
ts_iter = itertools.repeat(orig_timestamp)
|
2014-06-25 20:36:33 -07:00
|
|
|
codes = [201] * self.obj_ring.replicas
|
2014-06-25 11:38:54 -07:00
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 201)
|
|
|
|
|
|
|
|
def test_put_x_timestamp_conflict(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
2015-06-15 22:10:45 +05:30
|
|
|
'X-Timestamp': next(ts).internal})
|
|
|
|
ts_iter = iter([next(ts).internal, None, None])
|
2014-06-25 20:36:33 -07:00
|
|
|
codes = [409] + [201] * (self.obj_ring.replicas - 1)
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
2014-06-25 11:38:54 -07:00
|
|
|
resp = req.get_response(self.app)
|
2014-06-25 20:36:33 -07:00
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
2015-08-04 23:15:37 -07:00
|
|
|
def test_put_x_timestamp_conflict_with_missing_backend_timestamp(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
2015-10-08 15:38:36 +02:00
|
|
|
'X-Timestamp': next(ts).internal})
|
2015-08-04 23:15:37 -07:00
|
|
|
ts_iter = iter([None, None, None])
|
|
|
|
codes = [409] * self.obj_ring.replicas
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
|
|
|
def test_put_x_timestamp_conflict_with_other_weird_success_response(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
2015-10-08 15:38:36 +02:00
|
|
|
'X-Timestamp': next(ts).internal})
|
|
|
|
ts_iter = iter([next(ts).internal, None, None])
|
2015-08-04 23:15:37 -07:00
|
|
|
codes = [409] + [(201, 'notused')] * (self.obj_ring.replicas - 1)
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
|
|
|
def test_put_x_timestamp_conflict_with_if_none_match(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
|
|
|
'If-None-Match': '*',
|
2015-10-08 15:38:36 +02:00
|
|
|
'X-Timestamp': next(ts).internal})
|
|
|
|
ts_iter = iter([next(ts).internal, None, None])
|
2015-08-04 23:15:37 -07:00
|
|
|
codes = [409] + [(412, 'notused')] * (self.obj_ring.replicas - 1)
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 412)
|
|
|
|
|
2014-06-25 20:36:33 -07:00
|
|
|
def test_container_sync_put_x_timestamp_race(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
2015-06-15 22:10:45 +05:30
|
|
|
put_timestamp = next(ts).internal
|
2014-06-25 20:36:33 -07:00
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
|
|
|
'X-Timestamp': put_timestamp})
|
|
|
|
|
|
|
|
# object nodes they respond 409 because another in-flight request
|
|
|
|
# finished and now the on disk timestamp is equal to the request.
|
|
|
|
put_ts = [put_timestamp] * self.obj_ring.replicas
|
|
|
|
codes = [409] * self.obj_ring.replicas
|
|
|
|
|
|
|
|
ts_iter = iter(put_ts)
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
|
|
|
|
|
|
|
def test_container_sync_put_x_timestamp_unsynced_race(self):
|
|
|
|
ts = (utils.Timestamp(t) for t in itertools.count(int(time.time())))
|
|
|
|
test_indexes = [None] + [int(p) for p in POLICIES]
|
|
|
|
for policy_index in test_indexes:
|
2015-06-15 22:10:45 +05:30
|
|
|
put_timestamp = next(ts).internal
|
2014-06-25 20:36:33 -07:00
|
|
|
req = swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method='PUT', headers={
|
|
|
|
'Content-Length': 0,
|
|
|
|
'X-Timestamp': put_timestamp})
|
|
|
|
|
|
|
|
# only one in-flight request finished
|
|
|
|
put_ts = [None] * (self.obj_ring.replicas - 1)
|
|
|
|
put_resp = [201] * (self.obj_ring.replicas - 1)
|
|
|
|
put_ts += [put_timestamp]
|
|
|
|
put_resp += [409]
|
|
|
|
|
|
|
|
ts_iter = iter(put_ts)
|
|
|
|
codes = put_resp
|
|
|
|
with set_http_connect(*codes, timestamps=ts_iter):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 202)
|
2014-06-25 11:38:54 -07:00
|
|
|
|
2013-09-19 21:05:46 +00:00
|
|
|
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
@patch_policies(
|
|
|
|
[StoragePolicy(0, '1-replica', True),
|
|
|
|
StoragePolicy(1, '5-replica', False),
|
|
|
|
StoragePolicy(2, '8-replica', False),
|
|
|
|
StoragePolicy(3, '15-replica', False)],
|
|
|
|
fake_ring_args=[
|
|
|
|
{'replicas': 1}, {'replicas': 5}, {'replicas': 8}, {'replicas': 15}])
|
|
|
|
class TestReplicatedObjControllerVariousReplicas(BaseObjectControllerMixin,
|
|
|
|
unittest.TestCase):
|
|
|
|
controller_cls = obj.ReplicatedObjectController
|
|
|
|
|
|
|
|
|
2015-08-12 13:32:50 -07:00
|
|
|
class StubResponse(object):
|
|
|
|
|
|
|
|
def __init__(self, status, body='', headers=None):
|
|
|
|
self.status = status
|
|
|
|
self.body = body
|
|
|
|
self.readable = BytesIO(body)
|
2016-03-02 10:28:51 +00:00
|
|
|
self.headers = HeaderKeyDict(headers)
|
2015-08-12 13:32:50 -07:00
|
|
|
fake_reason = ('Fake', 'This response is a lie.')
|
|
|
|
self.reason = swob.RESPONSE_REASONS.get(status, fake_reason)[0]
|
|
|
|
|
|
|
|
def getheader(self, header_name, default=None):
|
|
|
|
return self.headers.get(header_name, default)
|
|
|
|
|
|
|
|
def getheaders(self):
|
|
|
|
if 'Content-Length' not in self.headers:
|
|
|
|
self.headers['Content-Length'] = len(self.body)
|
|
|
|
return self.headers.items()
|
|
|
|
|
|
|
|
def read(self, amt=0):
|
|
|
|
return self.readable.read(amt)
|
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def capture_http_requests(get_response):
|
|
|
|
|
|
|
|
class FakeConn(object):
|
|
|
|
|
|
|
|
def __init__(self, req):
|
|
|
|
self.req = req
|
|
|
|
self.resp = None
|
|
|
|
|
|
|
|
def getresponse(self):
|
|
|
|
self.resp = get_response(self.req)
|
|
|
|
return self.resp
|
|
|
|
|
|
|
|
class ConnectionLog(object):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.connections = []
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return len(self.connections)
|
|
|
|
|
|
|
|
def __getitem__(self, i):
|
|
|
|
return self.connections[i]
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
return iter(self.connections)
|
|
|
|
|
|
|
|
def __call__(self, ip, port, method, path, headers, qs, ssl):
|
|
|
|
req = {
|
|
|
|
'ip': ip,
|
|
|
|
'port': port,
|
|
|
|
'method': method,
|
|
|
|
'path': path,
|
|
|
|
'headers': headers,
|
|
|
|
'qs': qs,
|
|
|
|
'ssl': ssl,
|
|
|
|
}
|
|
|
|
conn = FakeConn(req)
|
|
|
|
self.connections.append(conn)
|
|
|
|
return conn
|
|
|
|
|
|
|
|
fake_conn = ConnectionLog()
|
|
|
|
|
|
|
|
with mock.patch('swift.common.bufferedhttp.http_connect_raw',
|
|
|
|
new=fake_conn):
|
|
|
|
yield fake_conn
|
|
|
|
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
@patch_policies(with_ec_default=True)
|
|
|
|
class TestECObjController(BaseObjectControllerMixin, unittest.TestCase):
|
2014-07-02 14:39:42 -07:00
|
|
|
container_info = {
|
Fix up get_account_info and get_container_info
get_account_info used to work like this:
* make an account HEAD request
* ignore the response
* get the account info by digging around in the request environment,
where it had been deposited by elves or something
Not actually elves, but the proxy's GETorHEAD_base method would take
the HEAD response and cache it in the response environment, which was
the same object as the request environment, thus enabling
get_account_info to find it.
This was extraordinarily brittle. If a WSGI middleware were to
shallow-copy the request environment, then any middlewares to its left
could not use get_account_info, as the left middleware's request
environment would no longer be identical to the response environment
down in GETorHEAD_base.
Now, get_account_info works like this:
* make an account HEAD request.
* if the account info is in the request environment, return it. This
is an optimization to avoid a double-set in memcached.
* else, compute the account info from the response headers, store it
in caches, and return it.
This is much easier to think about; get_account_info can get and cache
account info all on its own; the cache check and cache set are right
next to each other.
All the above is true for get_container_info as well.
get_info() is still around, but it's just a shim. It was trying to
unify get_account_info and get_container_info to exploit the
commonalities, but the number of times that "if container:" showed up
in get_info and its helpers really indicated that something was
wrong. I'd rather have two functions with some duplication than one
function with no duplication but a bunch of "if container:" branches.
Other things of note:
* a HEAD request to a deleted account returns 410, but
get_account_info would return 404 since the 410 came from the
account controller *after* GETorHEAD_base ran. Now
get_account_info returns 410 as well.
* cache validity period (recheck_account_existence and
recheck_container_existence) is now communicated to
get_account_info via an X-Backend header. This way,
get_account_info doesn't need a reference to the
swift.proxy.server.Application object.
* both logged swift_source values are now correct for
get_container_info calls; before, on a cold cache,
get_container_info would call get_account_info but not pass along
swift_source, resulting in get_account_info logging "GET_INFO" as
the source. Amusingly, there was a unit test asserting this bogus
behavior.
* callers that modify the return value of get_account_info or of
get_container_info don't modify what's stored in swift.infocache.
* get_account_info on an account that *can* be autocreated but has
not been will return a 200, same as a HEAD request. The old
behavior was a 404 from get_account_info but a 200 from
HEAD. Callers can tell the difference by looking at
info['account_really_exists'] if they need to know the difference
(there is one call site that needs to know, in container
PUT). Note: this is for all accounts when the proxy's
"account_autocreate" setting is on.
Change-Id: I5167714025ec7237f7e6dd4759c2c6eb959b3fca
2016-02-11 15:51:45 -08:00
|
|
|
'status': 200,
|
2014-07-02 14:39:42 -07:00
|
|
|
'read_acl': None,
|
|
|
|
'write_acl': None,
|
|
|
|
'sync_key': None,
|
|
|
|
'versions': None,
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
'storage_policy': '0',
|
2014-07-02 14:39:42 -07:00
|
|
|
}
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
controller_cls = obj.ECObjectController
|
|
|
|
|
|
|
|
def test_determine_chunk_destinations(self):
|
|
|
|
class FakePutter(object):
|
|
|
|
def __init__(self, index):
|
|
|
|
self.node_index = index
|
|
|
|
|
|
|
|
controller = self.controller_cls(
|
|
|
|
self.app, 'a', 'c', 'o')
|
|
|
|
|
|
|
|
# create a dummy list of putters, check no handoffs
|
|
|
|
putters = []
|
|
|
|
for index in range(0, 4):
|
|
|
|
putters.append(FakePutter(index))
|
|
|
|
got = controller._determine_chunk_destinations(putters)
|
|
|
|
expected = {}
|
|
|
|
for i, p in enumerate(putters):
|
|
|
|
expected[p] = i
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(got, expected)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
# now lets make a handoff at the end
|
|
|
|
putters[3].node_index = None
|
|
|
|
got = controller._determine_chunk_destinations(putters)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(got, expected)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
putters[3].node_index = 3
|
|
|
|
|
|
|
|
# now lets make a handoff at the start
|
|
|
|
putters[0].node_index = None
|
|
|
|
got = controller._determine_chunk_destinations(putters)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(got, expected)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
putters[0].node_index = 0
|
|
|
|
|
|
|
|
# now lets make a handoff in the middle
|
|
|
|
putters[2].node_index = None
|
|
|
|
got = controller._determine_chunk_destinations(putters)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(got, expected)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
putters[2].node_index = 0
|
|
|
|
|
|
|
|
# now lets make all of them handoffs
|
|
|
|
for index in range(0, 4):
|
|
|
|
putters[index].node_index = None
|
|
|
|
got = controller._determine_chunk_destinations(putters)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(got, expected)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_GET_simple(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
get_resp = [200] * self.policy.ec_ndata
|
|
|
|
with set_http_connect(*get_resp):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
2016-02-04 05:18:42 -08:00
|
|
|
self.assertIn('Accept-Ranges', resp.headers)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_GET_simple_x_newest(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o',
|
|
|
|
headers={'X-Newest': 'true'})
|
EC: support multiple ranges for GET requests
This commit lets clients receive multipart/byteranges responses (see
RFC 7233, Appendix A) for erasure-coded objects. Clients can already
do this for replicated objects, so this brings EC closer to feature
parity (ha!).
GetOrHeadHandler got a base class extracted from it that treats an
HTTP response as a sequence of byte-range responses. This way, it can
continue to yield whole fragments, not just N-byte pieces of the raw
HTTP response, since an N-byte piece of a multipart/byteranges
response is pretty much useless.
There are a couple of bonus fixes in here, too. For starters, download
resuming now works on multipart/byteranges responses. Before, it only
worked on 200 responses or 206 responses for a single byte
range. Also, BufferedHTTPResponse grew a readline() method.
Also, the MIME response for replicated objects got tightened up a
little. Before, it had some leading and trailing CRLFs which, while
allowed by RFC 7233, provide no benefit. Now, both replicated and EC
multipart/byteranges avoid extraneous bytes. This let me re-use the
Content-Length calculation in swob instead of having to either hack
around it or add extraneous whitespace to match.
Change-Id: I16fc65e0ec4e356706d327bdb02a3741e36330a0
2015-03-20 09:56:30 -07:00
|
|
|
codes = [200] * self.policy.ec_ndata
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
with set_http_connect(*codes):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_GET_error(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
get_resp = [503] + [200] * self.policy.ec_ndata
|
|
|
|
with set_http_connect(*get_resp):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_GET_with_body(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o')
|
|
|
|
# turn a real body into fragments
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
real_body = ('asdf' * segment_size)[:-10]
|
|
|
|
# split it up into chunks
|
|
|
|
chunks = [real_body[x:x + segment_size]
|
|
|
|
for x in range(0, len(real_body), segment_size)]
|
|
|
|
fragment_payloads = []
|
|
|
|
for chunk in chunks:
|
|
|
|
fragments = self.policy.pyeclib_driver.encode(chunk)
|
|
|
|
if not fragments:
|
|
|
|
break
|
|
|
|
fragment_payloads.append(fragments)
|
|
|
|
# sanity
|
|
|
|
sanity_body = ''
|
|
|
|
for fragment_payload in fragment_payloads:
|
|
|
|
sanity_body += self.policy.pyeclib_driver.decode(
|
|
|
|
fragment_payload)
|
|
|
|
self.assertEqual(len(real_body), len(sanity_body))
|
|
|
|
self.assertEqual(real_body, sanity_body)
|
|
|
|
|
2015-07-11 22:46:41 +05:30
|
|
|
# list(zip(...)) for py3 compatibility (zip is lazy there)
|
|
|
|
node_fragments = list(zip(*fragment_payloads))
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.assertEqual(len(node_fragments), self.replicas()) # sanity
|
EC: support multiple ranges for GET requests
This commit lets clients receive multipart/byteranges responses (see
RFC 7233, Appendix A) for erasure-coded objects. Clients can already
do this for replicated objects, so this brings EC closer to feature
parity (ha!).
GetOrHeadHandler got a base class extracted from it that treats an
HTTP response as a sequence of byte-range responses. This way, it can
continue to yield whole fragments, not just N-byte pieces of the raw
HTTP response, since an N-byte piece of a multipart/byteranges
response is pretty much useless.
There are a couple of bonus fixes in here, too. For starters, download
resuming now works on multipart/byteranges responses. Before, it only
worked on 200 responses or 206 responses for a single byte
range. Also, BufferedHTTPResponse grew a readline() method.
Also, the MIME response for replicated objects got tightened up a
little. Before, it had some leading and trailing CRLFs which, while
allowed by RFC 7233, provide no benefit. Now, both replicated and EC
multipart/byteranges avoid extraneous bytes. This let me re-use the
Content-Length calculation in swob instead of having to either hack
around it or add extraneous whitespace to match.
Change-Id: I16fc65e0ec4e356706d327bdb02a3741e36330a0
2015-03-20 09:56:30 -07:00
|
|
|
headers = {'X-Object-Sysmeta-Ec-Content-Length': str(len(real_body))}
|
|
|
|
responses = [(200, ''.join(node_fragments[i]), headers)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
for i in range(POLICIES.default.ec_ndata)]
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 200)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
self.assertEqual(len(real_body), len(resp.body))
|
|
|
|
self.assertEqual(real_body, resp.body)
|
|
|
|
|
|
|
|
def test_PUT_simple(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
2016-01-12 12:50:43 +05:30
|
|
|
def test_txn_id_logging_ECPUT(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
self.app.logger.txn_id = req.environ['swift.trans_id'] = 'test-txn-id'
|
|
|
|
codes = [(100, Timeout(), 503, 503)] * self.replicas()
|
|
|
|
stdout = BytesIO()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers), \
|
|
|
|
mock.patch('sys.stdout', stdout):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 503)
|
|
|
|
for line in stdout.getvalue().splitlines():
|
|
|
|
self.assertIn('test-txn-id', line)
|
|
|
|
self.assertIn('Trying to get ',
|
|
|
|
stdout.getvalue())
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_PUT_with_explicit_commit_status(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [(100, 100, 201)] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_error(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [503] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 503)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_mostly_success(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [201] * self.quorum()
|
|
|
|
codes += [503] * (self.replicas() - len(codes))
|
|
|
|
random.shuffle(codes)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_error_commit(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [(100, 503, Exception('not used'))] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 503)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_mostly_success_commit(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [201] * self.quorum()
|
|
|
|
codes += [(100, 503, Exception('not used'))] * (
|
|
|
|
self.replicas() - len(codes))
|
|
|
|
random.shuffle(codes)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_mostly_error_commit(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [(100, 503, Exception('not used'))] * self.quorum()
|
|
|
|
codes += [201] * (self.replicas() - len(codes))
|
|
|
|
random.shuffle(codes)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 503)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_commit_timeout(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [201] * (self.replicas() - 1)
|
|
|
|
codes.append((100, Timeout(), Exception('not used')))
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def test_PUT_commit_exception(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
codes = [201] * (self.replicas() - 1)
|
|
|
|
codes.append((100, Exception('kaboom!'), Exception('not used')))
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
2015-10-16 11:27:34 -05:00
|
|
|
def test_PUT_ec_error_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-02-18 11:59:31 +05:30
|
|
|
raise IOError('error message')
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 499)
|
|
|
|
|
|
|
|
def test_PUT_ec_chunkreadtimeout_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise exceptions.ChunkReadTimeout()
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 408)
|
|
|
|
|
|
|
|
def test_PUT_ec_timeout_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise exceptions.Timeout()
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 499)
|
|
|
|
|
|
|
|
def test_PUT_ec_exception_during_transfer_data(self):
|
|
|
|
class FakeReader(object):
|
|
|
|
def read(self, size):
|
2015-11-11 07:55:51 -08:00
|
|
|
raise Exception('exception message')
|
2015-10-16 11:27:34 -05:00
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o.jpg', method='PUT',
|
|
|
|
body='test body')
|
|
|
|
|
|
|
|
req.environ['wsgi.input'] = FakeReader()
|
|
|
|
req.headers['content-length'] = '6'
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 500)
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_PUT_with_body(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT')
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
test_body = ('asdf' * segment_size)[:-10]
|
|
|
|
etag = md5(test_body).hexdigest()
|
|
|
|
size = len(test_body)
|
|
|
|
req.body = test_body
|
|
|
|
codes = [201] * self.replicas()
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
|
|
|
|
put_requests = defaultdict(lambda: {'boundary': None, 'chunks': []})
|
|
|
|
|
|
|
|
def capture_body(conn_id, chunk):
|
|
|
|
put_requests[conn_id]['chunks'].append(chunk)
|
|
|
|
|
|
|
|
def capture_headers(ip, port, device, part, method, path, headers,
|
|
|
|
**kwargs):
|
|
|
|
conn_id = kwargs['connection_id']
|
|
|
|
put_requests[conn_id]['boundary'] = headers[
|
|
|
|
'X-Backend-Obj-Multipart-Mime-Boundary']
|
2016-02-19 18:07:18 -06:00
|
|
|
put_requests[conn_id]['backend-content-length'] = headers[
|
|
|
|
'X-Backend-Obj-Content-Length']
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers,
|
|
|
|
give_send=capture_body,
|
|
|
|
give_connect=capture_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
frag_archives = []
|
|
|
|
for connection_id, info in put_requests.items():
|
|
|
|
body = unchunk_body(''.join(info['chunks']))
|
|
|
|
self.assertTrue(info['boundary'] is not None,
|
|
|
|
"didn't get boundary for conn %r" % (
|
|
|
|
connection_id,))
|
2016-02-19 18:07:18 -06:00
|
|
|
self.assertTrue(size > int(info['backend-content-length']) > 0,
|
|
|
|
"invalid backend-content-length for conn %r" % (
|
|
|
|
connection_id,))
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
# email.parser.FeedParser doesn't know how to take a multipart
|
|
|
|
# message and boundary together and parse it; it only knows how
|
|
|
|
# to take a string, parse the headers, and figure out the
|
|
|
|
# boundary on its own.
|
|
|
|
parser = email.parser.FeedParser()
|
|
|
|
parser.feed(
|
|
|
|
"Content-Type: multipart/nobodycares; boundary=%s\r\n\r\n" %
|
|
|
|
info['boundary'])
|
|
|
|
parser.feed(body)
|
|
|
|
message = parser.close()
|
|
|
|
|
|
|
|
self.assertTrue(message.is_multipart()) # sanity check
|
|
|
|
mime_parts = message.get_payload()
|
|
|
|
self.assertEqual(len(mime_parts), 3)
|
|
|
|
obj_part, footer_part, commit_part = mime_parts
|
|
|
|
|
|
|
|
# attach the body to frag_archives list
|
|
|
|
self.assertEqual(obj_part['X-Document'], 'object body')
|
|
|
|
frag_archives.append(obj_part.get_payload())
|
|
|
|
|
2016-02-19 18:07:18 -06:00
|
|
|
# assert length was correct for this connection
|
|
|
|
self.assertEqual(int(info['backend-content-length']),
|
|
|
|
len(frag_archives[-1]))
|
|
|
|
# assert length was the same for all connections
|
|
|
|
self.assertEqual(int(info['backend-content-length']),
|
|
|
|
len(frag_archives[0]))
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# validate some footer metadata
|
|
|
|
self.assertEqual(footer_part['X-Document'], 'object metadata')
|
|
|
|
footer_metadata = json.loads(footer_part.get_payload())
|
|
|
|
self.assertTrue(footer_metadata)
|
|
|
|
expected = {
|
|
|
|
'X-Object-Sysmeta-EC-Content-Length': str(size),
|
|
|
|
'X-Backend-Container-Update-Override-Size': str(size),
|
|
|
|
'X-Object-Sysmeta-EC-Etag': etag,
|
|
|
|
'X-Backend-Container-Update-Override-Etag': etag,
|
|
|
|
'X-Object-Sysmeta-EC-Segment-Size': str(segment_size),
|
|
|
|
}
|
|
|
|
for header, value in expected.items():
|
|
|
|
self.assertEqual(footer_metadata[header], value)
|
|
|
|
|
|
|
|
# sanity on commit message
|
|
|
|
self.assertEqual(commit_part['X-Document'], 'put commit')
|
|
|
|
|
|
|
|
self.assertEqual(len(frag_archives), self.replicas())
|
|
|
|
fragment_size = self.policy.fragment_size
|
|
|
|
node_payloads = []
|
|
|
|
for fa in frag_archives:
|
|
|
|
payload = [fa[x:x + fragment_size]
|
|
|
|
for x in range(0, len(fa), fragment_size)]
|
|
|
|
node_payloads.append(payload)
|
|
|
|
fragment_payloads = zip(*node_payloads)
|
|
|
|
|
|
|
|
expected_body = ''
|
|
|
|
for fragment_payload in fragment_payloads:
|
|
|
|
self.assertEqual(len(fragment_payload), self.replicas())
|
|
|
|
if True:
|
|
|
|
fragment_payload = list(fragment_payload)
|
|
|
|
expected_body += self.policy.pyeclib_driver.decode(
|
|
|
|
fragment_payload)
|
|
|
|
|
|
|
|
self.assertEqual(len(test_body), len(expected_body))
|
|
|
|
self.assertEqual(test_body, expected_body)
|
|
|
|
|
|
|
|
def test_PUT_old_obj_server(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
responses = [
|
|
|
|
# one server will response 100-continue but not include the
|
|
|
|
# needful expect headers and the connection will be dropped
|
|
|
|
((100, Exception('not used')), {}),
|
|
|
|
] + [
|
|
|
|
# and pleanty of successful responses too
|
|
|
|
(201, {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes',
|
|
|
|
}),
|
|
|
|
] * self.replicas()
|
|
|
|
random.shuffle(responses)
|
|
|
|
if responses[-1][0] != 201:
|
|
|
|
# whoops, stupid random
|
|
|
|
responses = responses[1:] + [responses[0]]
|
|
|
|
codes, expect_headers = zip(*responses)
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
def _make_ec_archive_bodies(self, test_body, policy=None):
|
|
|
|
policy = policy or self.policy
|
|
|
|
segment_size = policy.ec_segment_size
|
|
|
|
# split up the body into buffers
|
|
|
|
chunks = [test_body[x:x + segment_size]
|
|
|
|
for x in range(0, len(test_body), segment_size)]
|
|
|
|
# encode the buffers into fragment payloads
|
|
|
|
fragment_payloads = []
|
|
|
|
for chunk in chunks:
|
|
|
|
fragments = self.policy.pyeclib_driver.encode(chunk)
|
|
|
|
if not fragments:
|
|
|
|
break
|
|
|
|
fragment_payloads.append(fragments)
|
|
|
|
|
|
|
|
# join up the fragment payloads per node
|
|
|
|
ec_archive_bodies = [''.join(fragments)
|
|
|
|
for fragments in zip(*fragment_payloads)]
|
|
|
|
return ec_archive_bodies
|
|
|
|
|
2015-08-12 13:32:50 -07:00
|
|
|
def _make_ec_object_stub(self, test_body=None, policy=None):
|
|
|
|
policy = policy or self.policy
|
|
|
|
segment_size = policy.ec_segment_size
|
|
|
|
test_body = test_body or (
|
|
|
|
'test' * segment_size)[:-random.randint(0, 1000)]
|
|
|
|
etag = md5(test_body).hexdigest()
|
|
|
|
ec_archive_bodies = self._make_ec_archive_bodies(test_body,
|
|
|
|
policy=policy)
|
|
|
|
return {
|
|
|
|
'body': test_body,
|
|
|
|
'etag': etag,
|
|
|
|
'frags': ec_archive_bodies,
|
|
|
|
}
|
|
|
|
|
|
|
|
def _fake_ec_node_response(self, node_frags):
|
|
|
|
"""
|
|
|
|
Given a list of entries for each node in ring order, where the
|
|
|
|
entries are a dict (or list of dicts) which describe all of the
|
|
|
|
fragment(s); create a function suitable for use with
|
|
|
|
capture_http_requests that will accept a req object and return a
|
|
|
|
response that will suitably fake the behavior of an object
|
|
|
|
server who had the given fragments on disk at the time.
|
|
|
|
"""
|
|
|
|
node_map = {}
|
|
|
|
all_nodes = []
|
|
|
|
|
|
|
|
def _build_node_map(req):
|
|
|
|
node_key = lambda n: (n['ip'], n['port'])
|
|
|
|
part = utils.split_path(req['path'], 5, 5, True)[1]
|
|
|
|
policy = POLICIES[int(
|
|
|
|
req['headers']['X-Backend-Storage-Policy-Index'])]
|
|
|
|
all_nodes.extend(policy.object_ring.get_part_nodes(part))
|
|
|
|
all_nodes.extend(policy.object_ring.get_more_nodes(part))
|
|
|
|
for i, node in enumerate(all_nodes):
|
|
|
|
node_map[node_key(node)] = i
|
|
|
|
|
|
|
|
# normalize node_frags to a list of fragments for each node even
|
|
|
|
# if there's only one fragment in the dataset provided.
|
|
|
|
for i, frags in enumerate(node_frags):
|
|
|
|
if isinstance(frags, dict):
|
|
|
|
node_frags[i] = [frags]
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
if not node_map:
|
|
|
|
_build_node_map(req)
|
|
|
|
|
|
|
|
try:
|
|
|
|
node_index = node_map[(req['ip'], req['port'])]
|
|
|
|
except KeyError:
|
|
|
|
raise Exception("Couldn't find node %s:%s in %r" % (
|
|
|
|
req['ip'], req['port'], all_nodes))
|
|
|
|
|
|
|
|
try:
|
|
|
|
frags = node_frags[node_index]
|
|
|
|
except KeyError:
|
|
|
|
raise Exception('Found node %r:%r at index %s - '
|
|
|
|
'but only got %s stub response nodes' % (
|
|
|
|
req['ip'], req['port'], node_index,
|
|
|
|
len(node_frags)))
|
|
|
|
|
|
|
|
try:
|
|
|
|
stub = random.choice(frags)
|
|
|
|
except IndexError:
|
|
|
|
stub = None
|
|
|
|
if stub:
|
|
|
|
body = stub['obj']['frags'][stub['frag']]
|
|
|
|
headers = {
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(
|
|
|
|
stub['obj']['body']),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': stub['obj']['etag'],
|
|
|
|
'X-Object-Sysmeta-Ec-Frag-Index': stub['frag'],
|
|
|
|
}
|
|
|
|
resp = StubResponse(200, body, headers)
|
|
|
|
else:
|
|
|
|
resp = StubResponse(404)
|
|
|
|
return resp
|
|
|
|
|
|
|
|
return get_response
|
|
|
|
|
|
|
|
def test_GET_with_frags_swapped_around(self):
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
test_data = ('test' * segment_size)[:-657]
|
|
|
|
etag = md5(test_data).hexdigest()
|
|
|
|
ec_archive_bodies = self._make_ec_archive_bodies(test_data)
|
|
|
|
|
|
|
|
_part, primary_nodes = self.obj_ring.get_nodes('a', 'c', 'o')
|
|
|
|
|
|
|
|
node_key = lambda n: (n['ip'], n['port'])
|
|
|
|
response_map = {
|
|
|
|
node_key(n): StubResponse(200, ec_archive_bodies[i], {
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(test_data),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': etag,
|
|
|
|
'X-Object-Sysmeta-Ec-Frag-Index': i,
|
|
|
|
}) for i, n in enumerate(primary_nodes)
|
|
|
|
}
|
|
|
|
|
|
|
|
# swap a parity response into a data node
|
|
|
|
data_node = random.choice(primary_nodes[:self.policy.ec_ndata])
|
|
|
|
parity_node = random.choice(primary_nodes[self.policy.ec_ndata:])
|
|
|
|
(response_map[node_key(data_node)],
|
|
|
|
response_map[node_key(parity_node)]) = \
|
|
|
|
(response_map[node_key(parity_node)],
|
|
|
|
response_map[node_key(data_node)])
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
req_key = (req['ip'], req['port'])
|
|
|
|
return response_map.pop(req_key)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(get_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(len(log), self.policy.ec_ndata)
|
|
|
|
self.assertEqual(len(response_map),
|
|
|
|
len(primary_nodes) - self.policy.ec_ndata)
|
|
|
|
|
|
|
|
def test_GET_with_single_missed_overwrite_does_not_need_handoff(self):
|
|
|
|
obj1 = self._make_ec_object_stub()
|
|
|
|
obj2 = self._make_ec_object_stub()
|
|
|
|
|
|
|
|
node_frags = [
|
|
|
|
{'obj': obj2, 'frag': 0},
|
|
|
|
{'obj': obj2, 'frag': 1},
|
|
|
|
{'obj': obj1, 'frag': 2}, # missed over write
|
|
|
|
{'obj': obj2, 'frag': 3},
|
|
|
|
{'obj': obj2, 'frag': 4},
|
|
|
|
{'obj': obj2, 'frag': 5},
|
|
|
|
{'obj': obj2, 'frag': 6},
|
|
|
|
{'obj': obj2, 'frag': 7},
|
|
|
|
{'obj': obj2, 'frag': 8},
|
|
|
|
{'obj': obj2, 'frag': 9},
|
|
|
|
{'obj': obj2, 'frag': 10}, # parity
|
|
|
|
{'obj': obj2, 'frag': 11}, # parity
|
|
|
|
{'obj': obj2, 'frag': 12}, # parity
|
|
|
|
{'obj': obj2, 'frag': 13}, # parity
|
|
|
|
# {'obj': obj2, 'frag': 2}, # handoff (not used in this test)
|
|
|
|
]
|
|
|
|
|
|
|
|
fake_response = self._fake_ec_node_response(node_frags)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(fake_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['etag'], obj2['etag'])
|
|
|
|
self.assertEqual(md5(resp.body).hexdigest(), obj2['etag'])
|
|
|
|
|
|
|
|
collected_responses = defaultdict(set)
|
|
|
|
for conn in log:
|
|
|
|
etag = conn.resp.headers['X-Object-Sysmeta-Ec-Etag']
|
|
|
|
index = conn.resp.headers['X-Object-Sysmeta-Ec-Frag-Index']
|
|
|
|
collected_responses[etag].add(index)
|
|
|
|
|
|
|
|
# because the primary nodes are shuffled, it's possible the proxy
|
|
|
|
# didn't even notice the missed overwrite frag - but it might have
|
|
|
|
self.assertLessEqual(len(log), self.policy.ec_ndata + 1)
|
|
|
|
self.assertLessEqual(len(collected_responses), 2)
|
|
|
|
|
|
|
|
# ... regardless we should never need to fetch more than ec_ndata
|
|
|
|
# frags for any given etag
|
|
|
|
for etag, frags in collected_responses.items():
|
|
|
|
self.assertTrue(len(frags) <= self.policy.ec_ndata,
|
|
|
|
'collected %s frags for etag %s' % (
|
|
|
|
len(frags), etag))
|
|
|
|
|
|
|
|
def test_GET_with_many_missed_overwrite_will_need_handoff(self):
|
|
|
|
obj1 = self._make_ec_object_stub()
|
|
|
|
obj2 = self._make_ec_object_stub()
|
|
|
|
|
|
|
|
node_frags = [
|
|
|
|
{'obj': obj2, 'frag': 0},
|
|
|
|
{'obj': obj2, 'frag': 1},
|
|
|
|
{'obj': obj1, 'frag': 2}, # missed
|
|
|
|
{'obj': obj2, 'frag': 3},
|
|
|
|
{'obj': obj2, 'frag': 4},
|
|
|
|
{'obj': obj2, 'frag': 5},
|
|
|
|
{'obj': obj1, 'frag': 6}, # missed
|
|
|
|
{'obj': obj2, 'frag': 7},
|
|
|
|
{'obj': obj2, 'frag': 8},
|
|
|
|
{'obj': obj1, 'frag': 9}, # missed
|
|
|
|
{'obj': obj1, 'frag': 10}, # missed
|
|
|
|
{'obj': obj1, 'frag': 11}, # missed
|
|
|
|
{'obj': obj2, 'frag': 12},
|
|
|
|
{'obj': obj2, 'frag': 13},
|
|
|
|
{'obj': obj2, 'frag': 6}, # handoff
|
|
|
|
]
|
|
|
|
|
|
|
|
fake_response = self._fake_ec_node_response(node_frags)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(fake_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['etag'], obj2['etag'])
|
|
|
|
self.assertEqual(md5(resp.body).hexdigest(), obj2['etag'])
|
|
|
|
|
|
|
|
collected_responses = defaultdict(set)
|
|
|
|
for conn in log:
|
|
|
|
etag = conn.resp.headers['X-Object-Sysmeta-Ec-Etag']
|
|
|
|
index = conn.resp.headers['X-Object-Sysmeta-Ec-Frag-Index']
|
|
|
|
collected_responses[etag].add(index)
|
|
|
|
|
|
|
|
# there's not enough of the obj2 etag on the primaries, we would
|
|
|
|
# have collected responses for both etags, and would have made
|
|
|
|
# one more request to the handoff node
|
|
|
|
self.assertEqual(len(log), self.replicas() + 1)
|
|
|
|
self.assertEqual(len(collected_responses), 2)
|
|
|
|
|
|
|
|
# ... regardless we should never need to fetch more than ec_ndata
|
|
|
|
# frags for any given etag
|
|
|
|
for etag, frags in collected_responses.items():
|
|
|
|
self.assertTrue(len(frags) <= self.policy.ec_ndata,
|
|
|
|
'collected %s frags for etag %s' % (
|
|
|
|
len(frags), etag))
|
|
|
|
|
|
|
|
def test_GET_with_missing_and_mixed_frags_will_dig_deep_but_succeed(self):
|
|
|
|
obj1 = self._make_ec_object_stub()
|
|
|
|
obj2 = self._make_ec_object_stub()
|
|
|
|
|
|
|
|
node_frags = [
|
|
|
|
{'obj': obj1, 'frag': 0},
|
|
|
|
{'obj': obj2, 'frag': 0},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 1},
|
|
|
|
{'obj': obj2, 'frag': 1},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 2},
|
|
|
|
{'obj': obj2, 'frag': 2},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 3},
|
|
|
|
{'obj': obj2, 'frag': 3},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 4},
|
|
|
|
{'obj': obj2, 'frag': 4},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 5},
|
|
|
|
{'obj': obj2, 'frag': 5},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 6},
|
|
|
|
{'obj': obj2, 'frag': 6},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 7},
|
|
|
|
{'obj': obj2, 'frag': 7},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 8},
|
|
|
|
{'obj': obj2, 'frag': 8},
|
|
|
|
{},
|
|
|
|
{'obj': obj2, 'frag': 9},
|
|
|
|
]
|
|
|
|
|
|
|
|
fake_response = self._fake_ec_node_response(node_frags)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(fake_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.headers['etag'], obj2['etag'])
|
|
|
|
self.assertEqual(md5(resp.body).hexdigest(), obj2['etag'])
|
|
|
|
|
|
|
|
collected_responses = defaultdict(set)
|
|
|
|
for conn in log:
|
|
|
|
etag = conn.resp.headers['X-Object-Sysmeta-Ec-Etag']
|
|
|
|
index = conn.resp.headers['X-Object-Sysmeta-Ec-Frag-Index']
|
|
|
|
collected_responses[etag].add(index)
|
|
|
|
|
|
|
|
# we go exactly as long as we have to, finding two different
|
|
|
|
# etags and some 404's (i.e. collected_responses[None])
|
|
|
|
self.assertEqual(len(log), len(node_frags))
|
|
|
|
self.assertEqual(len(collected_responses), 3)
|
|
|
|
|
|
|
|
# ... regardless we should never need to fetch more than ec_ndata
|
|
|
|
# frags for any given etag
|
|
|
|
for etag, frags in collected_responses.items():
|
|
|
|
self.assertTrue(len(frags) <= self.policy.ec_ndata,
|
|
|
|
'collected %s frags for etag %s' % (
|
|
|
|
len(frags), etag))
|
|
|
|
|
|
|
|
def test_GET_with_missing_and_mixed_frags_will_dig_deep_but_stop(self):
|
|
|
|
obj1 = self._make_ec_object_stub()
|
|
|
|
obj2 = self._make_ec_object_stub()
|
|
|
|
|
|
|
|
node_frags = [
|
|
|
|
{'obj': obj1, 'frag': 0},
|
|
|
|
{'obj': obj2, 'frag': 0},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 1},
|
|
|
|
{'obj': obj2, 'frag': 1},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 2},
|
|
|
|
{'obj': obj2, 'frag': 2},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 3},
|
|
|
|
{'obj': obj2, 'frag': 3},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 4},
|
|
|
|
{'obj': obj2, 'frag': 4},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 5},
|
|
|
|
{'obj': obj2, 'frag': 5},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 6},
|
|
|
|
{'obj': obj2, 'frag': 6},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 7},
|
|
|
|
{'obj': obj2, 'frag': 7},
|
|
|
|
{},
|
|
|
|
{'obj': obj1, 'frag': 8},
|
|
|
|
{'obj': obj2, 'frag': 8},
|
|
|
|
{},
|
|
|
|
{},
|
|
|
|
]
|
|
|
|
|
|
|
|
fake_response = self._fake_ec_node_response(node_frags)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(fake_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 404)
|
|
|
|
|
|
|
|
collected_responses = defaultdict(set)
|
|
|
|
for conn in log:
|
|
|
|
etag = conn.resp.headers['X-Object-Sysmeta-Ec-Etag']
|
|
|
|
index = conn.resp.headers['X-Object-Sysmeta-Ec-Frag-Index']
|
|
|
|
collected_responses[etag].add(index)
|
|
|
|
|
|
|
|
# default node_iter will exhaust at 2 * replicas
|
|
|
|
self.assertEqual(len(log), 2 * self.replicas())
|
|
|
|
self.assertEqual(len(collected_responses), 3)
|
|
|
|
|
|
|
|
# ... regardless we should never need to fetch more than ec_ndata
|
|
|
|
# frags for any given etag
|
|
|
|
for etag, frags in collected_responses.items():
|
|
|
|
self.assertTrue(len(frags) <= self.policy.ec_ndata,
|
|
|
|
'collected %s frags for etag %s' % (
|
|
|
|
len(frags), etag))
|
|
|
|
|
|
|
|
def test_GET_mixed_success_with_range(self):
|
|
|
|
fragment_size = self.policy.fragment_size
|
|
|
|
|
|
|
|
ec_stub = self._make_ec_object_stub()
|
|
|
|
frag_archives = ec_stub['frags']
|
|
|
|
frag_archive_size = len(ec_stub['frags'][0])
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
'Content-Type': 'text/plain',
|
|
|
|
'Content-Length': fragment_size,
|
|
|
|
'Content-Range': 'bytes 0-%s/%s' % (fragment_size - 1,
|
|
|
|
frag_archive_size),
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(ec_stub['body']),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': ec_stub['etag'],
|
|
|
|
}
|
|
|
|
responses = [
|
|
|
|
StubResponse(206, frag_archives[0][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[1][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[2][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[3][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[4][:fragment_size], headers),
|
|
|
|
# data nodes with old frag
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(206, frag_archives[7][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[8][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[9][:fragment_size], headers),
|
|
|
|
# hopefully we ask for two more
|
|
|
|
StubResponse(206, frag_archives[10][:fragment_size], headers),
|
|
|
|
StubResponse(206, frag_archives[11][:fragment_size], headers),
|
|
|
|
]
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
return responses.pop(0) if responses else StubResponse(404)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', headers={'Range': 'bytes=0-3'})
|
|
|
|
with capture_http_requests(get_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 206)
|
|
|
|
self.assertEqual(resp.body, 'test')
|
|
|
|
self.assertEqual(len(log), self.policy.ec_ndata + 2)
|
|
|
|
|
|
|
|
def test_GET_with_range_unsatisfiable_mixed_success(self):
|
|
|
|
responses = [
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
# sneak in bogus extra responses
|
|
|
|
StubResponse(404),
|
|
|
|
StubResponse(206),
|
|
|
|
# and then just "enough" more 416's
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
StubResponse(416),
|
|
|
|
]
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
return responses.pop(0) if responses else StubResponse(404)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o', headers={
|
|
|
|
'Range': 'bytes=%s-' % 100000000000000})
|
|
|
|
with capture_http_requests(get_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 416)
|
|
|
|
# ec_ndata responses that must agree, plus the bogus extras
|
|
|
|
self.assertEqual(len(log), self.policy.ec_ndata + 2)
|
|
|
|
|
|
|
|
def test_GET_mixed_ranged_responses_success(self):
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
fragment_size = self.policy.fragment_size
|
|
|
|
new_data = ('test' * segment_size)[:-492]
|
|
|
|
new_etag = md5(new_data).hexdigest()
|
|
|
|
new_archives = self._make_ec_archive_bodies(new_data)
|
|
|
|
old_data = ('junk' * segment_size)[:-492]
|
|
|
|
old_etag = md5(old_data).hexdigest()
|
|
|
|
old_archives = self._make_ec_archive_bodies(old_data)
|
|
|
|
frag_archive_size = len(new_archives[0])
|
|
|
|
|
|
|
|
new_headers = {
|
|
|
|
'Content-Type': 'text/plain',
|
|
|
|
'Content-Length': fragment_size,
|
|
|
|
'Content-Range': 'bytes 0-%s/%s' % (fragment_size - 1,
|
|
|
|
frag_archive_size),
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(new_data),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': new_etag,
|
|
|
|
}
|
|
|
|
old_headers = {
|
|
|
|
'Content-Type': 'text/plain',
|
|
|
|
'Content-Length': fragment_size,
|
|
|
|
'Content-Range': 'bytes 0-%s/%s' % (fragment_size - 1,
|
|
|
|
frag_archive_size),
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(old_data),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': old_etag,
|
|
|
|
}
|
|
|
|
# 7 primaries with stale frags, 3 handoffs failed to get new frags
|
|
|
|
responses = [
|
|
|
|
StubResponse(206, old_archives[0][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[1][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[2][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[3][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[4][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[5][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[6][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[7][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[8][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[9][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[10][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[11][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, old_archives[12][:fragment_size], old_headers),
|
|
|
|
StubResponse(206, new_archives[13][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, new_archives[0][:fragment_size], new_headers),
|
|
|
|
StubResponse(404),
|
|
|
|
StubResponse(404),
|
|
|
|
StubResponse(206, new_archives[6][:fragment_size], new_headers),
|
|
|
|
StubResponse(404),
|
|
|
|
StubResponse(206, new_archives[10][:fragment_size], new_headers),
|
|
|
|
StubResponse(206, new_archives[12][:fragment_size], new_headers),
|
|
|
|
]
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
return responses.pop(0) if responses else StubResponse(404)
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
with capture_http_requests(get_response) as log:
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.body, new_data[:segment_size])
|
|
|
|
self.assertEqual(len(log), self.policy.ec_ndata + 10)
|
|
|
|
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
def test_GET_mismatched_fragment_archives(self):
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
test_data1 = ('test' * segment_size)[:-333]
|
|
|
|
# N.B. the object data *length* here is different
|
|
|
|
test_data2 = ('blah1' * segment_size)[:-333]
|
|
|
|
|
|
|
|
etag1 = md5(test_data1).hexdigest()
|
|
|
|
etag2 = md5(test_data2).hexdigest()
|
|
|
|
|
|
|
|
ec_archive_bodies1 = self._make_ec_archive_bodies(test_data1)
|
|
|
|
ec_archive_bodies2 = self._make_ec_archive_bodies(test_data2)
|
|
|
|
|
EC: support multiple ranges for GET requests
This commit lets clients receive multipart/byteranges responses (see
RFC 7233, Appendix A) for erasure-coded objects. Clients can already
do this for replicated objects, so this brings EC closer to feature
parity (ha!).
GetOrHeadHandler got a base class extracted from it that treats an
HTTP response as a sequence of byte-range responses. This way, it can
continue to yield whole fragments, not just N-byte pieces of the raw
HTTP response, since an N-byte piece of a multipart/byteranges
response is pretty much useless.
There are a couple of bonus fixes in here, too. For starters, download
resuming now works on multipart/byteranges responses. Before, it only
worked on 200 responses or 206 responses for a single byte
range. Also, BufferedHTTPResponse grew a readline() method.
Also, the MIME response for replicated objects got tightened up a
little. Before, it had some leading and trailing CRLFs which, while
allowed by RFC 7233, provide no benefit. Now, both replicated and EC
multipart/byteranges avoid extraneous bytes. This let me re-use the
Content-Length calculation in swob instead of having to either hack
around it or add extraneous whitespace to match.
Change-Id: I16fc65e0ec4e356706d327bdb02a3741e36330a0
2015-03-20 09:56:30 -07:00
|
|
|
headers1 = {'X-Object-Sysmeta-Ec-Etag': etag1,
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': '333'}
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
# here we're going to *lie* and say the etag here matches
|
EC: support multiple ranges for GET requests
This commit lets clients receive multipart/byteranges responses (see
RFC 7233, Appendix A) for erasure-coded objects. Clients can already
do this for replicated objects, so this brings EC closer to feature
parity (ha!).
GetOrHeadHandler got a base class extracted from it that treats an
HTTP response as a sequence of byte-range responses. This way, it can
continue to yield whole fragments, not just N-byte pieces of the raw
HTTP response, since an N-byte piece of a multipart/byteranges
response is pretty much useless.
There are a couple of bonus fixes in here, too. For starters, download
resuming now works on multipart/byteranges responses. Before, it only
worked on 200 responses or 206 responses for a single byte
range. Also, BufferedHTTPResponse grew a readline() method.
Also, the MIME response for replicated objects got tightened up a
little. Before, it had some leading and trailing CRLFs which, while
allowed by RFC 7233, provide no benefit. Now, both replicated and EC
multipart/byteranges avoid extraneous bytes. This let me re-use the
Content-Length calculation in swob instead of having to either hack
around it or add extraneous whitespace to match.
Change-Id: I16fc65e0ec4e356706d327bdb02a3741e36330a0
2015-03-20 09:56:30 -07:00
|
|
|
headers2 = {'X-Object-Sysmeta-Ec-Etag': etag1,
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': '333'}
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
|
|
|
|
responses1 = [(200, body, headers1)
|
|
|
|
for body in ec_archive_bodies1]
|
|
|
|
responses2 = [(200, body, headers2)
|
|
|
|
for body in ec_archive_bodies2]
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
|
|
|
|
# sanity check responses1
|
|
|
|
responses = responses1[:self.policy.ec_ndata]
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(md5(resp.body).hexdigest(), etag1)
|
|
|
|
|
|
|
|
# sanity check responses2
|
|
|
|
responses = responses2[:self.policy.ec_ndata]
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(md5(resp.body).hexdigest(), etag2)
|
|
|
|
|
|
|
|
# now mix the responses a bit
|
|
|
|
mix_index = random.randint(0, self.policy.ec_ndata - 1)
|
|
|
|
mixed_responses = responses1[:self.policy.ec_ndata]
|
|
|
|
mixed_responses[mix_index] = responses2[mix_index]
|
|
|
|
|
|
|
|
status_codes, body_iter, headers = zip(*mixed_responses)
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
try:
|
|
|
|
resp.body
|
|
|
|
except ECDriverError:
|
2015-10-02 16:40:28 -07:00
|
|
|
resp._app_iter.close()
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
else:
|
|
|
|
self.fail('invalid ec fragment response body did not blow up!')
|
|
|
|
error_lines = self.logger.get_lines_for_level('error')
|
|
|
|
self.assertEqual(1, len(error_lines))
|
|
|
|
msg = error_lines[0]
|
|
|
|
self.assertTrue('Error decoding fragments' in msg)
|
|
|
|
self.assertTrue('/a/c/o' in msg)
|
|
|
|
log_msg_args, log_msg_kwargs = self.logger.log_dict['error'][0]
|
|
|
|
self.assertEqual(log_msg_kwargs['exc_info'][0], ECDriverError)
|
|
|
|
|
|
|
|
def test_GET_read_timeout(self):
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
test_data = ('test' * segment_size)[:-333]
|
|
|
|
etag = md5(test_data).hexdigest()
|
|
|
|
ec_archive_bodies = self._make_ec_archive_bodies(test_data)
|
|
|
|
headers = {'X-Object-Sysmeta-Ec-Etag': etag}
|
|
|
|
self.app.recoverable_node_timeout = 0.01
|
|
|
|
responses = [(200, SlowBody(body, 0.1), headers)
|
|
|
|
for body in ec_archive_bodies]
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
|
|
|
|
status_codes, body_iter, headers = zip(*responses + [
|
|
|
|
(404, '', {}) for i in range(
|
|
|
|
self.policy.object_ring.max_more_nodes)])
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
# do this inside the fake http context manager, it'll try to
|
|
|
|
# resume but won't be able to give us all the right bytes
|
|
|
|
self.assertNotEqual(md5(resp.body).hexdigest(), etag)
|
|
|
|
error_lines = self.logger.get_lines_for_level('error')
|
|
|
|
self.assertEqual(self.replicas(), len(error_lines))
|
|
|
|
nparity = self.policy.ec_nparity
|
|
|
|
for line in error_lines[:nparity]:
|
|
|
|
self.assertTrue('retrying' in line)
|
|
|
|
for line in error_lines[nparity:]:
|
|
|
|
self.assertTrue('ChunkReadTimeout (0.01s)' in line)
|
|
|
|
|
|
|
|
def test_GET_read_timeout_resume(self):
|
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
test_data = ('test' * segment_size)[:-333]
|
|
|
|
etag = md5(test_data).hexdigest()
|
|
|
|
ec_archive_bodies = self._make_ec_archive_bodies(test_data)
|
|
|
|
headers = {'X-Object-Sysmeta-Ec-Etag': etag}
|
|
|
|
self.app.recoverable_node_timeout = 0.05
|
|
|
|
# first one is slow
|
|
|
|
responses = [(200, SlowBody(ec_archive_bodies[0], 0.1), headers)]
|
|
|
|
# ... the rest are fine
|
|
|
|
responses += [(200, body, headers)
|
|
|
|
for body in ec_archive_bodies[1:]]
|
|
|
|
|
|
|
|
req = swob.Request.blank('/v1/a/c/o')
|
|
|
|
|
|
|
|
status_codes, body_iter, headers = zip(
|
|
|
|
*responses[:self.policy.ec_ndata + 1])
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertTrue(md5(resp.body).hexdigest(), etag)
|
|
|
|
error_lines = self.logger.get_lines_for_level('error')
|
|
|
|
self.assertEqual(1, len(error_lines))
|
|
|
|
self.assertTrue('retrying' in error_lines[0])
|
|
|
|
|
2015-07-22 10:39:22 -07:00
|
|
|
def test_fix_response_HEAD(self):
|
|
|
|
headers = {'X-Object-Sysmeta-Ec-Content-Length': '10',
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': 'foo'}
|
|
|
|
|
|
|
|
# sucsessful HEAD
|
|
|
|
responses = [(200, '', headers)]
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='HEAD')
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(resp.status_int, 200)
|
|
|
|
self.assertEqual(resp.body, '')
|
2015-07-22 10:39:22 -07:00
|
|
|
# 200OK shows original object content length
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(resp.headers['Content-Length'], '10')
|
|
|
|
self.assertEqual(resp.headers['Etag'], 'foo')
|
2015-07-22 10:39:22 -07:00
|
|
|
|
|
|
|
# not found HEAD
|
|
|
|
responses = [(404, '', {})] * self.replicas() * 2
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='HEAD')
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(resp.status_int, 404)
|
2015-07-22 10:39:22 -07:00
|
|
|
# 404 shows actual response body size (i.e. 0 for HEAD)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(resp.headers['Content-Length'], '0')
|
2015-07-22 10:39:22 -07:00
|
|
|
|
2015-05-06 16:29:06 -07:00
|
|
|
def test_PUT_with_slow_commits(self):
|
|
|
|
# It's important that this timeout be much less than the delay in
|
|
|
|
# the slow commit responses so that the slow commits are not waited
|
|
|
|
# for.
|
|
|
|
self.app.post_quorum_timeout = 0.01
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
# plenty of slow commits
|
|
|
|
response_sleep = 5.0
|
|
|
|
codes = [FakeStatus(201, response_sleep=response_sleep)
|
|
|
|
for i in range(self.replicas())]
|
|
|
|
# swap out some with regular fast responses
|
2015-10-02 13:53:27 -07:00
|
|
|
number_of_fast_responses_needed_to_be_quick_enough = \
|
|
|
|
self.policy.quorum
|
2015-05-06 16:29:06 -07:00
|
|
|
fast_indexes = random.sample(
|
2015-05-25 18:28:02 +02:00
|
|
|
range(self.replicas()),
|
2015-05-06 16:29:06 -07:00
|
|
|
number_of_fast_responses_needed_to_be_quick_enough)
|
|
|
|
for i in fast_indexes:
|
|
|
|
codes[i] = 201
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
start = time.time()
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
response_time = time.time() - start
|
2015-08-06 10:01:17 -05:00
|
|
|
self.assertEqual(resp.status_int, 201)
|
2015-05-06 16:29:06 -07:00
|
|
|
self.assertTrue(response_time < response_sleep)
|
|
|
|
|
2015-10-02 13:53:27 -07:00
|
|
|
def test_PUT_with_just_enough_durable_responses(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
|
|
|
|
codes = [201] * (self.policy.ec_ndata + 1)
|
|
|
|
codes += [503] * (self.policy.ec_nparity - 1)
|
|
|
|
self.assertEqual(len(codes), self.replicas())
|
|
|
|
random.shuffle(codes)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 201)
|
|
|
|
|
2015-08-17 13:54:44 -05:00
|
|
|
def test_PUT_with_less_durable_responses(self):
|
|
|
|
req = swift.common.swob.Request.blank('/v1/a/c/o', method='PUT',
|
|
|
|
body='')
|
|
|
|
|
2015-10-02 13:53:27 -07:00
|
|
|
codes = [201] * (self.policy.ec_ndata)
|
|
|
|
codes += [503] * (self.policy.ec_nparity)
|
|
|
|
self.assertEqual(len(codes), self.replicas())
|
2015-08-17 13:54:44 -05:00
|
|
|
random.shuffle(codes)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*codes, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
|
|
|
self.assertEqual(resp.status_int, 503)
|
|
|
|
|
2015-07-22 10:39:22 -07:00
|
|
|
def test_GET_with_invalid_ranges(self):
|
2016-03-16 17:41:30 +00:00
|
|
|
# real body size is segment_size - 10 (just 1 segment)
|
2015-07-22 10:39:22 -07:00
|
|
|
segment_size = self.policy.ec_segment_size
|
|
|
|
real_body = ('a' * segment_size)[:-10]
|
|
|
|
|
|
|
|
# range is out of real body but in segment size
|
|
|
|
self._test_invalid_ranges('GET', real_body,
|
|
|
|
segment_size, '%s-' % (segment_size - 10))
|
|
|
|
# range is out of both real body and segment size
|
|
|
|
self._test_invalid_ranges('GET', real_body,
|
|
|
|
segment_size, '%s-' % (segment_size + 10))
|
|
|
|
|
|
|
|
def _test_invalid_ranges(self, method, real_body, segment_size, req_range):
|
|
|
|
# make a request with range starts from more than real size.
|
2016-03-16 17:41:30 +00:00
|
|
|
body_etag = md5(real_body).hexdigest()
|
2015-07-22 10:39:22 -07:00
|
|
|
req = swift.common.swob.Request.blank(
|
|
|
|
'/v1/a/c/o', method=method,
|
|
|
|
headers={'Destination': 'c1/o',
|
|
|
|
'Range': 'bytes=%s' % (req_range)})
|
|
|
|
|
|
|
|
fragments = self.policy.pyeclib_driver.encode(real_body)
|
|
|
|
fragment_payloads = [fragments]
|
|
|
|
|
|
|
|
node_fragments = zip(*fragment_payloads)
|
|
|
|
self.assertEqual(len(node_fragments), self.replicas()) # sanity
|
2016-03-16 17:41:30 +00:00
|
|
|
headers = {'X-Object-Sysmeta-Ec-Content-Length': str(len(real_body)),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': body_etag}
|
2015-07-22 10:39:22 -07:00
|
|
|
start = int(req_range.split('-')[0])
|
|
|
|
self.assertTrue(start >= 0) # sanity
|
|
|
|
title, exp = swob.RESPONSE_REASONS[416]
|
|
|
|
range_not_satisfiable_body = \
|
|
|
|
'<html><h1>%s</h1><p>%s</p></html>' % (title, exp)
|
|
|
|
if start >= segment_size:
|
|
|
|
responses = [(416, range_not_satisfiable_body, headers)
|
|
|
|
for i in range(POLICIES.default.ec_ndata)]
|
|
|
|
else:
|
|
|
|
responses = [(200, ''.join(node_fragments[i]), headers)
|
|
|
|
for i in range(POLICIES.default.ec_ndata)]
|
|
|
|
status_codes, body_iter, headers = zip(*responses)
|
|
|
|
expect_headers = {
|
|
|
|
'X-Obj-Metadata-Footer': 'yes',
|
|
|
|
'X-Obj-Multiphase-Commit': 'yes'
|
|
|
|
}
|
|
|
|
with set_http_connect(*status_codes, body_iter=body_iter,
|
|
|
|
headers=headers, expect_headers=expect_headers):
|
|
|
|
resp = req.get_response(self.app)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(resp.status_int, 416)
|
|
|
|
self.assertEqual(resp.content_length, len(range_not_satisfiable_body))
|
|
|
|
self.assertEqual(resp.body, range_not_satisfiable_body)
|
2016-03-16 17:41:30 +00:00
|
|
|
self.assertEqual(resp.etag, body_etag)
|
|
|
|
self.assertEqual(resp.headers['Accept-Ranges'], 'bytes')
|
2015-07-22 10:39:22 -07:00
|
|
|
|
2014-07-02 14:39:42 -07:00
|
|
|
|
2013-06-13 11:24:29 -07:00
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|