2013-09-20 01:00:54 +08:00
|
|
|
# Copyright (c) 2010-2012 OpenStack Foundation
|
2013-09-07 16:29:15 +02:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
""" Swift tests """
|
|
|
|
|
2015-07-28 21:03:05 +05:30
|
|
|
from __future__ import print_function
|
2010-11-11 16:41:07 -06:00
|
|
|
import os
|
2012-04-30 16:38:15 -04:00
|
|
|
import copy
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
import logging
|
2018-05-22 16:17:12 -05:00
|
|
|
import logging.handlers
|
2013-11-18 11:41:58 -08:00
|
|
|
import sys
|
2014-04-28 19:22:51 -07:00
|
|
|
from contextlib import contextmanager, closing
|
2014-03-17 12:18:25 -07:00
|
|
|
from collections import defaultdict, Iterable
|
2018-05-22 16:17:12 -05:00
|
|
|
from hashlib import md5
|
2015-03-31 22:35:37 -07:00
|
|
|
import itertools
|
2014-05-27 16:57:25 -07:00
|
|
|
from numbers import Number
|
2010-11-11 16:41:07 -06:00
|
|
|
from tempfile import NamedTemporaryFile
|
2013-08-30 21:37:07 -07:00
|
|
|
import time
|
2015-03-31 22:35:37 -07:00
|
|
|
import eventlet
|
2017-06-02 17:47:25 -07:00
|
|
|
from eventlet import greenpool, debug as eventlet_debug
|
2010-07-12 17:03:45 -05:00
|
|
|
from eventlet.green import socket
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
from tempfile import mkdtemp, mkstemp, gettempdir
|
2011-02-11 13:18:19 -06:00
|
|
|
from shutil import rmtree
|
2016-06-07 10:35:18 +00:00
|
|
|
import signal
|
2016-06-29 03:32:09 -05:00
|
|
|
import json
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
import random
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
import errno
|
|
|
|
import xattr
|
2019-10-14 14:03:16 -07:00
|
|
|
from io import BytesIO
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
|
2019-04-15 21:44:18 -07:00
|
|
|
import six
|
2018-05-22 16:17:12 -05:00
|
|
|
import six.moves.cPickle as pickle
|
|
|
|
from six.moves import range
|
|
|
|
from six.moves.http_client import HTTPException
|
|
|
|
|
2018-11-29 01:31:13 -06:00
|
|
|
from swift.common import storage_policy, swob, utils
|
|
|
|
from swift.common.storage_policy import (StoragePolicy, ECStoragePolicy,
|
|
|
|
VALID_EC_TYPES)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
from swift.common.utils import Timestamp, NOTICE
|
2012-04-30 16:38:15 -04:00
|
|
|
from test import get_config
|
2016-03-02 10:28:51 +00:00
|
|
|
from swift.common.header_key_dict import HeaderKeyDict
|
2017-06-20 11:17:33 +01:00
|
|
|
from swift.common.ring import Ring, RingData, RingBuilder
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
from swift.obj import server
|
2018-11-29 01:31:13 -06:00
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
import functools
|
2014-04-28 19:22:51 -07:00
|
|
|
from gzip import GzipFile
|
|
|
|
import mock as mocklib
|
2015-03-31 22:35:37 -07:00
|
|
|
import inspect
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
import unittest
|
|
|
|
|
|
|
|
|
2020-01-12 03:10:25 -06:00
|
|
|
class SkipTest(unittest.SkipTest):
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
pass
|
2015-03-31 22:35:37 -07:00
|
|
|
|
2020-04-03 10:44:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
EMPTY_ETAG = md5().hexdigest()
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-01-08 20:29:47 -08:00
|
|
|
# try not to import this module from swift
|
|
|
|
if not os.path.basename(sys.argv[0]).startswith('swift'):
|
|
|
|
# never patch HASH_PATH_SUFFIX AGAIN!
|
2016-11-23 10:14:21 -08:00
|
|
|
utils.HASH_PATH_SUFFIX = b'endcap'
|
2015-01-08 20:29:47 -08:00
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-10-23 13:16:33 +00:00
|
|
|
EC_TYPE_PREFERENCE = [
|
|
|
|
'liberasurecode_rs_vand',
|
|
|
|
'jerasure_rs_vand',
|
|
|
|
]
|
|
|
|
for eclib_name in EC_TYPE_PREFERENCE:
|
|
|
|
if eclib_name in VALID_EC_TYPES:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise SystemExit('ERROR: unable to find suitable PyECLib type'
|
|
|
|
' (none of %r found in %r)' % (
|
|
|
|
EC_TYPE_PREFERENCE,
|
|
|
|
VALID_EC_TYPES,
|
|
|
|
))
|
|
|
|
DEFAULT_TEST_EC_TYPE = eclib_name
|
|
|
|
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def patch_policies(thing_or_policies=None, legacy_only=False,
|
|
|
|
with_ec_default=False, fake_ring_args=None):
|
|
|
|
if isinstance(thing_or_policies, (
|
|
|
|
Iterable, storage_policy.StoragePolicyCollection)):
|
|
|
|
return PatchPolicies(thing_or_policies, fake_ring_args=fake_ring_args)
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
if legacy_only:
|
2015-03-31 22:35:37 -07:00
|
|
|
default_policies = [
|
|
|
|
StoragePolicy(0, name='legacy', is_default=True),
|
|
|
|
]
|
|
|
|
default_ring_args = [{}]
|
|
|
|
elif with_ec_default:
|
|
|
|
default_policies = [
|
|
|
|
ECStoragePolicy(0, name='ec', is_default=True,
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE, ec_ndata=10,
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
ec_nparity=4, ec_segment_size=4096),
|
2015-03-31 22:35:37 -07:00
|
|
|
StoragePolicy(1, name='unu'),
|
|
|
|
]
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
default_ring_args = [{'replicas': 14}, {}]
|
2014-03-17 12:18:25 -07:00
|
|
|
else:
|
2014-06-23 16:01:02 -07:00
|
|
|
default_policies = [
|
2015-03-31 22:35:37 -07:00
|
|
|
StoragePolicy(0, name='nulo', is_default=True),
|
|
|
|
StoragePolicy(1, name='unu'),
|
2014-06-23 16:01:02 -07:00
|
|
|
]
|
2015-03-31 22:35:37 -07:00
|
|
|
default_ring_args = [{}, {}]
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
fake_ring_args = fake_ring_args or default_ring_args
|
|
|
|
decorator = PatchPolicies(default_policies, fake_ring_args=fake_ring_args)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
if not thing_or_policies:
|
|
|
|
return decorator
|
2014-03-17 12:18:25 -07:00
|
|
|
else:
|
2015-03-31 22:35:37 -07:00
|
|
|
# it's a thing, we return the wrapped thing instead of the decorator
|
|
|
|
return decorator(thing_or_policies)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
|
|
|
|
class PatchPolicies(object):
|
|
|
|
"""
|
|
|
|
Why not mock.patch? In my case, when used as a decorator on the class it
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
seemed to patch setUp at the wrong time (i.e. in setUp the global wasn't
|
2014-03-17 12:18:25 -07:00
|
|
|
patched yet)
|
|
|
|
"""
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def __init__(self, policies, fake_ring_args=None):
|
2014-03-17 12:18:25 -07:00
|
|
|
if isinstance(policies, storage_policy.StoragePolicyCollection):
|
|
|
|
self.policies = policies
|
|
|
|
else:
|
|
|
|
self.policies = storage_policy.StoragePolicyCollection(policies)
|
2015-03-31 22:35:37 -07:00
|
|
|
self.fake_ring_args = fake_ring_args or [None] * len(self.policies)
|
|
|
|
|
|
|
|
def _setup_rings(self):
|
|
|
|
"""
|
|
|
|
Our tests tend to use the policies rings like their own personal
|
|
|
|
playground - which can be a problem in the particular case of a
|
|
|
|
patched TestCase class where the FakeRing objects are scoped in the
|
|
|
|
call to the patch_policies wrapper outside of the TestCase instance
|
|
|
|
which can lead to some bled state.
|
|
|
|
|
|
|
|
To help tests get better isolation without having to think about it,
|
|
|
|
here we're capturing the args required to *build* a new FakeRing
|
|
|
|
instances so we can ensure each test method gets a clean ring setup.
|
|
|
|
|
|
|
|
The TestCase can always "tweak" these fresh rings in setUp - or if
|
|
|
|
they'd prefer to get the same "reset" behavior with custom FakeRing's
|
|
|
|
they can pass in their own fake_ring_args to patch_policies instead of
|
|
|
|
setting the object_ring on the policy definitions.
|
|
|
|
"""
|
|
|
|
for policy, fake_ring_arg in zip(self.policies, self.fake_ring_args):
|
|
|
|
if fake_ring_arg is not None:
|
|
|
|
policy.object_ring = FakeRing(**fake_ring_arg)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def __call__(self, thing):
|
|
|
|
if isinstance(thing, type):
|
|
|
|
return self._patch_class(thing)
|
|
|
|
else:
|
|
|
|
return self._patch_method(thing)
|
|
|
|
|
|
|
|
def _patch_class(self, cls):
|
2015-03-31 22:35:37 -07:00
|
|
|
"""
|
|
|
|
Creating a new class that inherits from decorated class is the more
|
|
|
|
common way I've seen class decorators done - but it seems to cause
|
|
|
|
infinite recursion when super is called from inside methods in the
|
|
|
|
decorated class.
|
|
|
|
"""
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
orig_setUp = cls.setUp
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
|
|
|
|
def unpatch_cleanup(cls_self):
|
|
|
|
if cls_self._policies_patched:
|
|
|
|
self.__exit__()
|
|
|
|
cls_self._policies_patched = False
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def setUp(cls_self):
|
|
|
|
if not getattr(cls_self, '_policies_patched', False):
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
self.__enter__()
|
2015-03-31 22:35:37 -07:00
|
|
|
cls_self._policies_patched = True
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
cls_self.addCleanup(unpatch_cleanup, cls_self)
|
2015-03-31 22:35:37 -07:00
|
|
|
orig_setUp(cls_self)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
cls.setUp = setUp
|
|
|
|
|
|
|
|
return cls
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def _patch_method(self, f):
|
|
|
|
@functools.wraps(f)
|
|
|
|
def mywrapper(*args, **kwargs):
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
with self:
|
2014-03-17 12:18:25 -07:00
|
|
|
return f(*args, **kwargs)
|
|
|
|
return mywrapper
|
|
|
|
|
|
|
|
def __enter__(self):
|
|
|
|
self._orig_POLICIES = storage_policy._POLICIES
|
|
|
|
storage_policy._POLICIES = self.policies
|
Clean up how PatchPolicies works
We've got these lovely __enter__ and __exit__ methods; let's use them!
Note that this also changes how we patch classes' setUp methods so we
don't set self._orig_POLICIES when the class is already patched. I
hope this may fix some sporadic failures that include tracebacks
that look like
proxy ERROR: ERROR 500 Traceback (most recent call last):
File ".../swift/obj/server.py", line 1105, in __call__
res = getattr(self, req.method)(req)
File ".../swift/common/utils.py", line 1626, in _timing_stats
resp = func(ctrl, *args, **kwargs)
File ".../swift/obj/server.py", line 880, in GET
policy=policy, frag_prefs=frag_prefs)
File ".../swift/obj/server.py", line 211, in get_diskfile
return self._diskfile_router[policy].get_diskfile(
File ".../swift/obj/diskfile.py", line 555, in __getitem__
return self.policy_to_manager[policy]
KeyError: ECStoragePolicy(...)
... and try to unpatch more gracefully with TestCase.addCleanup
Change-Id: Iaa3d42ec21758b0707155878a645e665aa36696c
2017-04-26 17:13:37 -07:00
|
|
|
try:
|
|
|
|
self._setup_rings()
|
|
|
|
except: # noqa
|
|
|
|
self.__exit__()
|
|
|
|
raise
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def __exit__(self, *args):
|
|
|
|
storage_policy._POLICIES = self._orig_POLICIES
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2013-08-31 22:36:58 -04:00
|
|
|
|
2014-04-28 19:22:51 -07:00
|
|
|
class FakeRing(Ring):
|
2013-03-30 15:55:29 +03:00
|
|
|
|
2014-11-13 16:40:05 -08:00
|
|
|
def __init__(self, replicas=3, max_more_nodes=0, part_power=0,
|
|
|
|
base_port=1000):
|
2020-03-26 15:32:42 -04:00
|
|
|
self.serialized_path = '/foo/bar/object.ring.gz'
|
2014-11-13 16:40:05 -08:00
|
|
|
self._base_port = base_port
|
|
|
|
self.max_more_nodes = max_more_nodes
|
|
|
|
self._part_shift = 32 - part_power
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
self._init_device_char()
|
2013-03-30 15:55:29 +03:00
|
|
|
# 9 total nodes (6 more past the initial 3) is the cap, no matter if
|
|
|
|
# this is set higher, or R^2 for R replicas
|
2014-04-28 19:22:51 -07:00
|
|
|
self.set_replicas(replicas)
|
|
|
|
self._reload()
|
|
|
|
|
2017-06-02 17:47:25 -07:00
|
|
|
def has_changed(self):
|
|
|
|
"""
|
|
|
|
The real implementation uses getmtime on the serialized_path attribute,
|
|
|
|
which doesn't exist on our fake and relies on the implementation of
|
|
|
|
_reload which we override. So ... just NOOPE.
|
|
|
|
"""
|
|
|
|
return False
|
|
|
|
|
2014-04-28 19:22:51 -07:00
|
|
|
def _reload(self):
|
|
|
|
self._rtime = time.time()
|
|
|
|
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
@property
|
|
|
|
def device_char(self):
|
|
|
|
return next(self._device_char_iter)
|
|
|
|
|
|
|
|
def _init_device_char(self):
|
|
|
|
self._device_char_iter = itertools.cycle(
|
|
|
|
['sd%s' % chr(ord('a') + x) for x in range(26)])
|
|
|
|
|
2018-12-03 14:22:59 -08:00
|
|
|
def add_node(self, dev):
|
|
|
|
# round trip through json to ensure unicode like real rings
|
|
|
|
self._devs.append(json.loads(json.dumps(dev)))
|
|
|
|
|
2013-03-30 15:55:29 +03:00
|
|
|
def set_replicas(self, replicas):
|
|
|
|
self.replicas = replicas
|
2014-04-28 19:22:51 -07:00
|
|
|
self._devs = []
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
self._init_device_char()
|
2014-04-28 19:22:51 -07:00
|
|
|
for x in range(self.replicas):
|
|
|
|
ip = '10.0.0.%s' % x
|
2014-11-13 16:40:05 -08:00
|
|
|
port = self._base_port + x
|
2018-12-03 14:22:59 -08:00
|
|
|
dev = {
|
2014-04-28 19:22:51 -07:00
|
|
|
'ip': ip,
|
|
|
|
'replication_ip': ip,
|
|
|
|
'port': port,
|
|
|
|
'replication_port': port,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
'device': self.device_char,
|
2014-04-28 19:22:51 -07:00
|
|
|
'zone': x % 3,
|
|
|
|
'region': x % 2,
|
|
|
|
'id': x,
|
2018-12-03 14:22:59 -08:00
|
|
|
}
|
|
|
|
self.add_node(dev)
|
2013-03-30 15:55:29 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def replica_count(self):
|
|
|
|
return self.replicas
|
|
|
|
|
2014-04-28 19:22:51 -07:00
|
|
|
def _get_part_nodes(self, part):
|
Foundational support for PUT and GET of erasure-coded objects
This commit makes it possible to PUT an object into Swift and have it
stored using erasure coding instead of replication, and also to GET
the object back from Swift at a later time.
This works by splitting the incoming object into a number of segments,
erasure-coding each segment in turn to get fragments, then
concatenating the fragments into fragment archives. Segments are 1 MiB
in size, except the last, which is between 1 B and 1 MiB.
+====================================================================+
| object data |
+====================================================================+
|
+------------------------+----------------------+
| | |
v v v
+===================+ +===================+ +==============+
| segment 1 | | segment 2 | ... | segment N |
+===================+ +===================+ +==============+
| |
| |
v v
/=========\ /=========\
| pyeclib | | pyeclib | ...
\=========/ \=========/
| |
| |
+--> fragment A-1 +--> fragment A-2
| |
| |
| |
| |
| |
+--> fragment B-1 +--> fragment B-2
| |
| |
... ...
Then, object server A gets the concatenation of fragment A-1, A-2,
..., A-N, so its .data file looks like this (called a "fragment archive"):
+=====================================================================+
| fragment A-1 | fragment A-2 | ... | fragment A-N |
+=====================================================================+
Since this means that the object server never sees the object data as
the client sent it, we have to do a few things to ensure data
integrity.
First, the proxy has to check the Etag if the client provided it; the
object server can't do it since the object server doesn't see the raw
data.
Second, if the client does not provide an Etag, the proxy computes it
and uses the MIME-PUT mechanism to provide it to the object servers
after the object body. Otherwise, the object would not have an Etag at
all.
Third, the proxy computes the MD5 of each fragment archive and sends
it to the object server using the MIME-PUT mechanism. With replicated
objects, the proxy checks that the Etags from all the object servers
match, and if they don't, returns a 500 to the client. This mitigates
the risk of data corruption in one of the proxy --> object connections,
and signals to the client when it happens. With EC objects, we can't
use that same mechanism, so we must send the checksum with each
fragment archive to get comparable protection.
On the GET path, the inverse happens: the proxy connects to a bunch of
object servers (M of them, for an M+K scheme), reads one fragment at a
time from each fragment archive, decodes those fragments into a
segment, and serves the segment to the client.
When an object server dies partway through a GET response, any
partially-fetched fragment is discarded, the resumption point is wound
back to the nearest fragment boundary, and the GET is retried with the
next object server.
GET requests for a single byterange work; GET requests for multiple
byteranges do not.
There are a number of things _not_ included in this commit. Some of
them are listed here:
* multi-range GET
* deferred cleanup of old .data files
* durability (daemon to reconstruct missing archives)
Co-Authored-By: Alistair Coles <alistair.coles@hp.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com>
Co-Authored-By: Paul Luse <paul.e.luse@intel.com>
Co-Authored-By: Christian Schwede <christian.schwede@enovance.com>
Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com>
Change-Id: I9c13c03616489f8eab7dcd7c5f21237ed4cb6fd2
2014-10-22 13:18:34 -07:00
|
|
|
return [dict(node, index=i) for i, node in enumerate(list(self._devs))]
|
2013-03-30 15:55:29 +03:00
|
|
|
|
2013-06-13 11:24:29 -07:00
|
|
|
def get_more_nodes(self, part):
|
2019-02-04 15:46:40 -06:00
|
|
|
index_counter = itertools.count()
|
Fix missing container update
At PUT object request, proxy server makes backend headers (e.g.
X-Container-Partition) which help object-servers to determine
the container-server they should update. In addition, the backend
headers are created as many as the number of container replicas.
(i.e. 3 replica in container ring, 3 backend headers will be created)
On EC case, Swift fans out fragment archives to backend object-servers.
Basically the number of fragment archives will be more than the container
replica number and proxy-server assumes a request as success when quorum
number of object-server succeeded to store. That would cause to make an
orphaned object which is stored but not container updated.
For example, assuming k=10, m=4, container replica=3 case:
Assuming, proxy-server attempts to make 14 backend streams but
unfortunately first 3 nodes returns 507 (disk failure) and then
the Swift doesn't have any other disks.
In the case, proxy keeps 11 backend streams to store and current Swift
assumes it as sufficient because it is more than or equals quorum (right
now k+1 is sufficient i.e. 11 backend streams are enough to store)
However, in the case, the 11 streams doesn't have the container update
header so that the request will succeed but container will be never updated.
This patch allows to extract container updates up to object quorum_size
+ 1 to more nodes to ensure the updates. This approach sacrifices the
container update cost a bit because duplicated updates will be there but
quorum sizes + 1 seems reasonable (even if it's reaplicated case) to pay
to ensure that instead of whole objects incude the update headers.
Now Swift will work like as follows:
For example:
k=10, m=4, qurum_size=11 (k+1), 3 replica for container.
CU: container update
CA: commit ack
That result in like as
CU CU CU CU CU CU CU CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201]
CA CA CA CA CA
In this case, at least 3 container updates are saved.
For another example:
7 replicated objects, qurum_size=4 (7//2+1), 3 replica for container.
CU: container update
CA: commit ack (201s for successful PUT on replicated)
CU CU CU CU CU
[507, 507, 507, 201, 201, 201, 201]
CA CA CA CA
In this replicated case, at least 2 container updates are saved.
Cleaned up some unit tests so that modifying policies doesn't leak
between tests.
Co-Authored-By: John Dickinson <me@not.mn>
Co-Authored-By: Sam Merritt <sam@swiftstack.com>
Closes-Bug: #1460920
Change-Id: I04132858f44b42ee7ecf3b7994cb22a19d001d70
2015-05-22 16:53:12 -07:00
|
|
|
for x in range(self.replicas, (self.replicas + self.max_more_nodes)):
|
2013-06-13 11:24:29 -07:00
|
|
|
yield {'ip': '10.0.0.%s' % x,
|
2015-03-31 22:35:37 -07:00
|
|
|
'replication_ip': '10.0.0.%s' % x,
|
2014-11-13 16:40:05 -08:00
|
|
|
'port': self._base_port + x,
|
2015-03-31 22:35:37 -07:00
|
|
|
'replication_port': self._base_port + x,
|
2013-06-13 11:24:29 -07:00
|
|
|
'device': 'sda',
|
|
|
|
'zone': x % 3,
|
|
|
|
'region': x % 2,
|
2019-02-04 15:46:40 -06:00
|
|
|
'id': x,
|
|
|
|
'handoff_index': next(index_counter)}
|
2013-03-30 15:55:29 +03:00
|
|
|
|
|
|
|
|
2014-04-28 19:22:51 -07:00
|
|
|
def write_fake_ring(path, *devs):
|
|
|
|
"""
|
|
|
|
Pretty much just a two node, two replica, 2 part power ring...
|
|
|
|
"""
|
|
|
|
dev1 = {'id': 0, 'zone': 0, 'device': 'sda1', 'ip': '127.0.0.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200}
|
2019-02-08 09:36:35 -08:00
|
|
|
dev2 = {'id': 1, 'zone': 0, 'device': 'sdb1', 'ip': '127.0.0.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200}
|
2014-04-28 19:22:51 -07:00
|
|
|
|
|
|
|
dev1_updates, dev2_updates = devs or ({}, {})
|
|
|
|
|
|
|
|
dev1.update(dev1_updates)
|
|
|
|
dev2.update(dev2_updates)
|
|
|
|
|
|
|
|
replica2part2dev_id = [[0, 1, 0, 1], [1, 0, 1, 0]]
|
|
|
|
devs = [dev1, dev2]
|
|
|
|
part_shift = 30
|
|
|
|
with closing(GzipFile(path, 'wb')) as f:
|
|
|
|
pickle.dump(RingData(replica2part2dev_id, devs, part_shift), f)
|
|
|
|
|
|
|
|
|
2017-06-20 11:17:33 +01:00
|
|
|
def write_stub_builder(tmpdir, region=1, name=''):
|
|
|
|
"""
|
|
|
|
Pretty much just a three node, three replica, 8 part power builder...
|
|
|
|
|
|
|
|
:param tmpdir: a place to write the builder, be sure to clean it up!
|
|
|
|
:param region: an integer, fills in region and ip
|
|
|
|
:param name: the name of the builder (i.e. <name>.builder)
|
|
|
|
"""
|
|
|
|
name = name or str(region)
|
|
|
|
replicas = 3
|
|
|
|
builder = RingBuilder(8, replicas, 1)
|
|
|
|
for i in range(replicas):
|
|
|
|
dev = {'weight': 100,
|
|
|
|
'region': '%d' % region,
|
|
|
|
'zone': '1',
|
|
|
|
'ip': '10.0.0.%d' % region,
|
|
|
|
'port': '3600',
|
|
|
|
'device': 'sdb%d' % i}
|
|
|
|
builder.add_dev(dev)
|
|
|
|
builder.rebalance()
|
|
|
|
builder_file = os.path.join(tmpdir, '%s.builder' % name)
|
|
|
|
builder.save(builder_file)
|
|
|
|
return builder, builder_file
|
|
|
|
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
class FabricatedRing(Ring):
|
|
|
|
"""
|
|
|
|
When a FakeRing just won't do - you can fabricate one to meet
|
|
|
|
your tests needs.
|
|
|
|
"""
|
|
|
|
|
2016-02-01 18:06:54 +00:00
|
|
|
def __init__(self, replicas=6, devices=8, nodes=4, port=6200,
|
2015-03-31 22:35:37 -07:00
|
|
|
part_power=4):
|
|
|
|
self.devices = devices
|
|
|
|
self.nodes = nodes
|
|
|
|
self.port = port
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
self.replicas = replicas
|
2016-07-04 18:21:54 +02:00
|
|
|
self._part_shift = 32 - part_power
|
2015-03-31 22:35:37 -07:00
|
|
|
self._reload()
|
|
|
|
|
2019-02-04 15:46:40 -06:00
|
|
|
def has_changed(self):
|
|
|
|
return False
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def _reload(self, *args, **kwargs):
|
|
|
|
self._rtime = time.time() * 2
|
|
|
|
if hasattr(self, '_replica2part2dev_id'):
|
|
|
|
return
|
|
|
|
self._devs = [{
|
|
|
|
'region': 1,
|
|
|
|
'zone': 1,
|
|
|
|
'weight': 1.0,
|
|
|
|
'id': i,
|
|
|
|
'device': 'sda%d' % i,
|
|
|
|
'ip': '10.0.0.%d' % (i % self.nodes),
|
|
|
|
'replication_ip': '10.0.0.%d' % (i % self.nodes),
|
|
|
|
'port': self.port,
|
|
|
|
'replication_port': self.port,
|
|
|
|
} for i in range(self.devices)]
|
|
|
|
|
|
|
|
self._replica2part2dev_id = [
|
|
|
|
[None] * 2 ** self.part_power
|
|
|
|
for i in range(self.replicas)
|
|
|
|
]
|
|
|
|
dev_ids = itertools.cycle(range(self.devices))
|
|
|
|
for p in range(2 ** self.part_power):
|
|
|
|
for r in range(self.replicas):
|
|
|
|
self._replica2part2dev_id[r][p] = next(dev_ids)
|
2019-02-04 15:46:40 -06:00
|
|
|
self._update_bookkeeping()
|
2015-03-31 22:35:37 -07:00
|
|
|
|
|
|
|
|
2013-03-30 15:55:29 +03:00
|
|
|
class FakeMemcache(object):
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
self.store = {}
|
|
|
|
|
|
|
|
def get(self, key):
|
|
|
|
return self.store.get(key)
|
|
|
|
|
|
|
|
def keys(self):
|
|
|
|
return self.store.keys()
|
|
|
|
|
|
|
|
def set(self, key, value, time=0):
|
|
|
|
self.store[key] = value
|
|
|
|
return True
|
|
|
|
|
|
|
|
def incr(self, key, time=0):
|
|
|
|
self.store[key] = self.store.setdefault(key, 0) + 1
|
|
|
|
return self.store[key]
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def soft_lock(self, key, timeout=0, retries=5):
|
|
|
|
yield True
|
|
|
|
|
|
|
|
def delete(self, key):
|
|
|
|
try:
|
|
|
|
del self.store[key]
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
return True
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def readuntil2crlfs(fd):
|
2018-09-10 16:34:50 -06:00
|
|
|
rv = b''
|
|
|
|
lc = b''
|
2010-07-12 17:03:45 -05:00
|
|
|
crlfs = 0
|
|
|
|
while crlfs < 2:
|
|
|
|
c = fd.read(1)
|
2013-04-24 14:01:56 -07:00
|
|
|
if not c:
|
|
|
|
raise ValueError("didn't get two CRLFs; just got %r" % rv)
|
2010-07-12 17:03:45 -05:00
|
|
|
rv = rv + c
|
2018-09-10 16:34:50 -06:00
|
|
|
if c == b'\r' and lc != b'\n':
|
2010-07-12 17:03:45 -05:00
|
|
|
crlfs = 0
|
2018-09-10 16:34:50 -06:00
|
|
|
if lc == b'\r' and c == b'\n':
|
2010-07-12 17:03:45 -05:00
|
|
|
crlfs += 1
|
|
|
|
lc = c
|
|
|
|
return rv
|
|
|
|
|
|
|
|
|
|
|
|
def connect_tcp(hostport):
|
|
|
|
rv = socket.socket()
|
|
|
|
rv.connect(hostport)
|
|
|
|
return rv
|
2010-07-29 13:30:16 -05:00
|
|
|
|
2010-11-11 16:41:07 -06:00
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def tmpfile(content):
|
|
|
|
with NamedTemporaryFile('w', delete=False) as f:
|
|
|
|
file_name = f.name
|
|
|
|
f.write(str(content))
|
|
|
|
try:
|
|
|
|
yield file_name
|
|
|
|
finally:
|
|
|
|
os.unlink(file_name)
|
|
|
|
|
|
|
|
|
2011-02-11 13:18:19 -06:00
|
|
|
@contextmanager
|
|
|
|
def temptree(files, contents=''):
|
|
|
|
# generate enough contents to fill the files
|
|
|
|
c = len(files)
|
|
|
|
contents = (list(contents) + [''] * c)[:c]
|
|
|
|
tempdir = mkdtemp()
|
|
|
|
for path, content in zip(files, contents):
|
|
|
|
if os.path.isabs(path):
|
|
|
|
path = '.' + path
|
|
|
|
new_path = os.path.join(tempdir, path)
|
|
|
|
subdir = os.path.dirname(new_path)
|
|
|
|
if not os.path.exists(subdir):
|
|
|
|
os.makedirs(subdir)
|
|
|
|
with open(new_path, 'w') as f:
|
|
|
|
f.write(str(content))
|
|
|
|
try:
|
|
|
|
yield tempdir
|
|
|
|
finally:
|
|
|
|
rmtree(tempdir)
|
|
|
|
|
|
|
|
|
2014-05-27 16:57:25 -07:00
|
|
|
def with_tempdir(f):
|
|
|
|
"""
|
|
|
|
Decorator to give a single test a tempdir as argument to test method.
|
|
|
|
"""
|
|
|
|
@functools.wraps(f)
|
|
|
|
def wrapped(*args, **kwargs):
|
|
|
|
tempdir = mkdtemp()
|
|
|
|
args = list(args)
|
|
|
|
args.append(tempdir)
|
|
|
|
try:
|
|
|
|
return f(*args, **kwargs)
|
|
|
|
finally:
|
|
|
|
rmtree(tempdir)
|
|
|
|
return wrapped
|
|
|
|
|
|
|
|
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
class NullLoggingHandler(logging.Handler):
|
|
|
|
|
|
|
|
def emit(self, record):
|
|
|
|
pass
|
|
|
|
|
2013-09-01 15:10:39 -04:00
|
|
|
|
2013-08-30 21:37:07 -07:00
|
|
|
class UnmockTimeModule(object):
|
|
|
|
"""
|
|
|
|
Even if a test mocks time.time - you can restore unmolested behavior in a
|
|
|
|
another module who imports time directly by monkey patching it's imported
|
|
|
|
reference to the module with an instance of this class
|
|
|
|
"""
|
|
|
|
|
|
|
|
_orig_time = time.time
|
|
|
|
|
|
|
|
def __getattribute__(self, name):
|
|
|
|
if name == 'time':
|
|
|
|
return UnmockTimeModule._orig_time
|
|
|
|
return getattr(time, name)
|
|
|
|
|
|
|
|
|
|
|
|
# logging.LogRecord.__init__ calls time.time
|
|
|
|
logging.time = UnmockTimeModule()
|
|
|
|
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
|
2015-08-02 22:47:42 +05:30
|
|
|
class WARN_DEPRECATED(Exception):
|
|
|
|
def __init__(self, msg):
|
|
|
|
self.msg = msg
|
|
|
|
print(self.msg)
|
|
|
|
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
class FakeLogger(logging.Logger, object):
|
|
|
|
# a thread safe fake logger
|
2011-03-15 22:12:03 -07:00
|
|
|
|
2011-05-10 15:36:01 -07:00
|
|
|
def __init__(self, *args, **kwargs):
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
self._clear()
|
2013-08-30 21:01:12 -07:00
|
|
|
self.name = 'swift.unit.fake_logger'
|
2012-04-30 16:38:15 -04:00
|
|
|
self.level = logging.NOTSET
|
|
|
|
if 'facility' in kwargs:
|
|
|
|
self.facility = kwargs['facility']
|
2013-11-18 11:41:58 -08:00
|
|
|
self.statsd_client = None
|
2014-03-17 20:18:42 -07:00
|
|
|
self.thread_locals = None
|
2014-04-28 19:22:51 -07:00
|
|
|
self.parent = None
|
2011-03-15 22:12:03 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
store_in = {
|
|
|
|
logging.ERROR: 'error',
|
|
|
|
logging.WARNING: 'warning',
|
|
|
|
logging.INFO: 'info',
|
|
|
|
logging.DEBUG: 'debug',
|
|
|
|
logging.CRITICAL: 'critical',
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
NOTICE: 'notice',
|
2015-03-31 22:35:37 -07:00
|
|
|
}
|
|
|
|
|
2015-08-02 22:47:42 +05:30
|
|
|
def warn(self, *args, **kwargs):
|
|
|
|
raise WARN_DEPRECATED("Deprecated Method warn use warning instead")
|
|
|
|
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
def notice(self, msg, *args, **kwargs):
|
|
|
|
"""
|
|
|
|
Convenience function for syslog priority LOG_NOTICE. The python
|
|
|
|
logging lvl is set to 25, just above info. SysLogHandler is
|
|
|
|
monkey patched to map this log lvl to the LOG_NOTICE syslog
|
|
|
|
priority.
|
|
|
|
"""
|
|
|
|
self.log(NOTICE, msg, *args, **kwargs)
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def _log(self, level, msg, *args, **kwargs):
|
|
|
|
store_name = self.store_in[level]
|
|
|
|
cargs = [msg]
|
|
|
|
if any(args):
|
|
|
|
cargs.extend(args)
|
|
|
|
captured = dict(kwargs)
|
|
|
|
if 'exc_info' in kwargs and \
|
|
|
|
not isinstance(kwargs['exc_info'], tuple):
|
|
|
|
captured['exc_info'] = sys.exc_info()
|
|
|
|
self.log_dict[store_name].append((tuple(cargs), captured))
|
|
|
|
super(FakeLogger, self)._log(level, msg, *args, **kwargs)
|
|
|
|
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
def _clear(self):
|
2012-08-17 17:00:50 -07:00
|
|
|
self.log_dict = defaultdict(list)
|
Zero-copy object-server GET responses with splice()
This commit lets the object server use splice() and tee() to move data
from disk to the network without ever copying it into user space.
Requires Linux. Sorry, FreeBSD folks. You still have the old
mechanism, as does anyone who doesn't want to use splice. This
requires a relatively recent kernel (2.6.38+) to work, which includes
the two most recent Ubuntu LTS releases (Precise and Trusty) as well
as RHEL 7. However, it excludes Lucid and RHEL 6. On those systems,
setting "splice = on" will result in warnings in the logs but no
actual use of splice.
Note that this only applies to GET responses without Range headers. It
can easily be extended to single-range GET requests, but this commit
leaves that for future work. Same goes for PUT requests, or at least
non-chunked ones.
On some real hardware I had laying around (not a VM), this produced a
37% reduction in CPU usage for GETs made directly to the object
server. Measurements were done by looking at /proc/<pid>/stat,
specifically the utime and stime fields (user and kernel CPU jiffies,
respectively).
Note: There is a Python module called "splicetee" available on PyPi,
but it's licensed under the GPL, so it cannot easily be added to
OpenStack's requirements. That's why this patch uses ctypes instead.
Also fixed a long-standing annoyance in FakeLogger:
>>> fake_logger.warn('stuff')
>>> fake_logger.get_lines_for_level('warn')
[]
>>>
This, of course, is because the correct log level is 'warning'. Now
you get a KeyError if you call get_lines_for_level with a bogus log
level.
Change-Id: Ic6d6b833a5b04ca2019be94b1b90d941929d21c8
2014-06-10 14:15:27 -07:00
|
|
|
self.lines_dict = {'critical': [], 'error': [], 'info': [],
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
'warning': [], 'debug': [], 'notice': []}
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
|
2015-08-21 14:04:46 -07:00
|
|
|
clear = _clear # this is a public interface
|
|
|
|
|
2013-08-30 21:01:12 -07:00
|
|
|
def get_lines_for_level(self, level):
|
Zero-copy object-server GET responses with splice()
This commit lets the object server use splice() and tee() to move data
from disk to the network without ever copying it into user space.
Requires Linux. Sorry, FreeBSD folks. You still have the old
mechanism, as does anyone who doesn't want to use splice. This
requires a relatively recent kernel (2.6.38+) to work, which includes
the two most recent Ubuntu LTS releases (Precise and Trusty) as well
as RHEL 7. However, it excludes Lucid and RHEL 6. On those systems,
setting "splice = on" will result in warnings in the logs but no
actual use of splice.
Note that this only applies to GET responses without Range headers. It
can easily be extended to single-range GET requests, but this commit
leaves that for future work. Same goes for PUT requests, or at least
non-chunked ones.
On some real hardware I had laying around (not a VM), this produced a
37% reduction in CPU usage for GETs made directly to the object
server. Measurements were done by looking at /proc/<pid>/stat,
specifically the utime and stime fields (user and kernel CPU jiffies,
respectively).
Note: There is a Python module called "splicetee" available on PyPi,
but it's licensed under the GPL, so it cannot easily be added to
OpenStack's requirements. That's why this patch uses ctypes instead.
Also fixed a long-standing annoyance in FakeLogger:
>>> fake_logger.warn('stuff')
>>> fake_logger.get_lines_for_level('warn')
[]
>>>
This, of course, is because the correct log level is 'warning'. Now
you get a KeyError if you call get_lines_for_level with a bogus log
level.
Change-Id: Ic6d6b833a5b04ca2019be94b1b90d941929d21c8
2014-06-10 14:15:27 -07:00
|
|
|
if level not in self.lines_dict:
|
|
|
|
raise KeyError(
|
|
|
|
"Invalid log level '%s'; valid levels are %s" %
|
|
|
|
(level,
|
|
|
|
', '.join("'%s'" % lvl for lvl in sorted(self.lines_dict))))
|
2013-08-30 21:01:12 -07:00
|
|
|
return self.lines_dict[level]
|
|
|
|
|
Zero-copy object-server GET responses with splice()
This commit lets the object server use splice() and tee() to move data
from disk to the network without ever copying it into user space.
Requires Linux. Sorry, FreeBSD folks. You still have the old
mechanism, as does anyone who doesn't want to use splice. This
requires a relatively recent kernel (2.6.38+) to work, which includes
the two most recent Ubuntu LTS releases (Precise and Trusty) as well
as RHEL 7. However, it excludes Lucid and RHEL 6. On those systems,
setting "splice = on" will result in warnings in the logs but no
actual use of splice.
Note that this only applies to GET responses without Range headers. It
can easily be extended to single-range GET requests, but this commit
leaves that for future work. Same goes for PUT requests, or at least
non-chunked ones.
On some real hardware I had laying around (not a VM), this produced a
37% reduction in CPU usage for GETs made directly to the object
server. Measurements were done by looking at /proc/<pid>/stat,
specifically the utime and stime fields (user and kernel CPU jiffies,
respectively).
Note: There is a Python module called "splicetee" available on PyPi,
but it's licensed under the GPL, so it cannot easily be added to
OpenStack's requirements. That's why this patch uses ctypes instead.
Also fixed a long-standing annoyance in FakeLogger:
>>> fake_logger.warn('stuff')
>>> fake_logger.get_lines_for_level('warn')
[]
>>>
This, of course, is because the correct log level is 'warning'. Now
you get a KeyError if you call get_lines_for_level with a bogus log
level.
Change-Id: Ic6d6b833a5b04ca2019be94b1b90d941929d21c8
2014-06-10 14:15:27 -07:00
|
|
|
def all_log_lines(self):
|
|
|
|
return dict((level, msgs) for level, msgs in self.lines_dict.items()
|
|
|
|
if len(msgs) > 0)
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
def _store_in(store_name):
|
|
|
|
def stub_fn(self, *args, **kwargs):
|
|
|
|
self.log_dict[store_name].append((args, kwargs))
|
|
|
|
return stub_fn
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
|
|
|
|
# mock out the StatsD logging methods:
|
2014-04-28 19:22:51 -07:00
|
|
|
update_stats = _store_in('update_stats')
|
2012-08-17 17:00:50 -07:00
|
|
|
increment = _store_in('increment')
|
|
|
|
decrement = _store_in('decrement')
|
|
|
|
timing = _store_in('timing')
|
|
|
|
timing_since = _store_in('timing_since')
|
2014-04-28 19:22:51 -07:00
|
|
|
transfer_rate = _store_in('transfer_rate')
|
2012-08-17 17:00:50 -07:00
|
|
|
set_statsd_prefix = _store_in('set_statsd_prefix')
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
|
2012-10-18 14:49:46 -07:00
|
|
|
def get_increments(self):
|
|
|
|
return [call[0][0] for call in self.log_dict['increment']]
|
|
|
|
|
|
|
|
def get_increment_counts(self):
|
|
|
|
counts = {}
|
|
|
|
for metric in self.get_increments():
|
|
|
|
if metric not in counts:
|
|
|
|
counts[metric] = 0
|
|
|
|
counts[metric] += 1
|
|
|
|
return counts
|
|
|
|
|
2012-04-30 16:38:15 -04:00
|
|
|
def setFormatter(self, obj):
|
|
|
|
self.formatter = obj
|
|
|
|
|
|
|
|
def close(self):
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
self._clear()
|
2012-04-30 16:38:15 -04:00
|
|
|
|
|
|
|
def set_name(self, name):
|
|
|
|
# don't touch _handlers
|
|
|
|
self._name = name
|
|
|
|
|
|
|
|
def acquire(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def release(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def createLock(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def emit(self, record):
|
|
|
|
pass
|
|
|
|
|
2013-11-18 11:41:58 -08:00
|
|
|
def _handle(self, record):
|
2013-08-30 21:37:07 -07:00
|
|
|
try:
|
|
|
|
line = record.getMessage()
|
|
|
|
except TypeError:
|
2015-07-28 21:03:05 +05:30
|
|
|
print('WARNING: unable to format log message %r %% %r' % (
|
|
|
|
record.msg, record.args))
|
2013-08-30 21:37:07 -07:00
|
|
|
raise
|
2014-04-28 19:22:51 -07:00
|
|
|
self.lines_dict[record.levelname.lower()].append(line)
|
2012-04-30 16:38:15 -04:00
|
|
|
|
2013-11-18 11:41:58 -08:00
|
|
|
def handle(self, record):
|
|
|
|
self._handle(record)
|
|
|
|
|
2012-04-30 16:38:15 -04:00
|
|
|
def flush(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def handleError(self, record):
|
|
|
|
pass
|
|
|
|
|
2018-11-07 17:35:23 -06:00
|
|
|
def isEnabledFor(self, level):
|
|
|
|
return True
|
|
|
|
|
Adding StatsD logging to Swift.
Documentation, including a list of metrics reported and their semantics,
is in the Admin Guide in a new section, "Reporting Metrics to StatsD".
An optional "metric prefix" may be configured which will be prepended to
every metric name sent to StatsD.
Here is the rationale for doing a deep integration like this versus only
sending metrics to StatsD in middleware. It's the only way to report
some internal activities of Swift in a real-time manner. So to have one
way of reporting to StatsD and one place/style of configuration, even
some things (like, say, timing of PUT requests into the proxy-server)
which could be logged via middleware are consistently logged the same
way (deep integration via the logger delegate methods).
When log_statsd_host is configured, get_logger() injects a
swift.common.utils.StatsdClient object into the logger as
logger.statsd_client. Then a set of delegate methods on LogAdapter
either pass through to the StatsdClient object or become no-ops. This
allows StatsD logging to look like:
self.logger.increment('some.metric.here')
and do the right thing in all cases and with no messy conditional logic.
I wanted to use the pystatsd module for the StatsD client, but the
version on PyPi is lagging the git repo (and is missing both the prefix
functionality and timing_since() method). So I wrote my
swift.common.utils.StatsdClient. The interface is the same as
pystatsd.Client, but the code was written from scratch. It's pretty
simple, and the tests I added cover it. This also frees Swift from an
optional dependency on the pystatsd module, making this feature easier
to enable.
There's test coverage for the new code and all existing tests continue
to pass.
Refactored out _one_audit_pass() method in swift/account/auditor.py and
swift/container/auditor.py.
Fixed some misc. PEP8 violations.
Misc test cleanups and refactorings (particularly the way "fake logging"
is handled).
Change-Id: Ie968a9ae8771f59ee7591e2ae11999c44bfe33b2
2012-04-01 16:47:08 -07:00
|
|
|
|
2016-01-12 12:50:43 +05:30
|
|
|
class DebugSwiftLogFormatter(utils.SwiftLogFormatter):
|
|
|
|
|
|
|
|
def format(self, record):
|
|
|
|
msg = super(DebugSwiftLogFormatter, self).format(record)
|
|
|
|
return msg.replace('#012', '\n')
|
|
|
|
|
|
|
|
|
2013-11-18 11:41:58 -08:00
|
|
|
class DebugLogger(FakeLogger):
|
|
|
|
"""A simple stdout logging version of FakeLogger"""
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
FakeLogger.__init__(self, *args, **kwargs)
|
2016-01-12 12:50:43 +05:30
|
|
|
self.formatter = DebugSwiftLogFormatter(
|
2014-04-28 19:22:51 -07:00
|
|
|
"%(server)s %(levelname)s: %(message)s")
|
2013-11-18 11:41:58 -08:00
|
|
|
|
|
|
|
def handle(self, record):
|
|
|
|
self._handle(record)
|
2015-07-28 21:03:05 +05:30
|
|
|
print(self.formatter.format(record))
|
2013-11-18 11:41:58 -08:00
|
|
|
|
|
|
|
|
2015-01-08 20:29:47 -08:00
|
|
|
class DebugLogAdapter(utils.LogAdapter):
|
2014-04-28 19:22:51 -07:00
|
|
|
|
|
|
|
def _send_to_logger(name):
|
|
|
|
def stub_fn(self, *args, **kwargs):
|
|
|
|
return getattr(self.logger, name)(*args, **kwargs)
|
|
|
|
return stub_fn
|
|
|
|
|
|
|
|
# delegate to FakeLogger's mocks
|
|
|
|
update_stats = _send_to_logger('update_stats')
|
|
|
|
increment = _send_to_logger('increment')
|
|
|
|
decrement = _send_to_logger('decrement')
|
|
|
|
timing = _send_to_logger('timing')
|
|
|
|
timing_since = _send_to_logger('timing_since')
|
|
|
|
transfer_rate = _send_to_logger('transfer_rate')
|
|
|
|
set_statsd_prefix = _send_to_logger('set_statsd_prefix')
|
|
|
|
|
|
|
|
def __getattribute__(self, name):
|
|
|
|
try:
|
|
|
|
return object.__getattribute__(self, name)
|
|
|
|
except AttributeError:
|
|
|
|
return getattr(self.__dict__['logger'], name)
|
|
|
|
|
|
|
|
|
2013-11-18 11:41:58 -08:00
|
|
|
def debug_logger(name='test'):
|
|
|
|
"""get a named adapted debug logger"""
|
2014-04-28 19:22:51 -07:00
|
|
|
return DebugLogAdapter(DebugLogger(), name)
|
2013-11-18 11:41:58 -08:00
|
|
|
|
|
|
|
|
2012-04-30 16:38:15 -04:00
|
|
|
original_syslog_handler = logging.handlers.SysLogHandler
|
|
|
|
|
|
|
|
|
|
|
|
def fake_syslog_handler():
|
|
|
|
for attr in dir(original_syslog_handler):
|
|
|
|
if attr.startswith('LOG'):
|
|
|
|
setattr(FakeLogger, attr,
|
|
|
|
copy.copy(getattr(logging.handlers.SysLogHandler, attr)))
|
|
|
|
FakeLogger.priority_map = \
|
|
|
|
copy.deepcopy(logging.handlers.SysLogHandler.priority_map)
|
|
|
|
|
|
|
|
logging.handlers.SysLogHandler = FakeLogger
|
|
|
|
|
|
|
|
|
2015-01-08 20:29:47 -08:00
|
|
|
if utils.config_true_value(
|
|
|
|
get_config('unit_test').get('fake_syslog', 'False')):
|
2012-04-30 16:38:15 -04:00
|
|
|
fake_syslog_handler()
|
|
|
|
|
2011-03-15 22:12:03 -07:00
|
|
|
|
2017-06-02 17:47:25 -07:00
|
|
|
@contextmanager
|
|
|
|
def quiet_eventlet_exceptions():
|
|
|
|
orig_state = greenpool.DEBUG
|
|
|
|
eventlet_debug.hub_exceptions(False)
|
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
|
|
|
eventlet_debug.hub_exceptions(orig_state)
|
|
|
|
|
|
|
|
|
2017-04-19 15:09:40 +02:00
|
|
|
@contextmanager
|
|
|
|
def mock_check_drive(isdir=False, ismount=False):
|
2010-07-29 13:30:16 -05:00
|
|
|
"""
|
2017-04-19 15:09:40 +02:00
|
|
|
All device/drive/mount checking should be done through the constraints
|
2017-10-17 15:16:43 -07:00
|
|
|
module. If we keep the mocking consistently within that module, we can
|
|
|
|
keep our tests robust to further rework on that interface.
|
2010-07-29 13:30:16 -05:00
|
|
|
|
2017-04-19 15:09:40 +02:00
|
|
|
Replace the constraint modules underlying os calls with mocks.
|
2012-04-30 16:38:15 -04:00
|
|
|
|
2017-04-19 15:09:40 +02:00
|
|
|
:param isdir: return value of constraints isdir calls, default False
|
|
|
|
:param ismount: return value of constraints ismount calls, default False
|
|
|
|
:returns: a dict of constraint module mocks
|
|
|
|
"""
|
|
|
|
mock_base = 'swift.common.constraints.'
|
|
|
|
with mocklib.patch(mock_base + 'isdir') as mock_isdir, \
|
|
|
|
mocklib.patch(mock_base + 'utils.ismount') as mock_ismount:
|
|
|
|
mock_isdir.return_value = isdir
|
|
|
|
mock_ismount.return_value = ismount
|
|
|
|
yield {
|
|
|
|
'isdir': mock_isdir,
|
|
|
|
'ismount': mock_ismount,
|
|
|
|
}
|
2012-08-21 12:51:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def mock(update):
|
|
|
|
returns = []
|
|
|
|
deletes = []
|
|
|
|
for key, value in update.items():
|
|
|
|
imports = key.split('.')
|
|
|
|
attr = imports.pop(-1)
|
|
|
|
module = __import__(imports[0], fromlist=imports[1:])
|
|
|
|
for modname in imports[1:]:
|
|
|
|
module = getattr(module, modname)
|
|
|
|
if hasattr(module, attr):
|
|
|
|
returns.append((module, attr, getattr(module, attr)))
|
|
|
|
else:
|
|
|
|
deletes.append((module, attr))
|
|
|
|
setattr(module, attr, value)
|
2013-07-12 13:27:56 -04:00
|
|
|
try:
|
|
|
|
yield True
|
|
|
|
finally:
|
|
|
|
for module, attr, value in returns:
|
|
|
|
setattr(module, attr, value)
|
|
|
|
for module, attr in deletes:
|
|
|
|
delattr(module, attr)
|
2013-03-20 19:26:45 -07:00
|
|
|
|
|
|
|
|
2015-05-06 16:29:06 -07:00
|
|
|
class FakeStatus(object):
|
|
|
|
"""
|
|
|
|
This will work with our fake_http_connect, if you hand in one of these
|
|
|
|
instead of a status int or status int tuple to the "codes" iter you can
|
|
|
|
add some eventlet sleep to the expect and response stages of the
|
|
|
|
connection.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, status, expect_sleep=None, response_sleep=None):
|
|
|
|
"""
|
|
|
|
:param status: the response status int, or a tuple of
|
|
|
|
([expect_status, ...], response_status)
|
|
|
|
:param expect_sleep: float, time to eventlet sleep during expect, can
|
|
|
|
be a iter of floats
|
|
|
|
:param response_sleep: float, time to eventlet sleep during response
|
|
|
|
"""
|
|
|
|
# connect exception
|
2018-05-01 15:12:05 +01:00
|
|
|
if inspect.isclass(status) and issubclass(status, Exception):
|
|
|
|
raise status('FakeStatus Error')
|
2015-05-06 16:29:06 -07:00
|
|
|
if isinstance(status, (Exception, eventlet.Timeout)):
|
|
|
|
raise status
|
|
|
|
if isinstance(status, tuple):
|
|
|
|
self.expect_status = list(status[:-1])
|
|
|
|
self.status = status[-1]
|
|
|
|
self.explicit_expect_list = True
|
|
|
|
else:
|
|
|
|
self.expect_status, self.status = ([], status)
|
|
|
|
self.explicit_expect_list = False
|
|
|
|
if not self.expect_status:
|
|
|
|
# when a swift backend service returns a status before reading
|
|
|
|
# from the body (mostly an error response) eventlet.wsgi will
|
|
|
|
# respond with that status line immediately instead of 100
|
|
|
|
# Continue, even if the client sent the Expect 100 header.
|
|
|
|
# BufferedHttp and the proxy both see these error statuses
|
|
|
|
# when they call getexpect, so our FakeConn tries to act like
|
|
|
|
# our backend services and return certain types of responses
|
|
|
|
# as expect statuses just like a real backend server would do.
|
|
|
|
if self.status in (507, 412, 409):
|
|
|
|
self.expect_status = [status]
|
|
|
|
else:
|
|
|
|
self.expect_status = [100, 100]
|
|
|
|
|
|
|
|
# setup sleep attributes
|
|
|
|
if not isinstance(expect_sleep, (list, tuple)):
|
|
|
|
expect_sleep = [expect_sleep] * len(self.expect_status)
|
|
|
|
self.expect_sleep_list = list(expect_sleep)
|
|
|
|
while len(self.expect_sleep_list) < len(self.expect_status):
|
|
|
|
self.expect_sleep_list.append(None)
|
|
|
|
self.response_sleep = response_sleep
|
|
|
|
|
|
|
|
def get_response_status(self):
|
|
|
|
if self.response_sleep is not None:
|
|
|
|
eventlet.sleep(self.response_sleep)
|
|
|
|
if self.expect_status and self.explicit_expect_list:
|
|
|
|
raise Exception('Test did not consume all fake '
|
|
|
|
'expect status: %r' % (self.expect_status,))
|
|
|
|
if isinstance(self.status, (Exception, eventlet.Timeout)):
|
|
|
|
raise self.status
|
|
|
|
return self.status
|
|
|
|
|
|
|
|
def get_expect_status(self):
|
|
|
|
expect_sleep = self.expect_sleep_list.pop(0)
|
|
|
|
if expect_sleep is not None:
|
|
|
|
eventlet.sleep(expect_sleep)
|
|
|
|
expect_status = self.expect_status.pop(0)
|
|
|
|
if isinstance(expect_status, (Exception, eventlet.Timeout)):
|
|
|
|
raise expect_status
|
|
|
|
return expect_status
|
|
|
|
|
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
class SlowBody(object):
|
|
|
|
"""
|
|
|
|
This will work with our fake_http_connect, if you hand in these
|
|
|
|
instead of strings it will make reads take longer by the given
|
|
|
|
amount. It should be a little bit easier to extend than the
|
|
|
|
current slow kwarg - which inserts whitespace in the response.
|
|
|
|
Also it should be easy to detect if you have one of these (or a
|
|
|
|
subclass) for the body inside of FakeConn if we wanted to do
|
|
|
|
something smarter than just duck-type the str/buffer api
|
|
|
|
enough to get by.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, body, slowness):
|
|
|
|
self.body = body
|
|
|
|
self.slowness = slowness
|
|
|
|
|
|
|
|
def slowdown(self):
|
|
|
|
eventlet.sleep(self.slowness)
|
|
|
|
|
|
|
|
def __getitem__(self, s):
|
|
|
|
return SlowBody(self.body[s], self.slowness)
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return len(self.body)
|
|
|
|
|
|
|
|
def __radd__(self, other):
|
|
|
|
self.slowdown()
|
|
|
|
return other + self.body
|
|
|
|
|
|
|
|
|
2013-03-20 19:26:45 -07:00
|
|
|
def fake_http_connect(*code_iter, **kwargs):
|
|
|
|
|
|
|
|
class FakeConn(object):
|
|
|
|
|
2018-06-13 14:28:28 -07:00
|
|
|
SLOW_READS = 4
|
|
|
|
SLOW_WRITES = 4
|
|
|
|
|
2019-06-25 11:53:32 -05:00
|
|
|
def __init__(self, status, etag=None, body=b'', timestamp=-1,
|
2015-03-31 22:35:37 -07:00
|
|
|
headers=None, expect_headers=None, connection_id=None,
|
Support for http footers - Replication and EC
Before this patch, the proxy ObjectController supported sending
metadata from the proxy server to object servers in "footers" that
trail the body of HTTP PUT requests, but this support was for EC
policies only. The encryption feature requires that footers are sent
with both EC and replicated policy requests in order to persist
encryption specific sysmeta, and to override container update headers
with an encrypted Etag value.
This patch:
- Moves most of the functionality of ECPutter into a generic Putter
class that is used for replicated object PUTs without footers.
- Creates a MIMEPutter subclass to support multipart and multiphase
behaviour required for any replicated object PUT with footers and
all EC PUTs.
- Modifies ReplicatedObjectController to use Putter objects in place
of raw connection objects.
- Refactors the _get_put_connections method and _put_connect_node methods
so that more code is in the BaseObjectController class and therefore
shared by [EC|Replicated]ObjectController classes.
- Adds support to call a callback that middleware may have placed
in the environ, so the callback can set footers. The
x-object-sysmeta-ec- namespace is reserved and any footer values
set by middleware in that namespace will not be forwarded to
object servers.
In addition this patch enables more than one value to be added to the
X-Backend-Etag-Is-At header. This header is used to point to an
(optional) alternative sysmeta header whose value should be used when
evaluating conditional requests with If-[None-]Match headers. This is
already used with EC policies when the ECObjectController has
calculated the actual body Etag and sent it using a footer
(X-Object-Sysmeta-EC-Etag). X-Backend-Etag-Is-At is in that case set
to X-Object-Sysmeta-Ec-Etag so as to point to the actual body Etag
value rather than the EC fragment Etag.
Encryption will also need to add a pointer to an encrypted Etag value.
However, the referenced sysmeta may not exist, for example if the
object was created before encryption was enabled. The
X-Backend-Etag-Is-At value is therefore changed to support a list of
possible locations for alternate Etag values. Encryption will place
its expected alternative Etag location on this list, as will the
ECObjectController, and the object server will look for the first
object metadata to match an entry on the list when matching
conditional requests. That way, if the object was not encrypted then
the object server will fall through to using the EC Etag value, or in
the case of a replicated policy will fall through to using the normal
Etag metadata.
If your proxy has a third-party middleware that uses X-Backend-Etag-Is-At
and it upgrades before an object server it's talking to then conditional
requests may be broken.
UpgradeImpact
Co-Authored-By: Alistair Coles <alistair.coles@hpe.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: Samuel Merritt <sam@swiftstack.com>
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1594739
Change-Id: I12a6e41150f90de746ce03623032b83ed1987ee1
2016-06-06 17:19:48 +01:00
|
|
|
give_send=None, give_expect=None):
|
2015-05-06 16:29:06 -07:00
|
|
|
if not isinstance(status, FakeStatus):
|
|
|
|
status = FakeStatus(status)
|
|
|
|
self._status = status
|
2013-03-20 19:26:45 -07:00
|
|
|
self.reason = 'Fake'
|
|
|
|
self.host = '1.2.3.4'
|
|
|
|
self.port = '1234'
|
|
|
|
self.sent = 0
|
|
|
|
self.received = 0
|
|
|
|
self.etag = etag
|
|
|
|
self.body = body
|
2013-04-11 12:52:33 -07:00
|
|
|
self.headers = headers or {}
|
2015-03-31 22:35:37 -07:00
|
|
|
self.expect_headers = expect_headers or {}
|
2019-06-25 11:53:32 -05:00
|
|
|
if timestamp == -1:
|
|
|
|
# -1 is reserved to mean "magic default"
|
|
|
|
if status.status != 404:
|
|
|
|
self.timestamp = '1'
|
|
|
|
else:
|
|
|
|
self.timestamp = '0'
|
|
|
|
else:
|
|
|
|
# tests may specify int, string, Timestamp or None
|
|
|
|
self.timestamp = timestamp
|
2015-03-31 22:35:37 -07:00
|
|
|
self.connection_id = connection_id
|
|
|
|
self.give_send = give_send
|
Support for http footers - Replication and EC
Before this patch, the proxy ObjectController supported sending
metadata from the proxy server to object servers in "footers" that
trail the body of HTTP PUT requests, but this support was for EC
policies only. The encryption feature requires that footers are sent
with both EC and replicated policy requests in order to persist
encryption specific sysmeta, and to override container update headers
with an encrypted Etag value.
This patch:
- Moves most of the functionality of ECPutter into a generic Putter
class that is used for replicated object PUTs without footers.
- Creates a MIMEPutter subclass to support multipart and multiphase
behaviour required for any replicated object PUT with footers and
all EC PUTs.
- Modifies ReplicatedObjectController to use Putter objects in place
of raw connection objects.
- Refactors the _get_put_connections method and _put_connect_node methods
so that more code is in the BaseObjectController class and therefore
shared by [EC|Replicated]ObjectController classes.
- Adds support to call a callback that middleware may have placed
in the environ, so the callback can set footers. The
x-object-sysmeta-ec- namespace is reserved and any footer values
set by middleware in that namespace will not be forwarded to
object servers.
In addition this patch enables more than one value to be added to the
X-Backend-Etag-Is-At header. This header is used to point to an
(optional) alternative sysmeta header whose value should be used when
evaluating conditional requests with If-[None-]Match headers. This is
already used with EC policies when the ECObjectController has
calculated the actual body Etag and sent it using a footer
(X-Object-Sysmeta-EC-Etag). X-Backend-Etag-Is-At is in that case set
to X-Object-Sysmeta-Ec-Etag so as to point to the actual body Etag
value rather than the EC fragment Etag.
Encryption will also need to add a pointer to an encrypted Etag value.
However, the referenced sysmeta may not exist, for example if the
object was created before encryption was enabled. The
X-Backend-Etag-Is-At value is therefore changed to support a list of
possible locations for alternate Etag values. Encryption will place
its expected alternative Etag location on this list, as will the
ECObjectController, and the object server will look for the first
object metadata to match an entry on the list when matching
conditional requests. That way, if the object was not encrypted then
the object server will fall through to using the EC Etag value, or in
the case of a replicated policy will fall through to using the normal
Etag metadata.
If your proxy has a third-party middleware that uses X-Backend-Etag-Is-At
and it upgrades before an object server it's talking to then conditional
requests may be broken.
UpgradeImpact
Co-Authored-By: Alistair Coles <alistair.coles@hpe.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: Samuel Merritt <sam@swiftstack.com>
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1594739
Change-Id: I12a6e41150f90de746ce03623032b83ed1987ee1
2016-06-06 17:19:48 +01:00
|
|
|
self.give_expect = give_expect
|
|
|
|
self.closed = False
|
2014-03-24 16:10:18 -07:00
|
|
|
if 'slow' in kwargs and isinstance(kwargs['slow'], list):
|
|
|
|
try:
|
|
|
|
self._next_sleep = kwargs['slow'].pop(0)
|
|
|
|
except IndexError:
|
|
|
|
self._next_sleep = None
|
2018-06-13 14:28:28 -07:00
|
|
|
|
|
|
|
# if we're going to be slow, we need a body to send slowly
|
|
|
|
am_slow, _junk = self.get_slow()
|
|
|
|
if am_slow and len(self.body) < self.SLOW_READS:
|
2019-05-04 16:33:41 -07:00
|
|
|
self.body += b" " * (self.SLOW_READS - len(self.body))
|
2018-06-13 14:28:28 -07:00
|
|
|
|
2015-03-31 22:35:37 -07:00
|
|
|
# be nice to trixy bits with node_iter's
|
|
|
|
eventlet.sleep()
|
2013-03-20 19:26:45 -07:00
|
|
|
|
|
|
|
def getresponse(self):
|
2014-04-28 20:03:48 -07:00
|
|
|
exc = kwargs.get('raise_exc')
|
|
|
|
if exc:
|
2015-03-31 22:35:37 -07:00
|
|
|
if isinstance(exc, (Exception, eventlet.Timeout)):
|
2014-04-28 20:03:48 -07:00
|
|
|
raise exc
|
2013-03-20 19:26:45 -07:00
|
|
|
raise Exception('test')
|
|
|
|
if kwargs.get('raise_timeout_exc'):
|
2015-03-31 22:35:37 -07:00
|
|
|
raise eventlet.Timeout()
|
2015-05-06 16:29:06 -07:00
|
|
|
self.status = self._status.get_response_status()
|
2013-03-20 19:26:45 -07:00
|
|
|
return self
|
|
|
|
|
|
|
|
def getexpect(self):
|
Support for http footers - Replication and EC
Before this patch, the proxy ObjectController supported sending
metadata from the proxy server to object servers in "footers" that
trail the body of HTTP PUT requests, but this support was for EC
policies only. The encryption feature requires that footers are sent
with both EC and replicated policy requests in order to persist
encryption specific sysmeta, and to override container update headers
with an encrypted Etag value.
This patch:
- Moves most of the functionality of ECPutter into a generic Putter
class that is used for replicated object PUTs without footers.
- Creates a MIMEPutter subclass to support multipart and multiphase
behaviour required for any replicated object PUT with footers and
all EC PUTs.
- Modifies ReplicatedObjectController to use Putter objects in place
of raw connection objects.
- Refactors the _get_put_connections method and _put_connect_node methods
so that more code is in the BaseObjectController class and therefore
shared by [EC|Replicated]ObjectController classes.
- Adds support to call a callback that middleware may have placed
in the environ, so the callback can set footers. The
x-object-sysmeta-ec- namespace is reserved and any footer values
set by middleware in that namespace will not be forwarded to
object servers.
In addition this patch enables more than one value to be added to the
X-Backend-Etag-Is-At header. This header is used to point to an
(optional) alternative sysmeta header whose value should be used when
evaluating conditional requests with If-[None-]Match headers. This is
already used with EC policies when the ECObjectController has
calculated the actual body Etag and sent it using a footer
(X-Object-Sysmeta-EC-Etag). X-Backend-Etag-Is-At is in that case set
to X-Object-Sysmeta-Ec-Etag so as to point to the actual body Etag
value rather than the EC fragment Etag.
Encryption will also need to add a pointer to an encrypted Etag value.
However, the referenced sysmeta may not exist, for example if the
object was created before encryption was enabled. The
X-Backend-Etag-Is-At value is therefore changed to support a list of
possible locations for alternate Etag values. Encryption will place
its expected alternative Etag location on this list, as will the
ECObjectController, and the object server will look for the first
object metadata to match an entry on the list when matching
conditional requests. That way, if the object was not encrypted then
the object server will fall through to using the EC Etag value, or in
the case of a replicated policy will fall through to using the normal
Etag metadata.
If your proxy has a third-party middleware that uses X-Backend-Etag-Is-At
and it upgrades before an object server it's talking to then conditional
requests may be broken.
UpgradeImpact
Co-Authored-By: Alistair Coles <alistair.coles@hpe.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: Samuel Merritt <sam@swiftstack.com>
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1594739
Change-Id: I12a6e41150f90de746ce03623032b83ed1987ee1
2016-06-06 17:19:48 +01:00
|
|
|
if self.give_expect:
|
|
|
|
self.give_expect(self)
|
2015-05-06 16:29:06 -07:00
|
|
|
expect_status = self._status.get_expect_status()
|
2015-03-31 22:35:37 -07:00
|
|
|
headers = dict(self.expect_headers)
|
|
|
|
if expect_status == 409:
|
2014-06-25 20:34:39 -07:00
|
|
|
headers['X-Backend-Timestamp'] = self.timestamp
|
2015-08-04 23:15:37 -07:00
|
|
|
response = FakeConn(expect_status,
|
|
|
|
timestamp=self.timestamp,
|
|
|
|
headers=headers)
|
2015-05-06 16:29:06 -07:00
|
|
|
response.status = expect_status
|
|
|
|
return response
|
2013-03-20 19:26:45 -07:00
|
|
|
|
|
|
|
def getheaders(self):
|
|
|
|
etag = self.etag
|
|
|
|
if not etag:
|
2019-01-14 22:32:31 +00:00
|
|
|
if isinstance(self.body, bytes):
|
2013-03-20 19:26:45 -07:00
|
|
|
etag = '"' + md5(self.body).hexdigest() + '"'
|
|
|
|
else:
|
|
|
|
etag = '"68b329da9893e34099c7d8ad5cb9c940"'
|
|
|
|
|
2018-06-13 14:28:28 -07:00
|
|
|
am_slow, _junk = self.get_slow()
|
2016-03-02 10:28:51 +00:00
|
|
|
headers = HeaderKeyDict({
|
2014-11-11 17:03:29 -08:00
|
|
|
'content-length': len(self.body),
|
|
|
|
'content-type': 'x-application/test',
|
|
|
|
'x-timestamp': self.timestamp,
|
|
|
|
'x-backend-timestamp': self.timestamp,
|
|
|
|
'last-modified': self.timestamp,
|
|
|
|
'x-object-meta-test': 'testing',
|
|
|
|
'x-delete-at': '9876543210',
|
|
|
|
'etag': etag,
|
|
|
|
'x-works': 'yes',
|
|
|
|
})
|
2013-06-26 08:23:00 +03:00
|
|
|
if self.status // 100 == 2:
|
|
|
|
headers['x-account-container-count'] = \
|
|
|
|
kwargs.get('count', 12345)
|
2013-03-20 19:26:45 -07:00
|
|
|
if not self.timestamp:
|
2014-11-11 17:03:29 -08:00
|
|
|
# when timestamp is None, HeaderKeyDict raises KeyError
|
|
|
|
headers.pop('x-timestamp', None)
|
2013-03-20 19:26:45 -07:00
|
|
|
try:
|
2015-06-15 22:10:45 +05:30
|
|
|
if next(container_ts_iter) is False:
|
2013-03-20 19:26:45 -07:00
|
|
|
headers['x-container-timestamp'] = '1'
|
|
|
|
except StopIteration:
|
|
|
|
pass
|
2013-04-11 12:52:33 -07:00
|
|
|
headers.update(self.headers)
|
2013-03-20 19:26:45 -07:00
|
|
|
return headers.items()
|
|
|
|
|
2014-03-24 16:10:18 -07:00
|
|
|
def get_slow(self):
|
|
|
|
if 'slow' in kwargs and isinstance(kwargs['slow'], list):
|
|
|
|
if self._next_sleep is not None:
|
|
|
|
return True, self._next_sleep
|
|
|
|
else:
|
|
|
|
return False, 0.01
|
|
|
|
if kwargs.get('slow') and isinstance(kwargs['slow'], Number):
|
|
|
|
return True, kwargs['slow']
|
|
|
|
return bool(kwargs.get('slow')), 0.1
|
2013-11-04 17:06:06 +00:00
|
|
|
|
2013-03-20 19:26:45 -07:00
|
|
|
def read(self, amt=None):
|
2014-03-24 16:10:18 -07:00
|
|
|
am_slow, value = self.get_slow()
|
|
|
|
if am_slow:
|
2018-06-13 14:28:28 -07:00
|
|
|
if self.sent < self.SLOW_READS:
|
2019-05-04 16:33:41 -07:00
|
|
|
slowly_read_byte = self.body[self.sent:self.sent + 1]
|
2013-03-20 19:26:45 -07:00
|
|
|
self.sent += 1
|
2015-03-31 22:35:37 -07:00
|
|
|
eventlet.sleep(value)
|
2018-06-13 14:28:28 -07:00
|
|
|
return slowly_read_byte
|
|
|
|
if amt is None:
|
|
|
|
rv = self.body[self.sent:]
|
|
|
|
else:
|
|
|
|
rv = self.body[self.sent:self.sent + amt]
|
|
|
|
self.sent += len(rv)
|
2013-03-20 19:26:45 -07:00
|
|
|
return rv
|
|
|
|
|
2016-07-07 11:31:31 +01:00
|
|
|
def send(self, data=None):
|
2015-03-31 22:35:37 -07:00
|
|
|
if self.give_send:
|
2016-07-07 11:31:31 +01:00
|
|
|
self.give_send(self, data)
|
2014-03-24 16:10:18 -07:00
|
|
|
am_slow, value = self.get_slow()
|
|
|
|
if am_slow:
|
2018-06-13 14:28:28 -07:00
|
|
|
if self.received < self.SLOW_WRITES:
|
2013-03-20 19:26:45 -07:00
|
|
|
self.received += 1
|
2015-03-31 22:35:37 -07:00
|
|
|
eventlet.sleep(value)
|
2013-03-20 19:26:45 -07:00
|
|
|
|
|
|
|
def getheader(self, name, default=None):
|
2016-03-02 10:28:51 +00:00
|
|
|
return HeaderKeyDict(self.getheaders()).get(name, default)
|
2013-03-20 19:26:45 -07:00
|
|
|
|
2015-02-17 16:55:34 -05:00
|
|
|
def close(self):
|
Support for http footers - Replication and EC
Before this patch, the proxy ObjectController supported sending
metadata from the proxy server to object servers in "footers" that
trail the body of HTTP PUT requests, but this support was for EC
policies only. The encryption feature requires that footers are sent
with both EC and replicated policy requests in order to persist
encryption specific sysmeta, and to override container update headers
with an encrypted Etag value.
This patch:
- Moves most of the functionality of ECPutter into a generic Putter
class that is used for replicated object PUTs without footers.
- Creates a MIMEPutter subclass to support multipart and multiphase
behaviour required for any replicated object PUT with footers and
all EC PUTs.
- Modifies ReplicatedObjectController to use Putter objects in place
of raw connection objects.
- Refactors the _get_put_connections method and _put_connect_node methods
so that more code is in the BaseObjectController class and therefore
shared by [EC|Replicated]ObjectController classes.
- Adds support to call a callback that middleware may have placed
in the environ, so the callback can set footers. The
x-object-sysmeta-ec- namespace is reserved and any footer values
set by middleware in that namespace will not be forwarded to
object servers.
In addition this patch enables more than one value to be added to the
X-Backend-Etag-Is-At header. This header is used to point to an
(optional) alternative sysmeta header whose value should be used when
evaluating conditional requests with If-[None-]Match headers. This is
already used with EC policies when the ECObjectController has
calculated the actual body Etag and sent it using a footer
(X-Object-Sysmeta-EC-Etag). X-Backend-Etag-Is-At is in that case set
to X-Object-Sysmeta-Ec-Etag so as to point to the actual body Etag
value rather than the EC fragment Etag.
Encryption will also need to add a pointer to an encrypted Etag value.
However, the referenced sysmeta may not exist, for example if the
object was created before encryption was enabled. The
X-Backend-Etag-Is-At value is therefore changed to support a list of
possible locations for alternate Etag values. Encryption will place
its expected alternative Etag location on this list, as will the
ECObjectController, and the object server will look for the first
object metadata to match an entry on the list when matching
conditional requests. That way, if the object was not encrypted then
the object server will fall through to using the EC Etag value, or in
the case of a replicated policy will fall through to using the normal
Etag metadata.
If your proxy has a third-party middleware that uses X-Backend-Etag-Is-At
and it upgrades before an object server it's talking to then conditional
requests may be broken.
UpgradeImpact
Co-Authored-By: Alistair Coles <alistair.coles@hpe.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: Samuel Merritt <sam@swiftstack.com>
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1594739
Change-Id: I12a6e41150f90de746ce03623032b83ed1987ee1
2016-06-06 17:19:48 +01:00
|
|
|
self.closed = True
|
2015-02-17 16:55:34 -05:00
|
|
|
|
2019-06-25 11:53:32 -05:00
|
|
|
# unless tests provide timestamps we use the "magic default"
|
|
|
|
timestamps_iter = iter(kwargs.get('timestamps') or [-1] * len(code_iter))
|
2013-03-20 19:26:45 -07:00
|
|
|
etag_iter = iter(kwargs.get('etags') or [None] * len(code_iter))
|
2015-03-31 22:35:37 -07:00
|
|
|
if isinstance(kwargs.get('headers'), (list, tuple)):
|
2013-04-11 12:52:33 -07:00
|
|
|
headers_iter = iter(kwargs['headers'])
|
|
|
|
else:
|
|
|
|
headers_iter = iter([kwargs.get('headers', {})] * len(code_iter))
|
2015-03-31 22:35:37 -07:00
|
|
|
if isinstance(kwargs.get('expect_headers'), (list, tuple)):
|
|
|
|
expect_headers_iter = iter(kwargs['expect_headers'])
|
|
|
|
else:
|
|
|
|
expect_headers_iter = iter([kwargs.get('expect_headers', {})] *
|
|
|
|
len(code_iter))
|
2013-04-11 12:52:33 -07:00
|
|
|
|
2013-03-20 19:26:45 -07:00
|
|
|
x = kwargs.get('missing_container', [False] * len(code_iter))
|
|
|
|
if not isinstance(x, (tuple, list)):
|
|
|
|
x = [x] * len(code_iter)
|
|
|
|
container_ts_iter = iter(x)
|
|
|
|
code_iter = iter(code_iter)
|
2015-03-31 22:35:37 -07:00
|
|
|
conn_id_and_code_iter = enumerate(code_iter)
|
2013-03-20 19:26:45 -07:00
|
|
|
static_body = kwargs.get('body', None)
|
|
|
|
body_iter = kwargs.get('body_iter', None)
|
|
|
|
if body_iter:
|
|
|
|
body_iter = iter(body_iter)
|
2017-02-16 14:14:09 -08:00
|
|
|
unexpected_requests = []
|
2013-03-20 19:26:45 -07:00
|
|
|
|
|
|
|
def connect(*args, **ckwargs):
|
2013-08-28 21:26:08 +00:00
|
|
|
if kwargs.get('slow_connect', False):
|
2015-03-31 22:35:37 -07:00
|
|
|
eventlet.sleep(0.1)
|
2013-03-20 19:26:45 -07:00
|
|
|
if 'give_content_type' in kwargs:
|
|
|
|
if len(args) >= 7 and 'Content-Type' in args[6]:
|
|
|
|
kwargs['give_content_type'](args[6]['Content-Type'])
|
|
|
|
else:
|
|
|
|
kwargs['give_content_type']('')
|
2017-02-16 14:14:09 -08:00
|
|
|
try:
|
|
|
|
i, status = next(conn_id_and_code_iter)
|
|
|
|
except StopIteration:
|
|
|
|
# the code under test may swallow the StopIteration, so by logging
|
|
|
|
# unexpected requests here we allow the test framework to check for
|
|
|
|
# them after the connect function has been used.
|
|
|
|
unexpected_requests.append((args, kwargs))
|
|
|
|
raise
|
|
|
|
|
2013-03-20 19:26:45 -07:00
|
|
|
if 'give_connect' in kwargs:
|
2015-03-31 22:35:37 -07:00
|
|
|
give_conn_fn = kwargs['give_connect']
|
|
|
|
argspec = inspect.getargspec(give_conn_fn)
|
|
|
|
if argspec.keywords or 'connection_id' in argspec.args:
|
|
|
|
ckwargs['connection_id'] = i
|
|
|
|
give_conn_fn(*args, **ckwargs)
|
2015-06-15 22:10:45 +05:30
|
|
|
etag = next(etag_iter)
|
|
|
|
headers = next(headers_iter)
|
|
|
|
expect_headers = next(expect_headers_iter)
|
|
|
|
timestamp = next(timestamps_iter)
|
2013-03-20 19:26:45 -07:00
|
|
|
|
2019-01-30 17:35:37 -06:00
|
|
|
if isinstance(status, int) and status <= 0:
|
2013-03-20 19:26:45 -07:00
|
|
|
raise HTTPException()
|
|
|
|
if body_iter is None:
|
2019-01-30 17:35:37 -06:00
|
|
|
body = static_body or b''
|
2013-03-20 19:26:45 -07:00
|
|
|
else:
|
2015-06-15 22:10:45 +05:30
|
|
|
body = next(body_iter)
|
2013-03-20 19:26:45 -07:00
|
|
|
return FakeConn(status, etag, body=body, timestamp=timestamp,
|
2015-03-31 22:35:37 -07:00
|
|
|
headers=headers, expect_headers=expect_headers,
|
Support for http footers - Replication and EC
Before this patch, the proxy ObjectController supported sending
metadata from the proxy server to object servers in "footers" that
trail the body of HTTP PUT requests, but this support was for EC
policies only. The encryption feature requires that footers are sent
with both EC and replicated policy requests in order to persist
encryption specific sysmeta, and to override container update headers
with an encrypted Etag value.
This patch:
- Moves most of the functionality of ECPutter into a generic Putter
class that is used for replicated object PUTs without footers.
- Creates a MIMEPutter subclass to support multipart and multiphase
behaviour required for any replicated object PUT with footers and
all EC PUTs.
- Modifies ReplicatedObjectController to use Putter objects in place
of raw connection objects.
- Refactors the _get_put_connections method and _put_connect_node methods
so that more code is in the BaseObjectController class and therefore
shared by [EC|Replicated]ObjectController classes.
- Adds support to call a callback that middleware may have placed
in the environ, so the callback can set footers. The
x-object-sysmeta-ec- namespace is reserved and any footer values
set by middleware in that namespace will not be forwarded to
object servers.
In addition this patch enables more than one value to be added to the
X-Backend-Etag-Is-At header. This header is used to point to an
(optional) alternative sysmeta header whose value should be used when
evaluating conditional requests with If-[None-]Match headers. This is
already used with EC policies when the ECObjectController has
calculated the actual body Etag and sent it using a footer
(X-Object-Sysmeta-EC-Etag). X-Backend-Etag-Is-At is in that case set
to X-Object-Sysmeta-Ec-Etag so as to point to the actual body Etag
value rather than the EC fragment Etag.
Encryption will also need to add a pointer to an encrypted Etag value.
However, the referenced sysmeta may not exist, for example if the
object was created before encryption was enabled. The
X-Backend-Etag-Is-At value is therefore changed to support a list of
possible locations for alternate Etag values. Encryption will place
its expected alternative Etag location on this list, as will the
ECObjectController, and the object server will look for the first
object metadata to match an entry on the list when matching
conditional requests. That way, if the object was not encrypted then
the object server will fall through to using the EC Etag value, or in
the case of a replicated policy will fall through to using the normal
Etag metadata.
If your proxy has a third-party middleware that uses X-Backend-Etag-Is-At
and it upgrades before an object server it's talking to then conditional
requests may be broken.
UpgradeImpact
Co-Authored-By: Alistair Coles <alistair.coles@hpe.com>
Co-Authored-By: Thiago da Silva <thiago@redhat.com>
Co-Authored-By: Samuel Merritt <sam@swiftstack.com>
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Closes-Bug: #1594739
Change-Id: I12a6e41150f90de746ce03623032b83ed1987ee1
2016-06-06 17:19:48 +01:00
|
|
|
connection_id=i, give_send=kwargs.get('give_send'),
|
|
|
|
give_expect=kwargs.get('give_expect'))
|
2013-03-20 19:26:45 -07:00
|
|
|
|
2017-02-16 14:14:09 -08:00
|
|
|
connect.unexpected_requests = unexpected_requests
|
2014-03-03 09:08:43 -08:00
|
|
|
connect.code_iter = code_iter
|
|
|
|
|
2013-03-20 19:26:45 -07:00
|
|
|
return connect
|
2014-04-28 19:22:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def mocked_http_conn(*args, **kwargs):
|
2014-11-17 20:29:45 -08:00
|
|
|
requests = []
|
|
|
|
|
|
|
|
def capture_requests(ip, port, method, path, headers, qs, ssl):
|
2019-04-15 21:44:18 -07:00
|
|
|
if six.PY2 and not isinstance(ip, bytes):
|
|
|
|
ip = ip.encode('ascii')
|
2014-11-17 20:29:45 -08:00
|
|
|
req = {
|
|
|
|
'ip': ip,
|
|
|
|
'port': port,
|
|
|
|
'method': method,
|
|
|
|
'path': path,
|
|
|
|
'headers': headers,
|
|
|
|
'qs': qs,
|
|
|
|
'ssl': ssl,
|
|
|
|
}
|
|
|
|
requests.append(req)
|
|
|
|
kwargs.setdefault('give_connect', capture_requests)
|
2014-04-28 19:22:51 -07:00
|
|
|
fake_conn = fake_http_connect(*args, **kwargs)
|
2014-11-17 20:29:45 -08:00
|
|
|
fake_conn.requests = requests
|
2014-04-28 19:22:51 -07:00
|
|
|
with mocklib.patch('swift.common.bufferedhttp.http_connect_raw',
|
|
|
|
new=fake_conn):
|
|
|
|
yield fake_conn
|
2014-11-17 20:29:45 -08:00
|
|
|
left_over_status = list(fake_conn.code_iter)
|
|
|
|
if left_over_status:
|
|
|
|
raise AssertionError('left over status %r' % left_over_status)
|
2017-05-01 12:06:40 -07:00
|
|
|
if fake_conn.unexpected_requests:
|
|
|
|
raise AssertionError('unexpected requests %r' %
|
|
|
|
fake_conn.unexpected_requests)
|
2015-03-31 22:35:37 -07:00
|
|
|
|
|
|
|
|
Fix sporadic failure in test/unit/obj/test_server.py
In particular, in TestObjectController.test_object_delete_at_async_update
Rarely (<0.1% of the time?), it would fail with:
======================================================================
FAIL: test_object_delete_at_async_update
(test.unit.obj.test_server.TestObjectController)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/vagrant/swift/test/unit/obj/test_server.py", line 4826, in
test_object_delete_at_async_update
resp = req.get_response(self.object_controller)
File "/usr/lib/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/vagrant/swift/test/unit/__init__.py", line 1075, in
mocked_http_conn
raise AssertionError('left over status %r' % left_over_status)
AssertionError: left over status [500, 500]
-------------------- >> begin captured stdout << ---------------------
test INFO: None - - [26/Apr/2017:22:32:13 +0000] "PUT /sda1/p/a/c/o" 400
19 "-" "-" "-" 0.0003 "-" 23801 0
--------------------- >> end captured stdout << ----------------------
>> raise AssertionError('left over status %r' % [500, 500])
----------------------------------------------------------------------
Related-Bug: 1514111
Change-Id: I1af4a291fb67cf4b1829f167998a08644117a800
2017-04-26 15:50:59 -07:00
|
|
|
def make_timestamp_iter(offset=0):
|
|
|
|
return iter(Timestamp(t)
|
|
|
|
for t in itertools.count(int(time.time()) + offset))
|
2016-06-07 10:35:18 +00:00
|
|
|
|
|
|
|
|
2018-05-01 15:12:05 +01:00
|
|
|
@contextmanager
|
|
|
|
def mock_timestamp_now(now=None):
|
|
|
|
if now is None:
|
|
|
|
now = Timestamp.now()
|
|
|
|
with mocklib.patch('swift.common.utils.Timestamp.now',
|
|
|
|
classmethod(lambda c: now)):
|
|
|
|
yield now
|
|
|
|
|
|
|
|
|
2016-06-07 10:35:18 +00:00
|
|
|
class Timeout(object):
|
|
|
|
def __init__(self, seconds):
|
|
|
|
self.seconds = seconds
|
|
|
|
|
|
|
|
def __enter__(self):
|
|
|
|
signal.signal(signal.SIGALRM, self._exit)
|
|
|
|
signal.alarm(self.seconds)
|
|
|
|
|
|
|
|
def __exit__(self, type, value, traceback):
|
|
|
|
signal.alarm(0)
|
|
|
|
|
|
|
|
def _exit(self, signum, frame):
|
|
|
|
class TimeoutException(Exception):
|
|
|
|
pass
|
|
|
|
raise TimeoutException
|
2015-03-05 18:18:25 +05:30
|
|
|
|
|
|
|
|
2018-03-12 18:07:37 +01:00
|
|
|
def requires_o_tmpfile_support_in_tmp(func):
|
|
|
|
@functools.wraps(func)
|
|
|
|
def wrapper(*args, **kwargs):
|
|
|
|
if not utils.o_tmpfile_in_tmpdir_supported():
|
|
|
|
raise SkipTest('Requires O_TMPFILE support in TMPDIR')
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
class StubResponse(object):
|
|
|
|
|
2019-01-30 17:35:37 -06:00
|
|
|
def __init__(self, status, body=b'', headers=None, frag_index=None):
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
self.status = status
|
|
|
|
self.body = body
|
|
|
|
self.readable = BytesIO(body)
|
|
|
|
self.headers = HeaderKeyDict(headers)
|
|
|
|
if frag_index is not None:
|
|
|
|
self.headers['X-Object-Sysmeta-Ec-Frag-Index'] = frag_index
|
|
|
|
fake_reason = ('Fake', 'This response is a lie.')
|
|
|
|
self.reason = swob.RESPONSE_REASONS.get(status, fake_reason)[0]
|
|
|
|
|
|
|
|
def getheader(self, header_name, default=None):
|
|
|
|
return self.headers.get(header_name, default)
|
|
|
|
|
|
|
|
def getheaders(self):
|
|
|
|
if 'Content-Length' not in self.headers:
|
|
|
|
self.headers['Content-Length'] = len(self.body)
|
|
|
|
return self.headers.items()
|
|
|
|
|
|
|
|
def read(self, amt=0):
|
|
|
|
return self.readable.read(amt)
|
|
|
|
|
2019-12-20 14:30:20 -06:00
|
|
|
def __repr__(self):
|
|
|
|
info = ['Status: %s' % self.status]
|
|
|
|
if self.headers:
|
|
|
|
info.append('Headers: %r' % dict(self.headers))
|
|
|
|
if self.body:
|
|
|
|
info.append('Body: %r' % self.body)
|
|
|
|
return '<StubResponse %s>' % ', '.join(info)
|
|
|
|
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
|
2016-10-17 20:38:52 +01:00
|
|
|
def encode_frag_archive_bodies(policy, body):
|
|
|
|
"""
|
|
|
|
Given a stub body produce a list of complete frag_archive bodies as
|
|
|
|
strings in frag_index order.
|
|
|
|
|
|
|
|
:param policy: a StoragePolicy instance, with policy_type EC_POLICY
|
|
|
|
:param body: a string, the body to encode into frag archives
|
|
|
|
|
|
|
|
:returns: list of strings, the complete frag_archive bodies for the given
|
|
|
|
plaintext
|
|
|
|
"""
|
|
|
|
segment_size = policy.ec_segment_size
|
|
|
|
# split up the body into buffers
|
|
|
|
chunks = [body[x:x + segment_size]
|
|
|
|
for x in range(0, len(body), segment_size)]
|
|
|
|
# encode the buffers into fragment payloads
|
|
|
|
fragment_payloads = []
|
|
|
|
for chunk in chunks:
|
2017-02-25 20:28:13 -08:00
|
|
|
fragments = policy.pyeclib_driver.encode(chunk) \
|
|
|
|
* policy.ec_duplication_factor
|
2016-10-17 20:38:52 +01:00
|
|
|
if not fragments:
|
|
|
|
break
|
|
|
|
fragment_payloads.append(fragments)
|
|
|
|
|
|
|
|
# join up the fragment payloads per node
|
2018-11-29 01:31:13 -06:00
|
|
|
ec_archive_bodies = [b''.join(frags)
|
2016-10-17 20:38:52 +01:00
|
|
|
for frags in zip(*fragment_payloads)]
|
|
|
|
return ec_archive_bodies
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
|
|
|
|
|
|
|
|
def make_ec_object_stub(test_body, policy, timestamp):
|
|
|
|
segment_size = policy.ec_segment_size
|
|
|
|
test_body = test_body or (
|
2019-01-30 17:35:37 -06:00
|
|
|
b'test' * segment_size)[:-random.randint(1, 1000)]
|
2017-04-27 14:19:00 -07:00
|
|
|
timestamp = timestamp or utils.Timestamp.now()
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
etag = md5(test_body).hexdigest()
|
|
|
|
ec_archive_bodies = encode_frag_archive_bodies(policy, test_body)
|
|
|
|
|
|
|
|
return {
|
|
|
|
'body': test_body,
|
|
|
|
'etag': etag,
|
|
|
|
'frags': ec_archive_bodies,
|
|
|
|
'timestamp': timestamp
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def fake_ec_node_response(node_frags, policy):
|
|
|
|
"""
|
|
|
|
Given a list of entries for each node in ring order, where the entries
|
|
|
|
are a dict (or list of dicts) which describes the fragment (or
|
|
|
|
fragments) that are on the node; create a function suitable for use
|
|
|
|
with capture_http_requests that will accept a req object and return a
|
|
|
|
response that will suitably fake the behavior of an object server who
|
|
|
|
had the given fragments on disk at the time.
|
|
|
|
|
|
|
|
:param node_frags: a list. Each item in the list describes the
|
|
|
|
fragments that are on a node; each item is a dict or list of dicts,
|
|
|
|
each dict describing a single fragment; where the item is a list,
|
|
|
|
repeated calls to get_response will return fragments in the order
|
|
|
|
of the list; each dict has keys:
|
|
|
|
- obj: an object stub, as generated by _make_ec_object_stub,
|
|
|
|
that defines all of the fragments that compose an object
|
|
|
|
at a specific timestamp.
|
|
|
|
- frag: the index of a fragment to be selected from the object
|
|
|
|
stub
|
|
|
|
- durable (optional): True if the selected fragment is durable
|
|
|
|
:param policy: storage policy to return
|
|
|
|
"""
|
|
|
|
node_map = {} # maps node ip and port to node index
|
|
|
|
all_nodes = []
|
|
|
|
call_count = {} # maps node index to get_response call count for node
|
|
|
|
|
|
|
|
def _build_node_map(req, policy):
|
|
|
|
node_key = lambda n: (n['ip'], n['port'])
|
|
|
|
part = utils.split_path(req['path'], 5, 5, True)[1]
|
|
|
|
all_nodes.extend(policy.object_ring.get_part_nodes(part))
|
|
|
|
all_nodes.extend(policy.object_ring.get_more_nodes(part))
|
|
|
|
for i, node in enumerate(all_nodes):
|
|
|
|
node_map[node_key(node)] = i
|
|
|
|
call_count[i] = 0
|
|
|
|
|
|
|
|
# normalize node_frags to a list of fragments for each node even
|
|
|
|
# if there's only one fragment in the dataset provided.
|
|
|
|
for i, frags in enumerate(node_frags):
|
|
|
|
if isinstance(frags, dict):
|
|
|
|
node_frags[i] = [frags]
|
|
|
|
|
|
|
|
def get_response(req):
|
|
|
|
requested_policy = int(
|
|
|
|
req['headers']['X-Backend-Storage-Policy-Index'])
|
|
|
|
if int(policy) != requested_policy:
|
|
|
|
AssertionError(
|
|
|
|
"Requested polciy doesn't fit the fake response policy")
|
|
|
|
if not node_map:
|
|
|
|
_build_node_map(req, policy)
|
|
|
|
|
|
|
|
try:
|
|
|
|
node_index = node_map[(req['ip'], req['port'])]
|
|
|
|
except KeyError:
|
|
|
|
raise Exception("Couldn't find node %s:%s in %r" % (
|
|
|
|
req['ip'], req['port'], all_nodes))
|
|
|
|
try:
|
|
|
|
frags = node_frags[node_index]
|
|
|
|
except IndexError:
|
|
|
|
raise Exception('Found node %r:%r at index %s - '
|
|
|
|
'but only got %s stub response nodes' % (
|
|
|
|
req['ip'], req['port'], node_index,
|
|
|
|
len(node_frags)))
|
|
|
|
|
|
|
|
if not frags:
|
|
|
|
return StubResponse(404)
|
|
|
|
|
|
|
|
# determine response fragment (if any) for this call
|
|
|
|
resp_frag = frags[call_count[node_index]]
|
|
|
|
call_count[node_index] += 1
|
|
|
|
frag_prefs = req['headers'].get('X-Backend-Fragment-Preferences')
|
|
|
|
if not (frag_prefs or resp_frag.get('durable', True)):
|
|
|
|
return StubResponse(404)
|
|
|
|
|
|
|
|
# prepare durable timestamp and backend frags header for this node
|
|
|
|
obj_stub = resp_frag['obj']
|
|
|
|
ts2frags = defaultdict(list)
|
|
|
|
durable_timestamp = None
|
|
|
|
for frag in frags:
|
|
|
|
ts_frag = frag['obj']['timestamp']
|
|
|
|
if frag.get('durable', True):
|
|
|
|
durable_timestamp = ts_frag.internal
|
|
|
|
ts2frags[ts_frag].append(frag['frag'])
|
|
|
|
|
|
|
|
try:
|
|
|
|
body = obj_stub['frags'][resp_frag['frag']]
|
|
|
|
except IndexError as err:
|
|
|
|
raise Exception(
|
|
|
|
'Frag index %s not defined: node index %s, frags %r\n%s' %
|
|
|
|
(resp_frag['frag'], node_index, [f['frag'] for f in frags],
|
|
|
|
err))
|
|
|
|
headers = {
|
|
|
|
'X-Object-Sysmeta-Ec-Content-Length': len(obj_stub['body']),
|
|
|
|
'X-Object-Sysmeta-Ec-Etag': obj_stub['etag'],
|
|
|
|
'X-Object-Sysmeta-Ec-Frag-Index':
|
|
|
|
policy.get_backend_index(resp_frag['frag']),
|
|
|
|
'X-Backend-Timestamp': obj_stub['timestamp'].internal,
|
|
|
|
'X-Timestamp': obj_stub['timestamp'].normal,
|
|
|
|
'X-Backend-Data-Timestamp': obj_stub['timestamp'].internal,
|
|
|
|
'X-Backend-Fragments':
|
|
|
|
server._make_backend_fragments_header(ts2frags)
|
|
|
|
}
|
|
|
|
if durable_timestamp:
|
|
|
|
headers['X-Backend-Durable-Timestamp'] = durable_timestamp
|
|
|
|
|
|
|
|
return StubResponse(200, body, headers)
|
|
|
|
|
|
|
|
return get_response
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
supports_xattr_cached_val = None
|
|
|
|
|
|
|
|
|
|
|
|
def xattr_supported_check():
|
|
|
|
"""
|
|
|
|
This check simply sets more than 4k of metadata on a tempfile and
|
|
|
|
returns True if it worked and False if not.
|
|
|
|
|
|
|
|
We want to use *more* than 4k of metadata in this check because
|
|
|
|
some filesystems (eg ext4) only allow one blocksize worth of
|
|
|
|
metadata. The XFS filesystem doesn't have this limit, and so this
|
|
|
|
check returns True when TMPDIR is XFS. This check will return
|
|
|
|
False under ext4 (which supports xattrs <= 4k) and tmpfs (which
|
|
|
|
doesn't support xattrs at all).
|
|
|
|
|
|
|
|
"""
|
|
|
|
global supports_xattr_cached_val
|
|
|
|
|
|
|
|
if supports_xattr_cached_val is not None:
|
|
|
|
return supports_xattr_cached_val
|
|
|
|
|
|
|
|
# assume the worst -- xattrs aren't supported
|
|
|
|
supports_xattr_cached_val = False
|
|
|
|
|
2018-02-22 22:48:55 +00:00
|
|
|
big_val = b'x' * (4096 + 1) # more than 4k of metadata
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
try:
|
|
|
|
fd, tmppath = mkstemp()
|
|
|
|
xattr.setxattr(fd, 'user.swift.testing_key', big_val)
|
|
|
|
except IOError as e:
|
2019-07-15 19:53:14 +02:00
|
|
|
if errno.errorcode.get(e.errno) in ('ENOSPC', 'ENOTSUP', 'EOPNOTSUPP',
|
|
|
|
'ERANGE'):
|
Add checksum to object extended attributes
Currently, our integrity checking for objects is pretty weak when it
comes to object metadata. If the extended attributes on a .data or
.meta file get corrupted in such a way that we can still unpickle it,
we don't have anything that detects that.
This could be especially bad with encrypted etags; if the encrypted
etag (X-Object-Sysmeta-Crypto-Etag or whatever it is) gets some bits
flipped, then we'll cheerfully decrypt the cipherjunk into plainjunk,
then send it to the client. Net effect is that the client sees a GET
response with an ETag that doesn't match the MD5 of the object *and*
Swift has no way of detecting and quarantining this object.
Note that, with an unencrypted object, if the ETag metadatum gets
mangled, then the object will be quarantined by the object server or
auditor, whichever notices first.
As part of this commit, I also ripped out some mocking of
getxattr/setxattr in tests. It appears to be there to allow unit tests
to run on systems where /tmp doesn't support xattrs. However, since
the mock is keyed off of inode number and inode numbers get re-used,
there's lots of leakage between different test runs. On a real FS,
unlinking a file and then creating a new one of the same name will
also reset the xattrs; this isn't the case with the mock.
The mock was pretty old; Ubuntu 12.04 and up all support xattrs in
/tmp, and recent Red Hat / CentOS releases do too. The xattr mock was
added in 2011; maybe it was to support Ubuntu Lucid Lynx?
Bonus: now you can pause a test with the debugger, inspect its files
in /tmp, and actually see the xattrs along with the data.
Since this patch now uses a real filesystem for testing filesystem
operations, tests are skipped if the underlying filesystem does not
support setting xattrs (eg tmpfs or more than 4k of xattrs on ext4).
References to "/tmp" have been replaced with calls to
tempfile.gettempdir(). This will allow setting the TMPDIR envvar in
test setup and getting an XFS filesystem instead of ext4 or tmpfs.
THIS PATCH SIGNIFICANTLY CHANGES TESTING ENVIRONMENTS
With this patch, every test environment will require TMPDIR to be
using a filesystem that supports at least 4k of extended attributes.
Neither ext4 nor tempfs support this. XFS is recommended.
So why all the SkipTests? Why not simply raise an error? We still need
the tests to run on the base image for OpenStack's CI system. Since
we were previously mocking out xattr, there wasn't a problem, but we
also weren't actually testing anything. This patch adds functionality
to validate xattr data, so we need to drop the mock.
`test.unit.skip_if_no_xattrs()` is also imported into `test.functional`
so that functional tests can import it from the functional test
namespace.
The related OpenStack CI infrastructure changes are made in
https://review.openstack.org/#/c/394600/.
Co-Authored-By: John Dickinson <me@not.mn>
Change-Id: I98a37c0d451f4960b7a12f648e4405c6c6716808
2016-06-30 16:52:58 -07:00
|
|
|
# filesystem does not support xattr of this size
|
|
|
|
return False
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
supports_xattr_cached_val = True
|
|
|
|
return True
|
|
|
|
finally:
|
|
|
|
# clean up the tmpfile
|
|
|
|
os.close(fd)
|
|
|
|
os.unlink(tmppath)
|
|
|
|
|
|
|
|
|
|
|
|
def skip_if_no_xattrs():
|
|
|
|
if not xattr_supported_check():
|
|
|
|
raise SkipTest('Large xattrs not supported in `%s`. Skipping test' %
|
|
|
|
gettempdir())
|
2018-05-01 15:12:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
def unlink_files(paths):
|
|
|
|
for path in paths:
|
|
|
|
try:
|
|
|
|
os.unlink(path)
|
|
|
|
except OSError as err:
|
|
|
|
if err.errno != errno.ENOENT:
|
|
|
|
raise
|
2018-05-02 10:47:51 +01:00
|
|
|
|
|
|
|
|
|
|
|
class FakeHTTPResponse(object):
|
|
|
|
|
|
|
|
def __init__(self, resp):
|
|
|
|
self.resp = resp
|
|
|
|
|
|
|
|
@property
|
|
|
|
def status(self):
|
|
|
|
return self.resp.status_int
|
|
|
|
|
|
|
|
@property
|
|
|
|
def data(self):
|
|
|
|
return self.resp.body
|
|
|
|
|
|
|
|
|
|
|
|
def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None):
|
|
|
|
class FakeReplConnection(object):
|
|
|
|
|
|
|
|
def __init__(self, node, partition, hash_, logger):
|
|
|
|
self.logger = logger
|
|
|
|
self.node = node
|
|
|
|
self.partition = partition
|
|
|
|
self.path = '/%s/%s/%s' % (node['device'], partition, hash_)
|
|
|
|
self.host = node['replication_ip']
|
|
|
|
|
|
|
|
def replicate(self, op, *sync_args):
|
|
|
|
print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args))
|
|
|
|
resp = None
|
|
|
|
if errors and op in errors and errors[op]:
|
|
|
|
resp = errors[op].pop(0)
|
|
|
|
if not resp:
|
|
|
|
replicate_args = self.path.lstrip('/').split('/')
|
|
|
|
args = [op] + copy.deepcopy(list(sync_args))
|
|
|
|
with mock_check_drive(isdir=not rpc.mount_check,
|
|
|
|
ismount=rpc.mount_check):
|
|
|
|
swob_response = rpc.dispatch(replicate_args, args)
|
|
|
|
resp = FakeHTTPResponse(swob_response)
|
|
|
|
if replicate_hook:
|
|
|
|
replicate_hook(op, *sync_args)
|
|
|
|
return resp
|
|
|
|
|
|
|
|
return FakeReplConnection
|
2019-05-30 11:55:58 -07:00
|
|
|
|
|
|
|
|
|
|
|
def group_by_byte(contents):
|
|
|
|
# This looks a little funny, but iterating through a byte string on py3
|
|
|
|
# yields a sequence of ints, not a sequence of single-byte byte strings
|
|
|
|
# as it did on py2.
|
|
|
|
byte_iter = (contents[i:i + 1] for i in range(len(contents)))
|
|
|
|
return [
|
|
|
|
(char, sum(1 for _ in grp))
|
|
|
|
for char, grp in itertools.groupby(byte_iter)]
|