2014-03-17 12:18:25 -07:00
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
""" Tests for swift.common.storage_policies """
|
2017-01-26 02:03:28 +00:00
|
|
|
import contextlib
|
2015-05-27 17:27:47 +02:00
|
|
|
import six
|
2017-01-26 02:03:28 +00:00
|
|
|
import logging
|
2014-03-17 12:18:25 -07:00
|
|
|
import unittest
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
import os
|
2014-03-17 12:18:25 -07:00
|
|
|
import mock
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
from functools import partial
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
|
2015-05-25 18:26:38 +02:00
|
|
|
from six.moves.configparser import ConfigParser
|
2014-03-17 12:18:25 -07:00
|
|
|
from tempfile import NamedTemporaryFile
|
2015-10-23 13:16:33 +00:00
|
|
|
from test.unit import patch_policies, FakeRing, temptree, DEFAULT_TEST_EC_TYPE
|
2017-01-26 02:03:28 +00:00
|
|
|
import swift.common.storage_policy
|
2014-03-17 12:18:25 -07:00
|
|
|
from swift.common.storage_policy import (
|
2014-06-30 11:14:28 -07:00
|
|
|
StoragePolicyCollection, POLICIES, PolicyError, parse_storage_policies,
|
|
|
|
reload_storage_policies, get_policy_string, split_policy_string,
|
|
|
|
BaseStoragePolicy, StoragePolicy, ECStoragePolicy, REPL_POLICY, EC_POLICY,
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
VALID_EC_TYPES, DEFAULT_EC_OBJECT_SEGMENT_SIZE, BindPortsCache)
|
|
|
|
from swift.common.ring import RingData
|
2016-01-15 03:53:01 -08:00
|
|
|
from swift.common.exceptions import RingLoadError
|
2016-07-20 18:16:27 -07:00
|
|
|
from pyeclib.ec_iface import ECDriver
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
|
2017-01-26 02:03:28 +00:00
|
|
|
class CapturingHandler(logging.Handler):
|
|
|
|
def __init__(self):
|
|
|
|
super(CapturingHandler, self).__init__()
|
|
|
|
self._records = []
|
|
|
|
|
|
|
|
def emit(self, record):
|
|
|
|
self._records.append(record)
|
|
|
|
|
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def capture_logging(log_name):
|
|
|
|
captured = CapturingHandler()
|
|
|
|
logger = logging.getLogger(log_name)
|
|
|
|
logger.addHandler(captured)
|
|
|
|
try:
|
|
|
|
yield captured._records
|
|
|
|
finally:
|
|
|
|
logger.removeHandler(captured)
|
|
|
|
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
@BaseStoragePolicy.register('fake')
|
|
|
|
class FakeStoragePolicy(BaseStoragePolicy):
|
|
|
|
"""
|
|
|
|
Test StoragePolicy class - the only user at the moment is
|
|
|
|
test_validate_policies_type_invalid()
|
|
|
|
"""
|
2015-11-05 23:04:14 +13:00
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
def __init__(self, idx, name='', is_default=False, is_deprecated=False,
|
|
|
|
object_ring=None):
|
|
|
|
super(FakeStoragePolicy, self).__init__(
|
|
|
|
idx, name, is_default, is_deprecated, object_ring)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
|
|
|
|
class TestStoragePolicies(unittest.TestCase):
|
|
|
|
def _conf(self, conf_str):
|
|
|
|
conf_str = "\n".join(line.strip() for line in conf_str.split("\n"))
|
2018-02-01 14:30:19 -08:00
|
|
|
if six.PY2:
|
|
|
|
conf = ConfigParser()
|
|
|
|
else:
|
|
|
|
conf = ConfigParser(strict=False)
|
2015-05-27 17:27:47 +02:00
|
|
|
conf.readfp(six.StringIO(conf_str))
|
2014-03-17 12:18:25 -07:00
|
|
|
return conf
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
def assertRaisesWithMessage(self, exc_class, message, f, *args, **kwargs):
|
|
|
|
try:
|
|
|
|
f(*args, **kwargs)
|
|
|
|
except exc_class as err:
|
|
|
|
err_msg = str(err)
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(message in err_msg, 'Error message %r did not '
|
|
|
|
'have expected substring %r' % (err_msg, message))
|
2014-06-30 11:14:28 -07:00
|
|
|
else:
|
|
|
|
self.fail('%r did not raise %s' % (message, exc_class.__name__))
|
|
|
|
|
|
|
|
def test_policy_baseclass_instantiate(self):
|
|
|
|
self.assertRaisesWithMessage(TypeError,
|
|
|
|
"Can't instantiate BaseStoragePolicy",
|
|
|
|
BaseStoragePolicy, 1, 'one')
|
|
|
|
|
|
|
|
@patch_policies([
|
|
|
|
StoragePolicy(0, 'zero', is_default=True),
|
|
|
|
StoragePolicy(1, 'one'),
|
|
|
|
StoragePolicy(2, 'two'),
|
|
|
|
StoragePolicy(3, 'three', is_deprecated=True),
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(10, 'ten', ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=10, ec_nparity=4),
|
|
|
|
])
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_swift_info(self):
|
|
|
|
# the deprecated 'three' should not exist in expect
|
2015-11-05 23:04:14 +13:00
|
|
|
expect = [{'aliases': 'zero', 'default': True, 'name': 'zero', },
|
|
|
|
{'aliases': 'two', 'name': 'two'},
|
|
|
|
{'aliases': 'one', 'name': 'one'},
|
|
|
|
{'aliases': 'ten', 'name': 'ten'}]
|
2014-03-17 12:18:25 -07:00
|
|
|
swift_info = POLICIES.get_policy_info()
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(sorted(expect, key=lambda k: k['name']),
|
|
|
|
sorted(swift_info, key=lambda k: k['name']))
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
@patch_policies
|
|
|
|
def test_get_policy_string(self):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(get_policy_string('something', 0), 'something')
|
|
|
|
self.assertEqual(get_policy_string('something', None), 'something')
|
|
|
|
self.assertEqual(get_policy_string('something', ''), 'something')
|
|
|
|
self.assertEqual(get_policy_string('something', 1),
|
|
|
|
'something' + '-1')
|
2014-03-17 12:18:25 -07:00
|
|
|
self.assertRaises(PolicyError, get_policy_string, 'something', 99)
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
@patch_policies
|
|
|
|
def test_split_policy_string(self):
|
|
|
|
expectations = {
|
|
|
|
'something': ('something', POLICIES[0]),
|
|
|
|
'something-1': ('something', POLICIES[1]),
|
|
|
|
'tmp': ('tmp', POLICIES[0]),
|
|
|
|
'objects': ('objects', POLICIES[0]),
|
|
|
|
'tmp-1': ('tmp', POLICIES[1]),
|
|
|
|
'objects-1': ('objects', POLICIES[1]),
|
|
|
|
'objects-': PolicyError,
|
|
|
|
'objects-0': PolicyError,
|
|
|
|
'objects--1': ('objects-', POLICIES[1]),
|
|
|
|
'objects-+1': PolicyError,
|
|
|
|
'objects--': PolicyError,
|
|
|
|
'objects-foo': PolicyError,
|
|
|
|
'objects--bar': PolicyError,
|
|
|
|
'objects-+bar': PolicyError,
|
|
|
|
# questionable, demonstrated as inverse of get_policy_string
|
|
|
|
'objects+0': ('objects+0', POLICIES[0]),
|
|
|
|
'': ('', POLICIES[0]),
|
|
|
|
'0': ('0', POLICIES[0]),
|
|
|
|
'-1': ('', POLICIES[1]),
|
|
|
|
}
|
|
|
|
for policy_string, expected in expectations.items():
|
|
|
|
if expected == PolicyError:
|
|
|
|
try:
|
|
|
|
invalid = split_policy_string(policy_string)
|
|
|
|
except PolicyError:
|
|
|
|
continue # good
|
|
|
|
else:
|
|
|
|
self.fail('The string %r returned %r '
|
|
|
|
'instead of raising a PolicyError' %
|
|
|
|
(policy_string, invalid))
|
|
|
|
self.assertEqual(expected, split_policy_string(policy_string))
|
|
|
|
# should be inverse of get_policy_string
|
|
|
|
self.assertEqual(policy_string, get_policy_string(*expected))
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_defaults(self):
|
2016-12-08 15:44:48 +07:00
|
|
|
self.assertGreater(len(POLICIES), 0)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
# test class functions
|
|
|
|
default_policy = POLICIES.default
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(default_policy.is_default)
|
2014-03-17 12:18:25 -07:00
|
|
|
zero_policy = POLICIES.get_by_index(0)
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(zero_policy.idx == 0)
|
2014-03-17 12:18:25 -07:00
|
|
|
zero_policy_by_name = POLICIES.get_by_name(zero_policy.name)
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(zero_policy_by_name.idx == 0)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def test_storage_policy_repr(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'aay', True),
|
|
|
|
StoragePolicy(1, 'bee', False),
|
2014-06-30 11:14:28 -07:00
|
|
|
StoragePolicy(2, 'cee', False),
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(10, 'ten',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
ec_ndata=10, ec_nparity=3),
|
|
|
|
ECStoragePolicy(11, 'eleven',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=10, ec_nparity=3,
|
|
|
|
ec_duplication_factor=2)]
|
2014-03-17 12:18:25 -07:00
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
for policy in policies:
|
|
|
|
policy_repr = repr(policy)
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(policy.__class__.__name__ in policy_repr)
|
|
|
|
self.assertTrue('is_default=%s' % policy.is_default in policy_repr)
|
|
|
|
self.assertTrue('is_deprecated=%s' % policy.is_deprecated in
|
|
|
|
policy_repr)
|
|
|
|
self.assertTrue(policy.name in policy_repr)
|
2014-06-30 11:14:28 -07:00
|
|
|
if policy.policy_type == EC_POLICY:
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue('ec_type=%s' % policy.ec_type in policy_repr)
|
|
|
|
self.assertTrue('ec_ndata=%s' % policy.ec_ndata in policy_repr)
|
|
|
|
self.assertTrue('ec_nparity=%s' %
|
|
|
|
policy.ec_nparity in policy_repr)
|
|
|
|
self.assertTrue('ec_segment_size=%s' %
|
|
|
|
policy.ec_segment_size in policy_repr)
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
if policy.ec_duplication_factor > 1:
|
|
|
|
self.assertTrue('ec_duplication_factor=%s' %
|
|
|
|
policy.ec_duplication_factor in
|
|
|
|
policy_repr)
|
2014-03-17 12:18:25 -07:00
|
|
|
collection_repr = repr(policies)
|
|
|
|
collection_repr_lines = collection_repr.splitlines()
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(
|
|
|
|
policies.__class__.__name__ in collection_repr_lines[0])
|
2014-03-17 12:18:25 -07:00
|
|
|
self.assertEqual(len(policies), len(collection_repr_lines[1:-1]))
|
|
|
|
for policy, line in zip(policies, collection_repr_lines[1:-1]):
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(repr(policy) in line)
|
2014-03-17 12:18:25 -07:00
|
|
|
with patch_policies(policies):
|
|
|
|
self.assertEqual(repr(POLICIES), collection_repr)
|
|
|
|
|
|
|
|
def test_validate_policies_defaults(self):
|
|
|
|
# 0 explicit default
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(2, 'two', False)]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(policies.default, test_policies[0])
|
|
|
|
self.assertEqual(policies.default.name, 'zero')
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
# non-zero explicit default
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(2, 'two', True)]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(policies.default, test_policies[2])
|
|
|
|
self.assertEqual(policies.default.name, 'two')
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
# multiple defaults
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', True),
|
|
|
|
StoragePolicy(2, 'two', True)]
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, 'Duplicate default', StoragePolicyCollection,
|
|
|
|
test_policies)
|
|
|
|
|
|
|
|
# nothing specified
|
|
|
|
test_policies = []
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(policies.default, policies[0])
|
|
|
|
self.assertEqual(policies.default.name, 'Policy-0')
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
# no default specified with only policy index 0
|
|
|
|
test_policies = [StoragePolicy(0, 'zero')]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
self.assertEqual(policies.default, policies[0])
|
|
|
|
|
|
|
|
# no default specified with multiple policies
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(2, 'two', False)]
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, 'Unable to find default policy',
|
|
|
|
StoragePolicyCollection, test_policies)
|
|
|
|
|
|
|
|
def test_deprecate_policies(self):
|
|
|
|
# deprecation specified
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(2, 'two', False, is_deprecated=True)]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(policies.default, test_policies[0])
|
|
|
|
self.assertEqual(policies.default.name, 'zero')
|
|
|
|
self.assertEqual(len(policies), 3)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
# multiple policies requires default
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', False),
|
|
|
|
StoragePolicy(1, 'one', False, is_deprecated=True),
|
|
|
|
StoragePolicy(2, 'two', False)]
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, 'Unable to find default policy',
|
|
|
|
StoragePolicyCollection, test_policies)
|
|
|
|
|
|
|
|
def test_validate_policies_indexes(self):
|
|
|
|
# duplicate indexes
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(1, 'two', False)]
|
|
|
|
self.assertRaises(PolicyError, StoragePolicyCollection,
|
|
|
|
test_policies)
|
|
|
|
|
|
|
|
def test_validate_policy_params(self):
|
|
|
|
StoragePolicy(0, 'name') # sanity
|
|
|
|
# bogus indexes
|
2014-06-30 11:14:28 -07:00
|
|
|
self.assertRaises(PolicyError, FakeStoragePolicy, 'x', 'name')
|
|
|
|
self.assertRaises(PolicyError, FakeStoragePolicy, -1, 'name')
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
# non-zero Policy-0
|
2014-06-30 11:14:28 -07:00
|
|
|
self.assertRaisesWithMessage(PolicyError, 'reserved',
|
|
|
|
FakeStoragePolicy, 1, 'policy-0')
|
2014-03-17 12:18:25 -07:00
|
|
|
# deprecate default
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, 'Deprecated policy can not be default',
|
2014-06-30 11:14:28 -07:00
|
|
|
FakeStoragePolicy, 1, 'Policy-1', is_default=True,
|
2014-03-17 12:18:25 -07:00
|
|
|
is_deprecated=True)
|
|
|
|
# weird names
|
|
|
|
names = (
|
|
|
|
'',
|
|
|
|
'name_foo',
|
|
|
|
'name\nfoo',
|
|
|
|
'name foo',
|
|
|
|
u'name \u062a',
|
|
|
|
'name \xd8\xaa',
|
|
|
|
)
|
|
|
|
for name in names:
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
2014-06-30 11:14:28 -07:00
|
|
|
FakeStoragePolicy, 1, name)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def test_validate_policies_names(self):
|
|
|
|
# duplicate names
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'zero', False),
|
|
|
|
StoragePolicy(2, 'two', False)]
|
|
|
|
self.assertRaises(PolicyError, StoragePolicyCollection,
|
|
|
|
test_policies)
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
def test_validate_policies_type_default(self):
|
|
|
|
# no type specified - make sure the policy is initialized to
|
|
|
|
# DEFAULT_POLICY_TYPE
|
|
|
|
test_policy = FakeStoragePolicy(0, 'zero', True)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(test_policy.policy_type, 'fake')
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
def test_validate_policies_type_invalid(self):
|
|
|
|
class BogusStoragePolicy(FakeStoragePolicy):
|
|
|
|
policy_type = 'bogus'
|
2015-11-05 23:04:14 +13:00
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
# unsupported policy type - initialization with FakeStoragePolicy
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid type',
|
|
|
|
BogusStoragePolicy, 1, 'one')
|
|
|
|
|
|
|
|
def test_policies_type_attribute(self):
|
|
|
|
test_policies = [
|
|
|
|
StoragePolicy(0, 'zero', is_default=True),
|
|
|
|
StoragePolicy(1, 'one'),
|
|
|
|
StoragePolicy(2, 'two'),
|
|
|
|
StoragePolicy(3, 'three', is_deprecated=True),
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(10, 'ten', ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=10, ec_nparity=3),
|
|
|
|
]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(policies.get_by_index(0).policy_type,
|
|
|
|
REPL_POLICY)
|
|
|
|
self.assertEqual(policies.get_by_index(1).policy_type,
|
|
|
|
REPL_POLICY)
|
|
|
|
self.assertEqual(policies.get_by_index(2).policy_type,
|
|
|
|
REPL_POLICY)
|
|
|
|
self.assertEqual(policies.get_by_index(3).policy_type,
|
|
|
|
REPL_POLICY)
|
|
|
|
self.assertEqual(policies.get_by_index(10).policy_type,
|
|
|
|
EC_POLICY)
|
2014-06-30 11:14:28 -07:00
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_names_are_normalized(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'ZERO', False)]
|
|
|
|
self.assertRaises(PolicyError, StoragePolicyCollection,
|
|
|
|
test_policies)
|
|
|
|
|
|
|
|
policies = StoragePolicyCollection([StoragePolicy(0, 'zEro', True),
|
|
|
|
StoragePolicy(1, 'One', False)])
|
|
|
|
|
|
|
|
pol0 = policies[0]
|
|
|
|
pol1 = policies[1]
|
|
|
|
|
|
|
|
for name in ('zero', 'ZERO', 'zErO', 'ZeRo'):
|
|
|
|
self.assertEqual(pol0, policies.get_by_name(name))
|
|
|
|
self.assertEqual(policies.get_by_name(name).name, 'zEro')
|
|
|
|
for name in ('one', 'ONE', 'oNe', 'OnE'):
|
|
|
|
self.assertEqual(pol1, policies.get_by_name(name))
|
|
|
|
self.assertEqual(policies.get_by_name(name).name, 'One')
|
|
|
|
|
2015-11-05 23:04:14 +13:00
|
|
|
def test_multiple_names(self):
|
|
|
|
# checking duplicate on insert
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'one', False, aliases='zero')]
|
|
|
|
self.assertRaises(PolicyError, StoragePolicyCollection,
|
|
|
|
test_policies)
|
|
|
|
|
|
|
|
# checking correct retrival using other names
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True, aliases='cero, kore'),
|
|
|
|
StoragePolicy(1, 'one', False, aliases='uno, tahi'),
|
|
|
|
StoragePolicy(2, 'two', False, aliases='dos, rua')]
|
|
|
|
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
|
|
|
|
for name in ('zero', 'cero', 'kore'):
|
|
|
|
self.assertEqual(policies.get_by_name(name), test_policies[0])
|
|
|
|
for name in ('two', 'dos', 'rua'):
|
|
|
|
self.assertEqual(policies.get_by_name(name), test_policies[2])
|
|
|
|
|
|
|
|
# Testing parsing of conf files/text
|
|
|
|
good_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = one
|
|
|
|
aliases = uno, tahi
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
policies = parse_storage_policies(good_conf)
|
|
|
|
self.assertEqual(policies.get_by_name('one'),
|
|
|
|
policies[0])
|
|
|
|
self.assertEqual(policies.get_by_name('one'),
|
|
|
|
policies.get_by_name('tahi'))
|
|
|
|
|
|
|
|
name_repeat_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = one
|
|
|
|
aliases = one
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
# Test on line below should not generate errors. Repeat of main
|
|
|
|
# name under aliases is permitted during construction
|
|
|
|
# but only because automated testing requires it.
|
|
|
|
policies = parse_storage_policies(name_repeat_conf)
|
|
|
|
|
2016-02-11 16:00:38 -08:00
|
|
|
extra_commas_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = one
|
|
|
|
aliases = ,,one, ,
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
# Extra blank entries should be silently dropped
|
|
|
|
policies = parse_storage_policies(extra_commas_conf)
|
|
|
|
|
2015-11-05 23:04:14 +13:00
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = one
|
|
|
|
aliases = uno, uno
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'is already assigned to this policy',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
def test_multiple_names_EC(self):
|
|
|
|
# checking duplicate names on insert
|
|
|
|
test_policies_ec = [
|
|
|
|
ECStoragePolicy(
|
|
|
|
0, 'ec8-2',
|
|
|
|
aliases='zeus, jupiter',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=8, ec_nparity=2,
|
|
|
|
object_ring=FakeRing(replicas=8),
|
|
|
|
is_default=True),
|
|
|
|
ECStoragePolicy(
|
|
|
|
1, 'ec10-4',
|
|
|
|
aliases='ec8-2',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=10, ec_nparity=4,
|
|
|
|
object_ring=FakeRing(replicas=10))]
|
|
|
|
|
|
|
|
self.assertRaises(PolicyError, StoragePolicyCollection,
|
|
|
|
test_policies_ec)
|
|
|
|
|
|
|
|
# checking correct retrival using other names
|
|
|
|
good_test_policies_EC = [
|
|
|
|
ECStoragePolicy(0, 'ec8-2', aliases='zeus, jupiter',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=8, ec_nparity=2,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
object_ring=FakeRing(replicas=10),
|
2015-11-05 23:04:14 +13:00
|
|
|
is_default=True),
|
|
|
|
ECStoragePolicy(1, 'ec10-4', aliases='athena, minerva',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=10, ec_nparity=4,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
object_ring=FakeRing(replicas=14)),
|
2015-11-05 23:04:14 +13:00
|
|
|
ECStoragePolicy(2, 'ec4-2', aliases='poseidon, neptune',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=4, ec_nparity=2,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
object_ring=FakeRing(replicas=6)),
|
|
|
|
ECStoragePolicy(3, 'ec4-2-dup', aliases='uzuki, rin',
|
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=4, ec_nparity=2,
|
|
|
|
ec_duplication_factor=2,
|
|
|
|
object_ring=FakeRing(replicas=12)),
|
2015-11-05 23:04:14 +13:00
|
|
|
]
|
|
|
|
ec_policies = StoragePolicyCollection(good_test_policies_EC)
|
|
|
|
|
|
|
|
for name in ('ec8-2', 'zeus', 'jupiter'):
|
|
|
|
self.assertEqual(ec_policies.get_by_name(name), ec_policies[0])
|
|
|
|
for name in ('ec10-4', 'athena', 'minerva'):
|
|
|
|
self.assertEqual(ec_policies.get_by_name(name), ec_policies[1])
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
for name in ('ec4-2', 'poseidon', 'neptune'):
|
|
|
|
self.assertEqual(ec_policies.get_by_name(name), ec_policies[2])
|
|
|
|
for name in ('ec4-2-dup', 'uzuki', 'rin'):
|
|
|
|
self.assertEqual(ec_policies.get_by_name(name), ec_policies[3])
|
2015-11-05 23:04:14 +13:00
|
|
|
|
|
|
|
# Testing parsing of conf files/text
|
|
|
|
good_ec_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = ec8-2
|
|
|
|
aliases = zeus, jupiter
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = %(ec_type)s
|
|
|
|
default = yes
|
|
|
|
ec_num_data_fragments = 8
|
|
|
|
ec_num_parity_fragments = 2
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
aliases = poseidon, neptune
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = %(ec_type)s
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 4
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
[storage-policy:2]
|
|
|
|
name = ec4-2-dup
|
|
|
|
aliases = uzuki, rin
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = %(ec_type)s
|
|
|
|
ec_num_data_fragments = 4
|
|
|
|
ec_num_parity_fragments = 2
|
|
|
|
ec_duplication_factor = 2
|
2015-11-05 23:04:14 +13:00
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE})
|
|
|
|
|
|
|
|
ec_policies = parse_storage_policies(good_ec_conf)
|
|
|
|
self.assertEqual(ec_policies.get_by_name('ec8-2'),
|
|
|
|
ec_policies[0])
|
|
|
|
self.assertEqual(ec_policies.get_by_name('ec10-4'),
|
|
|
|
ec_policies.get_by_name('poseidon'))
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
self.assertEqual(ec_policies.get_by_name('ec4-2-dup'),
|
|
|
|
ec_policies.get_by_name('uzuki'))
|
2015-11-05 23:04:14 +13:00
|
|
|
|
|
|
|
name_repeat_ec_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = ec8-2
|
|
|
|
aliases = ec8-2
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = %(ec_type)s
|
|
|
|
default = yes
|
|
|
|
ec_num_data_fragments = 8
|
|
|
|
ec_num_parity_fragments = 2
|
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE})
|
|
|
|
# Test on line below should not generate errors. Repeat of main
|
|
|
|
# name under aliases is permitted during construction
|
|
|
|
# but only because automated testing requires it.
|
|
|
|
ec_policies = parse_storage_policies(name_repeat_ec_conf)
|
|
|
|
|
|
|
|
bad_ec_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = ec8-2
|
|
|
|
aliases = zeus, zeus
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = %(ec_type)s
|
|
|
|
default = yes
|
|
|
|
ec_num_data_fragments = 8
|
|
|
|
ec_num_parity_fragments = 2
|
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE})
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'is already assigned to this policy',
|
|
|
|
parse_storage_policies, bad_ec_conf)
|
|
|
|
|
|
|
|
def test_add_remove_names(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'zero', True),
|
|
|
|
StoragePolicy(1, 'one', False),
|
|
|
|
StoragePolicy(2, 'two', False)]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
|
|
|
|
# add names
|
|
|
|
policies.add_policy_alias(1, 'tahi')
|
|
|
|
self.assertEqual(policies.get_by_name('tahi'), test_policies[1])
|
|
|
|
|
|
|
|
policies.add_policy_alias(2, 'rua', 'dos')
|
|
|
|
self.assertEqual(policies.get_by_name('rua'), test_policies[2])
|
|
|
|
self.assertEqual(policies.get_by_name('dos'), test_policies[2])
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
policies.add_policy_alias, 2, 'double\n')
|
|
|
|
|
2016-02-11 16:00:38 -08:00
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
policies.add_policy_alias, 2, '')
|
|
|
|
|
2015-11-05 23:04:14 +13:00
|
|
|
# try to add existing name
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Duplicate name',
|
|
|
|
policies.add_policy_alias, 2, 'two')
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Duplicate name',
|
|
|
|
policies.add_policy_alias, 1, 'two')
|
|
|
|
|
|
|
|
# remove name
|
|
|
|
policies.remove_policy_alias('tahi')
|
2017-06-25 10:29:23 +08:00
|
|
|
self.assertIsNone(policies.get_by_name('tahi'))
|
2015-11-05 23:04:14 +13:00
|
|
|
|
|
|
|
# remove only name
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Policies must have at least one name.',
|
|
|
|
policies.remove_policy_alias, 'zero')
|
|
|
|
|
|
|
|
# remove non-existent name
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'No policy with name',
|
|
|
|
policies.remove_policy_alias, 'three')
|
|
|
|
|
|
|
|
# remove default name
|
|
|
|
policies.remove_policy_alias('two')
|
2017-06-25 10:29:23 +08:00
|
|
|
self.assertIsNone(policies.get_by_name('two'))
|
2015-11-05 23:04:14 +13:00
|
|
|
self.assertEqual(policies.get_by_index(2).name, 'rua')
|
|
|
|
|
|
|
|
# change default name to a new name
|
|
|
|
policies.change_policy_primary_name(2, 'two')
|
|
|
|
self.assertEqual(policies.get_by_name('two'), test_policies[2])
|
|
|
|
self.assertEqual(policies.get_by_index(2).name, 'two')
|
|
|
|
|
|
|
|
# change default name to an existing alias
|
|
|
|
policies.change_policy_primary_name(2, 'dos')
|
|
|
|
self.assertEqual(policies.get_by_index(2).name, 'dos')
|
|
|
|
|
|
|
|
# change default name to a bad new name
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
policies.change_policy_primary_name,
|
|
|
|
2, 'bad\nname')
|
|
|
|
|
|
|
|
# change default name to a name belonging to another policy
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Other policy',
|
|
|
|
policies.change_policy_primary_name,
|
|
|
|
1, 'dos')
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_deprecated_default(self):
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
deprecated = yes
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, "Deprecated policy can not be default",
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
def test_multiple_policies_with_no_policy_index_zero(self):
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
# Policy-0 will not be implicitly added if other policies are defined
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, "must specify a storage policy section "
|
|
|
|
"for policy index 0", parse_storage_policies, bad_conf)
|
|
|
|
|
2017-01-26 02:03:28 +00:00
|
|
|
@mock.patch.object(swift.common.storage_policy, 'VALID_EC_TYPES',
|
|
|
|
['isa_l_rs_vand', 'isa_l_rs_cauchy'])
|
|
|
|
@mock.patch('swift.common.storage_policy.ECDriver')
|
|
|
|
def test_known_bad_ec_config(self, mock_driver):
|
|
|
|
good_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = bad-policy
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = isa_l_rs_cauchy
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 5
|
|
|
|
""")
|
|
|
|
|
|
|
|
with capture_logging('swift.common.storage_policy') as records:
|
|
|
|
parse_storage_policies(good_conf)
|
|
|
|
mock_driver.assert_called_once()
|
|
|
|
mock_driver.reset_mock()
|
2017-04-28 15:56:21 -07:00
|
|
|
self.assertFalse([(r.levelname, r.msg) for r in records])
|
2017-01-26 02:03:28 +00:00
|
|
|
|
|
|
|
good_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = bad-policy
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = isa_l_rs_vand
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 4
|
|
|
|
""")
|
|
|
|
|
|
|
|
with capture_logging('swift.common.storage_policy') as records:
|
|
|
|
parse_storage_policies(good_conf)
|
|
|
|
mock_driver.assert_called_once()
|
|
|
|
mock_driver.reset_mock()
|
2017-04-28 15:56:21 -07:00
|
|
|
self.assertFalse([(r.levelname, r.msg) for r in records])
|
2017-01-26 02:03:28 +00:00
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = bad-policy
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = isa_l_rs_vand
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 5
|
|
|
|
""")
|
|
|
|
|
2017-05-25 09:40:47 -07:00
|
|
|
with capture_logging('swift.common.storage_policy') as records, \
|
|
|
|
self.assertRaises(PolicyError) as exc_mgr:
|
2017-01-26 02:03:28 +00:00
|
|
|
parse_storage_policies(bad_conf)
|
2018-02-01 14:30:19 -08:00
|
|
|
self.assertEqual(exc_mgr.exception.args[0],
|
2017-05-25 09:40:47 -07:00
|
|
|
'Storage policy bad-policy uses an EC '
|
|
|
|
'configuration known to harm data durability. This '
|
|
|
|
'policy MUST be deprecated.')
|
|
|
|
mock_driver.assert_not_called()
|
2017-01-26 02:03:28 +00:00
|
|
|
mock_driver.reset_mock()
|
|
|
|
self.assertEqual([r.levelname for r in records],
|
2017-05-25 09:40:47 -07:00
|
|
|
['WARNING'])
|
2017-01-26 02:03:28 +00:00
|
|
|
for msg in ('known to harm data durability',
|
|
|
|
'Any data in this policy should be migrated',
|
|
|
|
'https://bugs.launchpad.net/swift/+bug/1639691'):
|
2017-04-28 15:56:21 -07:00
|
|
|
self.assertIn(msg, records[0].msg)
|
2017-01-26 02:03:28 +00:00
|
|
|
|
|
|
|
slightly_less_bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = bad-policy
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = isa_l_rs_vand
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 5
|
|
|
|
deprecated = true
|
|
|
|
|
|
|
|
[storage-policy:1]
|
|
|
|
name = good-policy
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = isa_l_rs_cauchy
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 5
|
|
|
|
default = true
|
|
|
|
""")
|
|
|
|
|
|
|
|
with capture_logging('swift.common.storage_policy') as records:
|
|
|
|
parse_storage_policies(slightly_less_bad_conf)
|
|
|
|
self.assertEqual(2, mock_driver.call_count)
|
|
|
|
mock_driver.reset_mock()
|
|
|
|
self.assertEqual([r.levelname for r in records],
|
|
|
|
['WARNING'])
|
|
|
|
for msg in ('known to harm data durability',
|
|
|
|
'Any data in this policy should be migrated',
|
|
|
|
'https://bugs.launchpad.net/swift/+bug/1639691'):
|
2017-04-28 15:56:21 -07:00
|
|
|
self.assertIn(msg, records[0].msg)
|
2017-01-26 02:03:28 +00:00
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_no_default(self):
|
|
|
|
orig_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
default = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
policies = parse_storage_policies(orig_conf)
|
|
|
|
self.assertEqual(policies.default, policies[1])
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(policies[0].name, 'Policy-0')
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
deprecated = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
# multiple polices and no explicit default
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, "Unable to find default",
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
good_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = Policy-0
|
|
|
|
default = yes
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
deprecated = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
policies = parse_storage_policies(good_conf)
|
|
|
|
self.assertEqual(policies.default, policies[0])
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(policies[1].is_deprecated, True)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def test_parse_storage_policies(self):
|
|
|
|
# ValueError when deprecating policy 0
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
deprecated = yes
|
|
|
|
|
|
|
|
[storage-policy:1]
|
|
|
|
name = one
|
|
|
|
deprecated = yes
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(
|
|
|
|
PolicyError, "Unable to find policy that's not deprecated",
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:-1]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:x]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:x-1]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:x]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:x:1]
|
|
|
|
name = zero
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid index',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:1]
|
|
|
|
name = zero
|
|
|
|
boo = berries
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid option',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name =
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:3]
|
|
|
|
name = Policy-0
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:1]
|
|
|
|
name = policY-0
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = one
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ONE
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Duplicate name',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = good_stuff
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Invalid name',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
# policy_type = erasure_coding
|
|
|
|
|
|
|
|
# missing ec_type, ec_num_data_fragments and ec_num_parity_fragments
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Missing ec_type',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
# missing ec_type, but other options valid...
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 4
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError, 'Missing ec_type',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
# ec_type specified, but invalid...
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
default = yes
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
|
|
|
ec_type = garbage_alg
|
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 4
|
|
|
|
""")
|
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Wrong ec_type garbage_alg for policy '
|
|
|
|
'ec10-4, should be one of "%s"' %
|
|
|
|
(', '.join(VALID_EC_TYPES)),
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
# missing and invalid ec_num_parity_fragments
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type = %(ec_type)s
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_num_data_fragments = 10
|
2015-10-23 13:16:33 +00:00
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE})
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Invalid ec_num_parity_fragments',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
for num_parity in ('-4', '0', 'x'):
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type = %(ec_type)s
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_num_data_fragments = 10
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_num_parity_fragments = %(num_parity)s
|
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE,
|
|
|
|
'num_parity': num_parity})
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Invalid ec_num_parity_fragments',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
# missing and invalid ec_num_data_fragments
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type = %(ec_type)s
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_num_parity_fragments = 4
|
2015-10-23 13:16:33 +00:00
|
|
|
""" % {'ec_type': DEFAULT_TEST_EC_TYPE})
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Invalid ec_num_data_fragments',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
for num_data in ('-10', '0', 'x'):
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type = %(ec_type)s
|
|
|
|
ec_num_data_fragments = %(num_data)s
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_num_parity_fragments = 4
|
2015-10-23 13:16:33 +00:00
|
|
|
""" % {'num_data': num_data, 'ec_type': DEFAULT_TEST_EC_TYPE})
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Invalid ec_num_data_fragments',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
|
|
|
# invalid ec_object_segment_size
|
|
|
|
for segment_size in ('-4', '0', 'x'):
|
|
|
|
bad_conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:1]
|
|
|
|
name = ec10-4
|
|
|
|
policy_type = erasure_coding
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_object_segment_size = %(segment_size)s
|
|
|
|
ec_type = %(ec_type)s
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_num_data_fragments = 10
|
|
|
|
ec_num_parity_fragments = 4
|
2015-10-23 13:16:33 +00:00
|
|
|
""" % {'segment_size': segment_size,
|
|
|
|
'ec_type': DEFAULT_TEST_EC_TYPE})
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
self.assertRaisesWithMessage(PolicyError,
|
|
|
|
'Invalid ec_object_segment_size',
|
|
|
|
parse_storage_policies, bad_conf)
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
# Additional section added to ensure parser ignores other sections
|
|
|
|
conf = self._conf("""
|
|
|
|
[some-other-section]
|
|
|
|
foo = bar
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:5]
|
|
|
|
name = one
|
|
|
|
default = yes
|
|
|
|
[storage-policy:6]
|
|
|
|
name = duplicate-sections-are-ignored
|
|
|
|
[storage-policy:6]
|
|
|
|
name = apple
|
|
|
|
""")
|
|
|
|
policies = parse_storage_policies(conf)
|
|
|
|
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(True, policies.get_by_index(5).is_default)
|
|
|
|
self.assertEqual(False, policies.get_by_index(0).is_default)
|
|
|
|
self.assertEqual(False, policies.get_by_index(6).is_default)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual("object", policies.get_by_name("zero").ring_name)
|
|
|
|
self.assertEqual("object-5", policies.get_by_name("one").ring_name)
|
|
|
|
self.assertEqual("object-6", policies.get_by_name("apple").ring_name)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
self.assertEqual(0, int(policies.get_by_name('zero')))
|
|
|
|
self.assertEqual(5, int(policies.get_by_name('one')))
|
|
|
|
self.assertEqual(6, int(policies.get_by_name('apple')))
|
|
|
|
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual("zero", policies.get_by_index(0).name)
|
|
|
|
self.assertEqual("zero", policies.get_by_index("0").name)
|
|
|
|
self.assertEqual("one", policies.get_by_index(5).name)
|
|
|
|
self.assertEqual("apple", policies.get_by_index(6).name)
|
|
|
|
self.assertEqual("zero", policies.get_by_index(None).name)
|
|
|
|
self.assertEqual("zero", policies.get_by_index('').name)
|
2014-03-17 12:18:25 -07:00
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
self.assertEqual(policies.get_by_index(0), policies.legacy)
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_reload_invalid_storage_policies(self):
|
|
|
|
conf = self._conf("""
|
|
|
|
[storage-policy:0]
|
|
|
|
name = zero
|
|
|
|
[storage-policy:00]
|
|
|
|
name = double-zero
|
|
|
|
""")
|
2018-02-01 14:30:19 -08:00
|
|
|
with NamedTemporaryFile(mode='w+t') as f:
|
2014-03-17 12:18:25 -07:00
|
|
|
conf.write(f)
|
|
|
|
f.flush()
|
2016-07-20 09:51:24 +00:00
|
|
|
with mock.patch('swift.common.utils.SWIFT_CONF_FILE',
|
2014-03-17 12:18:25 -07:00
|
|
|
new=f.name):
|
|
|
|
try:
|
|
|
|
reload_storage_policies()
|
|
|
|
except SystemExit as e:
|
|
|
|
err_msg = str(e)
|
|
|
|
else:
|
|
|
|
self.fail('SystemExit not raised')
|
|
|
|
parts = [
|
|
|
|
'Invalid Storage Policy Configuration',
|
|
|
|
'Duplicate index',
|
|
|
|
]
|
|
|
|
for expected in parts:
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(
|
|
|
|
expected in err_msg, '%s was not in %s' % (expected,
|
|
|
|
err_msg))
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def test_storage_policy_ordering(self):
|
|
|
|
test_policies = StoragePolicyCollection([
|
|
|
|
StoragePolicy(0, 'zero', is_default=True),
|
|
|
|
StoragePolicy(503, 'error'),
|
|
|
|
StoragePolicy(204, 'empty'),
|
|
|
|
StoragePolicy(404, 'missing'),
|
|
|
|
])
|
|
|
|
self.assertEqual([0, 204, 404, 503], [int(p) for p in
|
|
|
|
sorted(list(test_policies))])
|
|
|
|
|
|
|
|
p503 = test_policies[503]
|
|
|
|
self.assertTrue(501 < p503 < 507)
|
|
|
|
|
|
|
|
def test_get_object_ring(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'aay', True),
|
|
|
|
StoragePolicy(1, 'bee', False),
|
|
|
|
StoragePolicy(2, 'cee', False)]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
|
|
|
|
class NamedFakeRing(FakeRing):
|
|
|
|
|
|
|
|
def __init__(self, swift_dir, ring_name=None):
|
|
|
|
self.ring_name = ring_name
|
|
|
|
super(NamedFakeRing, self).__init__()
|
|
|
|
|
|
|
|
with mock.patch('swift.common.storage_policy.Ring',
|
|
|
|
new=NamedFakeRing):
|
|
|
|
for policy in policies:
|
|
|
|
self.assertFalse(policy.object_ring)
|
|
|
|
ring = policies.get_object_ring(int(policy), '/path/not/used')
|
|
|
|
self.assertEqual(ring.ring_name, policy.ring_name)
|
|
|
|
self.assertTrue(policy.object_ring)
|
2015-07-21 19:23:00 +05:30
|
|
|
self.assertTrue(isinstance(policy.object_ring, NamedFakeRing))
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
def blow_up(*args, **kwargs):
|
|
|
|
raise Exception('kaboom!')
|
|
|
|
|
|
|
|
with mock.patch('swift.common.storage_policy.Ring', new=blow_up):
|
|
|
|
for policy in policies:
|
|
|
|
policy.load_ring('/path/not/used')
|
|
|
|
expected = policies.get_object_ring(int(policy),
|
|
|
|
'/path/not/used')
|
|
|
|
self.assertEqual(policy.object_ring, expected)
|
|
|
|
|
|
|
|
# bad policy index
|
|
|
|
self.assertRaises(PolicyError, policies.get_object_ring, 99,
|
|
|
|
'/path/not/used')
|
|
|
|
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
def test_bind_ports_cache(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'aay', True),
|
|
|
|
StoragePolicy(1, 'bee', False),
|
|
|
|
StoragePolicy(2, 'cee', False)]
|
|
|
|
|
|
|
|
my_ips = ['1.2.3.4', '2.3.4.5']
|
|
|
|
other_ips = ['3.4.5.6', '4.5.6.7']
|
|
|
|
bind_ip = my_ips[1]
|
|
|
|
devs_by_ring_name1 = {
|
|
|
|
'object': [ # 'aay'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[0],
|
|
|
|
'port': 6006},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[0],
|
|
|
|
'port': 6007},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[1],
|
|
|
|
'port': 6008},
|
|
|
|
None,
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6009}],
|
|
|
|
'object-1': [ # 'bee'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[1],
|
|
|
|
'port': 6006}, # dupe
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[0],
|
|
|
|
'port': 6010},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[1],
|
|
|
|
'port': 6011},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6012}],
|
|
|
|
'object-2': [ # 'cee'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[0],
|
|
|
|
'port': 6010}, # on our IP and a not-us IP
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[0],
|
|
|
|
'port': 6013},
|
|
|
|
None,
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[1],
|
|
|
|
'port': 6014},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6015}],
|
|
|
|
}
|
|
|
|
devs_by_ring_name2 = {
|
|
|
|
'object': [ # 'aay'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[0],
|
|
|
|
'port': 6016},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6019}],
|
|
|
|
'object-1': [ # 'bee'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[1],
|
|
|
|
'port': 6016}, # dupe
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6022}],
|
|
|
|
'object-2': [ # 'cee'
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': my_ips[0],
|
|
|
|
'port': 6020},
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': other_ips[1],
|
|
|
|
'port': 6025}],
|
|
|
|
}
|
|
|
|
ring_files = [ring_name + '.ring.gz'
|
|
|
|
for ring_name in sorted(devs_by_ring_name1)]
|
|
|
|
|
|
|
|
def _fake_load(gz_path, stub_objs, metadata_only=False):
|
|
|
|
return RingData(
|
|
|
|
devs=stub_objs[os.path.basename(gz_path)[:-8]],
|
|
|
|
replica2part2dev_id=[],
|
|
|
|
part_shift=24)
|
|
|
|
|
|
|
|
with mock.patch(
|
2015-11-05 23:04:14 +13:00
|
|
|
'swift.common.storage_policy.RingData.load'
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
) as mock_ld, \
|
|
|
|
patch_policies(test_policies), \
|
|
|
|
mock.patch('swift.common.storage_policy.whataremyips') \
|
|
|
|
as mock_whataremyips, \
|
|
|
|
temptree(ring_files) as tempdir:
|
|
|
|
mock_whataremyips.return_value = my_ips
|
|
|
|
|
|
|
|
cache = BindPortsCache(tempdir, bind_ip)
|
|
|
|
|
|
|
|
self.assertEqual([
|
|
|
|
mock.call(bind_ip),
|
|
|
|
], mock_whataremyips.mock_calls)
|
|
|
|
mock_whataremyips.reset_mock()
|
|
|
|
|
|
|
|
mock_ld.side_effect = partial(_fake_load,
|
|
|
|
stub_objs=devs_by_ring_name1)
|
|
|
|
self.assertEqual(set([
|
|
|
|
6006, 6008, 6011, 6010, 6014,
|
|
|
|
]), cache.all_bind_ports_for_node())
|
|
|
|
self.assertEqual([
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[0]),
|
|
|
|
metadata_only=True),
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[1]),
|
|
|
|
metadata_only=True),
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[2]),
|
|
|
|
metadata_only=True),
|
|
|
|
], mock_ld.mock_calls)
|
|
|
|
mock_ld.reset_mock()
|
|
|
|
|
|
|
|
mock_ld.side_effect = partial(_fake_load,
|
|
|
|
stub_objs=devs_by_ring_name2)
|
|
|
|
self.assertEqual(set([
|
|
|
|
6006, 6008, 6011, 6010, 6014,
|
|
|
|
]), cache.all_bind_ports_for_node())
|
|
|
|
self.assertEqual([], mock_ld.mock_calls)
|
|
|
|
|
|
|
|
# but when all the file mtimes are made different, it'll
|
|
|
|
# reload
|
|
|
|
for gz_file in [os.path.join(tempdir, n)
|
|
|
|
for n in ring_files]:
|
|
|
|
os.utime(gz_file, (88, 88))
|
|
|
|
|
|
|
|
self.assertEqual(set([
|
|
|
|
6016, 6020,
|
|
|
|
]), cache.all_bind_ports_for_node())
|
|
|
|
self.assertEqual([
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[0]),
|
|
|
|
metadata_only=True),
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[1]),
|
|
|
|
metadata_only=True),
|
|
|
|
mock.call(os.path.join(tempdir, ring_files[2]),
|
|
|
|
metadata_only=True),
|
|
|
|
], mock_ld.mock_calls)
|
|
|
|
mock_ld.reset_mock()
|
|
|
|
|
|
|
|
# Don't do something stupid like crash if a ring file is missing.
|
|
|
|
os.unlink(os.path.join(tempdir, 'object-2.ring.gz'))
|
|
|
|
|
|
|
|
self.assertEqual(set([
|
|
|
|
6016, 6020,
|
|
|
|
]), cache.all_bind_ports_for_node())
|
|
|
|
self.assertEqual([], mock_ld.mock_calls)
|
|
|
|
|
|
|
|
# whataremyips() is only called in the constructor
|
|
|
|
self.assertEqual([], mock_whataremyips.mock_calls)
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
def test_singleton_passthrough(self):
|
|
|
|
test_policies = [StoragePolicy(0, 'aay', True),
|
|
|
|
StoragePolicy(1, 'bee', False),
|
|
|
|
StoragePolicy(2, 'cee', False)]
|
|
|
|
with patch_policies(test_policies):
|
|
|
|
for policy in POLICIES:
|
|
|
|
self.assertEqual(POLICIES[int(policy)], policy)
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
def test_quorum_size_replication(self):
|
|
|
|
expected_sizes = {1: 1,
|
2016-04-27 16:59:00 -05:00
|
|
|
2: 1,
|
2014-06-30 11:14:28 -07:00
|
|
|
3: 2,
|
2016-04-27 16:59:00 -05:00
|
|
|
4: 2,
|
2014-06-30 11:14:28 -07:00
|
|
|
5: 3}
|
|
|
|
for n, expected in expected_sizes.items():
|
|
|
|
policy = StoragePolicy(0, 'zero',
|
|
|
|
object_ring=FakeRing(replicas=n))
|
|
|
|
self.assertEqual(policy.quorum, expected)
|
|
|
|
|
|
|
|
def test_quorum_size_erasure_coding(self):
|
|
|
|
test_ec_policies = [
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(10, 'ec8-2', ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=8, ec_nparity=2),
|
|
|
|
ECStoragePolicy(11, 'df10-6', ec_type='flat_xor_hd_4',
|
|
|
|
ec_ndata=10, ec_nparity=6),
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
ECStoragePolicy(12, 'ec4-2-dup', ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=4, ec_nparity=2, ec_duplication_factor=2),
|
2014-06-30 11:14:28 -07:00
|
|
|
]
|
|
|
|
for ec_policy in test_ec_policies:
|
|
|
|
k = ec_policy.ec_ndata
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
expected_size = (
|
|
|
|
(k + ec_policy.pyeclib_driver.min_parity_fragments_needed())
|
|
|
|
* ec_policy.ec_duplication_factor
|
|
|
|
)
|
|
|
|
|
2014-06-30 11:14:28 -07:00
|
|
|
self.assertEqual(expected_size, ec_policy.quorum)
|
|
|
|
|
|
|
|
def test_validate_ring(self):
|
|
|
|
test_policies = [
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(0, 'ec8-2', ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=8, ec_nparity=2,
|
|
|
|
is_default=True),
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(1, 'ec10-4', ec_type=DEFAULT_TEST_EC_TYPE,
|
2016-01-15 03:53:01 -08:00
|
|
|
ec_ndata=10, ec_nparity=4),
|
2015-10-23 13:16:33 +00:00
|
|
|
ECStoragePolicy(2, 'ec4-2', ec_type=DEFAULT_TEST_EC_TYPE,
|
2016-01-15 03:53:01 -08:00
|
|
|
ec_ndata=4, ec_nparity=2),
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
ECStoragePolicy(3, 'ec4-2-2dup', ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=4, ec_nparity=2,
|
|
|
|
ec_duplication_factor=2)
|
2014-06-30 11:14:28 -07:00
|
|
|
]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
class MockRingData(object):
|
|
|
|
def __init__(self, num_replica):
|
2017-09-12 22:46:14 +00:00
|
|
|
self.replica_count = num_replica
|
|
|
|
|
|
|
|
def do_test(actual_load_ring_replicas):
|
|
|
|
for policy, ring_replicas in zip(policies,
|
|
|
|
actual_load_ring_replicas):
|
|
|
|
with mock.patch('swift.common.ring.ring.RingData.load',
|
|
|
|
return_value=MockRingData(ring_replicas)):
|
|
|
|
necessary_replica_num = (policy.ec_n_unique_fragments *
|
|
|
|
policy.ec_duplication_factor)
|
|
|
|
with mock.patch(
|
|
|
|
'swift.common.ring.ring.validate_configuration'):
|
|
|
|
msg = 'EC ring for policy %s needs to be configured ' \
|
|
|
|
'with exactly %d replicas.' % \
|
|
|
|
(policy.name, necessary_replica_num)
|
|
|
|
self.assertRaisesWithMessage(RingLoadError, msg,
|
|
|
|
policy.load_ring, 'mock')
|
|
|
|
|
|
|
|
# first, do somethign completely different
|
|
|
|
do_test([8, 10, 7, 11])
|
|
|
|
# then again, closer to true, but fractional
|
|
|
|
do_test([9.9, 14.1, 5.99999, 12.000000001])
|
2014-06-30 11:14:28 -07:00
|
|
|
|
|
|
|
def test_storage_policy_get_info(self):
|
|
|
|
test_policies = [
|
|
|
|
StoragePolicy(0, 'zero', is_default=True),
|
2015-11-05 23:04:14 +13:00
|
|
|
StoragePolicy(1, 'one', is_deprecated=True,
|
|
|
|
aliases='tahi, uno'),
|
2014-06-30 11:14:28 -07:00
|
|
|
ECStoragePolicy(10, 'ten',
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=10, ec_nparity=3),
|
|
|
|
ECStoragePolicy(11, 'done', is_deprecated=True,
|
2015-10-23 13:16:33 +00:00
|
|
|
ec_type=DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
ec_ndata=10, ec_nparity=3),
|
|
|
|
]
|
|
|
|
policies = StoragePolicyCollection(test_policies)
|
|
|
|
expected = {
|
|
|
|
# default replication
|
|
|
|
(0, True): {
|
|
|
|
'name': 'zero',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'zero',
|
2014-06-30 11:14:28 -07:00
|
|
|
'default': True,
|
|
|
|
'deprecated': False,
|
|
|
|
'policy_type': REPL_POLICY
|
|
|
|
},
|
|
|
|
(0, False): {
|
|
|
|
'name': 'zero',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'zero',
|
2014-06-30 11:14:28 -07:00
|
|
|
'default': True,
|
|
|
|
},
|
|
|
|
# deprecated replication
|
|
|
|
(1, True): {
|
|
|
|
'name': 'one',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'one, tahi, uno',
|
2014-06-30 11:14:28 -07:00
|
|
|
'default': False,
|
|
|
|
'deprecated': True,
|
|
|
|
'policy_type': REPL_POLICY
|
|
|
|
},
|
|
|
|
(1, False): {
|
|
|
|
'name': 'one',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'one, tahi, uno',
|
2014-06-30 11:14:28 -07:00
|
|
|
'deprecated': True,
|
|
|
|
},
|
|
|
|
# enabled ec
|
|
|
|
(10, True): {
|
|
|
|
'name': 'ten',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'ten',
|
2014-06-30 11:14:28 -07:00
|
|
|
'default': False,
|
|
|
|
'deprecated': False,
|
|
|
|
'policy_type': EC_POLICY,
|
2015-10-23 13:16:33 +00:00
|
|
|
'ec_type': DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
'ec_num_data_fragments': 10,
|
|
|
|
'ec_num_parity_fragments': 3,
|
|
|
|
'ec_object_segment_size': DEFAULT_EC_OBJECT_SEGMENT_SIZE,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
'ec_duplication_factor': 1,
|
2014-06-30 11:14:28 -07:00
|
|
|
},
|
|
|
|
(10, False): {
|
|
|
|
'name': 'ten',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'ten',
|
2014-06-30 11:14:28 -07:00
|
|
|
},
|
|
|
|
# deprecated ec
|
|
|
|
(11, True): {
|
|
|
|
'name': 'done',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'done',
|
2014-06-30 11:14:28 -07:00
|
|
|
'default': False,
|
|
|
|
'deprecated': True,
|
|
|
|
'policy_type': EC_POLICY,
|
2015-10-23 13:16:33 +00:00
|
|
|
'ec_type': DEFAULT_TEST_EC_TYPE,
|
2014-06-30 11:14:28 -07:00
|
|
|
'ec_num_data_fragments': 10,
|
|
|
|
'ec_num_parity_fragments': 3,
|
|
|
|
'ec_object_segment_size': DEFAULT_EC_OBJECT_SEGMENT_SIZE,
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
'ec_duplication_factor': 1,
|
2014-06-30 11:14:28 -07:00
|
|
|
},
|
|
|
|
(11, False): {
|
|
|
|
'name': 'done',
|
2015-11-05 23:04:14 +13:00
|
|
|
'aliases': 'done',
|
2014-06-30 11:14:28 -07:00
|
|
|
'deprecated': True,
|
|
|
|
},
|
EC Fragment Duplication - Foundational Global EC Cluster Support
This patch enables efficent PUT/GET for global distributed cluster[1].
Problem:
Erasure coding has the capability to decrease the amout of actual stored
data less then replicated model. For example, ec_k=6, ec_m=3 parameter
can be 1.5x of the original data which is smaller than 3x replicated.
However, unlike replication, erasure coding requires availability of at
least some ec_k fragments of the total ec_k + ec_m fragments to service
read (e.g. 6 of 9 in the case above). As such, if we stored the
EC object into a swift cluster on 2 geographically distributed data
centers which have the same volume of disks, it is likely the fragments
will be stored evenly (about 4 and 5) so we still need to access a
faraway data center to decode the original object. In addition, if one
of the data centers was lost in a disaster, the stored objects will be
lost forever, and we have to cry a lot. To ensure highly durable
storage, you would think of making *more* parity fragments (e.g.
ec_k=6, ec_m=10), unfortunately this causes *significant* performance
degradation due to the cost of mathmetical caluculation for erasure
coding encode/decode.
How this resolves the problem:
EC Fragment Duplication extends on the initial solution to add *more*
fragments from which to rebuild an object similar to the solution
described above. The difference is making *copies* of encoded fragments.
With experimental results[1][2], employing small ec_k and ec_m shows
enough performance to store/retrieve objects.
On PUT:
- Encode incomming object with small ec_k and ec_m <- faster!
- Make duplicated copies of the encoded fragments. The # of copies
are determined by 'ec_duplication_factor' in swift.conf
- Store all fragments in Swift Global EC Cluster
The duplicated fragments increase pressure on existing requirements
when decoding objects in service to a read request. All fragments are
stored with their X-Object-Sysmeta-Ec-Frag-Index. In this change, the
X-Object-Sysmeta-Ec-Frag-Index represents the actual fragment index
encoded by PyECLib, there *will* be duplicates. Anytime we must decode
the original object data, we must only consider the ec_k fragments as
unique according to their X-Object-Sysmeta-Ec-Frag-Index. On decode no
duplicate X-Object-Sysmeta-Ec-Frag-Index may be used when decoding an
object, duplicate X-Object-Sysmeta-Ec-Frag-Index should be expected and
avoided if possible.
On GET:
This patch inclues following changes:
- Change GET Path to sort primary nodes grouping as subsets, so that
each subset will includes unique fragments
- Change Reconstructor to be more aware of possibly duplicate fragments
For example, with this change, a policy could be configured such that
swift.conf:
ec_num_data_fragments = 2
ec_num_parity_fragments = 1
ec_duplication_factor = 2
(object ring must have 6 replicas)
At Object-Server:
node index (from object ring): 0 1 2 3 4 5 <- keep node index for
reconstruct decision
X-Object-Sysmeta-Ec-Frag-Index: 0 1 2 0 1 2 <- each object keeps actual
fragment index for
backend (PyEClib)
Additional improvements to Global EC Cluster Support will require
features such as Composite Rings, and more efficient fragment
rebalance/reconstruction.
1: http://goo.gl/IYiNPk (Swift Design Spec Repository)
2: http://goo.gl/frgj6w (Slide Share for OpenStack Summit Tokyo)
Doc-Impact
Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com>
Change-Id: Idd155401982a2c48110c30b480966a863f6bd305
2015-08-06 01:06:47 -07:00
|
|
|
# enabled ec with ec_duplication
|
|
|
|
(12, True): {
|
|
|
|
'name': 'twelve',
|
|
|
|
'aliases': 'twelve',
|
|
|
|
'default': False,
|
|
|
|
'deprecated': False,
|
|
|
|
'policy_type': EC_POLICY,
|
|
|
|
'ec_type': DEFAULT_TEST_EC_TYPE,
|
|
|
|
'ec_num_data_fragments': 10,
|
|
|
|
'ec_num_parity_fragments': 3,
|
|
|
|
'ec_object_segment_size': DEFAULT_EC_OBJECT_SEGMENT_SIZE,
|
|
|
|
'ec_duplication_factor': 2,
|
|
|
|
},
|
|
|
|
(12, False): {
|
|
|
|
'name': 'twelve',
|
|
|
|
'aliases': 'twelve',
|
|
|
|
},
|
2014-06-30 11:14:28 -07:00
|
|
|
}
|
|
|
|
self.maxDiff = None
|
|
|
|
for policy in policies:
|
|
|
|
expected_info = expected[(int(policy), True)]
|
|
|
|
self.assertEqual(policy.get_info(config=True), expected_info)
|
|
|
|
expected_info = expected[(int(policy), False)]
|
|
|
|
self.assertEqual(policy.get_info(config=False), expected_info)
|
2015-02-25 17:33:44 +00:00
|
|
|
|
2016-07-20 18:16:27 -07:00
|
|
|
def test_ec_fragment_size_cached(self):
|
|
|
|
policy = ECStoragePolicy(
|
|
|
|
0, 'ec2-1', ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
ec_ndata=2, ec_nparity=1, object_ring=FakeRing(replicas=3),
|
|
|
|
ec_segment_size=DEFAULT_EC_OBJECT_SEGMENT_SIZE, is_default=True)
|
|
|
|
|
|
|
|
ec_driver = ECDriver(ec_type=DEFAULT_TEST_EC_TYPE,
|
|
|
|
k=2, m=1)
|
|
|
|
expected_fragment_size = ec_driver.get_segment_info(
|
|
|
|
DEFAULT_EC_OBJECT_SEGMENT_SIZE,
|
|
|
|
DEFAULT_EC_OBJECT_SEGMENT_SIZE)['fragment_size']
|
|
|
|
|
|
|
|
with mock.patch.object(
|
|
|
|
policy.pyeclib_driver, 'get_segment_info') as fake:
|
|
|
|
fake.return_value = {
|
|
|
|
'fragment_size': expected_fragment_size}
|
|
|
|
|
|
|
|
for x in range(10):
|
|
|
|
self.assertEqual(expected_fragment_size,
|
|
|
|
policy.fragment_size)
|
|
|
|
# pyeclib_driver.get_segment_info is called only once
|
|
|
|
self.assertEqual(1, fake.call_count)
|
|
|
|
|
2014-03-17 12:18:25 -07:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|