2013-09-20 01:00:54 +08:00
|
|
|
# Copyright (c) 2010-2012 OpenStack Foundation
|
2010-07-12 17:03:45 -05:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
import array
|
2017-01-09 20:00:33 +00:00
|
|
|
import collections
|
2015-07-07 22:46:37 +05:30
|
|
|
import six.moves.cPickle as pickle
|
2010-07-12 17:03:45 -05:00
|
|
|
import os
|
|
|
|
import unittest
|
2014-04-05 09:38:12 +01:00
|
|
|
import stat
|
2013-07-20 13:44:11 -07:00
|
|
|
from contextlib import closing
|
2010-07-12 17:03:45 -05:00
|
|
|
from gzip import GzipFile
|
2014-01-16 00:49:28 -05:00
|
|
|
from tempfile import mkdtemp
|
2010-07-12 17:03:45 -05:00
|
|
|
from shutil import rmtree
|
|
|
|
from time import sleep, time
|
2016-11-08 14:17:31 -08:00
|
|
|
import sys
|
|
|
|
import copy
|
|
|
|
import mock
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2015-05-25 18:28:02 +02:00
|
|
|
from six.moves import range
|
|
|
|
|
2013-10-07 12:10:31 +00:00
|
|
|
from swift.common import ring, utils
|
2016-01-15 11:44:21 -08:00
|
|
|
from swift.common.ring import utils as ring_utils
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
|
2014-05-27 16:57:25 -07:00
|
|
|
class TestRingBase(unittest.TestCase):
|
2017-01-09 20:00:33 +00:00
|
|
|
longMessage = True
|
2014-05-27 16:57:25 -07:00
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
self._orig_hash_suffix = utils.HASH_PATH_SUFFIX
|
|
|
|
self._orig_hash_prefix = utils.HASH_PATH_PREFIX
|
2016-11-23 10:14:21 -08:00
|
|
|
utils.HASH_PATH_SUFFIX = b'endcap'
|
|
|
|
utils.HASH_PATH_PREFIX = b''
|
2014-05-27 16:57:25 -07:00
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
utils.HASH_PATH_SUFFIX = self._orig_hash_suffix
|
|
|
|
utils.HASH_PATH_PREFIX = self._orig_hash_prefix
|
|
|
|
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
class TestRingData(unittest.TestCase):
|
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
def setUp(self):
|
|
|
|
self.testdir = os.path.join(os.path.dirname(__file__), 'ring_data')
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
os.mkdir(self.testdir)
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
|
|
|
|
def assert_ring_data_equal(self, rd_expected, rd_got):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(rd_expected._replica2part2dev_id,
|
|
|
|
rd_got._replica2part2dev_id)
|
|
|
|
self.assertEqual(rd_expected.devs, rd_got.devs)
|
|
|
|
self.assertEqual(rd_expected._part_shift, rd_got._part_shift)
|
2012-08-05 00:51:49 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_attrs(self):
|
|
|
|
r2p2d = [[0, 1, 0, 1], [0, 1, 0, 1]]
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
d = [{'id': 0, 'zone': 0, 'region': 0, 'ip': '10.1.1.0', 'port': 7000},
|
|
|
|
{'id': 1, 'zone': 1, 'region': 1, 'ip': '10.1.1.1', 'port': 7000}]
|
2010-07-12 17:03:45 -05:00
|
|
|
s = 30
|
|
|
|
rd = ring.RingData(r2p2d, d, s)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(rd._replica2part2dev_id, r2p2d)
|
|
|
|
self.assertEqual(rd.devs, d)
|
|
|
|
self.assertEqual(rd._part_shift, s)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
def test_can_load_pickled_ring_data(self):
|
2013-09-01 01:14:40 -04:00
|
|
|
rd = ring.RingData(
|
|
|
|
[[0, 1, 0, 1], [0, 1, 0, 1]],
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
[{'id': 0, 'zone': 0, 'ip': '10.1.1.0', 'port': 7000},
|
|
|
|
{'id': 1, 'zone': 1, 'ip': '10.1.1.1', 'port': 7000}],
|
|
|
|
30)
|
2012-08-05 00:51:49 -07:00
|
|
|
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
|
2015-05-25 18:28:02 +02:00
|
|
|
for p in range(pickle.HIGHEST_PROTOCOL):
|
2013-07-20 13:44:11 -07:00
|
|
|
with closing(GzipFile(ring_fname, 'wb')) as f:
|
|
|
|
pickle.dump(rd, f, protocol=p)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
meta_only = ring.RingData.load(ring_fname, metadata_only=True)
|
|
|
|
self.assertEqual([
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1, 'ip': '10.1.1.0',
|
|
|
|
'port': 7000},
|
|
|
|
{'id': 1, 'zone': 1, 'region': 1, 'ip': '10.1.1.1',
|
|
|
|
'port': 7000},
|
|
|
|
], meta_only.devs)
|
|
|
|
# Pickled rings can't load only metadata, so you get it all
|
|
|
|
self.assert_ring_data_equal(rd, meta_only)
|
2012-08-05 00:51:49 -07:00
|
|
|
ring_data = ring.RingData.load(ring_fname)
|
|
|
|
self.assert_ring_data_equal(rd, ring_data)
|
|
|
|
|
|
|
|
def test_roundtrip_serialization(self):
|
|
|
|
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
|
|
|
|
rd = ring.RingData(
|
2013-07-20 13:44:11 -07:00
|
|
|
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
|
2012-08-05 00:51:49 -07:00
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
|
|
|
rd.save(ring_fname)
|
Allow 1+ object-servers-per-disk deployment
Enabled by a new > 0 integer config value, "servers_per_port" in the
[DEFAULT] config section for object-server and/or replication server
configs. The setting's integer value determines how many different
object-server workers handle requests for any single unique local port
in the ring. In this mode, the parent swift-object-server process
continues to run as the original user (i.e. root if low-port binding
is required), binds to all ports as defined in the ring, and forks off
the specified number of workers per listen socket. The child, per-port
servers drop privileges and behave pretty much how object-server workers
always have, except that because the ring has unique ports per disk, the
object-servers will only be handling requests for a single disk. The
parent process detects dead servers and restarts them (with the correct
listen socket), starts missing servers when an updated ring file is
found with a device on the server with a new port, and kills extraneous
servers when their port is found to no longer be in the ring. The ring
files are stat'ed at most every "ring_check_interval" seconds, as
configured in the object-server config (same default of 15s).
Immediately stopping all swift-object-worker processes still works by
sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process
still causes the parent process to close all listen sockets and exit,
allowing existing children to finish serving their existing requests.
The drop_privileges helper function now has an optional param to
suppress the setsid() call, which otherwise screws up the child workers'
process management.
The class method RingData.load() can be told to only load the ring
metadata (i.e. everything except replica2part2dev_id) with the optional
kwarg, header_only=True. This is used to keep the parent and all
forked off workers from unnecessarily having full copies of all storage
policy rings in memory.
A new helper class, swift.common.storage_policy.BindPortsCache,
provides a method to return a set of all device ports in all rings for
the server on which it is instantiated (identified by its set of IP
addresses). The BindPortsCache instance will track mtimes of ring
files, so they are not opened more frequently than necessary.
This patch includes enhancements to the probe tests and
object-replicator/object-reconstructor config plumbing to allow the
probe tests to work correctly both in the "normal" config (same IP but
unique ports for each SAIO "server") and a server-per-port setup where
each SAIO "server" must have a unique IP address and unique port per
disk within each "server". The main probe tests only work with 4
servers and 4 disks, but you can see the difference in the rings for the
EC probe tests where there are 2 disks per server for a total of 8
disks. Specifically, swift.common.ring.utils.is_local_device() will
ignore the ports when the "my_port" argument is None. Then,
object-replicator and object-reconstructor both set self.bind_port to
None if server_per_port is enabled. Bonus improvement for IPv6
addresses in is_local_device().
This PR for vagrant-swift-all-in-one will aid in testing this patch:
https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/
Also allow SAIO to answer is_local_device() better; common SAIO setups
have multiple "servers" all on the same host with different ports for
the different "servers" (which happen to match the IPs specified in the
rings for the devices on each of those "servers").
However, you can configure the SAIO to have different localhost IP
addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the
servers' config files' bind_ip setting.
This new whataremyips() implementation combined with a little plumbing
allows is_local_device() to accurately answer, even on an SAIO.
In the default case (an unspecified bind_ip defaults to '0.0.0.0') as
well as an explict "bind to everything" like '0.0.0.0' or '::',
whataremyips() behaves as it always has, returning all IP addresses for
the server.
Also updated probe tests to handle each "server" in the SAIO having a
unique IP address.
For some (noisy) benchmarks that show servers_per_port=X is at least as
good as the same number of "normal" workers:
https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md
Benchmarks showing the benefits of I/O isolation with a small number of
slow disks:
https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md
If you were wondering what the overhead of threads_per_disk looks like:
https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md
DocImpact
Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
|
|
|
meta_only = ring.RingData.load(ring_fname, metadata_only=True)
|
|
|
|
self.assertEqual([
|
|
|
|
{'id': 0, 'zone': 0, 'region': 1},
|
|
|
|
{'id': 1, 'zone': 1, 'region': 1},
|
|
|
|
], meta_only.devs)
|
|
|
|
self.assertEqual([], meta_only._replica2part2dev_id)
|
2012-08-05 00:51:49 -07:00
|
|
|
rd2 = ring.RingData.load(ring_fname)
|
|
|
|
self.assert_ring_data_equal(rd, rd2)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2016-11-08 14:17:31 -08:00
|
|
|
def test_byteswapped_serialization(self):
|
|
|
|
# Manually byte swap a ring and write it out, claiming it was written
|
|
|
|
# on a different endian machine. Then read it back in and see if it's
|
|
|
|
# the same as the non-byte swapped original.
|
|
|
|
|
|
|
|
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
|
|
|
|
data = [array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])]
|
|
|
|
swapped_data = copy.deepcopy(data)
|
|
|
|
for x in swapped_data:
|
|
|
|
x.byteswap()
|
|
|
|
|
|
|
|
with mock.patch.object(sys, 'byteorder',
|
|
|
|
'big' if sys.byteorder == 'little'
|
|
|
|
else 'little'):
|
|
|
|
rds = ring.RingData(swapped_data,
|
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}],
|
|
|
|
30)
|
|
|
|
rds.save(ring_fname)
|
|
|
|
|
|
|
|
rd1 = ring.RingData(data, [{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}],
|
|
|
|
30)
|
|
|
|
rd2 = ring.RingData.load(ring_fname)
|
|
|
|
self.assert_ring_data_equal(rd1, rd2)
|
|
|
|
|
2013-01-31 15:12:09 -08:00
|
|
|
def test_deterministic_serialization(self):
|
|
|
|
"""
|
|
|
|
Two identical rings should produce identical .gz files on disk.
|
|
|
|
"""
|
|
|
|
os.mkdir(os.path.join(self.testdir, '1'))
|
|
|
|
os.mkdir(os.path.join(self.testdir, '2'))
|
|
|
|
# These have to have the same filename (not full path,
|
|
|
|
# obviously) since the filename gets encoded in the gzip data.
|
|
|
|
ring_fname1 = os.path.join(self.testdir, '1', 'the.ring.gz')
|
|
|
|
ring_fname2 = os.path.join(self.testdir, '2', 'the.ring.gz')
|
|
|
|
rd = ring.RingData(
|
2013-09-01 01:14:40 -04:00
|
|
|
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
|
2013-01-31 15:12:09 -08:00
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
|
|
|
rd.save(ring_fname1)
|
|
|
|
rd.save(ring_fname2)
|
2016-11-23 10:14:21 -08:00
|
|
|
with open(ring_fname1, 'rb') as ring1:
|
|
|
|
with open(ring_fname2, 'rb') as ring2:
|
2013-01-31 15:12:09 -08:00
|
|
|
self.assertEqual(ring1.read(), ring2.read())
|
|
|
|
|
2014-04-05 09:38:12 +01:00
|
|
|
def test_permissions(self):
|
|
|
|
ring_fname = os.path.join(self.testdir, 'stat.ring.gz')
|
|
|
|
rd = ring.RingData(
|
|
|
|
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
|
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
|
|
|
rd.save(ring_fname)
|
2016-11-23 10:14:21 -08:00
|
|
|
ring_mode = stat.S_IMODE(os.stat(ring_fname).st_mode)
|
|
|
|
expected_mode = (stat.S_IRUSR | stat.S_IWUSR |
|
|
|
|
stat.S_IRGRP | stat.S_IROTH)
|
|
|
|
self.assertEqual(
|
|
|
|
ring_mode, expected_mode,
|
|
|
|
'Ring has mode 0%o, expected 0%o' % (ring_mode, expected_mode))
|
2014-04-05 09:38:12 +01:00
|
|
|
|
2017-09-12 22:46:14 +00:00
|
|
|
def test_replica_count(self):
|
|
|
|
rd = ring.RingData(
|
|
|
|
[[0, 1, 0, 1], [0, 1, 0, 1]],
|
|
|
|
[{'id': 0, 'zone': 0, 'ip': '10.1.1.0', 'port': 7000},
|
|
|
|
{'id': 1, 'zone': 1, 'ip': '10.1.1.1', 'port': 7000}],
|
|
|
|
30)
|
|
|
|
self.assertEqual(rd.replica_count, 2)
|
|
|
|
|
|
|
|
rd = ring.RingData(
|
|
|
|
[[0, 1, 0, 1], [0, 1, 0]],
|
|
|
|
[{'id': 0, 'zone': 0, 'ip': '10.1.1.0', 'port': 7000},
|
|
|
|
{'id': 1, 'zone': 1, 'ip': '10.1.1.1', 'port': 7000}],
|
|
|
|
30)
|
|
|
|
self.assertEqual(rd.replica_count, 1.75)
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2014-05-27 16:57:25 -07:00
|
|
|
class TestRing(TestRingBase):
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def setUp(self):
|
2014-05-27 16:57:25 -07:00
|
|
|
super(TestRing, self).setUp()
|
2014-01-16 00:49:28 -05:00
|
|
|
self.testdir = mkdtemp()
|
2012-03-11 04:36:26 -07:00
|
|
|
self.testgz = os.path.join(self.testdir, 'whatever.ring.gz')
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_replica2part2dev_id = [
|
|
|
|
array.array('H', [0, 1, 0, 1]),
|
|
|
|
array.array('H', [0, 1, 0, 1]),
|
|
|
|
array.array('H', [3, 4, 3, 4])]
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2012-12-17 06:39:25 -05:00
|
|
|
'replication_ip': '10.1.0.1',
|
|
|
|
'replication_port': 6066},
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2012-12-17 06:39:25 -05:00
|
|
|
'replication_ip': '10.1.0.2',
|
|
|
|
'replication_port': 6066},
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
None,
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.1', 'port': 6200,
|
2012-12-17 06:39:25 -05:00
|
|
|
'replication_ip': '10.2.0.1',
|
|
|
|
'replication_port': 6066},
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.2', 'port': 6200,
|
2012-12-17 06:39:25 -05:00
|
|
|
'replication_ip': '10.2.0.1',
|
|
|
|
'replication_port': 6066}]
|
2010-07-12 17:03:45 -05:00
|
|
|
self.intended_part_shift = 30
|
|
|
|
self.intended_reload_time = 15
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.ring = ring.Ring(
|
|
|
|
self.testdir,
|
2012-03-11 04:36:26 -07:00
|
|
|
reload_time=self.intended_reload_time, ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def tearDown(self):
|
2014-05-27 16:57:25 -07:00
|
|
|
super(TestRing, self).tearDown()
|
2010-07-12 17:03:45 -05:00
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
|
|
|
|
def test_creation(self):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(self.ring._replica2part2dev_id,
|
|
|
|
self.intended_replica2part2dev_id)
|
|
|
|
self.assertEqual(self.ring._part_shift, self.intended_part_shift)
|
|
|
|
self.assertEqual(self.ring.devs, self.intended_devs)
|
|
|
|
self.assertEqual(self.ring.reload_time, self.intended_reload_time)
|
|
|
|
self.assertEqual(self.ring.serialized_path, self.testgz)
|
2011-02-16 09:02:38 -06:00
|
|
|
# test invalid endcap
|
2016-11-23 10:14:21 -08:00
|
|
|
with mock.patch.object(utils, 'HASH_PATH_SUFFIX', b''), \
|
|
|
|
mock.patch.object(utils, 'HASH_PATH_PREFIX', b''), \
|
2017-10-20 15:31:07 -07:00
|
|
|
mock.patch.object(utils, 'SWIFT_CONF_FILE', ''):
|
2012-03-11 04:36:26 -07:00
|
|
|
self.assertRaises(SystemExit, ring.Ring, self.testdir, 'whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2017-09-12 22:46:14 +00:00
|
|
|
def test_replica_count(self):
|
|
|
|
self.assertEqual(self.ring.replica_count, 3)
|
|
|
|
self.ring._replica2part2dev_id.append([0])
|
|
|
|
self.assertEqual(self.ring.replica_count, 3.25)
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_has_changed(self):
|
2016-07-15 14:46:41 +02:00
|
|
|
self.assertFalse(self.ring.has_changed())
|
2011-02-09 21:36:14 +00:00
|
|
|
os.utime(self.testgz, (time() + 60, time() + 60))
|
2016-07-15 14:46:41 +02:00
|
|
|
self.assertTrue(self.ring.has_changed())
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_reload(self):
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
2013-09-01 01:14:40 -04:00
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 5)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.intended_devs.append(
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
{'id': 3, 'region': 0, 'zone': 3, 'weight': 1.0,
|
|
|
|
'ip': '10.1.1.1', 'port': 9876})
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
|
|
|
self.ring.get_nodes('a')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 6)
|
2015-08-31 21:49:49 +05:30
|
|
|
self.assertNotEqual(self.ring._mtime, orig_mtime)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
2013-09-01 01:14:40 -04:00
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 6)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.intended_devs.append(
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
{'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0,
|
|
|
|
'ip': '10.5.5.5', 'port': 9876})
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
|
|
|
self.ring.get_part_nodes(0)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 7)
|
2015-08-31 21:49:49 +05:30
|
|
|
self.assertNotEqual(self.ring._mtime, orig_mtime)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
2013-09-01 01:14:40 -04:00
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 7)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.intended_devs.append(
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
{'id': 6, 'region': 0, 'zone': 5, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.6.6.6', 'port': 6200})
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
2015-06-15 22:10:45 +05:30
|
|
|
next(self.ring.get_more_nodes(part))
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 8)
|
2015-08-31 21:49:49 +05:30
|
|
|
self.assertNotEqual(self.ring._mtime, orig_mtime)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2012-07-20 09:15:34 -07:00
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
2013-09-01 01:14:40 -04:00
|
|
|
ring_name='whatever')
|
2012-07-20 09:15:34 -07:00
|
|
|
orig_mtime = self.ring._mtime
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 8)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.intended_devs.append(
|
Update handoff algorithm to use IP/port pairs
The replica placement algorithm works on regions, then zones, then
IP/port, then device ID. The handoff algorithm worked on regions, then
zones, then device ID, completely skipping IP/port. It's now been
updated to take IP/port into consideration.
This means you get one handoff on each machine in the cluster before
you start getting handoffs that share a machine with a previous
one. In small clusters, this can help with durability.
Because this is performance-critical code, here are some quick
benchmark results:
Run time averages over 25000 trials on a 1200-device ring (20 part
power, 3 replicas, 2 regions, 10 zones, 120 nodes):
| master | branch
===================+=============+============
get 1 more node | 2.727e-05 | 3.076e-05
get 6 more nodes | 3.55e-05 | 4.214e-05
get all more nodes | 0.002247 | 0.002691
There's a small slowdown from the additional bookkeeping, but nothing
too awful. The time to get 6 more nodes (for handoff checks on 404,
it's 2x replica count by default, hence 6) went from 35 to 42
microseconds, so it remains small.
Change-Id: Ie7da4dfcb0fcf1a38e2fb13f60c204540fadbf06
2013-12-02 17:08:19 -08:00
|
|
|
{'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.5.5.5', 'port': 6200})
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2012-07-20 09:15:34 -07:00
|
|
|
sleep(0.1)
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(self.ring.devs), 9)
|
2015-08-31 21:49:49 +05:30
|
|
|
self.assertNotEqual(self.ring._mtime, orig_mtime)
|
2012-07-20 09:15:34 -07:00
|
|
|
|
2012-12-17 06:39:25 -05:00
|
|
|
def test_reload_without_replication(self):
|
|
|
|
replication_less_devs = [{'id': 0, 'region': 0, 'zone': 0,
|
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2012-12-17 06:39:25 -05:00
|
|
|
{'id': 1, 'region': 0, 'zone': 0,
|
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2012-12-17 06:39:25 -05:00
|
|
|
None,
|
|
|
|
{'id': 3, 'region': 0, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2012-12-17 06:39:25 -05:00
|
|
|
{'id': 4, 'region': 0, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.2',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200}]
|
2012-12-17 06:39:25 -05:00
|
|
|
intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2013-09-01 01:14:40 -04:00
|
|
|
'replication_ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-09-01 01:14:40 -04:00
|
|
|
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2013-09-01 01:14:40 -04:00
|
|
|
'replication_ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-09-01 01:14:40 -04:00
|
|
|
None,
|
|
|
|
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.1', 'port': 6200,
|
2013-09-01 01:14:40 -04:00
|
|
|
'replication_ip': '10.1.2.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-09-01 01:14:40 -04:00
|
|
|
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.2', 'port': 6200,
|
2013-09-01 01:14:40 -04:00
|
|
|
'replication_ip': '10.1.2.2',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200}]
|
2012-12-17 06:39:25 -05:00
|
|
|
testgz = os.path.join(self.testdir, 'without_replication.ring.gz')
|
2013-09-01 01:14:40 -04:00
|
|
|
ring.RingData(
|
|
|
|
self.intended_replica2part2dev_id,
|
2012-12-17 06:39:25 -05:00
|
|
|
replication_less_devs, self.intended_part_shift).save(testgz)
|
2013-09-01 01:14:40 -04:00
|
|
|
self.ring = ring.Ring(
|
|
|
|
self.testdir,
|
2012-12-17 06:39:25 -05:00
|
|
|
reload_time=self.intended_reload_time,
|
|
|
|
ring_name='without_replication')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(self.ring.devs, intended_devs)
|
2012-12-17 06:39:25 -05:00
|
|
|
|
2013-11-07 11:36:55 +00:00
|
|
|
def test_reload_old_style_pickled_ring(self):
|
|
|
|
devs = [{'id': 0, 'zone': 0,
|
2015-07-30 00:28:44 +02:00
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
{'id': 1, 'zone': 0,
|
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
None,
|
|
|
|
{'id': 3, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
{'id': 4, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.2',
|
2016-02-01 18:06:54 +00:00
|
|
|
'port': 6200}]
|
2013-11-07 11:36:55 +00:00
|
|
|
intended_devs = [{'id': 0, 'region': 1, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2013-11-07 11:36:55 +00:00
|
|
|
'replication_ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
{'id': 1, 'region': 1, 'zone': 0, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.1.1', 'port': 6200,
|
2013-11-07 11:36:55 +00:00
|
|
|
'replication_ip': '10.1.1.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
None,
|
|
|
|
{'id': 3, 'region': 1, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.1', 'port': 6200,
|
2013-11-07 11:36:55 +00:00
|
|
|
'replication_ip': '10.1.2.1',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200},
|
2013-11-07 11:36:55 +00:00
|
|
|
{'id': 4, 'region': 1, 'zone': 2, 'weight': 1.0,
|
2016-02-01 18:06:54 +00:00
|
|
|
'ip': '10.1.2.2', 'port': 6200,
|
2013-11-07 11:36:55 +00:00
|
|
|
'replication_ip': '10.1.2.2',
|
2016-02-01 18:06:54 +00:00
|
|
|
'replication_port': 6200}]
|
2013-11-07 11:36:55 +00:00
|
|
|
|
|
|
|
# simulate an old-style pickled ring
|
|
|
|
testgz = os.path.join(self.testdir,
|
|
|
|
'without_replication_or_region.ring.gz')
|
|
|
|
ring_data = ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
devs,
|
|
|
|
self.intended_part_shift)
|
|
|
|
# an old-style pickled ring won't have region data
|
|
|
|
for dev in ring_data.devs:
|
|
|
|
if dev:
|
|
|
|
del dev["region"]
|
|
|
|
gz_file = GzipFile(testgz, 'wb')
|
|
|
|
pickle.dump(ring_data, gz_file, protocol=2)
|
|
|
|
gz_file.close()
|
|
|
|
|
|
|
|
self.ring = ring.Ring(
|
|
|
|
self.testdir,
|
|
|
|
reload_time=self.intended_reload_time,
|
|
|
|
ring_name='without_replication_or_region')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(self.ring.devs, intended_devs)
|
2013-11-07 11:36:55 +00:00
|
|
|
|
2013-04-06 01:35:58 +00:00
|
|
|
def test_get_part(self):
|
|
|
|
part1 = self.ring.get_part('a')
|
|
|
|
nodes1 = self.ring.get_part_nodes(part1)
|
|
|
|
part2, nodes2 = self.ring.get_nodes('a')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part1, part2)
|
|
|
|
self.assertEqual(nodes1, nodes2)
|
2013-04-06 01:35:58 +00:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_get_part_nodes(self):
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(nodes, self.ring.get_part_nodes(part))
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_get_nodes(self):
|
|
|
|
# Yes, these tests are deliberately very fragile. We want to make sure
|
|
|
|
# that if someones changes the results the ring produces, they know it.
|
|
|
|
self.assertRaises(TypeError, self.ring.get_nodes)
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 0)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a1')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 0)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a4')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 1)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('aa')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 1)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])])
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
part, nodes = self.ring.get_nodes('a', 'c1')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 0)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c0')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 3)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c3')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 2)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c2')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o1')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 1)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o5')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 0)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o0')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 0)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o2')
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, 2)
|
|
|
|
self.assertEqual(nodes, [dict(node, index=i) for i, node in
|
|
|
|
enumerate([self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])])
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
|
|
|
|
def add_dev_to_ring(self, new_dev):
|
|
|
|
self.ring.devs.append(new_dev)
|
|
|
|
self.ring._rebuild_tier_data()
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2016-11-23 10:14:21 -08:00
|
|
|
@unittest.skipIf(sys.version_info >= (3,),
|
|
|
|
"Seed-specific tests don't work well on py3")
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_get_more_nodes(self):
|
|
|
|
# Yes, these tests are deliberately very fragile. We want to make sure
|
|
|
|
# that if someone changes the results the ring produces, they know it.
|
2013-03-04 08:52:24 +00:00
|
|
|
exp_part = 6
|
2015-10-15 16:20:58 -07:00
|
|
|
exp_devs = [71, 77, 30]
|
|
|
|
exp_zones = set([6, 3, 7])
|
|
|
|
|
|
|
|
exp_handoffs = [99, 43, 94, 13, 1, 49, 60, 72, 27, 68, 78, 26, 21, 9,
|
|
|
|
51, 105, 47, 89, 65, 82, 34, 98, 38, 85, 16, 4, 59,
|
|
|
|
102, 40, 90, 20, 8, 54, 66, 80, 25, 14, 2, 50, 12, 0,
|
|
|
|
48, 70, 76, 32, 107, 45, 87, 101, 44, 93, 100, 42, 95,
|
|
|
|
106, 46, 88, 97, 37, 86, 96, 36, 84, 17, 5, 57, 63,
|
|
|
|
81, 33, 67, 79, 24, 15, 3, 58, 69, 75, 31, 61, 74, 29,
|
|
|
|
23, 10, 52, 22, 11, 53, 64, 83, 35, 62, 73, 28, 18, 6,
|
|
|
|
56, 104, 39, 91, 103, 41, 92, 19, 7, 55]
|
|
|
|
|
|
|
|
exp_first_handoffs = [23, 64, 105, 102, 67, 17, 99, 65, 69, 97, 15,
|
|
|
|
17, 24, 98, 66, 65, 69, 18, 104, 105, 16, 107,
|
|
|
|
100, 15, 14, 19, 102, 105, 63, 104, 99, 12, 107,
|
|
|
|
99, 16, 105, 71, 15, 15, 63, 63, 99, 21, 68, 20,
|
|
|
|
64, 96, 21, 98, 19, 68, 99, 15, 69, 62, 100, 96,
|
|
|
|
102, 17, 62, 13, 61, 102, 105, 22, 16, 21, 18,
|
|
|
|
21, 100, 20, 16, 21, 106, 66, 106, 16, 99, 16,
|
|
|
|
22, 62, 60, 99, 69, 18, 23, 104, 98, 106, 61,
|
|
|
|
21, 23, 23, 16, 67, 71, 101, 16, 64, 66, 70, 15,
|
|
|
|
102, 63, 19, 98, 18, 106, 101, 100, 62, 63, 98,
|
|
|
|
18, 13, 97, 23, 22, 100, 13, 14, 67, 96, 14,
|
|
|
|
105, 97, 71, 64, 96, 22, 65, 66, 98, 19, 105,
|
|
|
|
98, 97, 21, 15, 69, 100, 98, 106, 65, 66, 97,
|
|
|
|
62, 22, 68, 63, 61, 67, 67, 20, 105, 106, 105,
|
|
|
|
18, 71, 100, 17, 62, 60, 13, 103, 99, 101, 96,
|
|
|
|
97, 16, 60, 21, 14, 20, 12, 60, 69, 104, 65, 65,
|
|
|
|
17, 16, 67, 13, 64, 15, 16, 68, 96, 21, 104, 66,
|
|
|
|
96, 105, 58, 105, 103, 21, 96, 60, 16, 96, 21,
|
|
|
|
71, 16, 99, 101, 63, 62, 103, 18, 102, 60, 17,
|
|
|
|
19, 106, 97, 14, 99, 68, 102, 13, 70, 103, 21,
|
|
|
|
22, 19, 61, 103, 23, 104, 65, 62, 68, 16, 65,
|
|
|
|
15, 102, 102, 71, 99, 63, 67, 19, 23, 15, 69,
|
|
|
|
107, 14, 13, 64, 13, 105, 15, 98, 69]
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
rb = ring.RingBuilder(8, 3, 1)
|
|
|
|
next_dev_id = 0
|
2015-05-25 18:28:02 +02:00
|
|
|
for zone in range(1, 10):
|
|
|
|
for server in range(1, 5):
|
|
|
|
for device in range(1, 4):
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.2.%d.%d' % (zone, server),
|
Use just IP, not port, when determining partition placement
In the ring builder, we place partitions with maximum possible
dispersion across tiers, where a "tier" is region, then zone, then
IP/port,then device. Now, instead of IP/port, just use IP. The port
wasn't really getting us anything; two different object servers on two
different ports on one machine aren't separate failure
domains. However, if someone has only a few machines and is using one
object server on its own port per disk, then the ring builder would
end up with every disk in its own IP/port tier, resulting in bad (with
respect to durability) partition placement.
For example: assume 1 region, 1 zone, 4 machines, 48 total disks (12
per machine), and one object server (and hence one port) per
disk. With the old behavior, partition replicas will all go in the one
region, then the one zone, then pick one of 48 IP/port pairs, then
pick the one disk therein. This gives the same result as randomly
picking 3 disks (without replacement) to store data on; it completely
ignores machine boundaries.
With the new behavior, the replica placer will pick the one region,
then the one zone, then one of 4 IPs, then one of 12 disks
therein. This gives the optimal placement with respect to durability.
The same applies to Ring.get_more_nodes().
Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp>
Change-Id: Ibbd740c51296b7e360845b5309d276d7383a3742
2015-06-15 13:36:36 -07:00
|
|
|
'port': 1234 + device,
|
|
|
|
'zone': zone, 'region': 0,
|
2013-03-04 17:05:43 -08:00
|
|
|
'weight': 1.0})
|
2013-03-04 08:52:24 +00:00
|
|
|
next_dev_id += 1
|
2015-10-15 16:20:58 -07:00
|
|
|
rb.rebalance(seed=2)
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
2015-10-15 16:20:58 -07:00
|
|
|
|
|
|
|
# every part has the same number of handoffs
|
|
|
|
part_handoff_counts = set()
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
part_handoff_counts.add(len(list(r.get_more_nodes(part))))
|
|
|
|
self.assertEqual(part_handoff_counts, {105})
|
|
|
|
# which less the primaries - is every device in the ring
|
|
|
|
self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105)
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_zones = set([d['zone'] for d in devs])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, exp_part)
|
|
|
|
self.assertEqual([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEqual(primary_zones, exp_zones)
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = list(r.get_more_nodes(part))
|
2015-10-15 16:20:58 -07:00
|
|
|
self.assertEqual(len(devs), len(exp_handoffs))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
|
|
|
self.assertEqual(dev_ids, exp_handoffs)
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
# The first 6 replicas plus the 3 primary nodes should cover all 9
|
|
|
|
# zones in this test
|
|
|
|
seen_zones = set(primary_zones)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([d['zone'] for d in devs[:6]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_zones, set(range(1, 10)))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
# The first handoff nodes for each partition in the ring
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-06-15 22:10:45 +05:30
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(devs, exp_first_handoffs)
|
2013-03-04 08:52:24 +00:00
|
|
|
|
|
|
|
# Add a new device we can handoff to.
|
|
|
|
zone = 5
|
|
|
|
server = 0
|
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.2.%d.%d' % (zone, server),
|
2013-03-04 17:05:43 -08:00
|
|
|
'port': 1234, 'zone': zone, 'region': 0, 'weight': 1.0})
|
2013-03-04 08:52:24 +00:00
|
|
|
next_dev_id += 1
|
2015-10-15 16:20:58 -07:00
|
|
|
rb.pretend_min_part_hours_passed()
|
|
|
|
num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=2)
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
2015-10-15 16:20:58 -07:00
|
|
|
|
|
|
|
# so now we expect the device list to be longer by one device
|
|
|
|
part_handoff_counts = set()
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
part_handoff_counts.add(len(list(r.get_more_nodes(part))))
|
|
|
|
self.assertEqual(part_handoff_counts, {106})
|
|
|
|
self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 106)
|
|
|
|
# I don't think there's any special reason this dev goes at this index
|
|
|
|
exp_handoffs.insert(27, rb.devs[-1]['id'])
|
|
|
|
|
|
|
|
# We would change expectations here, but in this part only the added
|
|
|
|
# device changed at all.
|
2013-03-04 08:52:24 +00:00
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_zones = set([d['zone'] for d in devs])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, exp_part)
|
|
|
|
self.assertEqual([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEqual(primary_zones, exp_zones)
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(dev_ids), len(exp_handoffs))
|
2013-03-04 08:52:24 +00:00
|
|
|
for index, dev in enumerate(dev_ids):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
# The handoffs still cover all the non-primary zones first
|
|
|
|
seen_zones = set(primary_zones)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([d['zone'] for d in devs[:6]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_zones, set(range(1, 10)))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2015-10-15 16:20:58 -07:00
|
|
|
# Change expectations for the rest of the parts
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-06-15 22:10:45 +05:30
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
2015-10-15 16:20:58 -07:00
|
|
|
changed_first_handoff = 0
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-10-15 16:20:58 -07:00
|
|
|
if devs[part] != exp_first_handoffs[part]:
|
|
|
|
changed_first_handoff += 1
|
|
|
|
exp_first_handoffs[part] = devs[part]
|
|
|
|
self.assertEqual(devs, exp_first_handoffs)
|
|
|
|
self.assertEqual(changed_first_handoff, num_parts_changed)
|
2013-03-04 08:52:24 +00:00
|
|
|
|
2015-10-15 16:20:58 -07:00
|
|
|
# Remove a device - no need to fluff min_part_hours.
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.remove_dev(0)
|
2015-10-15 16:20:58 -07:00
|
|
|
num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=1)
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
2015-10-15 16:20:58 -07:00
|
|
|
|
|
|
|
# so now we expect the device list to be shorter by one device
|
|
|
|
part_handoff_counts = set()
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
part_handoff_counts.add(len(list(r.get_more_nodes(part))))
|
|
|
|
self.assertEqual(part_handoff_counts, {105})
|
|
|
|
self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105)
|
|
|
|
|
|
|
|
# Change expectations for our part
|
|
|
|
exp_handoffs.remove(0)
|
|
|
|
first_matches = 0
|
|
|
|
total_changed = 0
|
|
|
|
devs = list(d['id'] for d in r.get_more_nodes(exp_part))
|
|
|
|
for i, part in enumerate(devs):
|
|
|
|
if exp_handoffs[i] != devs[i]:
|
|
|
|
total_changed += 1
|
|
|
|
exp_handoffs[i] = devs[i]
|
|
|
|
if not total_changed:
|
|
|
|
first_matches += 1
|
|
|
|
self.assertEqual(devs, exp_handoffs)
|
|
|
|
# the first 21 handoffs were the same across the rebalance
|
|
|
|
self.assertEqual(first_matches, 21)
|
|
|
|
# but as you dig deeper some of the differences show up
|
|
|
|
self.assertEqual(total_changed, 41)
|
|
|
|
|
|
|
|
# Change expectations for the rest of the parts
|
|
|
|
devs = []
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
|
|
|
changed_first_handoff = 0
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
if devs[part] != exp_first_handoffs[part]:
|
|
|
|
changed_first_handoff += 1
|
|
|
|
exp_first_handoffs[part] = devs[part]
|
|
|
|
self.assertEqual(devs, exp_first_handoffs)
|
|
|
|
self.assertEqual(changed_first_handoff, num_parts_changed)
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
# Test
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_zones = set([d['zone'] for d in devs])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, exp_part)
|
|
|
|
self.assertEqual([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEqual(primary_zones, exp_zones)
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(dev_ids), len(exp_handoffs))
|
2013-03-04 08:52:24 +00:00
|
|
|
for index, dev in enumerate(dev_ids):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([d['zone'] for d in devs[:6]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_zones, set(range(1, 10)))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-06-15 22:10:45 +05:30
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
devs[part], exp_first_handoffs[part],
|
|
|
|
'handoff for partitition %d is now device id %d' % (
|
|
|
|
part, devs[part]))
|
|
|
|
|
|
|
|
# Add a partial replica
|
|
|
|
rb.set_replicas(3.5)
|
2015-10-15 16:20:58 -07:00
|
|
|
num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=164)
|
2013-03-04 08:52:24 +00:00
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
2015-10-15 16:20:58 -07:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
# Change expectations
|
2015-10-15 16:20:58 -07:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
# We have another replica now
|
2015-10-15 16:20:58 -07:00
|
|
|
exp_devs.append(90)
|
|
|
|
exp_zones.add(8)
|
|
|
|
# and therefore one less handoff
|
|
|
|
exp_handoffs = exp_handoffs[:-1]
|
2013-03-04 08:52:24 +00:00
|
|
|
# Caused some major changes in the sequence of handoffs for our test
|
|
|
|
# partition, but at least the first stayed the same.
|
2015-10-15 16:20:58 -07:00
|
|
|
devs = list(d['id'] for d in r.get_more_nodes(exp_part))
|
|
|
|
first_matches = 0
|
|
|
|
total_changed = 0
|
|
|
|
for i, part in enumerate(devs):
|
|
|
|
if exp_handoffs[i] != devs[i]:
|
|
|
|
total_changed += 1
|
|
|
|
exp_handoffs[i] = devs[i]
|
|
|
|
if not total_changed:
|
|
|
|
first_matches += 1
|
|
|
|
# most seeds seem to throw out first handoff stabilization with
|
|
|
|
# replica_count change
|
|
|
|
self.assertEqual(first_matches, 2)
|
|
|
|
# and lots of other handoff changes...
|
|
|
|
self.assertEqual(total_changed, 95)
|
|
|
|
|
|
|
|
self.assertEqual(devs, exp_handoffs)
|
|
|
|
|
|
|
|
# Change expectations for the rest of the parts
|
|
|
|
devs = []
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
|
|
|
changed_first_handoff = 0
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
if devs[part] != exp_first_handoffs[part]:
|
|
|
|
changed_first_handoff += 1
|
|
|
|
exp_first_handoffs[part] = devs[part]
|
|
|
|
self.assertEqual(devs, exp_first_handoffs)
|
|
|
|
self.assertLessEqual(changed_first_handoff, num_parts_changed)
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
# Test
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_zones = set([d['zone'] for d in devs])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part, exp_part)
|
|
|
|
self.assertEqual([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEqual(primary_zones, exp_zones)
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(dev_ids), len(exp_handoffs))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
for index, dev in enumerate(dev_ids):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([d['zone'] for d in devs[:6]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_zones, set(range(1, 10)))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-06-15 22:10:45 +05:30
|
|
|
devs.append(next(r.get_more_nodes(part))['id'])
|
2015-05-25 18:28:02 +02:00
|
|
|
for part in range(r.partition_count):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
devs[part], exp_first_handoffs[part],
|
|
|
|
'handoff for partitition %d is now device id %d' % (
|
|
|
|
part, devs[part]))
|
|
|
|
|
|
|
|
# One last test of a partial replica partition
|
|
|
|
exp_part2 = 136
|
2015-10-15 16:20:58 -07:00
|
|
|
exp_devs2 = [70, 76, 32]
|
|
|
|
exp_zones2 = set([3, 6, 7])
|
|
|
|
exp_handoffs2 = [89, 97, 37, 53, 20, 1, 86, 64, 102, 40, 90, 60, 72,
|
|
|
|
27, 99, 68, 78, 26, 105, 45, 42, 95, 22, 13, 49, 55,
|
|
|
|
11, 8, 83, 16, 4, 59, 33, 108, 61, 74, 29, 88, 66,
|
|
|
|
80, 25, 100, 39, 67, 79, 24, 65, 96, 36, 84, 54, 21,
|
|
|
|
63, 81, 56, 71, 77, 30, 48, 23, 10, 52, 82, 34, 17,
|
|
|
|
107, 87, 104, 5, 35, 2, 50, 43, 62, 73, 28, 18, 14,
|
|
|
|
98, 38, 85, 15, 57, 9, 51, 12, 6, 91, 3, 103, 41, 92,
|
|
|
|
47, 75, 44, 69, 101, 93, 106, 46, 94, 31, 19, 7, 58]
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
part2, devs2 = r.get_nodes('a', 'c', 'o2')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_zones2 = set([d['zone'] for d in devs2])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(part2, exp_part2)
|
|
|
|
self.assertEqual([d['id'] for d in devs2], exp_devs2)
|
|
|
|
self.assertEqual(primary_zones2, exp_zones2)
|
2013-03-04 08:52:24 +00:00
|
|
|
devs2 = list(r.get_more_nodes(part2))
|
|
|
|
dev_ids2 = [d['id'] for d in devs2]
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(dev_ids2), len(exp_handoffs2))
|
2013-03-04 08:52:24 +00:00
|
|
|
for index, dev in enumerate(dev_ids2):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 08:52:24 +00:00
|
|
|
dev, exp_handoffs2[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids2[index:], exp_handoffs2[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones2)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([d['zone'] for d in devs2[:6]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_zones, set(range(1, 10)))
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2013-03-04 17:05:43 -08:00
|
|
|
# Test distribution across regions
|
|
|
|
rb.set_replicas(3)
|
2015-05-25 18:28:02 +02:00
|
|
|
for region in range(1, 5):
|
2013-03-04 17:05:43 -08:00
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.%d.1.%d' % (region, server), 'port': 1234,
|
2014-08-19 16:44:56 -07:00
|
|
|
# 108.0 is the weight of all devices created prior to
|
|
|
|
# this test in region 0; this way all regions have
|
|
|
|
# equal combined weight
|
|
|
|
'zone': 1, 'region': region, 'weight': 108.0})
|
2013-03-04 17:05:43 -08:00
|
|
|
next_dev_id += 1
|
|
|
|
rb.pretend_min_part_hours_passed()
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.pretend_min_part_hours_passed()
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
|
|
|
|
# There's 5 regions now, so the primary nodes + first 2 handoffs
|
|
|
|
# should span all 5 regions
|
|
|
|
part, devs = r.get_nodes('a1', 'c1', 'o1')
|
2013-09-01 01:14:40 -04:00
|
|
|
primary_regions = set([d['region'] for d in devs])
|
|
|
|
primary_zones = set([(d['region'], d['zone']) for d in devs])
|
2013-03-04 17:05:43 -08:00
|
|
|
more_devs = list(r.get_more_nodes(part))
|
|
|
|
|
|
|
|
seen_regions = set(primary_regions)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_regions.update([d['region'] for d in more_devs[:2]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(seen_regions, set(range(0, 5)))
|
2013-03-04 17:05:43 -08:00
|
|
|
|
|
|
|
# There are 13 zones now, so the first 13 nodes should all have
|
|
|
|
# distinct zones (that's r0z0, r0z1, ..., r0z8, r1z1, r2z1, r3z1, and
|
|
|
|
# r4z1).
|
|
|
|
seen_zones = set(primary_zones)
|
2013-09-01 01:14:40 -04:00
|
|
|
seen_zones.update([(d['region'], d['zone']) for d in more_devs[:10]])
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(13, len(seen_zones))
|
2013-03-04 17:05:43 -08:00
|
|
|
|
|
|
|
# Here's a brittle canary-in-the-coalmine test to make sure the region
|
|
|
|
# handoff computation didn't change accidentally
|
2015-10-15 16:20:58 -07:00
|
|
|
exp_handoffs = [111, 112, 35, 58, 62, 74, 20, 105, 41, 90, 53, 6, 3,
|
|
|
|
67, 55, 76, 108, 32, 12, 80, 38, 85, 94, 42, 27, 99,
|
|
|
|
50, 47, 70, 87, 26, 9, 15, 97, 102, 81, 23, 65, 33,
|
|
|
|
77, 34, 4, 75, 8, 5, 30, 13, 73, 36, 92, 54, 51, 72,
|
|
|
|
78, 66, 1, 48, 14, 93, 95, 88, 86, 84, 106, 60, 101,
|
|
|
|
57, 43, 89, 59, 79, 46, 61, 52, 44, 45, 37, 68, 25,
|
|
|
|
100, 49, 24, 16, 71, 96, 21, 107, 98, 64, 39, 18, 29,
|
|
|
|
103, 91, 22, 63, 69, 28, 56, 11, 82, 10, 17, 19, 7,
|
|
|
|
40, 83, 104, 31]
|
2013-03-04 17:05:43 -08:00
|
|
|
dev_ids = [d['id'] for d in more_devs]
|
|
|
|
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(len(dev_ids), len(exp_handoffs))
|
2013-03-04 17:05:43 -08:00
|
|
|
for index, dev_id in enumerate(dev_ids):
|
2015-08-05 23:58:14 +05:30
|
|
|
self.assertEqual(
|
2013-03-04 17:05:43 -08:00
|
|
|
dev_id, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
|
|
|
|
2016-01-15 11:44:21 -08:00
|
|
|
def test_get_more_nodes_with_zero_weight_region(self):
|
|
|
|
rb = ring.RingBuilder(8, 3, 1)
|
|
|
|
devs = [
|
|
|
|
ring_utils.parse_add_value(v) for v in [
|
2016-02-01 18:06:54 +00:00
|
|
|
'r1z1-127.0.0.1:6200/d1',
|
|
|
|
'r1z1-127.0.0.1:6201/d2',
|
|
|
|
'r1z1-127.0.0.1:6202/d3',
|
|
|
|
'r1z1-127.0.0.1:6203/d4',
|
|
|
|
'r1z2-127.0.0.2:6200/d1',
|
|
|
|
'r1z2-127.0.0.2:6201/d2',
|
|
|
|
'r1z2-127.0.0.2:6202/d3',
|
|
|
|
'r1z2-127.0.0.2:6203/d4',
|
|
|
|
'r2z1-127.0.1.1:6200/d1',
|
|
|
|
'r2z1-127.0.1.1:6201/d2',
|
|
|
|
'r2z1-127.0.1.1:6202/d3',
|
|
|
|
'r2z1-127.0.1.1:6203/d4',
|
|
|
|
'r2z2-127.0.1.2:6200/d1',
|
|
|
|
'r2z2-127.0.1.2:6201/d2',
|
|
|
|
'r2z2-127.0.1.2:6202/d3',
|
|
|
|
'r2z2-127.0.1.2:6203/d4',
|
2016-01-15 11:44:21 -08:00
|
|
|
]
|
|
|
|
]
|
|
|
|
for dev in devs:
|
|
|
|
if dev['region'] == 2:
|
|
|
|
dev['weight'] = 0.0
|
|
|
|
else:
|
|
|
|
dev['weight'] = 1.0
|
|
|
|
rb.add_dev(dev)
|
2017-01-09 20:00:33 +00:00
|
|
|
rb.rebalance()
|
2016-01-15 11:44:21 -08:00
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
|
|
|
|
class CountingRingTable(object):
|
|
|
|
|
|
|
|
def __init__(self, table):
|
|
|
|
self.table = table
|
|
|
|
self.count = 0
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
self._iter = iter(self.table)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __next__(self):
|
|
|
|
self.count += 1
|
|
|
|
return next(self._iter)
|
|
|
|
|
|
|
|
# complete the api
|
|
|
|
next = __next__
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
return self.table[key]
|
|
|
|
|
2017-01-09 20:00:33 +00:00
|
|
|
histogram = collections.defaultdict(int)
|
|
|
|
for part in range(r.partition_count):
|
|
|
|
counting_table = CountingRingTable(r._replica2part2dev_id)
|
|
|
|
with mock.patch.object(r, '_replica2part2dev_id', counting_table):
|
|
|
|
node_iter = r.get_more_nodes(part)
|
|
|
|
next(node_iter)
|
|
|
|
histogram[counting_table.count] += 1
|
|
|
|
# Don't let our summing muddy our histogram
|
|
|
|
histogram = dict(histogram)
|
2016-01-15 11:44:21 -08:00
|
|
|
|
|
|
|
# sanity
|
|
|
|
self.assertEqual(1, r._num_regions)
|
|
|
|
self.assertEqual(2, r._num_zones)
|
2017-01-09 20:00:33 +00:00
|
|
|
self.assertEqual(256, r.partition_count)
|
|
|
|
|
|
|
|
# We always do one loop (including the StopIteration) while getting
|
|
|
|
# primaries, so every part should hit next() at least 5 times
|
|
|
|
self.assertEqual(sum(histogram.get(x, 0) for x in range(5)), 0,
|
|
|
|
histogram)
|
|
|
|
|
|
|
|
# Most of the parts should find a handoff device in the next partition,
|
|
|
|
# but because some of the primary devices may *also* be used for that
|
|
|
|
# partition, that means 5, 6, or 7 calls to next().
|
|
|
|
self.assertGreater(sum(histogram.get(x, 0) for x in range(8)), 160,
|
|
|
|
histogram)
|
|
|
|
|
|
|
|
# Want 90% confidence that it'll happen within two partitions
|
|
|
|
self.assertGreater(sum(histogram.get(x, 0) for x in range(12)), 230,
|
|
|
|
histogram)
|
|
|
|
|
|
|
|
# Tail should fall off fairly quickly
|
|
|
|
self.assertLess(sum(histogram.get(x, 0) for x in range(20, 100)), 5,
|
|
|
|
histogram)
|
|
|
|
|
|
|
|
# Hard limit at 50 (we've seen as bad as 41, 45)
|
|
|
|
self.assertEqual(sum(histogram.get(x, 0) for x in range(50, 100)), 0,
|
|
|
|
histogram)
|
2016-01-15 11:44:21 -08:00
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|