2012-03-19 13:45:34 -05:00
|
|
|
# Copyright (c) 2010-2012 OpenStack, LLC.
|
2010-07-12 17:03:45 -05:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
|
|
# implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
import array
|
2010-07-12 17:03:45 -05:00
|
|
|
import cPickle as pickle
|
|
|
|
import os
|
2013-01-31 15:12:09 -08:00
|
|
|
import sys
|
2010-07-12 17:03:45 -05:00
|
|
|
import unittest
|
2013-07-20 13:44:11 -07:00
|
|
|
from contextlib import closing
|
2010-07-12 17:03:45 -05:00
|
|
|
from gzip import GzipFile
|
|
|
|
from shutil import rmtree
|
|
|
|
from time import sleep, time
|
|
|
|
|
|
|
|
from swift.common import ring, utils
|
|
|
|
|
|
|
|
|
|
|
|
class TestRingData(unittest.TestCase):
|
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
def setUp(self):
|
|
|
|
self.testdir = os.path.join(os.path.dirname(__file__), 'ring_data')
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
os.mkdir(self.testdir)
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
|
|
|
|
def assert_ring_data_equal(self, rd_expected, rd_got):
|
|
|
|
self.assertEquals(rd_expected._replica2part2dev_id,
|
|
|
|
rd_got._replica2part2dev_id)
|
|
|
|
self.assertEquals(rd_expected.devs, rd_got.devs)
|
|
|
|
self.assertEquals(rd_expected._part_shift, rd_got._part_shift)
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_attrs(self):
|
|
|
|
r2p2d = [[0, 1, 0, 1], [0, 1, 0, 1]]
|
2013-03-04 17:05:43 -08:00
|
|
|
d = [{'id': 0, 'zone': 0, 'region': 0},
|
|
|
|
{'id': 1, 'zone': 1, 'region': 1}]
|
2010-07-12 17:03:45 -05:00
|
|
|
s = 30
|
|
|
|
rd = ring.RingData(r2p2d, d, s)
|
|
|
|
self.assertEquals(rd._replica2part2dev_id, r2p2d)
|
|
|
|
self.assertEquals(rd.devs, d)
|
|
|
|
self.assertEquals(rd._part_shift, s)
|
|
|
|
|
2012-08-05 00:51:49 -07:00
|
|
|
def test_can_load_pickled_ring_data(self):
|
2010-07-12 17:03:45 -05:00
|
|
|
rd = ring.RingData([[0, 1, 0, 1], [0, 1, 0, 1]],
|
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
2012-08-05 00:51:49 -07:00
|
|
|
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
|
2010-07-12 17:03:45 -05:00
|
|
|
for p in xrange(pickle.HIGHEST_PROTOCOL):
|
2013-07-20 13:44:11 -07:00
|
|
|
with closing(GzipFile(ring_fname, 'wb')) as f:
|
|
|
|
pickle.dump(rd, f, protocol=p)
|
2012-08-05 00:51:49 -07:00
|
|
|
ring_data = ring.RingData.load(ring_fname)
|
|
|
|
self.assert_ring_data_equal(rd, ring_data)
|
|
|
|
|
|
|
|
def test_roundtrip_serialization(self):
|
|
|
|
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
|
|
|
|
rd = ring.RingData(
|
2013-07-20 13:44:11 -07:00
|
|
|
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
|
2012-08-05 00:51:49 -07:00
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
|
|
|
rd.save(ring_fname)
|
|
|
|
rd2 = ring.RingData.load(ring_fname)
|
|
|
|
self.assert_ring_data_equal(rd, rd2)
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2013-01-31 15:12:09 -08:00
|
|
|
def test_deterministic_serialization(self):
|
|
|
|
"""
|
|
|
|
Two identical rings should produce identical .gz files on disk.
|
|
|
|
|
|
|
|
Only true on Python 2.7 or greater.
|
|
|
|
"""
|
|
|
|
if sys.version_info[0] == 2 and sys.version_info[1] < 7:
|
|
|
|
return
|
|
|
|
os.mkdir(os.path.join(self.testdir, '1'))
|
|
|
|
os.mkdir(os.path.join(self.testdir, '2'))
|
|
|
|
# These have to have the same filename (not full path,
|
|
|
|
# obviously) since the filename gets encoded in the gzip data.
|
|
|
|
ring_fname1 = os.path.join(self.testdir, '1', 'the.ring.gz')
|
|
|
|
ring_fname2 = os.path.join(self.testdir, '2', 'the.ring.gz')
|
|
|
|
rd = ring.RingData(
|
|
|
|
[array.array('H', [0, 1, 0, 1]), array.array('H',[0, 1, 0, 1])],
|
|
|
|
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
|
|
|
|
rd.save(ring_fname1)
|
|
|
|
rd.save(ring_fname2)
|
|
|
|
with open(ring_fname1) as ring1:
|
|
|
|
with open(ring_fname2) as ring2:
|
|
|
|
self.assertEqual(ring1.read(), ring2.read())
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
class TestRing(unittest.TestCase):
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
utils.HASH_PATH_SUFFIX = 'endcap'
|
2013-03-20 01:35:41 +02:00
|
|
|
utils.HASH_PATH_PREFIX = ''
|
2010-07-12 17:03:45 -05:00
|
|
|
self.testdir = os.path.join(os.path.dirname(__file__), 'ring')
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
os.mkdir(self.testdir)
|
2012-03-11 04:36:26 -07:00
|
|
|
self.testgz = os.path.join(self.testdir, 'whatever.ring.gz')
|
2012-08-05 00:51:49 -07:00
|
|
|
self.intended_replica2part2dev_id = [
|
|
|
|
array.array('H', [0, 1, 0, 1]),
|
|
|
|
array.array('H', [0, 1, 0, 1]),
|
|
|
|
array.array('H', [3, 4, 3, 4])]
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2012-12-17 06:39:25 -05:00
|
|
|
'ip': '10.1.1.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.0.1',
|
|
|
|
'replication_port': 6066},
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
|
2012-12-17 06:39:25 -05:00
|
|
|
'ip': '10.1.1.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.0.2',
|
|
|
|
'replication_port': 6066},
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
None,
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2012-12-17 06:39:25 -05:00
|
|
|
'ip': '10.1.2.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.2.0.1',
|
|
|
|
'replication_port': 6066},
|
2013-03-04 17:05:43 -08:00
|
|
|
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
|
2012-12-17 06:39:25 -05:00
|
|
|
'ip': '10.1.2.2', 'port': 6000,
|
|
|
|
'replication_ip': '10.2.0.1',
|
|
|
|
'replication_port': 6066}]
|
2010-07-12 17:03:45 -05:00
|
|
|
self.intended_part_shift = 30
|
|
|
|
self.intended_reload_time = 15
|
2012-08-05 00:51:49 -07:00
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir,
|
|
|
|
reload_time=self.intended_reload_time, ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
rmtree(self.testdir, ignore_errors=1)
|
|
|
|
|
|
|
|
def test_creation(self):
|
|
|
|
self.assertEquals(self.ring._replica2part2dev_id,
|
|
|
|
self.intended_replica2part2dev_id)
|
|
|
|
self.assertEquals(self.ring._part_shift, self.intended_part_shift)
|
|
|
|
self.assertEquals(self.ring.devs, self.intended_devs)
|
|
|
|
self.assertEquals(self.ring.reload_time, self.intended_reload_time)
|
2012-08-05 00:51:49 -07:00
|
|
|
self.assertEquals(self.ring.serialized_path, self.testgz)
|
2011-02-16 09:02:38 -06:00
|
|
|
# test invalid endcap
|
|
|
|
_orig_hash_path_suffix = utils.HASH_PATH_SUFFIX
|
2013-03-20 01:35:41 +02:00
|
|
|
_orig_hash_path_prefix = utils.HASH_PATH_PREFIX
|
2011-02-16 09:02:38 -06:00
|
|
|
try:
|
|
|
|
utils.HASH_PATH_SUFFIX = ''
|
2013-03-20 01:35:41 +02:00
|
|
|
utils.HASH_PATH_PREFIX = ''
|
2012-03-11 04:36:26 -07:00
|
|
|
self.assertRaises(SystemExit, ring.Ring, self.testdir, 'whatever')
|
2011-02-16 09:02:38 -06:00
|
|
|
finally:
|
|
|
|
utils.HASH_PATH_SUFFIX = _orig_hash_path_suffix
|
2013-03-20 01:35:41 +02:00
|
|
|
utils.HASH_PATH_PREFIX = _orig_hash_path_prefix
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_has_changed(self):
|
|
|
|
self.assertEquals(self.ring.has_changed(), False)
|
2011-02-09 21:36:14 +00:00
|
|
|
os.utime(self.testgz, (time() + 60, time() + 60))
|
2010-07-12 17:03:45 -05:00
|
|
|
self.assertEquals(self.ring.has_changed(), True)
|
|
|
|
|
|
|
|
def test_reload(self):
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 5)
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs.append({'id': 3, 'region': 0, 'zone': 3, 'weight': 1.0})
|
2012-08-05 00:51:49 -07:00
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
|
|
|
self.ring.get_nodes('a')
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 6)
|
2010-07-12 17:03:45 -05:00
|
|
|
self.assertNotEquals(self.ring._mtime, orig_mtime)
|
|
|
|
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 6)
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs.append({'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0})
|
2012-08-05 00:51:49 -07:00
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
|
|
|
self.ring.get_part_nodes(0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 7)
|
2010-07-12 17:03:45 -05:00
|
|
|
self.assertNotEquals(self.ring._mtime, orig_mtime)
|
|
|
|
|
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
2012-03-11 04:36:26 -07:00
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
|
|
|
ring_name='whatever')
|
2010-07-12 17:03:45 -05:00
|
|
|
orig_mtime = self.ring._mtime
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 7)
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs.append({'id': 6, 'region': 0, 'zone': 5, 'weight': 1.0})
|
2012-08-05 00:51:49 -07:00
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2010-07-12 17:03:45 -05:00
|
|
|
sleep(0.1)
|
|
|
|
self.ring.get_more_nodes(part).next()
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(len(self.ring.devs), 8)
|
2010-07-12 17:03:45 -05:00
|
|
|
self.assertNotEquals(self.ring._mtime, orig_mtime)
|
|
|
|
|
2012-07-20 09:15:34 -07:00
|
|
|
os.utime(self.testgz, (time() - 300, time() - 300))
|
|
|
|
self.ring = ring.Ring(self.testdir, reload_time=0.001,
|
|
|
|
ring_name='whatever')
|
|
|
|
orig_mtime = self.ring._mtime
|
|
|
|
self.assertEquals(len(self.ring.devs), 8)
|
2013-03-04 17:05:43 -08:00
|
|
|
self.intended_devs.append({'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0})
|
2012-08-05 00:51:49 -07:00
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
self.intended_devs, self.intended_part_shift).save(self.testgz)
|
2012-07-20 09:15:34 -07:00
|
|
|
sleep(0.1)
|
|
|
|
self.assertEquals(len(self.ring.devs), 9)
|
|
|
|
self.assertNotEquals(self.ring._mtime, orig_mtime)
|
|
|
|
|
2012-12-17 06:39:25 -05:00
|
|
|
def test_reload_without_replication(self):
|
|
|
|
replication_less_devs = [{'id': 0, 'region': 0, 'zone': 0,
|
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
|
|
|
'port': 6000},
|
|
|
|
{'id': 1, 'region': 0, 'zone': 0,
|
|
|
|
'weight': 1.0, 'ip': '10.1.1.1',
|
|
|
|
'port': 6000},
|
|
|
|
None,
|
|
|
|
{'id': 3, 'region': 0, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.1',
|
|
|
|
'port': 6000},
|
|
|
|
{'id': 4, 'region': 0, 'zone': 2,
|
|
|
|
'weight': 1.0, 'ip': '10.1.2.2',
|
|
|
|
'port': 6000}]
|
|
|
|
intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
|
|
|
|
'ip': '10.1.1.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.1.1',
|
|
|
|
'replication_port': 6000},
|
|
|
|
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
|
|
|
|
'ip': '10.1.1.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.1.1',
|
|
|
|
'replication_port': 6000},
|
|
|
|
None,
|
|
|
|
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
|
|
|
|
'ip': '10.1.2.1', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.2.1',
|
|
|
|
'replication_port': 6000},
|
|
|
|
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
|
|
|
|
'ip': '10.1.2.2', 'port': 6000,
|
|
|
|
'replication_ip': '10.1.2.2',
|
|
|
|
'replication_port': 6000}]
|
|
|
|
testgz = os.path.join(self.testdir, 'without_replication.ring.gz')
|
|
|
|
ring.RingData(self.intended_replica2part2dev_id,
|
|
|
|
replication_less_devs, self.intended_part_shift).save(testgz)
|
|
|
|
self.ring = ring.Ring(self.testdir,
|
|
|
|
reload_time=self.intended_reload_time,
|
|
|
|
ring_name='without_replication')
|
|
|
|
self.assertEquals(self.ring.devs, intended_devs)
|
|
|
|
|
2013-04-06 01:35:58 +00:00
|
|
|
def test_get_part(self):
|
|
|
|
part1 = self.ring.get_part('a')
|
|
|
|
nodes1 = self.ring.get_part_nodes(part1)
|
|
|
|
part2, nodes2 = self.ring.get_nodes('a')
|
|
|
|
self.assertEquals(part1, part2)
|
|
|
|
self.assertEquals(nodes1, nodes2)
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
def test_get_part_nodes(self):
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
|
|
|
self.assertEquals(nodes, self.ring.get_part_nodes(part))
|
|
|
|
|
|
|
|
def test_get_nodes(self):
|
|
|
|
# Yes, these tests are deliberately very fragile. We want to make sure
|
|
|
|
# that if someones changes the results the ring produces, they know it.
|
|
|
|
self.assertRaises(TypeError, self.ring.get_nodes)
|
|
|
|
part, nodes = self.ring.get_nodes('a')
|
|
|
|
self.assertEquals(part, 0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a1')
|
|
|
|
self.assertEquals(part, 0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a4')
|
|
|
|
self.assertEquals(part, 1)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('aa')
|
|
|
|
self.assertEquals(part, 1)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
part, nodes = self.ring.get_nodes('a', 'c1')
|
|
|
|
self.assertEquals(part, 0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c0')
|
|
|
|
self.assertEquals(part, 3)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c3')
|
|
|
|
self.assertEquals(part, 2)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c2')
|
|
|
|
self.assertEquals(part, 2)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o1')
|
|
|
|
self.assertEquals(part, 1)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[1],
|
|
|
|
self.intended_devs[4]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o5')
|
|
|
|
self.assertEquals(part, 0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o0')
|
|
|
|
self.assertEquals(part, 0)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
part, nodes = self.ring.get_nodes('a', 'c', 'o2')
|
|
|
|
self.assertEquals(part, 2)
|
As-unique-as-possible partition replica placement.
This commit introduces a new algorithm for assigning partition
replicas to devices. Basically, the ring builder organizes the devices
into tiers (first zone, then IP/port, then device ID). When placing a
replica, the ring builder looks for the emptiest device (biggest
parts_wanted) in the furthest-away tier.
In the case where zone-count >= replica-count, the new algorithm will
give the same results as the one it replaces. Thus, no migration is
needed.
In the case where zone-count < replica-count, the new algorithm
behaves differently from the old algorithm. The new algorithm will
distribute things evenly at each tier so that the replication is as
high-quality as possible, given the circumstances. The old algorithm
would just crash, so again, no migration is needed.
Handoffs have also been updated to use the new algorithm. When
generating handoff nodes, first the ring looks for nodes in other
zones, then other ips/ports, then any other drive. The first handoff
nodes (the ones in other zones) will be the same as before; this
commit just extends the list of handoff nodes.
The proxy server and replicators have been altered to avoid looking at
the ring's replica count directly. Previously, with a replica count of
C, RingData.get_nodes() and RingData.get_part_nodes() would return
lists of length C, so some other code used the replica count when it
needed the number of nodes. If two of a partition's replicas are on
the same device (e.g. with 3 replicas, 2 devices), then that
assumption is no longer true. Fortunately, all the proxy server and
replicators really needed was the number of nodes returned, which they
already had. (Bonus: now the only code that mentions replica_count
directly is in the ring and the ring builder.)
Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
|
|
|
self.assertEquals(nodes, [self.intended_devs[0],
|
|
|
|
self.intended_devs[3]])
|
|
|
|
|
|
|
|
def add_dev_to_ring(self, new_dev):
|
|
|
|
self.ring.devs.append(new_dev)
|
|
|
|
self.ring._rebuild_tier_data()
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
def test_get_more_nodes(self):
|
|
|
|
# Yes, these tests are deliberately very fragile. We want to make sure
|
|
|
|
# that if someone changes the results the ring produces, they know it.
|
2013-03-04 08:52:24 +00:00
|
|
|
exp_part = 6
|
|
|
|
exp_devs = [48, 93, 96]
|
|
|
|
exp_zones = set([5, 8, 9])
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
exp_handoffs = [11, 47, 25, 76, 69, 23, 99, 59, 106, 64, 107, 43, 50,
|
|
|
|
34, 88, 3, 57, 30, 83, 31, 16, 27, 103, 39, 32, 60, 77,
|
|
|
|
24, 0, 42, 8, 100, 72, 56, 19, 71, 26, 9, 20, 35, 91,
|
|
|
|
13, 84, 5, 38, 14, 94, 28, 41, 18, 66, 102, 52, 101,
|
|
|
|
61, 95, 21, 81, 1, 78, 105, 58, 74, 90, 86, 46, 4, 68,
|
|
|
|
40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, 87,
|
|
|
|
65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, 37, 63,
|
|
|
|
53, 92, 33, 85, 73, 51, 98, 36, 10]
|
2013-03-04 08:52:24 +00:00
|
|
|
exp_first_handoffs = [1, 37, 48, 68, 84, 75, 11, 101, 14, 73, 100, 75,
|
|
|
|
29, 19, 18, 101, 15, 99, 95, 24, 46, 82, 73, 62,
|
|
|
|
24, 89, 9, 22, 107, 74, 54, 63, 40, 106, 99, 83,
|
|
|
|
64, 73, 73, 106, 106, 80, 6, 25, 20, 33, 6, 79,
|
|
|
|
59, 42, 62, 24, 14, 107, 28, 0, 85, 5, 4, 12, 58,
|
|
|
|
11, 92, 18, 36, 56, 86, 1, 21, 33, 80, 97, 4, 81,
|
|
|
|
79, 76, 89, 50, 75, 27, 7, 96, 47, 55, 81, 104,
|
|
|
|
12, 5, 18, 106, 27, 93, 39, 92, 42, 30, 20, 88,
|
|
|
|
58, 105, 65, 29, 17, 52, 11, 106, 7, 24, 21, 91,
|
|
|
|
62, 52, 50, 31, 77, 102, 19, 11, 8, 58, 53, 20,
|
|
|
|
26, 8, 18, 82, 48, 68, 82, 89, 101, 50, 3, 52,
|
|
|
|
46, 11, 2, 30, 79, 66, 4, 61, 3, 56, 45, 102, 73,
|
|
|
|
84, 36, 19, 34, 84, 49, 40, 103, 66, 31, 33, 93,
|
|
|
|
33, 4, 52, 26, 58, 30, 47, 100, 57, 40, 79, 33,
|
|
|
|
107, 24, 20, 44, 4, 7, 59, 83, 101, 1, 56, 20,
|
|
|
|
61, 33, 16, 5, 74, 98, 4, 80, 15, 104, 52, 73,
|
|
|
|
18, 67, 75, 98, 73, 79, 68, 75, 27, 91, 36, 100,
|
|
|
|
52, 95, 37, 46, 70, 14, 47, 3, 70, 23, 40, 105,
|
|
|
|
62, 86, 48, 22, 54, 4, 72, 81, 13, 0, 18, 98,
|
|
|
|
101, 36, 29, 24, 39, 79, 97, 105, 28, 107, 47,
|
|
|
|
52, 101, 20, 22, 29, 65, 27, 7, 33, 64, 101, 60,
|
|
|
|
19, 55]
|
|
|
|
rb = ring.RingBuilder(8, 3, 1)
|
|
|
|
next_dev_id = 0
|
|
|
|
for zone in xrange(1, 10):
|
|
|
|
for server in xrange(1, 5):
|
|
|
|
for device in xrange(1, 4):
|
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.2.%d.%d' % (zone, server),
|
2013-03-04 17:05:43 -08:00
|
|
|
'port': 1234, 'zone': zone, 'region': 0,
|
|
|
|
'weight': 1.0})
|
2013-03-04 08:52:24 +00:00
|
|
|
next_dev_id += 1
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
|
|
|
primary_zones = set(d['zone'] for d in devs)
|
|
|
|
self.assertEquals(part, exp_part)
|
|
|
|
self.assertEquals([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEquals(primary_zones, exp_zones)
|
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
self.assertEquals([d['id'] for d in devs], exp_handoffs)
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
# The first 6 replicas plus the 3 primary nodes should cover all 9
|
|
|
|
# zones in this test
|
|
|
|
seen_zones = set(primary_zones)
|
|
|
|
seen_zones.update(d['zone'] for d in devs[:6])
|
|
|
|
self.assertEquals(seen_zones, set(range(1, 10)))
|
|
|
|
|
|
|
|
# The first handoff nodes for each partition in the ring
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
devs.append(r.get_more_nodes(part).next()['id'])
|
|
|
|
self.assertEquals(devs, exp_first_handoffs)
|
|
|
|
|
|
|
|
# Add a new device we can handoff to.
|
|
|
|
zone = 5
|
|
|
|
server = 0
|
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.2.%d.%d' % (zone, server),
|
2013-03-04 17:05:43 -08:00
|
|
|
'port': 1234, 'zone': zone, 'region': 0, 'weight': 1.0})
|
2013-03-04 08:52:24 +00:00
|
|
|
next_dev_id += 1
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
# We would change expectations here, but in this test no handoffs
|
|
|
|
# changed at all.
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
|
|
|
primary_zones = set(d['zone'] for d in devs)
|
|
|
|
self.assertEquals(part, exp_part)
|
|
|
|
self.assertEquals([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEquals(primary_zones, exp_zones)
|
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
|
|
|
self.assertEquals(len(dev_ids), len(exp_handoffs))
|
|
|
|
for index, dev in enumerate(dev_ids):
|
|
|
|
self.assertEquals(
|
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
# The handoffs still cover all the non-primary zones first
|
|
|
|
seen_zones = set(primary_zones)
|
|
|
|
seen_zones.update(d['zone'] for d in devs[:6])
|
|
|
|
self.assertEquals(seen_zones, set(range(1, 10)))
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
devs.append(r.get_more_nodes(part).next()['id'])
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
self.assertEquals(
|
|
|
|
devs[part], exp_first_handoffs[part],
|
|
|
|
'handoff for partitition %d is now device id %d' % (
|
|
|
|
part, devs[part]))
|
|
|
|
|
|
|
|
|
|
|
|
# Remove a device.
|
|
|
|
rb.remove_dev(0)
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
# Change expectations
|
|
|
|
# The long string of handoff nodes for the partition were the same for
|
|
|
|
# the first 20, which is pretty good.
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
exp_handoffs[20:] = [16, 27, 103, 39, 32, 60, 77, 24, 108, 42, 8, 100,
|
|
|
|
72, 56, 19, 71, 26, 9, 20, 35, 91, 13, 84, 5, 38,
|
|
|
|
14, 94, 28, 41, 18, 66, 102, 52, 101, 61, 95, 21,
|
|
|
|
81, 1, 78, 105, 58, 74, 90, 86, 46, 4, 68, 40, 80,
|
|
|
|
54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, 87, 65,
|
|
|
|
12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, 37, 63,
|
|
|
|
53, 92, 33, 85, 73, 51, 98, 36, 10]
|
|
|
|
|
|
|
|
# Just a few of the first handoffs changed
|
|
|
|
exp_first_handoffs[3] = 68
|
2013-03-04 08:52:24 +00:00
|
|
|
exp_first_handoffs[55] = 104
|
|
|
|
exp_first_handoffs[116] = 6
|
|
|
|
exp_first_handoffs[181] = 15
|
|
|
|
exp_first_handoffs[228] = 38
|
|
|
|
# Test
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
|
|
|
primary_zones = set(d['zone'] for d in devs)
|
|
|
|
self.assertEquals(part, exp_part)
|
|
|
|
self.assertEquals([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEquals(primary_zones, exp_zones)
|
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
|
|
|
self.assertEquals(len(dev_ids), len(exp_handoffs))
|
|
|
|
for index, dev in enumerate(dev_ids):
|
|
|
|
self.assertEquals(
|
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones)
|
|
|
|
seen_zones.update(d['zone'] for d in devs[:6])
|
|
|
|
self.assertEquals(seen_zones, set(range(1, 10)))
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
devs.append(r.get_more_nodes(part).next()['id'])
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
self.assertEquals(
|
|
|
|
devs[part], exp_first_handoffs[part],
|
|
|
|
'handoff for partitition %d is now device id %d' % (
|
|
|
|
part, devs[part]))
|
|
|
|
|
|
|
|
# Add a partial replica
|
|
|
|
rb.set_replicas(3.5)
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
# Change expectations
|
|
|
|
# We have another replica now
|
|
|
|
exp_devs.append(47)
|
|
|
|
exp_zones.add(4)
|
|
|
|
# Caused some major changes in the sequence of handoffs for our test
|
|
|
|
# partition, but at least the first stayed the same.
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
exp_handoffs[1:] = [81, 25, 69, 23, 99, 59, 76, 3, 106, 45, 64, 107,
|
|
|
|
43, 13, 50, 34, 88, 57, 30, 16, 83, 31, 46, 27,
|
|
|
|
103, 39, 74, 32, 60, 77, 24, 108, 42, 63, 8, 100,
|
|
|
|
72, 56, 19, 71, 7, 26, 9, 20, 35, 91, 52, 84, 5,
|
|
|
|
87, 38, 14, 94, 62, 28, 41, 90, 18, 66, 82, 102,
|
|
|
|
22, 101, 61, 85, 95, 21, 98, 1, 67, 78, 105, 58,
|
|
|
|
86, 4, 79, 68, 40, 80, 54, 75, 44, 49, 6, 29, 15,
|
|
|
|
70, 65, 12, 17, 104, 97, 55, 89, 2, 37, 53, 92,
|
|
|
|
33, 73, 51, 36, 10]
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
# Lots of first handoffs changed, but 30 of 256 is still just 11.72%.
|
|
|
|
exp_first_handoffs[1] = 6
|
|
|
|
exp_first_handoffs[4] = 104
|
|
|
|
exp_first_handoffs[11] = 106
|
|
|
|
exp_first_handoffs[17] = 13
|
|
|
|
exp_first_handoffs[21] = 77
|
|
|
|
exp_first_handoffs[22] = 95
|
|
|
|
exp_first_handoffs[27] = 46
|
|
|
|
exp_first_handoffs[29] = 65
|
|
|
|
exp_first_handoffs[30] = 3
|
|
|
|
exp_first_handoffs[31] = 20
|
|
|
|
exp_first_handoffs[51] = 50
|
|
|
|
exp_first_handoffs[53] = 8
|
|
|
|
exp_first_handoffs[54] = 2
|
|
|
|
exp_first_handoffs[72] = 107
|
|
|
|
exp_first_handoffs[79] = 72
|
|
|
|
exp_first_handoffs[85] = 71
|
|
|
|
exp_first_handoffs[88] = 66
|
|
|
|
exp_first_handoffs[92] = 29
|
|
|
|
exp_first_handoffs[93] = 46
|
|
|
|
exp_first_handoffs[96] = 38
|
|
|
|
exp_first_handoffs[101] = 57
|
|
|
|
exp_first_handoffs[103] = 87
|
|
|
|
exp_first_handoffs[104] = 28
|
|
|
|
exp_first_handoffs[107] = 1
|
|
|
|
exp_first_handoffs[109] = 69
|
|
|
|
exp_first_handoffs[110] = 50
|
|
|
|
exp_first_handoffs[111] = 76
|
|
|
|
exp_first_handoffs[115] = 47
|
|
|
|
exp_first_handoffs[117] = 48
|
|
|
|
exp_first_handoffs[119] = 7
|
|
|
|
# Test
|
|
|
|
part, devs = r.get_nodes('a', 'c', 'o')
|
|
|
|
primary_zones = set(d['zone'] for d in devs)
|
|
|
|
self.assertEquals(part, exp_part)
|
|
|
|
self.assertEquals([d['id'] for d in devs], exp_devs)
|
|
|
|
self.assertEquals(primary_zones, exp_zones)
|
|
|
|
devs = list(r.get_more_nodes(part))
|
|
|
|
dev_ids = [d['id'] for d in devs]
|
|
|
|
self.assertEquals(len(dev_ids), len(exp_handoffs))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
for index, dev in enumerate(dev_ids):
|
|
|
|
self.assertEquals(
|
|
|
|
dev, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones)
|
|
|
|
seen_zones.update(d['zone'] for d in devs[:6])
|
|
|
|
self.assertEquals(seen_zones, set(range(1, 10)))
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
devs = []
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
devs.append(r.get_more_nodes(part).next()['id'])
|
|
|
|
for part in xrange(r.partition_count):
|
|
|
|
self.assertEquals(
|
|
|
|
devs[part], exp_first_handoffs[part],
|
|
|
|
'handoff for partitition %d is now device id %d' % (
|
|
|
|
part, devs[part]))
|
|
|
|
|
|
|
|
# One last test of a partial replica partition
|
|
|
|
exp_part2 = 136
|
|
|
|
exp_devs2 = [52, 76, 97]
|
|
|
|
exp_zones2 = set([9, 5, 7])
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
exp_handoffs2 = [2, 67, 37, 92, 33, 23, 107, 96, 63, 53, 44, 103, 108,
|
|
|
|
85, 73, 51, 42, 98, 35, 36, 10, 89, 80, 84, 43, 4, 17,
|
|
|
|
49, 104, 32, 12, 41, 58, 31, 65, 20, 25, 61, 1, 40, 9,
|
|
|
|
94, 47, 69, 56, 74, 101, 95, 45, 5, 71, 86, 78, 30, 93,
|
|
|
|
48, 28, 91, 15, 88, 39, 18, 57, 83, 72, 70, 27, 54, 16,
|
|
|
|
24, 21, 14, 11, 8, 77, 62, 50, 6, 105, 26, 55, 29, 60,
|
|
|
|
34, 13, 87, 59, 38, 99, 75, 106, 3, 82, 66, 79, 7, 46,
|
|
|
|
64, 81, 22, 68, 19, 102, 90, 100]
|
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
part2, devs2 = r.get_nodes('a', 'c', 'o2')
|
|
|
|
primary_zones2 = set(d['zone'] for d in devs2)
|
|
|
|
self.assertEquals(part2, exp_part2)
|
|
|
|
self.assertEquals([d['id'] for d in devs2], exp_devs2)
|
|
|
|
self.assertEquals(primary_zones2, exp_zones2)
|
|
|
|
devs2 = list(r.get_more_nodes(part2))
|
|
|
|
dev_ids2 = [d['id'] for d in devs2]
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
2013-03-04 08:52:24 +00:00
|
|
|
self.assertEquals(len(dev_ids2), len(exp_handoffs2))
|
|
|
|
for index, dev in enumerate(dev_ids2):
|
|
|
|
self.assertEquals(
|
|
|
|
dev, exp_handoffs2[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids2[index:], exp_handoffs2[index:]))
|
Spread handoffs out better around zones.
Before, you'd get your 3* primary nodes in 3 different zones, and then
get_more_nodes would give you everything it could from a non-primary
zone, and then finish up with stuff from the primary zones. It would
sort of look like this:
P: device in a primary node's zone
N: device not in a primary node's zone
PPPNNNNNNNNNNNNNNNNNNN...NNNNNNNNNPPP...PPPPPP
(The first three Ps are the primary nodes; they don't actually come
out of get_more_nodes(), but they're included for clarity.)
Now, the first few handoffs from get_more_nodes are in non-primary
zones, but only one per zone, and then the rest of the handoffs ignore
zones. It's still sampling the ring, so it's still taking weights into
consideration, but the zone distribution is more even early in the
handoff chain. It looks like this, assuming 10 zones:
P: device in a primary node's zone
N: device not in a primary node's zone
D: zone doesn't matter
PPPNNNNNNNDDDDDDDDDDD...DDD
* or whatever your replica count is
Change-Id: I31d2a2bc2cd6038386a2df85cd4fa37ccf2f650e
2013-03-05 13:19:04 -08:00
|
|
|
|
|
|
|
seen_zones = set(primary_zones2)
|
|
|
|
seen_zones.update(d['zone'] for d in devs2[:6])
|
|
|
|
self.assertEquals(seen_zones, set(range(1, 10)))
|
2010-07-12 17:03:45 -05:00
|
|
|
|
2013-03-04 17:05:43 -08:00
|
|
|
# Test distribution across regions
|
|
|
|
rb.set_replicas(3)
|
|
|
|
for region in xrange(1, 5):
|
|
|
|
rb.add_dev({'id': next_dev_id,
|
|
|
|
'ip': '1.%d.1.%d' % (region, server), 'port': 1234,
|
|
|
|
'zone': 1, 'region': region, 'weight': 1.0})
|
|
|
|
next_dev_id += 1
|
|
|
|
rb.pretend_min_part_hours_passed()
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.pretend_min_part_hours_passed()
|
|
|
|
rb.rebalance(seed=1)
|
|
|
|
rb.get_ring().save(self.testgz)
|
|
|
|
r = ring.Ring(self.testdir, ring_name='whatever')
|
|
|
|
|
|
|
|
# There's 5 regions now, so the primary nodes + first 2 handoffs
|
|
|
|
# should span all 5 regions
|
|
|
|
part, devs = r.get_nodes('a1', 'c1', 'o1')
|
|
|
|
primary_regions = set(d['region'] for d in devs)
|
|
|
|
primary_zones = set((d['region'], d['zone']) for d in devs)
|
|
|
|
more_devs = list(r.get_more_nodes(part))
|
|
|
|
|
|
|
|
seen_regions = set(primary_regions)
|
|
|
|
seen_regions.update(d['region'] for d in more_devs[:2])
|
|
|
|
self.assertEquals(seen_regions, set(range(0, 5)))
|
|
|
|
|
|
|
|
# There are 13 zones now, so the first 13 nodes should all have
|
|
|
|
# distinct zones (that's r0z0, r0z1, ..., r0z8, r1z1, r2z1, r3z1, and
|
|
|
|
# r4z1).
|
|
|
|
seen_zones = set(primary_zones)
|
|
|
|
seen_zones.update((d['region'], d['zone']) for d in more_devs[:10])
|
|
|
|
self.assertEquals(13, len(seen_zones))
|
|
|
|
|
|
|
|
# Here's a brittle canary-in-the-coalmine test to make sure the region
|
|
|
|
# handoff computation didn't change accidentally
|
|
|
|
exp_handoffs = [111, 112, 74, 54, 93, 31, 2, 43, 100, 22, 71, 32, 92,
|
|
|
|
35, 9, 50, 41, 76, 80, 84, 88, 17, 94, 101, 1, 10, 96,
|
|
|
|
44, 73, 6, 75, 102, 37, 21, 97, 29, 105, 5, 28, 47,
|
|
|
|
106, 30, 16, 39, 77, 42, 72, 20, 13, 34, 99, 108, 14,
|
|
|
|
66, 61, 81, 90, 4, 40, 3, 45, 62, 7, 15, 87, 12, 83,
|
|
|
|
89, 53, 33, 98, 49, 65, 25, 107, 56, 58, 86, 48, 57,
|
|
|
|
24, 11, 23, 26, 46, 64, 69, 38, 36, 79, 63, 104, 51,
|
|
|
|
70, 82, 67, 68, 8, 95, 91, 55, 59, 85]
|
|
|
|
dev_ids = [d['id'] for d in more_devs]
|
|
|
|
|
|
|
|
self.assertEquals(len(dev_ids), len(exp_handoffs))
|
|
|
|
for index, dev_id in enumerate(dev_ids):
|
|
|
|
self.assertEquals(
|
|
|
|
dev_id, exp_handoffs[index],
|
|
|
|
'handoff differs at position %d\n%s\n%s' % (
|
|
|
|
index, dev_ids[index:], exp_handoffs[index:]))
|
|
|
|
|
2010-07-12 17:03:45 -05:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|