755 lines
34 KiB
Python
Raw Normal View History

# Copyright (c) 2010-2012 OpenStack Foundation
2010-07-12 17:03:45 -05:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import array
2010-07-12 17:03:45 -05:00
import cPickle as pickle
import os
import sys
2010-07-12 17:03:45 -05:00
import unittest
import stat
from contextlib import closing
2010-07-12 17:03:45 -05:00
from gzip import GzipFile
from tempfile import mkdtemp
2010-07-12 17:03:45 -05:00
from shutil import rmtree
from time import sleep, time
from swift.common import ring, utils
2010-07-12 17:03:45 -05:00
Add Storage Policy support to Containers Containers now have a storage policy index associated with them, stored in the container_stat table. This index is only settable at container creation time (PUT request), and cannot be changed without deleting and recreating the container. This is because a container's policy index will apply to all its objects, so changing a container's policy index would require moving large amounts of object data around. If a user wants to change the policy for data in a container, they must create a new container with the desired policy and move the data over. Keep status_changed_at up-to-date with status changes. In particular during container recreation and replication. When a container-server receives a PUT for a deleted database an extra UPDATE is issued against the container_stat table to notate the x-timestamp of the request. During replication if merge_timestamps causes a container's status to change (from DELETED to ACTIVE or vice-versa) the status_changed_at field is set to the current time. Accurate reporting of status_changed_at is useful for container replication forensics and allows resolution of "set on create" attributes like the upcoming storage_policy_index. Expose Backend container info on deleted containers. Include basic container info in backend headers on 404 responses from the container server. Default empty values are used as placeholders if the database does not exist. Specifically the X-Backend-Status-Changed-At, X-Backend-DELETE-Timestamp and the X-Backend-Storage-Policy-Index value will be needed by the reconciler to deal with reconciling out of order object writes in the face of recently deleted containers. * Add "status_changed_at" key to the response from ContainerBroker.get_info. * Add "Status Timestamp" field to swift.cli.info.print_db_info_metadata. * Add "status_changed_at" key to the response from AccountBroker.get_info. DocImpact Implements: blueprint storage-policies Change-Id: Ie6d388f067f5b096b0f96faef151120ba23c8748
2014-05-27 16:57:25 -07:00
class TestRingBase(unittest.TestCase):
def setUp(self):
self._orig_hash_suffix = utils.HASH_PATH_SUFFIX
self._orig_hash_prefix = utils.HASH_PATH_PREFIX
utils.HASH_PATH_SUFFIX = 'endcap'
utils.HASH_PATH_PREFIX = ''
def tearDown(self):
utils.HASH_PATH_SUFFIX = self._orig_hash_suffix
utils.HASH_PATH_PREFIX = self._orig_hash_prefix
2010-07-12 17:03:45 -05:00
class TestRingData(unittest.TestCase):
def setUp(self):
self.testdir = os.path.join(os.path.dirname(__file__), 'ring_data')
rmtree(self.testdir, ignore_errors=1)
os.mkdir(self.testdir)
def tearDown(self):
rmtree(self.testdir, ignore_errors=1)
def assert_ring_data_equal(self, rd_expected, rd_got):
self.assertEquals(rd_expected._replica2part2dev_id,
rd_got._replica2part2dev_id)
self.assertEquals(rd_expected.devs, rd_got.devs)
self.assertEquals(rd_expected._part_shift, rd_got._part_shift)
2010-07-12 17:03:45 -05:00
def test_attrs(self):
r2p2d = [[0, 1, 0, 1], [0, 1, 0, 1]]
d = [{'id': 0, 'zone': 0, 'region': 0, 'ip': '10.1.1.0', 'port': 7000},
{'id': 1, 'zone': 1, 'region': 1, 'ip': '10.1.1.1', 'port': 7000}]
2010-07-12 17:03:45 -05:00
s = 30
rd = ring.RingData(r2p2d, d, s)
self.assertEquals(rd._replica2part2dev_id, r2p2d)
self.assertEquals(rd.devs, d)
self.assertEquals(rd._part_shift, s)
def test_can_load_pickled_ring_data(self):
rd = ring.RingData(
[[0, 1, 0, 1], [0, 1, 0, 1]],
[{'id': 0, 'zone': 0, 'ip': '10.1.1.0', 'port': 7000},
{'id': 1, 'zone': 1, 'ip': '10.1.1.1', 'port': 7000}],
30)
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
2010-07-12 17:03:45 -05:00
for p in xrange(pickle.HIGHEST_PROTOCOL):
with closing(GzipFile(ring_fname, 'wb')) as f:
pickle.dump(rd, f, protocol=p)
ring_data = ring.RingData.load(ring_fname)
self.assert_ring_data_equal(rd, ring_data)
def test_roundtrip_serialization(self):
ring_fname = os.path.join(self.testdir, 'foo.ring.gz')
rd = ring.RingData(
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
rd.save(ring_fname)
rd2 = ring.RingData.load(ring_fname)
self.assert_ring_data_equal(rd, rd2)
2010-07-12 17:03:45 -05:00
def test_deterministic_serialization(self):
"""
Two identical rings should produce identical .gz files on disk.
Only true on Python 2.7 or greater.
"""
if sys.version_info[0] == 2 and sys.version_info[1] < 7:
return
os.mkdir(os.path.join(self.testdir, '1'))
os.mkdir(os.path.join(self.testdir, '2'))
# These have to have the same filename (not full path,
# obviously) since the filename gets encoded in the gzip data.
ring_fname1 = os.path.join(self.testdir, '1', 'the.ring.gz')
ring_fname2 = os.path.join(self.testdir, '2', 'the.ring.gz')
rd = ring.RingData(
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
rd.save(ring_fname1)
rd.save(ring_fname2)
with open(ring_fname1) as ring1:
with open(ring_fname2) as ring2:
self.assertEqual(ring1.read(), ring2.read())
def test_permissions(self):
ring_fname = os.path.join(self.testdir, 'stat.ring.gz')
rd = ring.RingData(
[array.array('H', [0, 1, 0, 1]), array.array('H', [0, 1, 0, 1])],
[{'id': 0, 'zone': 0}, {'id': 1, 'zone': 1}], 30)
rd.save(ring_fname)
self.assertEqual(oct(stat.S_IMODE(os.stat(ring_fname).st_mode)),
'0644')
2010-07-12 17:03:45 -05:00
Add Storage Policy support to Containers Containers now have a storage policy index associated with them, stored in the container_stat table. This index is only settable at container creation time (PUT request), and cannot be changed without deleting and recreating the container. This is because a container's policy index will apply to all its objects, so changing a container's policy index would require moving large amounts of object data around. If a user wants to change the policy for data in a container, they must create a new container with the desired policy and move the data over. Keep status_changed_at up-to-date with status changes. In particular during container recreation and replication. When a container-server receives a PUT for a deleted database an extra UPDATE is issued against the container_stat table to notate the x-timestamp of the request. During replication if merge_timestamps causes a container's status to change (from DELETED to ACTIVE or vice-versa) the status_changed_at field is set to the current time. Accurate reporting of status_changed_at is useful for container replication forensics and allows resolution of "set on create" attributes like the upcoming storage_policy_index. Expose Backend container info on deleted containers. Include basic container info in backend headers on 404 responses from the container server. Default empty values are used as placeholders if the database does not exist. Specifically the X-Backend-Status-Changed-At, X-Backend-DELETE-Timestamp and the X-Backend-Storage-Policy-Index value will be needed by the reconciler to deal with reconciling out of order object writes in the face of recently deleted containers. * Add "status_changed_at" key to the response from ContainerBroker.get_info. * Add "Status Timestamp" field to swift.cli.info.print_db_info_metadata. * Add "status_changed_at" key to the response from AccountBroker.get_info. DocImpact Implements: blueprint storage-policies Change-Id: Ie6d388f067f5b096b0f96faef151120ba23c8748
2014-05-27 16:57:25 -07:00
class TestRing(TestRingBase):
2010-07-12 17:03:45 -05:00
def setUp(self):
Add Storage Policy support to Containers Containers now have a storage policy index associated with them, stored in the container_stat table. This index is only settable at container creation time (PUT request), and cannot be changed without deleting and recreating the container. This is because a container's policy index will apply to all its objects, so changing a container's policy index would require moving large amounts of object data around. If a user wants to change the policy for data in a container, they must create a new container with the desired policy and move the data over. Keep status_changed_at up-to-date with status changes. In particular during container recreation and replication. When a container-server receives a PUT for a deleted database an extra UPDATE is issued against the container_stat table to notate the x-timestamp of the request. During replication if merge_timestamps causes a container's status to change (from DELETED to ACTIVE or vice-versa) the status_changed_at field is set to the current time. Accurate reporting of status_changed_at is useful for container replication forensics and allows resolution of "set on create" attributes like the upcoming storage_policy_index. Expose Backend container info on deleted containers. Include basic container info in backend headers on 404 responses from the container server. Default empty values are used as placeholders if the database does not exist. Specifically the X-Backend-Status-Changed-At, X-Backend-DELETE-Timestamp and the X-Backend-Storage-Policy-Index value will be needed by the reconciler to deal with reconciling out of order object writes in the face of recently deleted containers. * Add "status_changed_at" key to the response from ContainerBroker.get_info. * Add "Status Timestamp" field to swift.cli.info.print_db_info_metadata. * Add "status_changed_at" key to the response from AccountBroker.get_info. DocImpact Implements: blueprint storage-policies Change-Id: Ie6d388f067f5b096b0f96faef151120ba23c8748
2014-05-27 16:57:25 -07:00
super(TestRing, self).setUp()
self.testdir = mkdtemp()
self.testgz = os.path.join(self.testdir, 'whatever.ring.gz')
self.intended_replica2part2dev_id = [
array.array('H', [0, 1, 0, 1]),
array.array('H', [0, 1, 0, 1]),
array.array('H', [3, 4, 3, 4])]
self.intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.0.1',
'replication_port': 6066},
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.0.2',
'replication_port': 6066},
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
None,
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.1', 'port': 6000,
'replication_ip': '10.2.0.1',
'replication_port': 6066},
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.2', 'port': 6000,
'replication_ip': '10.2.0.1',
'replication_port': 6066}]
2010-07-12 17:03:45 -05:00
self.intended_part_shift = 30
self.intended_reload_time = 15
ring.RingData(
self.intended_replica2part2dev_id,
self.intended_devs, self.intended_part_shift).save(self.testgz)
self.ring = ring.Ring(
self.testdir,
reload_time=self.intended_reload_time, ring_name='whatever')
2010-07-12 17:03:45 -05:00
def tearDown(self):
Add Storage Policy support to Containers Containers now have a storage policy index associated with them, stored in the container_stat table. This index is only settable at container creation time (PUT request), and cannot be changed without deleting and recreating the container. This is because a container's policy index will apply to all its objects, so changing a container's policy index would require moving large amounts of object data around. If a user wants to change the policy for data in a container, they must create a new container with the desired policy and move the data over. Keep status_changed_at up-to-date with status changes. In particular during container recreation and replication. When a container-server receives a PUT for a deleted database an extra UPDATE is issued against the container_stat table to notate the x-timestamp of the request. During replication if merge_timestamps causes a container's status to change (from DELETED to ACTIVE or vice-versa) the status_changed_at field is set to the current time. Accurate reporting of status_changed_at is useful for container replication forensics and allows resolution of "set on create" attributes like the upcoming storage_policy_index. Expose Backend container info on deleted containers. Include basic container info in backend headers on 404 responses from the container server. Default empty values are used as placeholders if the database does not exist. Specifically the X-Backend-Status-Changed-At, X-Backend-DELETE-Timestamp and the X-Backend-Storage-Policy-Index value will be needed by the reconciler to deal with reconciling out of order object writes in the face of recently deleted containers. * Add "status_changed_at" key to the response from ContainerBroker.get_info. * Add "Status Timestamp" field to swift.cli.info.print_db_info_metadata. * Add "status_changed_at" key to the response from AccountBroker.get_info. DocImpact Implements: blueprint storage-policies Change-Id: Ie6d388f067f5b096b0f96faef151120ba23c8748
2014-05-27 16:57:25 -07:00
super(TestRing, self).tearDown()
2010-07-12 17:03:45 -05:00
rmtree(self.testdir, ignore_errors=1)
def test_creation(self):
self.assertEquals(self.ring._replica2part2dev_id,
self.intended_replica2part2dev_id)
self.assertEquals(self.ring._part_shift, self.intended_part_shift)
self.assertEquals(self.ring.devs, self.intended_devs)
self.assertEquals(self.ring.reload_time, self.intended_reload_time)
self.assertEquals(self.ring.serialized_path, self.testgz)
# test invalid endcap
_orig_hash_path_suffix = utils.HASH_PATH_SUFFIX
_orig_hash_path_prefix = utils.HASH_PATH_PREFIX
_orig_swift_conf_file = utils.SWIFT_CONF_FILE
try:
utils.HASH_PATH_SUFFIX = ''
utils.HASH_PATH_PREFIX = ''
utils.SWIFT_CONF_FILE = ''
self.assertRaises(SystemExit, ring.Ring, self.testdir, 'whatever')
finally:
utils.HASH_PATH_SUFFIX = _orig_hash_path_suffix
utils.HASH_PATH_PREFIX = _orig_hash_path_prefix
utils.SWIFT_CONF_FILE = _orig_swift_conf_file
2010-07-12 17:03:45 -05:00
def test_has_changed(self):
self.assertEquals(self.ring.has_changed(), False)
os.utime(self.testgz, (time() + 60, time() + 60))
2010-07-12 17:03:45 -05:00
self.assertEquals(self.ring.has_changed(), True)
def test_reload(self):
os.utime(self.testgz, (time() - 300, time() - 300))
self.ring = ring.Ring(self.testdir, reload_time=0.001,
ring_name='whatever')
2010-07-12 17:03:45 -05:00
orig_mtime = self.ring._mtime
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 5)
self.intended_devs.append(
{'id': 3, 'region': 0, 'zone': 3, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 9876})
ring.RingData(
self.intended_replica2part2dev_id,
self.intended_devs, self.intended_part_shift).save(self.testgz)
2010-07-12 17:03:45 -05:00
sleep(0.1)
self.ring.get_nodes('a')
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 6)
2010-07-12 17:03:45 -05:00
self.assertNotEquals(self.ring._mtime, orig_mtime)
os.utime(self.testgz, (time() - 300, time() - 300))
self.ring = ring.Ring(self.testdir, reload_time=0.001,
ring_name='whatever')
2010-07-12 17:03:45 -05:00
orig_mtime = self.ring._mtime
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 6)
self.intended_devs.append(
{'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0,
'ip': '10.5.5.5', 'port': 9876})
ring.RingData(
self.intended_replica2part2dev_id,
self.intended_devs, self.intended_part_shift).save(self.testgz)
2010-07-12 17:03:45 -05:00
sleep(0.1)
self.ring.get_part_nodes(0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 7)
2010-07-12 17:03:45 -05:00
self.assertNotEquals(self.ring._mtime, orig_mtime)
os.utime(self.testgz, (time() - 300, time() - 300))
self.ring = ring.Ring(self.testdir, reload_time=0.001,
ring_name='whatever')
2010-07-12 17:03:45 -05:00
orig_mtime = self.ring._mtime
part, nodes = self.ring.get_nodes('a')
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 7)
self.intended_devs.append(
{'id': 6, 'region': 0, 'zone': 5, 'weight': 1.0,
'ip': '10.6.6.6', 'port': 6000})
ring.RingData(
self.intended_replica2part2dev_id,
self.intended_devs, self.intended_part_shift).save(self.testgz)
2010-07-12 17:03:45 -05:00
sleep(0.1)
self.ring.get_more_nodes(part).next()
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(len(self.ring.devs), 8)
2010-07-12 17:03:45 -05:00
self.assertNotEquals(self.ring._mtime, orig_mtime)
os.utime(self.testgz, (time() - 300, time() - 300))
self.ring = ring.Ring(self.testdir, reload_time=0.001,
ring_name='whatever')
orig_mtime = self.ring._mtime
self.assertEquals(len(self.ring.devs), 8)
self.intended_devs.append(
{'id': 5, 'region': 0, 'zone': 4, 'weight': 1.0,
'ip': '10.5.5.5', 'port': 6000})
ring.RingData(
self.intended_replica2part2dev_id,
self.intended_devs, self.intended_part_shift).save(self.testgz)
sleep(0.1)
self.assertEquals(len(self.ring.devs), 9)
self.assertNotEquals(self.ring._mtime, orig_mtime)
def test_reload_without_replication(self):
replication_less_devs = [{'id': 0, 'region': 0, 'zone': 0,
'weight': 1.0, 'ip': '10.1.1.1',
'port': 6000},
{'id': 1, 'region': 0, 'zone': 0,
'weight': 1.0, 'ip': '10.1.1.1',
'port': 6000},
None,
{'id': 3, 'region': 0, 'zone': 2,
'weight': 1.0, 'ip': '10.1.2.1',
'port': 6000},
{'id': 4, 'region': 0, 'zone': 2,
'weight': 1.0, 'ip': '10.1.2.2',
'port': 6000}]
intended_devs = [{'id': 0, 'region': 0, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.1.1',
'replication_port': 6000},
{'id': 1, 'region': 0, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.1.1',
'replication_port': 6000},
None,
{'id': 3, 'region': 0, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.1', 'port': 6000,
'replication_ip': '10.1.2.1',
'replication_port': 6000},
{'id': 4, 'region': 0, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.2', 'port': 6000,
'replication_ip': '10.1.2.2',
'replication_port': 6000}]
testgz = os.path.join(self.testdir, 'without_replication.ring.gz')
ring.RingData(
self.intended_replica2part2dev_id,
replication_less_devs, self.intended_part_shift).save(testgz)
self.ring = ring.Ring(
self.testdir,
reload_time=self.intended_reload_time,
ring_name='without_replication')
self.assertEquals(self.ring.devs, intended_devs)
def test_reload_old_style_pickled_ring(self):
devs = [{'id': 0, 'zone': 0,
'weight': 1.0, 'ip': '10.1.1.1',
'port': 6000},
{'id': 1, 'zone': 0,
'weight': 1.0, 'ip': '10.1.1.1',
'port': 6000},
None,
{'id': 3, 'zone': 2,
'weight': 1.0, 'ip': '10.1.2.1',
'port': 6000},
{'id': 4, 'zone': 2,
'weight': 1.0, 'ip': '10.1.2.2',
'port': 6000}]
intended_devs = [{'id': 0, 'region': 1, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.1.1',
'replication_port': 6000},
{'id': 1, 'region': 1, 'zone': 0, 'weight': 1.0,
'ip': '10.1.1.1', 'port': 6000,
'replication_ip': '10.1.1.1',
'replication_port': 6000},
None,
{'id': 3, 'region': 1, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.1', 'port': 6000,
'replication_ip': '10.1.2.1',
'replication_port': 6000},
{'id': 4, 'region': 1, 'zone': 2, 'weight': 1.0,
'ip': '10.1.2.2', 'port': 6000,
'replication_ip': '10.1.2.2',
'replication_port': 6000}]
# simulate an old-style pickled ring
testgz = os.path.join(self.testdir,
'without_replication_or_region.ring.gz')
ring_data = ring.RingData(self.intended_replica2part2dev_id,
devs,
self.intended_part_shift)
# an old-style pickled ring won't have region data
for dev in ring_data.devs:
if dev:
del dev["region"]
gz_file = GzipFile(testgz, 'wb')
pickle.dump(ring_data, gz_file, protocol=2)
gz_file.close()
self.ring = ring.Ring(
self.testdir,
reload_time=self.intended_reload_time,
ring_name='without_replication_or_region')
self.assertEquals(self.ring.devs, intended_devs)
Refactored lists of nodes to contact for requests Extensive refactor here to consolidate what nodes are contacted for any request. This consolidation means reads will contact the same set of nodes that writes would, giving a very good chance that read-your-write behavior will succeed. This also means that writes will not necessarily try all nodes in the cluster as it would previously, which really wasn't desirable anyway. (If you really want that, you can set request_node_count to a really big number, but understand that also means reads will contact every node looking for something that might not exist.) * Added a request_node_count proxy-server conf value that allows control of how many nodes are contacted for a normal request. In proxy.controllers.base.Controller: * Got rid of error_increment since it was only used in one spot by another method and just served to confuse. * Made error_occurred also log the device name. * Made error_limit require an error message and also documented a bit better. * Changed iter_nodes to just take a ring and a partition and yield all the nodes itself so it could control the number of nodes used in a given request. Also happens to consolidate where sort_nodes is called. * Updated account_info and container_info to use all nodes from iter_nodes and to call error_occurred appropriately. * Updated GETorHEAD_base to not track attempts on its own and just stop when iter_nodes tells it to stop. Also, it doesn't take the nodes to contact anymore; instead it takes the ring and gets the nodes from iter_nodes itself. Elsewhere: * Ring now has a get_part method. * Made changes to reflect all of the above. Change-Id: I37f76c99286b6456311abf25167cd0485bfcafac
2013-04-06 01:35:58 +00:00
def test_get_part(self):
part1 = self.ring.get_part('a')
nodes1 = self.ring.get_part_nodes(part1)
part2, nodes2 = self.ring.get_nodes('a')
self.assertEquals(part1, part2)
self.assertEquals(nodes1, nodes2)
2010-07-12 17:03:45 -05:00
def test_get_part_nodes(self):
part, nodes = self.ring.get_nodes('a')
self.assertEquals(nodes, self.ring.get_part_nodes(part))
def test_get_nodes(self):
# Yes, these tests are deliberately very fragile. We want to make sure
# that if someones changes the results the ring produces, they know it.
self.assertRaises(TypeError, self.ring.get_nodes)
part, nodes = self.ring.get_nodes('a')
self.assertEquals(part, 0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a1')
self.assertEquals(part, 0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a4')
self.assertEquals(part, 1)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[1],
self.intended_devs[4]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('aa')
self.assertEquals(part, 1)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[1],
self.intended_devs[4]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c1')
self.assertEquals(part, 0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c0')
self.assertEquals(part, 3)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[1],
self.intended_devs[4]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c3')
self.assertEquals(part, 2)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c2')
self.assertEquals(part, 2)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c', 'o1')
self.assertEquals(part, 1)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[1],
self.intended_devs[4]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c', 'o5')
self.assertEquals(part, 0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c', 'o0')
self.assertEquals(part, 0)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
2010-07-12 17:03:45 -05:00
part, nodes = self.ring.get_nodes('a', 'c', 'o2')
self.assertEquals(part, 2)
As-unique-as-possible partition replica placement. This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
2012-04-23 10:41:44 -07:00
self.assertEquals(nodes, [self.intended_devs[0],
self.intended_devs[3]])
def add_dev_to_ring(self, new_dev):
self.ring.devs.append(new_dev)
self.ring._rebuild_tier_data()
2010-07-12 17:03:45 -05:00
def test_get_more_nodes(self):
# Yes, these tests are deliberately very fragile. We want to make sure
# that if someone changes the results the ring produces, they know it.
exp_part = 6
exp_devs = [48, 93, 96]
exp_zones = set([5, 8, 9])
exp_handoffs = [11, 47, 25, 76, 69, 23, 99, 59, 106, 64, 43, 34, 88, 3,
30, 83, 16, 27, 103, 39, 60, 0, 8, 72, 56, 19, 91, 13,
84, 38, 66, 52, 78, 107, 50, 57, 31, 32, 77, 24, 42,
100, 71, 26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102,
101, 61, 95, 21, 81, 1, 105, 58, 74, 90, 86, 46, 4, 68,
40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, 87,
65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, 37, 63,
53, 92, 33, 85, 73, 51, 98, 36, 10]
exp_first_handoffs = [1, 37, 48, 68, 84, 75, 11, 101, 14, 73, 100, 75,
29, 19, 18, 101, 15, 99, 95, 24, 46, 82, 73, 62,
24, 89, 9, 22, 107, 74, 54, 63, 40, 106, 99, 83,
64, 73, 73, 106, 106, 80, 6, 25, 20, 33, 6, 79,
59, 42, 62, 24, 14, 107, 28, 0, 85, 5, 4, 12, 58,
11, 92, 18, 36, 56, 86, 1, 21, 33, 80, 97, 4, 81,
79, 76, 89, 50, 75, 27, 7, 96, 47, 55, 81, 104,
12, 5, 18, 106, 27, 93, 39, 92, 42, 30, 20, 88,
58, 105, 65, 29, 17, 52, 11, 106, 7, 24, 21, 91,
62, 52, 50, 31, 77, 102, 19, 11, 8, 58, 53, 20,
26, 8, 18, 82, 48, 68, 82, 89, 101, 50, 3, 52,
46, 11, 2, 30, 79, 66, 4, 61, 3, 56, 45, 102, 73,
84, 36, 19, 34, 84, 49, 40, 103, 66, 31, 33, 93,
33, 4, 52, 26, 58, 30, 47, 100, 57, 40, 79, 33,
107, 24, 20, 44, 4, 7, 59, 83, 101, 1, 56, 20,
61, 33, 16, 5, 74, 98, 4, 80, 15, 104, 52, 73,
18, 67, 75, 98, 73, 79, 68, 75, 27, 91, 36, 100,
52, 95, 37, 46, 70, 14, 47, 3, 70, 23, 40, 105,
62, 86, 48, 22, 54, 4, 72, 81, 13, 0, 18, 98,
101, 36, 29, 24, 39, 79, 97, 105, 28, 107, 47,
52, 101, 20, 22, 29, 65, 27, 7, 33, 64, 101, 60,
19, 55]
rb = ring.RingBuilder(8, 3, 1)
next_dev_id = 0
for zone in xrange(1, 10):
for server in xrange(1, 5):
for device in xrange(1, 4):
rb.add_dev({'id': next_dev_id,
'ip': '1.2.%d.%d' % (zone, server),
'port': 1234, 'zone': zone, 'region': 0,
'weight': 1.0})
next_dev_id += 1
rb.rebalance(seed=1)
rb.get_ring().save(self.testgz)
r = ring.Ring(self.testdir, ring_name='whatever')
part, devs = r.get_nodes('a', 'c', 'o')
primary_zones = set([d['zone'] for d in devs])
self.assertEquals(part, exp_part)
self.assertEquals([d['id'] for d in devs], exp_devs)
self.assertEquals(primary_zones, exp_zones)
devs = list(r.get_more_nodes(part))
self.assertEquals([d['id'] for d in devs], exp_handoffs)
# The first 6 replicas plus the 3 primary nodes should cover all 9
# zones in this test
seen_zones = set(primary_zones)
seen_zones.update([d['zone'] for d in devs[:6]])
self.assertEquals(seen_zones, set(range(1, 10)))
# The first handoff nodes for each partition in the ring
devs = []
for part in xrange(r.partition_count):
devs.append(r.get_more_nodes(part).next()['id'])
self.assertEquals(devs, exp_first_handoffs)
# Add a new device we can handoff to.
zone = 5
server = 0
rb.add_dev({'id': next_dev_id,
'ip': '1.2.%d.%d' % (zone, server),
'port': 1234, 'zone': zone, 'region': 0, 'weight': 1.0})
next_dev_id += 1
rb.rebalance(seed=1)
rb.get_ring().save(self.testgz)
r = ring.Ring(self.testdir, ring_name='whatever')
# We would change expectations here, but in this test no handoffs
# changed at all.
part, devs = r.get_nodes('a', 'c', 'o')
primary_zones = set([d['zone'] for d in devs])
self.assertEquals(part, exp_part)
self.assertEquals([d['id'] for d in devs], exp_devs)
self.assertEquals(primary_zones, exp_zones)
devs = list(r.get_more_nodes(part))
dev_ids = [d['id'] for d in devs]
self.assertEquals(len(dev_ids), len(exp_handoffs))
for index, dev in enumerate(dev_ids):
self.assertEquals(
dev, exp_handoffs[index],
'handoff differs at position %d\n%s\n%s' % (
index, dev_ids[index:], exp_handoffs[index:]))
# The handoffs still cover all the non-primary zones first
seen_zones = set(primary_zones)
seen_zones.update([d['zone'] for d in devs[:6]])
self.assertEquals(seen_zones, set(range(1, 10)))
devs = []
for part in xrange(r.partition_count):
devs.append(r.get_more_nodes(part).next()['id'])
for part in xrange(r.partition_count):
self.assertEquals(
devs[part], exp_first_handoffs[part],
'handoff for partitition %d is now device id %d' % (
part, devs[part]))
# Remove a device.
rb.remove_dev(0)
rb.rebalance(seed=1)
rb.get_ring().save(self.testgz)
r = ring.Ring(self.testdir, ring_name='whatever')
# Change expectations
# The long string of handoff nodes for the partition were the same for
# the first 20, which is pretty good.
exp_handoffs[20:] = [60, 108, 8, 72, 56, 19, 91, 13, 84, 38, 66, 52,
1, 78, 107, 50, 57, 31, 32, 77, 24, 42, 100, 71,
26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102, 101,
61, 95, 21, 81, 105, 58, 74, 90, 86, 46, 4, 68,
40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70,
87, 65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67,
37, 63, 53, 92, 33, 85, 73, 51, 98, 36, 10]
# Just a few of the first handoffs changed
exp_first_handoffs[3] = 68
exp_first_handoffs[55] = 104
exp_first_handoffs[116] = 6
exp_first_handoffs[181] = 15
exp_first_handoffs[228] = 38
# Test
part, devs = r.get_nodes('a', 'c', 'o')
primary_zones = set([d['zone'] for d in devs])
self.assertEquals(part, exp_part)
self.assertEquals([d['id'] for d in devs], exp_devs)
self.assertEquals(primary_zones, exp_zones)
devs = list(r.get_more_nodes(part))
dev_ids = [d['id'] for d in devs]
self.assertEquals(len(dev_ids), len(exp_handoffs))
for index, dev in enumerate(dev_ids):
self.assertEquals(
dev, exp_handoffs[index],
'handoff differs at position %d\n%s\n%s' % (
index, dev_ids[index:], exp_handoffs[index:]))
seen_zones = set(primary_zones)
seen_zones.update([d['zone'] for d in devs[:6]])
self.assertEquals(seen_zones, set(range(1, 10)))
devs = []
for part in xrange(r.partition_count):
devs.append(r.get_more_nodes(part).next()['id'])
for part in xrange(r.partition_count):
self.assertEquals(
devs[part], exp_first_handoffs[part],
'handoff for partitition %d is now device id %d' % (
part, devs[part]))
# Add a partial replica
rb.set_replicas(3.5)
rb.rebalance(seed=1)
rb.get_ring().save(self.testgz)
r = ring.Ring(self.testdir, ring_name='whatever')
# Change expectations
# We have another replica now
exp_devs.append(47)
exp_zones.add(4)
# Caused some major changes in the sequence of handoffs for our test
# partition, but at least the first stayed the same.
exp_handoffs[1:] = [81, 25, 69, 23, 99, 59, 76, 3, 106, 64, 43, 13, 34,
88, 30, 16, 27, 103, 39, 74, 60, 108, 8, 56, 19,
91, 52, 84, 38, 66, 1, 78, 45, 107, 50, 57, 83, 31,
46, 32, 77, 24, 42, 63, 100, 72, 71, 7, 26, 9, 20,
35, 5, 87, 14, 94, 62, 28, 41, 90, 18, 82, 102, 22,
101, 61, 85, 95, 21, 98, 67, 105, 58, 86, 4, 79,
68, 40, 80, 54, 75, 44, 49, 6, 29, 15, 70, 65, 12,
17, 104, 97, 55, 89, 2, 37, 53, 92, 33, 73, 51, 36,
10]
# Lots of first handoffs changed, but 30 of 256 is still just 11.72%.
exp_first_handoffs[1] = 6
exp_first_handoffs[4] = 104
exp_first_handoffs[11] = 106
exp_first_handoffs[17] = 13
exp_first_handoffs[21] = 77
exp_first_handoffs[22] = 95
exp_first_handoffs[27] = 46
exp_first_handoffs[29] = 65
exp_first_handoffs[30] = 3
exp_first_handoffs[31] = 20
exp_first_handoffs[51] = 50
exp_first_handoffs[53] = 8
exp_first_handoffs[54] = 2
exp_first_handoffs[72] = 107
exp_first_handoffs[79] = 72
exp_first_handoffs[85] = 71
exp_first_handoffs[88] = 66
exp_first_handoffs[92] = 29
exp_first_handoffs[93] = 46
exp_first_handoffs[96] = 38
exp_first_handoffs[101] = 57
exp_first_handoffs[103] = 87
exp_first_handoffs[104] = 28
exp_first_handoffs[107] = 1
exp_first_handoffs[109] = 69
exp_first_handoffs[110] = 50
exp_first_handoffs[111] = 76
exp_first_handoffs[115] = 47
exp_first_handoffs[117] = 48
exp_first_handoffs[119] = 7
# Test
part, devs = r.get_nodes('a', 'c', 'o')
primary_zones = set([d['zone'] for d in devs])
self.assertEquals(part, exp_part)
self.assertEquals([d['id'] for d in devs], exp_devs)
self.assertEquals(primary_zones, exp_zones)
devs = list(r.get_more_nodes(part))
dev_ids = [d['id'] for d in devs]
self.assertEquals(len(dev_ids), len(exp_handoffs))
for index, dev in enumerate(dev_ids):
self.assertEquals(
dev, exp_handoffs[index],
'handoff differs at position %d\n%s\n%s' % (
index, dev_ids[index:], exp_handoffs[index:]))
seen_zones = set(primary_zones)
seen_zones.update([d['zone'] for d in devs[:6]])
self.assertEquals(seen_zones, set(range(1, 10)))
devs = []
for part in xrange(r.partition_count):
devs.append(r.get_more_nodes(part).next()['id'])
for part in xrange(r.partition_count):
self.assertEquals(
devs[part], exp_first_handoffs[part],
'handoff for partitition %d is now device id %d' % (
part, devs[part]))
# One last test of a partial replica partition
exp_part2 = 136
exp_devs2 = [52, 76, 97]
exp_zones2 = set([9, 5, 7])
exp_handoffs2 = [2, 67, 37, 92, 33, 23, 107, 63, 44, 103, 108, 85,
73, 10, 89, 80, 4, 17, 49, 32, 12, 41, 58, 20, 25,
61, 94, 47, 69, 56, 101, 28, 83, 8, 96, 53, 51, 42,
98, 35, 36, 84, 43, 104, 31, 65, 1, 40, 9, 74, 95,
45, 5, 71, 86, 78, 30, 93, 48, 91, 15, 88, 39, 18,
57, 72, 70, 27, 54, 16, 24, 21, 14, 11, 77, 62, 50,
6, 105, 26, 55, 29, 60, 34, 13, 87, 59, 38, 99, 75,
106, 3, 82, 66, 79, 7, 46, 64, 81, 22, 68, 19, 102,
90, 100]
part2, devs2 = r.get_nodes('a', 'c', 'o2')
primary_zones2 = set([d['zone'] for d in devs2])
self.assertEquals(part2, exp_part2)
self.assertEquals([d['id'] for d in devs2], exp_devs2)
self.assertEquals(primary_zones2, exp_zones2)
devs2 = list(r.get_more_nodes(part2))
dev_ids2 = [d['id'] for d in devs2]
self.assertEquals(len(dev_ids2), len(exp_handoffs2))
for index, dev in enumerate(dev_ids2):
self.assertEquals(
dev, exp_handoffs2[index],
'handoff differs at position %d\n%s\n%s' % (
index, dev_ids2[index:], exp_handoffs2[index:]))
seen_zones = set(primary_zones2)
seen_zones.update([d['zone'] for d in devs2[:6]])
self.assertEquals(seen_zones, set(range(1, 10)))
2010-07-12 17:03:45 -05:00
# Test distribution across regions
rb.set_replicas(3)
for region in xrange(1, 5):
rb.add_dev({'id': next_dev_id,
'ip': '1.%d.1.%d' % (region, server), 'port': 1234,
'zone': 1, 'region': region, 'weight': 1.0})
next_dev_id += 1
rb.pretend_min_part_hours_passed()
rb.rebalance(seed=1)
rb.pretend_min_part_hours_passed()
rb.rebalance(seed=1)
rb.get_ring().save(self.testgz)
r = ring.Ring(self.testdir, ring_name='whatever')
# There's 5 regions now, so the primary nodes + first 2 handoffs
# should span all 5 regions
part, devs = r.get_nodes('a1', 'c1', 'o1')
primary_regions = set([d['region'] for d in devs])
primary_zones = set([(d['region'], d['zone']) for d in devs])
more_devs = list(r.get_more_nodes(part))
seen_regions = set(primary_regions)
seen_regions.update([d['region'] for d in more_devs[:2]])
self.assertEquals(seen_regions, set(range(0, 5)))
# There are 13 zones now, so the first 13 nodes should all have
# distinct zones (that's r0z0, r0z1, ..., r0z8, r1z1, r2z1, r3z1, and
# r4z1).
seen_zones = set(primary_zones)
seen_zones.update([(d['region'], d['zone']) for d in more_devs[:10]])
self.assertEquals(13, len(seen_zones))
# Here's a brittle canary-in-the-coalmine test to make sure the region
# handoff computation didn't change accidentally
exp_handoffs = [111, 112, 74, 54, 93, 31, 2, 43, 100, 22, 71, 92, 35,
9, 50, 41, 76, 80, 84, 88, 17, 96, 6, 102, 37, 29,
105, 5, 47, 20, 13, 108, 66, 81, 53, 65, 25, 58, 32,
94, 101, 1, 10, 44, 73, 75, 21, 97, 28, 106, 30, 16,
39, 77, 42, 72, 34, 99, 14, 61, 90, 4, 40, 3, 45, 62,
7, 15, 87, 12, 83, 89, 33, 98, 49, 107, 56, 86, 48,
57, 24, 11, 23, 26, 46, 64, 69, 38, 36, 79, 63, 104,
51, 70, 82, 67, 68, 8, 95, 91, 55, 59, 85]
dev_ids = [d['id'] for d in more_devs]
self.assertEquals(len(dev_ids), len(exp_handoffs))
for index, dev_id in enumerate(dev_ids):
self.assertEquals(
dev_id, exp_handoffs[index],
'handoff differs at position %d\n%s\n%s' % (
index, dev_ids[index:], exp_handoffs[index:]))
2010-07-12 17:03:45 -05:00
if __name__ == '__main__':
unittest.main()