bb509dd863
This commit introduces a new algorithm for assigning partition replicas to devices. Basically, the ring builder organizes the devices into tiers (first zone, then IP/port, then device ID). When placing a replica, the ring builder looks for the emptiest device (biggest parts_wanted) in the furthest-away tier. In the case where zone-count >= replica-count, the new algorithm will give the same results as the one it replaces. Thus, no migration is needed. In the case where zone-count < replica-count, the new algorithm behaves differently from the old algorithm. The new algorithm will distribute things evenly at each tier so that the replication is as high-quality as possible, given the circumstances. The old algorithm would just crash, so again, no migration is needed. Handoffs have also been updated to use the new algorithm. When generating handoff nodes, first the ring looks for nodes in other zones, then other ips/ports, then any other drive. The first handoff nodes (the ones in other zones) will be the same as before; this commit just extends the list of handoff nodes. The proxy server and replicators have been altered to avoid looking at the ring's replica count directly. Previously, with a replica count of C, RingData.get_nodes() and RingData.get_part_nodes() would return lists of length C, so some other code used the replica count when it needed the number of nodes. If two of a partition's replicas are on the same device (e.g. with 3 replicas, 2 devices), then that assumption is no longer true. Fortunately, all the proxy server and replicators really needed was the number of nodes returned, which they already had. (Bonus: now the only code that mentions replica_count directly is in the ring and the ring builder.) Change-Id: Iba2929edfc6ece89791890d0635d4763d821a3aa
90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
from collections import defaultdict
|
|
|
|
|
|
def tiers_for_dev(dev):
|
|
"""
|
|
Returns a tuple of tiers for a given device in ascending order by
|
|
length.
|
|
|
|
:returns: tuple of tiers
|
|
"""
|
|
t1 = dev['zone']
|
|
t2 = "{ip}:{port}".format(ip=dev.get('ip'), port=dev.get('port'))
|
|
t3 = dev['id']
|
|
|
|
return ((t1,),
|
|
(t1, t2),
|
|
(t1, t2, t3))
|
|
|
|
|
|
def build_tier_tree(devices):
|
|
"""
|
|
Construct the tier tree from the zone layout.
|
|
|
|
The tier tree is a dictionary that maps tiers to their child tiers.
|
|
A synthetic root node of () is generated so that there's one tree,
|
|
not a forest.
|
|
|
|
Example:
|
|
|
|
zone 1 -+---- 192.168.1.1:6000 -+---- device id 0
|
|
| |
|
|
| +---- device id 1
|
|
| |
|
|
| +---- device id 2
|
|
|
|
|
+---- 192.168.1.2:6000 -+---- device id 3
|
|
|
|
|
+---- device id 4
|
|
|
|
|
+---- device id 5
|
|
|
|
|
|
zone 2 -+---- 192.168.2.1:6000 -+---- device id 6
|
|
| |
|
|
| +---- device id 7
|
|
| |
|
|
| +---- device id 8
|
|
|
|
|
+---- 192.168.2.2:6000 -+---- device id 9
|
|
|
|
|
+---- device id 10
|
|
|
|
|
+---- device id 11
|
|
|
|
The tier tree would look like:
|
|
{
|
|
(): [(1,), (2,)],
|
|
|
|
(1,): [(1, 192.168.1.1:6000),
|
|
(1, 192.168.1.2:6000)],
|
|
(2,): [(1, 192.168.2.1:6000),
|
|
(1, 192.168.2.2:6000)],
|
|
|
|
(1, 192.168.1.1:6000): [(1, 192.168.1.1:6000, 0),
|
|
(1, 192.168.1.1:6000, 1),
|
|
(1, 192.168.1.1:6000, 2)],
|
|
(1, 192.168.1.2:6000): [(1, 192.168.1.2:6000, 3),
|
|
(1, 192.168.1.2:6000, 4),
|
|
(1, 192.168.1.2:6000, 5)],
|
|
(2, 192.168.2.1:6000): [(1, 192.168.2.1:6000, 6),
|
|
(1, 192.168.2.1:6000, 7),
|
|
(1, 192.168.2.1:6000, 8)],
|
|
(2, 192.168.2.2:6000): [(1, 192.168.2.2:6000, 9),
|
|
(1, 192.168.2.2:6000, 10),
|
|
(1, 192.168.2.2:6000, 11)],
|
|
}
|
|
|
|
:devices: device dicts from which to generate the tree
|
|
:returns: tier tree
|
|
|
|
"""
|
|
tier2children = defaultdict(set)
|
|
for dev in devices:
|
|
for tier in tiers_for_dev(dev):
|
|
if len(tier) > 1:
|
|
tier2children[tier[0:-1]].add(tier)
|
|
else:
|
|
tier2children[()].add(tier)
|
|
return tier2children
|