Let admins add a region without melting their cluster
Prior to this commit, swift-ring-builder would place partitions on devices by first going for maximal dispersion and breaking ties with device weight. This commit flips the order so that device weight trumps dispersion. Note: if your ring can be balanced, you won't see a behavior change. It's only when device weights and maximal-dispersion come into conflict that this commit changes anything. Example: a cluster with two regions. Region 1 has a combined weight of 1000, while region 2 has a combined weight of only 400. The ring has 3 replicas and 2^16 partitions. Prior to this commit, the balance would look like so: Region 1: 2 * 2^16 partitions Region 2: 2^16 partitions After this commit, the balance will be: Region 1: 10/14 * 2^16 partitions (more than before) Region 2: 4/14 * 2^16 partitions (fewer than before) One consequence of this is that some partitions will not have a replica in region 2, since it's not big enough to hold all of them. This way, a cluster operator can add a new region to a single-region cluster in a gradual fashion so as not to destroy their WAN link with replication traffic. As device weights are increased in the second region, more replicas will shift over to it. Once its weight is half that of the first region's, every partition will have a replica there. DocImpact Change-Id: I945abcc4a2917bb12be554b640f7507dd23cd0da
This commit is contained in:
parent
8fecf490fe
commit
6d77c379bd
@ -326,8 +326,7 @@ class RingBuilder(object):
|
||||
that before). Because of this, it keeps rebalancing until the device
|
||||
skew (number of partitions a device wants compared to what it has) gets
|
||||
below 1% or doesn't change by more than 1% (only happens with ring that
|
||||
can't be balanced no matter what -- like with 3 zones of differing
|
||||
weights with replicas set to 3).
|
||||
can't be balanced no matter what).
|
||||
|
||||
:returns: (number_of_partitions_altered, resulting_balance)
|
||||
"""
|
||||
@ -517,8 +516,18 @@ class RingBuilder(object):
|
||||
# indicate its strong desire to give up everything it has.
|
||||
dev['parts_wanted'] = -self.parts * self.replicas
|
||||
else:
|
||||
dev['parts_wanted'] = \
|
||||
int(weight_of_one_part * dev['weight']) - dev['parts']
|
||||
dev['parts_wanted'] = (
|
||||
# Round up here so that every partition ultimately ends up
|
||||
# with a placement.
|
||||
#
|
||||
# Imagine 5 partitions to be placed on 4 devices. If we
|
||||
# didn't use math.ceil() here, each device would have a
|
||||
# parts_wanted of 1, so 4 partitions would be placed but
|
||||
# the last would not, probably resulting in a crash. This
|
||||
# way, some devices end up with leftover parts_wanted, but
|
||||
# at least every partition ends up somewhere.
|
||||
int(math.ceil(weight_of_one_part * dev['weight'])) -
|
||||
dev['parts'])
|
||||
|
||||
def _adjust_replica2part2dev_size(self):
|
||||
"""
|
||||
@ -754,9 +763,24 @@ class RingBuilder(object):
|
||||
replicas_to_replace may be shared for multiple
|
||||
partitions, so be sure you do not modify it.
|
||||
"""
|
||||
parts_available_in_tier = defaultdict(int)
|
||||
for dev in self._iter_devs():
|
||||
dev['sort_key'] = self._sort_key_for(dev)
|
||||
dev['tiers'] = tiers_for_dev(dev)
|
||||
tiers = tiers_for_dev(dev)
|
||||
dev['tiers'] = tiers
|
||||
for tier in tiers:
|
||||
# Note: this represents how many partitions may be assigned to
|
||||
# a given tier (region/zone/server/disk). It does not take
|
||||
# into account how many partitions a given tier wants to shed.
|
||||
#
|
||||
# If we did not do this, we could have a zone where, at some
|
||||
# point during assignment, number-of-parts-to-gain equals
|
||||
# number-of-parts-to-shed. At that point, no further placement
|
||||
# into that zone would occur since its parts_available_in_tier
|
||||
# would be 0. This would happen any time a zone had any device
|
||||
# with partitions to shed, which is any time a device is being
|
||||
# removed, which is a pretty frequent operation.
|
||||
parts_available_in_tier[tier] += max(dev['parts_wanted'], 0)
|
||||
|
||||
available_devs = \
|
||||
sorted((d for d in self._iter_devs() if d['weight']),
|
||||
@ -795,23 +819,25 @@ class RingBuilder(object):
|
||||
# Gather up what other tiers (regions, zones, ip/ports, and
|
||||
# devices) the replicas not-to-be-moved are in for this part.
|
||||
other_replicas = defaultdict(int)
|
||||
unique_tiers_by_tier_len = defaultdict(set)
|
||||
occupied_tiers_by_tier_len = defaultdict(set)
|
||||
for replica in self._replicas_for_part(part):
|
||||
if replica not in replace_replicas:
|
||||
dev = self.devs[self._replica2part2dev[replica][part]]
|
||||
for tier in dev['tiers']:
|
||||
other_replicas[tier] += 1
|
||||
unique_tiers_by_tier_len[len(tier)].add(tier)
|
||||
occupied_tiers_by_tier_len[len(tier)].add(tier)
|
||||
|
||||
for replica in replace_replicas:
|
||||
# Find a new home for this replica
|
||||
tier = ()
|
||||
depth = 1
|
||||
while depth <= max_tier_depth:
|
||||
# Order the tiers by how many replicas of this
|
||||
# partition they already have. Then, of the ones
|
||||
# with the smallest number of replicas, pick the
|
||||
# tier with the hungriest drive and then continue
|
||||
# searching in that subtree.
|
||||
# with the smallest number of replicas and that have
|
||||
# room to accept more partitions, pick the tier with
|
||||
# the hungriest drive and then continue searching in
|
||||
# that subtree.
|
||||
#
|
||||
# There are other strategies we could use here,
|
||||
# such as hungriest-tier (i.e. biggest
|
||||
@ -819,10 +845,11 @@ class RingBuilder(object):
|
||||
# However, hungriest-drive is what was used here
|
||||
# before, and it worked pretty well in practice.
|
||||
#
|
||||
# Note that this allocator will balance things as
|
||||
# evenly as possible at each level of the device
|
||||
# layout. If your layout is extremely unbalanced,
|
||||
# this may produce poor results.
|
||||
# Note that this allocator prioritizes even device
|
||||
# filling over dispersion, so if your layout is
|
||||
# extremely unbalanced, you may not get the replica
|
||||
# dispersion that you expect, and your durability
|
||||
# may be lessened.
|
||||
#
|
||||
# This used to be a cute, recursive function, but it's been
|
||||
# unrolled for performance.
|
||||
@ -834,18 +861,28 @@ class RingBuilder(object):
|
||||
# short-circuit the search while still ensuring we get the
|
||||
# right tier.
|
||||
candidates_with_replicas = \
|
||||
unique_tiers_by_tier_len[len(tier) + 1]
|
||||
# Find a tier with the minimal replica count and the
|
||||
# hungriest drive among all the tiers with the minimal
|
||||
# replica count.
|
||||
if len(tier2children[tier]) > \
|
||||
occupied_tiers_by_tier_len[len(tier) + 1]
|
||||
|
||||
# Among the tiers with room for more partitions,
|
||||
# find one with the smallest possible number of
|
||||
# replicas already in it, breaking ties by which one
|
||||
# has the hungriest drive.
|
||||
candidates_with_room = [
|
||||
t for t in tier2children[tier]
|
||||
if parts_available_in_tier[t] > 0]
|
||||
|
||||
if len(candidates_with_room) > \
|
||||
len(candidates_with_replicas):
|
||||
# There exists at least one tier with 0 other replicas
|
||||
tier = max((t for t in tier2children[tier]
|
||||
# There exists at least one tier with room for
|
||||
# another partition and 0 other replicas already
|
||||
# in it, so we can use a faster search. The else
|
||||
# branch's search would work here, but it's
|
||||
# significantly slower.
|
||||
tier = max((t for t in candidates_with_room
|
||||
if other_replicas[t] == 0),
|
||||
key=tier2sort_key.__getitem__)
|
||||
else:
|
||||
tier = max(tier2children[tier],
|
||||
tier = max(candidates_with_room,
|
||||
key=lambda t: (-other_replicas[t],
|
||||
tier2sort_key[t]))
|
||||
depth += 1
|
||||
@ -855,8 +892,9 @@ class RingBuilder(object):
|
||||
old_sort_key = dev['sort_key']
|
||||
new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
|
||||
for tier in dev['tiers']:
|
||||
parts_available_in_tier[tier] -= 1
|
||||
other_replicas[tier] += 1
|
||||
unique_tiers_by_tier_len[len(tier)].add(tier)
|
||||
occupied_tiers_by_tier_len[len(tier)].add(tier)
|
||||
|
||||
index = bisect.bisect_left(tier2dev_sort_key[tier],
|
||||
old_sort_key)
|
||||
|
@ -35,6 +35,19 @@ class TestRingBuilder(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
rmtree(self.testdir, ignore_errors=1)
|
||||
|
||||
def _get_population_by_region(self, builder):
|
||||
"""
|
||||
Returns a dictionary mapping region to number of partitions in that
|
||||
region.
|
||||
"""
|
||||
population_by_region = defaultdict(int)
|
||||
r = builder.get_ring()
|
||||
for part2dev_id in r._replica2part2dev_id:
|
||||
for dev_id in part2dev_id:
|
||||
dev = r.devs[dev_id]
|
||||
population_by_region[dev['region']] += 1
|
||||
return dict(population_by_region.items())
|
||||
|
||||
def test_init(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
self.assertEquals(rb.part_power, 8)
|
||||
@ -641,6 +654,92 @@ class TestRingBuilder(unittest.TestCase):
|
||||
|
||||
rb.rebalance()
|
||||
|
||||
def test_region_fullness_with_balanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 4, 'region': 2, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10005, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 5, 'region': 2, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10006, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 6, 'region': 3, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10007, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 7, 'region': 3, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10008, 'device': 'sda1'})
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region,
|
||||
{0: 192, 1: 192, 2: 192, 3: 192})
|
||||
|
||||
def test_region_fullness_with_unbalanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 512, 1: 256})
|
||||
|
||||
def test_adding_region_slowly_with_unbalanceable_ring(self):
|
||||
rb = ring.RingBuilder(8, 3, 1)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 2,
|
||||
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
|
||||
rb.rebalance()
|
||||
|
||||
rb.add_dev({'id': 2, 'region': 1, 'zone': 0, 'weight': 0.25,
|
||||
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
|
||||
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 0.25,
|
||||
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
|
||||
# there's not enough room in r1 for every partition to have a replica
|
||||
# in it, so only 86 assignments occur in r1 (that's ~1/5 of the total,
|
||||
# since r1 has 1/5 of the weight).
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 682, 1: 86})
|
||||
|
||||
# and since there's not enough room, subsequent rebalances will not
|
||||
# cause additional assignments to r1
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 682, 1: 86})
|
||||
|
||||
# after you add more weight, more partition assignments move
|
||||
rb.set_dev_weight(2, 0.5)
|
||||
rb.set_dev_weight(3, 0.5)
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 614, 1: 154})
|
||||
|
||||
rb.set_dev_weight(2, 1.0)
|
||||
rb.set_dev_weight(3, 1.0)
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=2)
|
||||
population_by_region = self._get_population_by_region(rb)
|
||||
self.assertEquals(population_by_region, {0: 512, 1: 256})
|
||||
|
||||
def test_set_replicas_increase(self):
|
||||
rb = ring.RingBuilder(8, 2, 0)
|
||||
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,
|
||||
|
@ -703,7 +703,10 @@ class TestRing(TestRingBase):
|
||||
for region in xrange(1, 5):
|
||||
rb.add_dev({'id': next_dev_id,
|
||||
'ip': '1.%d.1.%d' % (region, server), 'port': 1234,
|
||||
'zone': 1, 'region': region, 'weight': 1.0})
|
||||
# 108.0 is the weight of all devices created prior to
|
||||
# this test in region 0; this way all regions have
|
||||
# equal combined weight
|
||||
'zone': 1, 'region': region, 'weight': 108.0})
|
||||
next_dev_id += 1
|
||||
rb.pretend_min_part_hours_passed()
|
||||
rb.rebalance(seed=1)
|
||||
|
Loading…
Reference in New Issue
Block a user