Optimize the ring builder's _reassign_parts() method.

Another ring builder optimization.  Profiling revealed hotspots in many
calls to min() and list.sort() in _reassign_parts().  That method didn't
get exercised in my last optimization pass because that pass targeted a
rebalance where nothing really moved around.

This time, I wrote a script which created a fresh ring, added a bunch of
devices, did the initial balance, deleted some devices, balanced, and
added some more back in.

Results from homebrew Python 2.7.3 on OS X 10.8.2 Macbook Pro
(bare-metal):

 BEFORE:
 Using part-power = 18, adding 600 devices, removing 100, then adding 300 more...
 NOT Profiling to 'initial_balance.prof'
   wall-time delta: 131.33s
 NOT Profiling to 'deleting_200_rebalance.prof'
   wall-time delta: 25.67s
 NOT Profiling to 'first_rebalance.prof'
   wall-time delta: 62.00s

 AFTER:
 Using part-power = 18, adding 600 devices, removing 100, then adding 300 more...
 NOT Profiling to 'initial_balance.prof'
   wall-time delta: 28.04s
 NOT Profiling to 'deleting_200_rebalance.prof'
   wall-time delta: 9.35s
 NOT Profiling to 'first_rebalance.prof'
   wall-time delta: 16.41s

The driver script I used is available here:
https://gist.github.com/adb982aec6f0709f1273

Change-Id: I17e270acb12b5e4d4bbb1e34d8867dea90678961
This commit is contained in:
Darrell Bishop 2013-01-15 08:32:07 -08:00
parent 62e71a2b1f
commit ec084de189
1 changed files with 84 additions and 36 deletions

View File

@ -641,65 +641,97 @@ class RingBuilder(object):
sorted((d for d in self._iter_devs() if d['weight']),
key=lambda x: x['sort_key'])
tier2children = build_tier_tree(available_devs)
tier2devs = defaultdict(list)
tier2sort_key = defaultdict(list)
tiers_by_depth = defaultdict(set)
max_tier_depth = 0
for dev in available_devs:
for tier in tiers_for_dev(dev):
tier2devs[tier].append(dev) # <-- starts out sorted!
tier2sort_key[tier].append(dev['sort_key'])
tiers_by_depth[len(tier)].add(tier)
if len(tier) > max_tier_depth:
max_tier_depth = len(tier)
tier2children_sets = build_tier_tree(available_devs)
tier2children = defaultdict(list)
tier2children_sort_key = {}
tiers_list = [()]
depth = 1
while depth <= max_tier_depth:
new_tiers_list = []
for tier in tiers_list:
child_tiers = list(tier2children_sets[tier])
child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
tier2children[tier] = child_tiers
tier2children_sort_key[tier] = map(
lambda t: tier2sort_key[t][-1], child_tiers)
new_tiers_list.extend(child_tiers)
tiers_list = new_tiers_list
depth += 1
for part, replace_replicas in reassign_parts:
# Gather up what other tiers (zones, ip_ports, and devices) the
# replicas not-to-be-moved are in for this part.
other_replicas = defaultdict(lambda: 0)
other_replicas = defaultdict(int)
unique_tiers_by_tier_len = defaultdict(set)
for replica in xrange(self.replicas):
if replica not in replace_replicas:
dev = self.devs[self._replica2part2dev[replica][part]]
for tier in tiers_for_dev(dev):
other_replicas[tier] += 1
def find_home_for_replica(tier=(), depth=1):
# Order the tiers by how many replicas of this
# partition they already have. Then, of the ones
# with the smallest number of replicas, pick the
# tier with the hungriest drive and then continue
# searching in that subtree.
#
# There are other strategies we could use here,
# such as hungriest-tier (i.e. biggest
# sum-of-parts-wanted) or picking one at random.
# However, hungriest-drive is what was used here
# before, and it worked pretty well in practice.
#
# Note that this allocator will balance things as
# evenly as possible at each level of the device
# layout. If your layout is extremely unbalanced,
# this may produce poor results.
candidate_tiers = tier2children[tier]
min_count = min(other_replicas[t] for t in candidate_tiers)
candidate_tiers = [t for t in candidate_tiers
if other_replicas[t] == min_count]
candidate_tiers.sort(
key=lambda t: tier2sort_key[t][-1])
if depth == max(tiers_by_depth.keys()):
return tier2devs[candidate_tiers[-1]][-1]
return find_home_for_replica(tier=candidate_tiers[-1],
depth=depth + 1)
unique_tiers_by_tier_len[len(tier)].add(tier)
for replica in replace_replicas:
dev = find_home_for_replica()
tier = ()
depth = 1
while depth <= max_tier_depth:
# Order the tiers by how many replicas of this
# partition they already have. Then, of the ones
# with the smallest number of replicas, pick the
# tier with the hungriest drive and then continue
# searching in that subtree.
#
# There are other strategies we could use here,
# such as hungriest-tier (i.e. biggest
# sum-of-parts-wanted) or picking one at random.
# However, hungriest-drive is what was used here
# before, and it worked pretty well in practice.
#
# Note that this allocator will balance things as
# evenly as possible at each level of the device
# layout. If your layout is extremely unbalanced,
# this may produce poor results.
#
# This used to be a cute, recursive function, but it's been
# unrolled for performance.
candidate_tiers = tier2children[tier]
candidates_with_replicas = \
unique_tiers_by_tier_len[len(tier) + 1]
if len(candidate_tiers) > len(candidates_with_replicas):
# There exists at least one tier with 0 other replicas,
# so work backward among the candidates, accepting the
# first which isn't in other_replicas.
#
# This optimization is to avoid calling the min()
# below, which is expensive if you've got thousands of
# drives.
for t in reversed(candidate_tiers):
if other_replicas[t] == 0:
tier = t
break
else:
min_count = min(other_replicas[t]
for t in candidate_tiers)
tier = (t for t in reversed(candidate_tiers)
if other_replicas[t] == min_count).next()
depth += 1
dev = tier2devs[tier][-1]
dev['parts_wanted'] -= 1
dev['parts'] += 1
old_sort_key = dev['sort_key']
new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
for tier in tiers_for_dev(dev):
other_replicas[tier] += 1
unique_tiers_by_tier_len[len(tier)].add(tier)
index = bisect.bisect_left(tier2sort_key[tier],
old_sort_key)
@ -711,6 +743,22 @@ class RingBuilder(object):
tier2devs[tier].insert(new_index, dev)
tier2sort_key[tier].insert(new_index, new_sort_key)
# Now jiggle tier2children values to keep them sorted
new_last_sort_key = tier2sort_key[tier][-1]
parent_tier = tier[0:-1]
index = bisect.bisect_left(
tier2children_sort_key[parent_tier],
old_sort_key)
popped = tier2children[parent_tier].pop(index)
tier2children_sort_key[parent_tier].pop(index)
new_index = bisect.bisect_left(
tier2children_sort_key[parent_tier],
new_last_sort_key)
tier2children[parent_tier].insert(new_index, popped)
tier2children_sort_key[parent_tier].insert(
new_index, new_last_sort_key)
self._replica2part2dev[replica][part] = dev['id']
# Just to save memory and keep from accidental reuse.