From 7035639dfd239b52d4ed46aae50f78d16ec8cbfe Mon Sep 17 00:00:00 2001 From: Clay Gerrard Date: Thu, 15 Oct 2015 16:20:58 -0700 Subject: [PATCH] Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431 --- swift/cli/ringbuilder.py | 21 +- swift/common/ring/builder.py | 1351 ++++++++++--------- test/unit/common/ring/test_builder.py | 1775 +++++++++++++++++++++++-- test/unit/common/ring/test_ring.py | 283 ++-- test/unit/common/ring/test_utils.py | 24 +- 5 files changed, 2542 insertions(+), 912 deletions(-) diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py index 341d9983b0..25e608b3a1 100755 --- a/swift/cli/ringbuilder.py +++ b/swift/cli/ringbuilder.py @@ -448,29 +448,19 @@ swift-ring-builder print('The overload factor is %0.2f%% (%.6f)' % ( builder.overload * 100, builder.overload)) if builder.devs: + balance_per_dev = builder._build_balance_per_dev() print('Devices: id region zone ip address port ' 'replication ip replication port name ' 'weight partitions balance flags meta') - weighted_parts = builder.parts * builder.replicas / \ - sum(d['weight'] for d in builder.devs if d is not None) - for dev in builder.devs: - if dev is None: - continue - if not dev['weight']: - if dev['parts']: - balance = MAX_BALANCE - else: - balance = 0 - else: - balance = 100.0 * dev['parts'] / \ - (dev['weight'] * weighted_parts) - 100.0 + for dev in builder._iter_devs(): flags = 'DEL' if dev in builder._remove_devs else '' print(' %5d %7d %5d %15s %5d %15s %17d %9s %6.02f ' '%10s %7.02f %5s %s' % (dev['id'], dev['region'], dev['zone'], dev['ip'], dev['port'], dev['replication_ip'], dev['replication_port'], dev['device'], dev['weight'], - dev['parts'], balance, flags, dev['meta'])) + dev['parts'], balance_per_dev[dev['id']], flags, + dev['meta'])) exit(EXIT_SUCCESS) def search(): @@ -924,6 +914,8 @@ swift-ring-builder dispersion [options] verbose=options.verbose) print('Dispersion is %.06f, Balance is %.06f, Overload is %0.2f%%' % ( builder.dispersion, builder.get_balance(), builder.overload * 100)) + print('Required overload is %.6f%%' % ( + builder.get_required_overload() * 100)) if report['worst_tier']: status = EXIT_WARNING print('Worst tier is %.06f (%s)' % (report['max_dispersion'], @@ -1034,7 +1026,6 @@ swift-ring-builder write_builder [min_part_hours] for parts in builder._replica2part2dev: for dev_id in parts: builder.devs[dev_id]['parts'] += 1 - builder._set_parts_wanted() builder.save(builder_file) def pretend_min_part_hours_passed(): diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index 830a381946..7629bbb900 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bisect import copy import errno import itertools @@ -23,7 +22,6 @@ import random import six.moves.cPickle as pickle from copy import deepcopy from contextlib import contextmanager -import warnings from array import array from collections import defaultdict @@ -36,7 +34,12 @@ from swift.common.ring import RingData from swift.common.ring.utils import tiers_for_dev, build_tier_tree, \ validate_and_normalize_address +# we can't store None's in the replica2part2dev array, so we high-jack +# the max value for magic to represent the part is not currently +# assigned to any device. +NONE_DEV = 2 ** 16 - 1 MAX_BALANCE = 999.99 +MAX_BALANCE_GATHER_COUNT = 3 class RingValidationWarning(Warning): @@ -88,7 +91,6 @@ class RingBuilder(object): self.devs_changed = False self.version = 0 self.overload = 0.0 - self._effective_overload = None # _replica2part2dev maps from replica number to partition number to # device id. So, for a three replica, 2**23 ring, it's an array of @@ -99,16 +101,16 @@ class RingBuilder(object): # a while ago, code-wise, when I last tried it). self._replica2part2dev = None - # _last_part_moves is a 2**23 array of unsigned bytes representing the - # number of hours since a given partition was last moved. This is used - # to guarantee we don't move a partition twice within a given number of - # hours (24 is my usual test). Removing a device or setting its weight - # to 0 overrides this behavior as it's assumed those actions are done - # because of device failure. + # _last_part_moves is an array of unsigned bytes representing + # the number of hours since a given partition was last moved. + # This is used to guarantee we don't move a partition twice + # within a given number of hours (24 is my usual test). Removing + # a device overrides this behavior as it's assumed that's only + # done because of device failure. + self._last_part_moves = None # _last_part_moves_epoch indicates the time the offsets in # _last_part_moves is based on. - self._last_part_moves_epoch = None - self._last_part_moves = None + self._last_part_moves_epoch = 0 self._last_part_gather_start = 0 @@ -204,6 +206,9 @@ class RingBuilder(object): for dev in self._iter_devs(): dev.setdefault("region", 1) + if not self._last_part_moves_epoch: + self._last_part_moves_epoch = 0 + def __deepcopy__(self, memo): return type(self).from_dict(deepcopy(self.to_dict(), memo)) @@ -341,7 +346,6 @@ class RingBuilder(object): dev['weight'] = float(dev['weight']) dev['parts'] = 0 self.devs[dev['id']] = dev - self._set_parts_wanted() self.devs_changed = True self.version += 1 return dev['id'] @@ -359,8 +363,10 @@ class RingBuilder(object): :param dev_id: device id :param weight: new weight for device """ + if any(dev_id == d['id'] for d in self._remove_devs): + raise ValueError("Can not set weight of dev_id %s because it " + "is marked for removal" % (dev_id,)) self.devs[dev_id]['weight'] = weight - self._set_parts_wanted() self.devs_changed = True self.version += 1 @@ -377,7 +383,6 @@ class RingBuilder(object): dev = self.devs[dev_id] dev['weight'] = 0 self._remove_devs.append(dev) - self._set_parts_wanted() self.devs_changed = True self.version += 1 @@ -399,68 +404,81 @@ class RingBuilder(object): :returns: (number_of_partitions_altered, resulting_balance, number_of_removed_devices) """ - num_devices = len([d for d in self._iter_devs() if d['weight'] > 0]) - removed_devs = 0 + # count up the devs, and cache some stuff + num_devices = 0 + for dev in self._iter_devs(): + dev['tiers'] = tiers_for_dev(dev) + if dev['weight'] > 0: + num_devices += 1 if num_devices < self.replicas: - warnings.warn(RingValidationWarning( + raise exceptions.RingValidationError( "Replica count of %(replicas)s requires more " "than %(num_devices)s devices" % { 'replicas': self.replicas, 'num_devices': num_devices, - })) - old_replica2part2dev = copy.deepcopy(self._replica2part2dev) + }) if seed is not None: random.seed(seed) - self._effective_overload = self.overload - if self.overload and self.dispersion <= 0: - # iff we're fully dispersed we want to bring in overload - self._effective_overload = min(self.overload, - self.get_required_overload()) - self.logger.debug("Using effective overload of %f", - self._effective_overload) - self._ring = None - if self._last_part_moves_epoch is None: + + old_replica2part2dev = copy.deepcopy(self._replica2part2dev) + + if self._last_part_moves is None: self.logger.debug("New builder; performing initial balance") - self._initial_balance() - self.devs_changed = False - self._build_dispersion_graph() - return self.parts, self.get_balance(), removed_devs - changed_parts = 0 + self._last_part_moves = array('B', itertools.repeat(0, self.parts)) self._update_last_part_moves() - last_balance = 0 - new_parts, removed_part_count = self._adjust_replica2part2dev_size() - self.logger.debug( - "%d new parts and %d removed parts from replica-count change", - len(new_parts), removed_part_count) - changed_parts += removed_part_count - self._set_parts_wanted() - self._reassign_parts(new_parts) - changed_parts += len(new_parts) - while True: - reassign_parts = self._gather_reassign_parts() - changed_parts += len(reassign_parts) - self.logger.debug("Gathered %d parts thus far (%d this pass)", - changed_parts, len(reassign_parts)) - self._reassign_parts(reassign_parts) - self.logger.debug("Assigned %d parts", changed_parts) - while self._remove_devs: - remove_dev_id = self._remove_devs.pop()['id'] - self.logger.debug("Removing dev %d", remove_dev_id) - self.devs[remove_dev_id] = None - removed_devs += 1 - balance = self.get_balance() - if balance < 1 or abs(last_balance - balance) < 1 or \ - changed_parts == self.parts: + + replica_plan = self._build_replica_plan() + self._set_parts_wanted(replica_plan) + + assign_parts = defaultdict(list) + # gather parts from failed devices + removed_devs = self._gather_parts_from_failed_devices(assign_parts) + # gather parts from replica count adjustment + self._adjust_replica2part2dev_size(assign_parts) + # gather parts for dispersion (N.B. this only picks up parts that + # *must* disperse according to the replica plan) + self._gather_parts_for_dispersion(assign_parts, replica_plan) + + # we'll gather a few times, or until we archive the plan + for gather_count in range(MAX_BALANCE_GATHER_COUNT): + self._gather_parts_for_balance(assign_parts, replica_plan) + if not assign_parts: + # most likely min part hours + finish_status = 'Unable to finish' break - last_balance = balance + assign_parts_list = list(assign_parts.items()) + # shuffle the parts to be reassigned, we have no preference on the + # order in which the replica plan is fulfilled. + random.shuffle(assign_parts_list) + # reset assign_parts map for next iteration + assign_parts = defaultdict(list) + + num_part_replicas = sum(len(r) for p, r in assign_parts_list) + self.logger.debug("Gathered %d parts", num_part_replicas) + self._reassign_parts(assign_parts_list, replica_plan) + self.logger.debug("Assigned %d parts", num_part_replicas) + + if not sum(d['parts_wanted'] < 0 for d in + self._iter_devs()): + finish_status = 'Finished' + break + else: + finish_status = 'Unable to finish' + self.logger.debug('%s rebalance plan after %s attempts' % ( + finish_status, gather_count + 1)) + self.devs_changed = False self.version += 1 - changed_parts = self._build_dispersion_graph(old_replica2part2dev) - return changed_parts, balance, removed_devs + + # clean up the cache + for dev in self._iter_devs(): + dev.pop('tiers', None) + + return changed_parts, self.get_balance(), removed_devs def _build_dispersion_graph(self, old_replica2part2dev=None): """ @@ -500,8 +518,6 @@ class RingBuilder(object): max_allowed_replicas = self._build_max_replicas_by_tier() parts_at_risk = 0 - tfd = {} - dispersion_graph = {} # go over all the devices holding each replica part by part for part_id, dev_ids in enumerate( @@ -511,9 +527,7 @@ class RingBuilder(object): replicas_at_tier = defaultdict(int) for rep_id, dev in enumerate(iter( self.devs[dev_id] for dev_id in dev_ids)): - if dev['id'] not in tfd: - tfd[dev['id']] = tiers_for_dev(dev) - for tier in tfd[dev['id']]: + for tier in (dev.get('tiers') or tiers_for_dev(dev)): replicas_at_tier[tier] += 1 # IndexErrors will be raised if the replicas are increased or # decreased, and that actually means the partition has changed @@ -616,10 +630,10 @@ class RingBuilder(object): (part, replica)) devs_for_part.append(dev_id) if len(devs_for_part) != len(set(devs_for_part)): - warnings.warn(RingValidationWarning( + raise exceptions.RingValidationError( "The partition %s has been assigned to " "duplicate devices %r" % ( - part, devs_for_part))) + part, devs_for_part)) if stats: weight_of_one_part = self.weight_of_one_part() @@ -641,6 +655,32 @@ class RingBuilder(object): return dev_usage, worst return None, None + def _build_balance_per_dev(self): + """ + Build a map of => where is a float + representing the percentage difference from the desired amount of + partitions a given device wants and the amount it has. + + N.B. this method only considers a device's weight and the parts + assigned, not the parts wanted according to the replica plan. + """ + weight_of_one_part = self.weight_of_one_part() + balance_per_dev = {} + for dev in self._iter_devs(): + if not dev['weight']: + if dev['parts']: + # If a device has no weight, but has partitions, then its + # overage is considered "infinity" and therefore always the + # worst possible. We show MAX_BALANCE for convenience. + balance = MAX_BALANCE + else: + balance = 0 + else: + balance = 100.0 * dev['parts'] / ( + dev['weight'] * weight_of_one_part) - 100.0 + balance_per_dev[dev['id']] = balance + return balance_per_dev + def get_balance(self): """ Get the balance of the ring. The balance value is the highest @@ -652,167 +692,39 @@ class RingBuilder(object): :returns: balance of the ring """ - balance = 0 - weight_of_one_part = self.weight_of_one_part() - for dev in self._iter_devs(): - if not dev['weight']: - if dev['parts']: - # If a device has no weight, but has partitions, then its - # overage is considered "infinity" and therefore always the - # worst possible. We show MAX_BALANCE for convenience. - balance = MAX_BALANCE - break - continue - dev_balance = abs(100.0 * dev['parts'] / - (dev['weight'] * weight_of_one_part) - 100.0) - if dev_balance > balance: - balance = dev_balance - return balance + balance_per_dev = self._build_balance_per_dev() + return max(abs(b) for b in balance_per_dev.values()) - def get_required_overload(self): + def get_required_overload(self, weighted=None, wanted=None): """ Returns the minimum overload value required to make the ring maximally dispersed. + + The required overload is the largest percentage change of any single + device from its weighted replicanth to its wanted replicanth (note + under weighted devices have a negative percentage change) to archive + dispersion - that is to say a single device that must be overloaded by + 5% is worse than 5 devices in a single tier overloaded by 1%. """ - self.logger.debug("computing required overload") - tfd, sibling_tiers = self._compute_sibling_tiers() - max_allowed_replicas = self._build_max_replicas_by_tier() - - # We're computing a bunch of different things here, but iterating - # over all the devs once is more efficient than doing it a bunch of - # times. - all_tiers = set([()]) - tier_weight = defaultdict(float) - total_weight = 0.0 - tier2children = defaultdict(set) + weighted = weighted or self._build_weighted_replicas_by_tier() + wanted = wanted or self._build_wanted_replicas_by_tier() + max_overload = 0.0 for dev in self._iter_devs(): - dev_weight = dev['weight'] - total_weight += dev_weight - for tier in tfd[dev['id']]: - all_tiers.add(tier) - tier_weight[tier] += dev_weight - tier2children[tier[:-1]].add(tier) - tier_weight[()] = total_weight - - max_required_overload = 0.0 - for tier in all_tiers: - if tier not in tier2children: - continue - if tier_weight[tier] <= 0: - continue - # Example 1: Consider a 3-replica cluster with 2 regions. If one - # region has more than 2/3 the total weight, then (ignoring - # overload) some partitions will reside entirely in the big - # region. - # - # Example 2: Consider a 3-replica cluster with 3 regions. If any - # region has more than 1/3 the total weight, some partitions will - # not have replicas spread across all regions. - # - # Example 3: Consider a 3-replica cluster with 4 regions. If any - # region has more than 1/3 the total weight, some partitions will - # not have replicas spread across all regions. - # - # Example 4: Consider a 3-replica cluster with 100 regions. If - # any region has more than 1/3 the total weight, some partitions - # will not have replicas spread across all regions. The fact - # that there's 100 regions doesn't matter; if one region is big - # enough, it'll get multiple replicas of some partitions. - # - # Example 5: Consider a 5-replica cluster with 2 regions. If the - # bigger region has more than 3/5 the weight, some partitions - # will have more than 3 replicas in the big region. (Optimal - # dispersion is 3 replicas in some region and 2 in the other; 4 - # and 1 is not good enough.) - # - # In general, what we do is split this tier's child tiers - # into two groups: "big" and "small". "Big" child tiers are - # ones whose weight exceeds their fraction of the replicas. - # For example, given 3 replicas and 4 zones of total weight - # 12,000, a zone with weight greater than 1/3 of 12,000 (= - # 4,000) would be considered big. "Small" child tiers are - # those which are not big. - # - # Once we've divided the child tiers into big and small, we - # figure out how many replicas should wind up on the small - # child tiers (all together), and then compute the needed - # overload factor to boost their weights so they can take - # that many replicas. - child_tiers = tier2children[tier] - tier_replicas = max_allowed_replicas[tier] - big_child_count = small_child_count = 0 - big_child_weight = small_child_weight = 0.0 - - max_child_replicas = math.ceil(tier_replicas / len(child_tiers)) - bigness_threshold = ( - max_child_replicas / tier_replicas * tier_weight[tier]) - - for child_tier in child_tiers: - child_weight = tier_weight[child_tier] - if child_weight == 0: - # If it's got 0 weight, it's not taking any - # partitions at all, so it doesn't count. + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + if not dev['weight']: + if tier not in wanted or not wanted[tier]: continue - if child_weight >= bigness_threshold: - big_child_count += 1 - big_child_weight += child_weight - else: - small_child_count += 1 - small_child_weight += child_weight - - if big_child_count == 0 or small_child_count == 0: - # We only need overload if we have both big and small - # tiers. Usually, all small tiers means things can - # balance, while all big tiers means that we have - # exactly one child tier (e.g. a cluster with only one - # region). - continue - - # We assume each big child tier takes the maximum possible - # number of replicas for optimal dispersion, but no more. - # That leaves the remainder for the small child tiers. - big_child_replicas = max_child_replicas * big_child_count - small_child_replicas = tier_replicas - big_child_replicas - - if small_child_replicas == 0: - # If we're not putting any replicas on small child - # tiers, then there's no need for overload. This also - # avoids a division-by-zero below. - continue - - # We want the overloaded small tiers to take up their fair - # share of the replicas. We can express this as follows: - # - # Let Ws be the sum of the weights of the small child tiers. - # - # Let Wb be the sum of the weights of the big child tiers. - # - # Let Rt be the number of replicas at the current tier. - # - # Let Rs be the desired number of replicas for the small - # child tiers. - # - # Let L be the overload. - # - # Then, we have the following: - # - # (L * Ws) / (Wb + L * Ws) = Rs / Rt - # - # Solving for L, we get: - # - # L = 1 / (Ws / Wb * (Rt / Rs - 1)) - required_overload = 1.0 / ( - (small_child_weight / big_child_weight) - * (tier_replicas / small_child_replicas - 1)) - 1 - - if required_overload > max_required_overload: - self.logger.debug("Required overload for %r is %f [NEW HIGH]", - tier, required_overload) - max_required_overload = required_overload - else: - self.logger.debug("Required overload for %r is %f", - tier, required_overload) - return max_required_overload + raise exceptions.RingValidationError( + 'Device %s has zero weight and ' + 'should not want any replicas' % (tier,)) + required = (wanted[tier] - weighted[tier]) / weighted[tier] + self.logger.debug('%s wants %s and is weighted for %s so ' + 'therefore requires %s overload' % ( + tier, wanted[tier], weighted[tier], + required)) + if required > max_overload: + max_overload = required + return max_overload def pretend_min_part_hours_passed(self): """ @@ -848,7 +760,13 @@ class RingBuilder(object): if dev is not None: yield dev - def _set_parts_wanted(self): + def _build_tier2children(self): + """ + Wrap helper build_tier_tree so exclude zero-weight devices. + """ + return build_tier_tree(d for d in self._iter_devs() if d['weight']) + + def _set_parts_wanted(self, replica_plan): """ Sets the parts_wanted key for each of the devices to the number of partitions the device wants based on its relative weight. This key is @@ -856,9 +774,49 @@ class RingBuilder(object): to best distribute partitions. A negative parts_wanted indicates the device is "overweight" and wishes to give partitions away if possible. - Note: parts_wanted does *not* consider overload. + :param replica_plan: a dict of dicts, as returned from + _build_replica_plan, that that maps + each tier to it's target replicanths. """ - weight_of_one_part = self.weight_of_one_part() + tier2children = self._build_tier2children() + + parts_by_tier = defaultdict(int) + + def place_parts(tier, parts): + parts_by_tier[tier] = parts + sub_tiers = sorted(tier2children[tier]) + if not sub_tiers: + return + to_place = defaultdict(int) + for t in sub_tiers: + to_place[t] = int(math.floor( + replica_plan[t]['target'] * self.parts)) + parts -= to_place[t] + + # if there's some parts left over, just throw 'em about + sub_tier_gen = itertools.cycle(sorted( + sub_tiers, key=lambda t: replica_plan[t]['target'])) + while parts: + t = next(sub_tier_gen) + to_place[t] += 1 + parts -= 1 + + for t, p in to_place.items(): + place_parts(t, p) + + total_parts = int(self.replicas * self.parts) + place_parts((), total_parts) + + # belts & suspenders/paranoia - at every level, the sum of + # parts_by_tier should be total_parts for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + parts_at_tier = sum(parts_by_tier[t] for t in parts_by_tier + if len(t) == i) + if parts_at_tier != total_parts: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + parts_at_tier, total_parts, tier_name)) for dev in self._iter_devs(): if not dev['weight']: @@ -867,97 +825,8 @@ class RingBuilder(object): # indicate its strong desire to give up everything it has. dev['parts_wanted'] = -self.parts * self.replicas else: - dev['parts_wanted'] = ( - # Round up here so that every partition ultimately ends up - # with a placement. - # - # Imagine 5 partitions to be placed on 4 devices. If we - # didn't use math.ceil() here, each device would have a - # parts_wanted of 1, so 4 partitions would be placed but - # the last would not, probably resulting in a crash. This - # way, some devices end up with leftover parts_wanted, but - # at least every partition ends up somewhere. - int(math.ceil(weight_of_one_part * dev['weight'] - - dev['parts']))) - - def _adjust_replica2part2dev_size(self): - """ - Make sure that the lengths of the arrays in _replica2part2dev - are correct for the current value of self.replicas. - - Example: - self.part_power = 8 - self.replicas = 2.25 - - self._replica2part2dev will contain 3 arrays: the first 2 of - length 256 (2**8), and the last of length 64 (0.25 * 2**8). - - Returns a 2-tuple: the first element is a list of (partition, - replicas) tuples indicating which replicas need to be - (re)assigned to devices, and the second element is a count of - how many replicas were removed. - """ - removed_replicas = 0 - - fractional_replicas, whole_replicas = math.modf(self.replicas) - whole_replicas = int(whole_replicas) - - desired_lengths = [self.parts] * whole_replicas - if fractional_replicas: - desired_lengths.append(int(self.parts * fractional_replicas)) - - to_assign = defaultdict(list) - - if self._replica2part2dev is not None: - # If we crossed an integer threshold (say, 4.1 --> 4), - # we'll have a partial extra replica clinging on here. Clean - # up any such extra stuff. - for part2dev in self._replica2part2dev[len(desired_lengths):]: - for dev_id in part2dev: - dev_losing_part = self.devs[dev_id] - dev_losing_part['parts'] -= 1 - removed_replicas += 1 - self._replica2part2dev = \ - self._replica2part2dev[:len(desired_lengths)] - else: - self._replica2part2dev = [] - - for replica, desired_length in enumerate(desired_lengths): - if replica < len(self._replica2part2dev): - part2dev = self._replica2part2dev[replica] - if len(part2dev) < desired_length: - # Not long enough: needs to be extended and the - # newly-added pieces assigned to devices. - for part in range(len(part2dev), desired_length): - to_assign[part].append(replica) - part2dev.append(0) - elif len(part2dev) > desired_length: - # Too long: truncate this mapping. - for part in range(desired_length, len(part2dev)): - dev_losing_part = self.devs[part2dev[part]] - dev_losing_part['parts'] -= 1 - removed_replicas += 1 - self._replica2part2dev[replica] = part2dev[:desired_length] - else: - # Mapping not present at all: make one up and assign - # all of it. - for part in range(desired_length): - to_assign[part].append(replica) - self._replica2part2dev.append( - array('H', (0 for _junk in range(desired_length)))) - - return (to_assign.items(), removed_replicas) - - def _initial_balance(self): - """ - Initial partition assignment is the same as rebalancing an - existing ring, but with some initial setup beforehand. - """ - self._last_part_moves = array('B', (0 for _junk in range(self.parts))) - self._last_part_moves_epoch = int(time()) - self._set_parts_wanted() - - self._reassign_parts(self._adjust_replica2part2dev_size()[0]) + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + dev['parts_wanted'] = parts_by_tier[tier] - dev['parts'] def _update_last_part_moves(self): """ @@ -976,231 +845,302 @@ class RingBuilder(object): self._last_part_moves[part] = 0xff self._last_part_moves_epoch = int(time()) - def _get_available_parts(self): + def _gather_parts_from_failed_devices(self, assign_parts): """ - Returns a dict of (tier: available parts in other tiers) for all tiers - in the ring. - - Devices that have too many partitions (negative parts_wanted plus - overload) are ignored, otherwise the sum of all returned values is 0 - +/- rounding errors. - - This takes overload into account. + Update the map of partition => [replicas] to be reassigned from + removed devices. """ - wanted_parts_for_tier = {} - for dev in self._iter_devs(): - extra_overload_parts = self._n_overload_parts(dev) - pw = max(dev['parts_wanted'] + extra_overload_parts, 0) - for tier in tiers_for_dev(dev): - wanted_parts_for_tier.setdefault(tier, 0) - wanted_parts_for_tier[tier] += pw - return wanted_parts_for_tier - - def _compute_sibling_tiers(self): - """ - Returns a 2-tuple; the first value is a dictionary mapping each - device's id to its tiers, and the second is a dictionary mapping - a-tier: list-of-sibling-tiers. - """ - # inline memoization of tiers_for_dev() results (profiling reveals it - # as a hot-spot). We also return it so callers don't have to - # rebuild it. - tfd = {} - - tiers_by_len = defaultdict(set) - for dev in self._iter_devs(): - tiers = tiers_for_dev(dev) - tfd[dev['id']] = tiers - for tier in tiers: - tiers_by_len[len(tier)].add(tier) - - tiers_by_len = dict((length, list(tiers)) - for length, tiers in tiers_by_len.items()) - - sibling_tiers = {} - for length, tiers in tiers_by_len.items(): - for i, tier in enumerate(tiers): - sibling_tiers[tier] = [t for t in (tiers[:i] + tiers[(i + 1):]) - if t[:-1] == tier[:-1]] - return (tfd, sibling_tiers) - - def _gather_reassign_parts(self): - """ - Returns a list of (partition, replicas) pairs to be reassigned by - gathering from removed devices, insufficiently-far-apart replicas, and - overweight drives. - """ - tfd, sibling_tiers = self._compute_sibling_tiers() - # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. - removed_dev_parts = defaultdict(list) + if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for part, replica in self._each_part_replica(): dev_id = self._replica2part2dev[replica][part] if dev_id in dev_ids: + self._replica2part2dev[replica][part] = NONE_DEV self._last_part_moves[part] = 0 - removed_dev_parts[part].append(replica) + assign_parts[part].append(replica) self.logger.debug( "Gathered %d/%d from dev %d [dev removed]", part, replica, dev_id) + removed_devs = 0 + while self._remove_devs: + remove_dev_id = self._remove_devs.pop()['id'] + self.logger.debug("Removing dev %d", remove_dev_id) + self.devs[remove_dev_id] = None + removed_devs += 1 + return removed_devs + def _adjust_replica2part2dev_size(self, to_assign): + """ + Make sure that the lengths of the arrays in _replica2part2dev + are correct for the current value of self.replicas. + + Example: + self.part_power = 8 + self.replicas = 2.25 + + self._replica2part2dev will contain 3 arrays: the first 2 of + length 256 (2**8), and the last of length 64 (0.25 * 2**8). + + Update the mapping of partition => [replicas] that need assignment. + """ + fractional_replicas, whole_replicas = math.modf(self.replicas) + whole_replicas = int(whole_replicas) + removed_parts = 0 + new_parts = 0 + + desired_lengths = [self.parts] * whole_replicas + if fractional_replicas: + desired_lengths.append(int(self.parts * fractional_replicas)) + + if self._replica2part2dev is not None: + # If we crossed an integer threshold (say, 4.1 --> 4), + # we'll have a partial extra replica clinging on here. Clean + # up any such extra stuff. + for part2dev in self._replica2part2dev[len(desired_lengths):]: + for dev_id in part2dev: + dev_losing_part = self.devs[dev_id] + dev_losing_part['parts'] -= 1 + removed_parts -= 1 + self._replica2part2dev = \ + self._replica2part2dev[:len(desired_lengths)] + else: + self._replica2part2dev = [] + + for replica, desired_length in enumerate(desired_lengths): + if replica < len(self._replica2part2dev): + part2dev = self._replica2part2dev[replica] + if len(part2dev) < desired_length: + # Not long enough: needs to be extended and the + # newly-added pieces assigned to devices. + for part in range(len(part2dev), desired_length): + to_assign[part].append(replica) + part2dev.append(NONE_DEV) + new_parts += 1 + elif len(part2dev) > desired_length: + # Too long: truncate this mapping. + for part in range(desired_length, len(part2dev)): + dev_losing_part = self.devs[part2dev[part]] + dev_losing_part['parts'] -= 1 + removed_parts -= 1 + self._replica2part2dev[replica] = part2dev[:desired_length] + else: + # Mapping not present at all: make one up and assign + # all of it. + for part in range(desired_length): + to_assign[part].append(replica) + new_parts += 1 + self._replica2part2dev.append( + array('H', itertools.repeat(NONE_DEV, desired_length))) + + self.logger.debug( + "%d new parts and %d removed parts from replica-count change", + new_parts, removed_parts) + + def _gather_parts_for_dispersion(self, assign_parts, replica_plan): + """ + Update the map of partition => [replicas] to be reassigned from + insufficiently-far-apart replicas. + """ # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. - spread_out_parts = defaultdict(list) - max_allowed_replicas = self._build_max_replicas_by_tier() - wanted_parts_for_tier = self._get_available_parts() - moved_parts = 0 for part in range(self.parts): - # Only move one replica at a time if possible. - if part in removed_dev_parts: + if self._last_part_moves[part] < self.min_part_hours: continue - # First, add up the count of replicas at each tier for each # partition. - # replicas_at_tier was a "lambda: 0" defaultdict, but profiling - # revealed the lambda invocation as a significant cost. - replicas_at_tier = {} + replicas_at_tier = defaultdict(int) for dev in self._devs_for_part(part): - for tier in tfd[dev['id']]: - if tier not in replicas_at_tier: - replicas_at_tier[tier] = 1 - else: - replicas_at_tier[tier] += 1 + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 - # Now, look for partitions not yet spread out enough and not - # recently moved. + # Now, look for partitions not yet spread out enough. + undispersed_dev_replicas = [] for replica in self._replicas_for_part(part): - dev = self.devs[self._replica2part2dev[replica][part]] - removed_replica = False - for tier in tfd[dev['id']]: - rep_at_tier = replicas_at_tier.get(tier, 0) + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + # the min part hour check is ignored iff a device has more + # than one replica of a part assigned to it - which would have + # only been possible on rings built with older version of code + if (self._last_part_moves[part] < self.min_part_hours and + not replicas_at_tier[dev['tiers'][-1]] > 1): + break + if all(replicas_at_tier[tier] <= + replica_plan[tier]['max'] + for tier in dev['tiers']): + continue + undispersed_dev_replicas.append((dev, replica)) - # If this tier's not overcrowded, there's nothing to - # gather, so we can avoid some calculation here as an - # optimization. - if rep_at_tier <= max_allowed_replicas[tier]: - continue + if not undispersed_dev_replicas: + continue - available_parts_for_tier = sum( - wanted_parts_for_tier[t] - for t in sibling_tiers[tier] - # If a sibling tier is "full" with respect to - # partition dispersion, but not "full" with respect - # to parts_wanted, we don't count it as a possible - # destination. - # - # Otherwise, we gather a partition from tier X - # (because its replicas are not spread out), and - # then we may place it right back in tier X or in - # another tier that already has replicas (because - # that tier has parts_wanted). Then, on the next - # rebalance, it'll happen again, and then again... - # - # Worse yet, this "dancing replica" immobilizes - # other replicas of the partition that want to move - # because they're on devices with negative - # parts_wanted. This can lead to a replica that - # sticks to a zero-weight device no matter how often - # the ring is rebalanced. - if (max_allowed_replicas[t] > - replicas_at_tier.get(t, 0)) - ) - moved_parts + undispersed_dev_replicas.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in undispersed_dev_replicas: + if self._last_part_moves[part] < self.min_part_hours: + break + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [dispersion]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + for tier in dev['tiers']: + replicas_at_tier[tier] -= 1 + self._last_part_moves[part] = 0 - # Only allow a part to be gathered if there are wanted - # parts on other tiers. - if (self._last_part_moves[part] >= self.min_part_hours - and available_parts_for_tier > 0): - self._last_part_moves[part] = 0 - spread_out_parts[part].append(replica) - dev['parts_wanted'] += 1 - dev['parts'] -= 1 - removed_replica = True - moved_parts += 1 - self.logger.debug( - "Gathered %d/%d from dev %d [dispersion]", - part, replica, dev['id']) - break - if removed_replica: - for tier in tfd[dev['id']]: - replicas_at_tier[tier] -= 1 + def _gather_parts_for_balance_can_disperse(self, assign_parts, start, + replica_plan): + """ + Update the map of partition => [replicas] to be reassigned from + overweight drives where the replicas can be better dispersed to + another failure domain. + :param assign_parts: the map of partition => [replica] to update + :param start: offset into self.parts to begin search + :param replica_plan: replicanth targets for tiers + """ # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. - reassign_parts = defaultdict(list) + for offset in range(self.parts): + part = (start + offset) % self.parts + if self._last_part_moves[part] < self.min_part_hours: + continue + # For each part we'll look at the devices holding those parts and + # see if any are overweight, keeping track of replicas_at_tier as + # we go + overweight_dev_replica = [] + replicas_at_tier = defaultdict(int) + for replica in self._replicas_for_part(part): + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 + if dev['parts_wanted'] < 0: + overweight_dev_replica.append((dev, replica)) - # We randomly pick a new starting point in the "circular" ring of - # partitions to try to get a better rebalance when called multiple - # times. + if not overweight_dev_replica: + continue - start = self._last_part_gather_start / 4 - start += random.randint(0, self.parts / 2) # GRAH PEP8!!! + overweight_dev_replica.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in overweight_dev_replica: + if self._last_part_moves[part] < self.min_part_hours: + break + if any(replica_plan[tier]['min'] <= + replicas_at_tier[tier] < + replica_plan[tier]['max'] + for tier in dev['tiers']): + continue + # this is the most overweight_device holding a replica + # of this part that can shed it according to the plan + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [weight disperse]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + for tier in dev['tiers']: + replicas_at_tier[tier] -= 1 + self._last_part_moves[part] = 0 + + def _gather_parts_for_balance(self, assign_parts, replica_plan): + """ + Gather parts that look like they should move for balance reasons. + + A simple gather of parts that looks dispersible normally works out, + we'll switch strategies if things don't be seem to moving... + """ + # pick a random starting point on the other side of the ring + quarter_turn = (self.parts // 4) + random_half = random.randint(0, self.parts / 2) + start = (self._last_part_gather_start + quarter_turn + + random_half) % self.parts + self.logger.debug('Gather start is %s ' + '(Last start was %s)' % ( + start, self._last_part_gather_start)) self._last_part_gather_start = start - for replica, part2dev in enumerate(self._replica2part2dev): - # If we've got a partial replica, start may be out of - # range. Scale it down so that we get a similar movement - # pattern (but scaled down) on sequential runs. - this_start = int(float(start) * len(part2dev) / self.parts) + self._gather_parts_for_balance_can_disperse( + assign_parts, start, replica_plan) + if not assign_parts: + self._gather_parts_for_balance_forced(assign_parts, start) - for part in itertools.chain(range(this_start, len(part2dev)), - range(0, this_start)): + def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs): + """ + Update the map of partition => [replicas] to be reassigned from + overweight drives without restriction, parts gathered from this method + may be placed back onto devices that are no better (or worse) than the + device from which they are gathered. + + This method allows devices to flop around enough to unlock replicas + that would have otherwise potentially been locked because of + dispersion - it should be used as a last resort. + + :param assign_parts: the map of partition => [replica] to update + :param start: offset into self.parts to begin search + """ + for offset in range(self.parts): + part = (start + offset) % self.parts + if self._last_part_moves[part] < self.min_part_hours: + continue + overweight_dev_replica = [] + for replica in self._replicas_for_part(part): + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + if dev['parts_wanted'] < 0: + overweight_dev_replica.append((dev, replica)) + + if not overweight_dev_replica: + continue + + overweight_dev_replica.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in overweight_dev_replica: if self._last_part_moves[part] < self.min_part_hours: - continue - if part in removed_dev_parts or part in spread_out_parts: - continue - dev = self.devs[part2dev[part]] - fudge = self._n_overload_parts(dev) - if dev['parts_wanted'] + fudge < 0: - self._last_part_moves[part] = 0 - dev['parts_wanted'] += 1 - dev['parts'] -= 1 - reassign_parts[part].append(replica) - self.logger.debug( - "Gathered %d/%d from dev %d [weight]", - part, replica, dev['id']) + break + # this is the most overweight_device holding a replica of this + # part we don't know where it's going to end up - but we'll + # pick it up and hope for the best. + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [weight forced]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + self._last_part_moves[part] = 0 - reassign_parts.update(spread_out_parts) - reassign_parts.update(removed_dev_parts) - - reassign_parts_list = list(reassign_parts.items()) - # We shuffle the partitions to reassign so we get a more even - # distribution later. There has been discussion of trying to distribute - # partitions more "regularly" because that would actually reduce risk - # but 1) it is really difficult to do this with uneven clusters and 2) - # it would concentrate load during failure recovery scenarios - # (increasing risk). The "right" answer has yet to be debated to - # conclusion, but working code wins for now. - random.shuffle(reassign_parts_list) - return reassign_parts_list - - def _n_overload_parts(self, dev): - """ - The number of extra partitions a device can take due to overload. - """ - return max(int(math.ceil( - (dev['parts_wanted'] + dev['parts']) - * self._effective_overload)), 0) - - def _reassign_parts(self, reassign_parts): + def _reassign_parts(self, reassign_parts, replica_plan): """ For an existing ring data set, partitions are reassigned similarly to - the initial assignment. The devices are ordered by how many partitions - they still want and kept in that order throughout the process. The - gathered partitions are iterated through, assigning them to devices - according to the "most wanted" while keeping the replicas as "far - apart" as possible. Two different regions are considered the - farthest-apart things, followed by zones, then different ip/port pairs - within a zone; the least-far-apart things are different devices with - the same ip/port pair in the same zone. + the initial assignment. - If you want more replicas than devices, you won't get all your - replicas. + The devices are ordered by how many partitions they still want and + kept in that order throughout the process. + + The gathered partitions are iterated through, assigning them to + devices according to the "most wanted" while keeping the replicas as + "far apart" as possible. + + Two different regions are considered the farthest-apart things, + followed by zones, then different ip within a zone; the + least-far-apart things are different devices with the same ip in the + same zone. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the @@ -1208,12 +1148,9 @@ class RingBuilder(object): replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ - fudge_available_in_tier = defaultdict(int) parts_available_in_tier = defaultdict(int) for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) - tiers = tiers_for_dev(dev) - dev['tiers'] = tiers # Note: this represents how many partitions may be assigned to a # given tier (region/zone/server/disk). It does not take into # account how many partitions a given tier wants to shed. @@ -1226,9 +1163,7 @@ class RingBuilder(object): # with partitions to shed, which is any time a device is being # removed, which is a pretty frequent operation. wanted = max(dev['parts_wanted'], 0) - fudge = self._n_overload_parts(dev) - for tier in tiers: - fudge_available_in_tier[tier] += (wanted + fudge) + for tier in dev['tiers']: parts_available_in_tier[tier] += wanted available_devs = \ @@ -1265,153 +1200,45 @@ class RingBuilder(object): depth += 1 for part, replace_replicas in reassign_parts: - # Gather up what other tiers (regions, zones, ip/ports, and - # devices) the replicas not-to-be-moved are in for this part. - other_replicas = defaultdict(int) - occupied_tiers_by_tier_len = defaultdict(set) - for replica in self._replicas_for_part(part): - if replica not in replace_replicas: - dev = self.devs[self._replica2part2dev[replica][part]] - for tier in dev['tiers']: - other_replicas[tier] += 1 - occupied_tiers_by_tier_len[len(tier)].add(tier) + # always update part_moves for min_part_hours + self._last_part_moves[part] = 0 + # count up where these replicas be + replicas_at_tier = defaultdict(int) + for dev in self._devs_for_part(part): + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 for replica in replace_replicas: # Find a new home for this replica tier = () + # This used to be a cute, recursive function, but it's been + # unrolled for performance. depth = 1 while depth <= max_tier_depth: - roomiest_tier = fudgiest_tier = None - # Order the tiers by how many replicas of this - # partition they already have. Then, of the ones - # with the smallest number of replicas and that have - # room to accept more partitions, pick the tier with - # the hungriest drive and then continue searching in - # that subtree. - # - # There are other strategies we could use here, - # such as hungriest-tier (i.e. biggest - # sum-of-parts-wanted) or picking one at random. - # However, hungriest-drive is what was used here - # before, and it worked pretty well in practice. - # - # Note that this allocator prioritizes even device - # filling over dispersion, so if your layout is - # extremely unbalanced, you may not get the replica - # dispersion that you expect, and your durability - # may be lessened. - # - # This used to be a cute, recursive function, but it's been - # unrolled for performance. + # Choose the roomiest tier among those that don't + # already have their max replicas assigned according + # to the replica_plan. + candidates = [t for t in tier2children[tier] if + replicas_at_tier[t] < + replica_plan[t]['max']] - # We sort the tiers here so that, when we look for a tier - # with the lowest number of replicas, the first one we - # find is the one with the hungriest drive (i.e. drive - # with the largest sort_key value). This lets us - # short-circuit the search while still ensuring we get the - # right tier. - candidates_with_replicas = \ - occupied_tiers_by_tier_len[len(tier) + 1] + if not candidates: + raise Exception('no home for %s/%s %s' % ( + part, replica, {t: ( + replicas_at_tier[t], + replica_plan[t]['max'], + ) for t in tier2children[tier]})) + tier = max(candidates, key=lambda t: + parts_available_in_tier[t]) - # Among the tiers with room for more partitions, - # find one with the smallest possible number of - # replicas already in it, breaking ties by which one - # has the hungriest drive. - candidates_with_room = [ - t for t in tier2children[tier] - if parts_available_in_tier[t] > 0] - candidates_with_fudge = set([ - t for t in tier2children[tier] - if fudge_available_in_tier[t] > 0]) - candidates_with_fudge.update(candidates_with_room) - - if candidates_with_room: - if len(candidates_with_room) > \ - len(candidates_with_replicas): - # There exists at least one tier with room for - # another partition and 0 other replicas already - # in it, so we can use a faster search. The else - # branch's search would work here, but it's - # significantly slower. - roomiest_tier = max( - (t for t in candidates_with_room - if other_replicas[t] == 0), - key=tier2sort_key.__getitem__) - else: - roomiest_tier = max( - candidates_with_room, - key=lambda t: (-other_replicas[t], - tier2sort_key[t])) - else: - roomiest_tier = None - - fudgiest_tier = max(candidates_with_fudge, - key=lambda t: (-other_replicas[t], - tier2sort_key[t])) - - if (roomiest_tier is None or - (other_replicas[roomiest_tier] > - other_replicas[fudgiest_tier])): - subtier = fudgiest_tier - else: - subtier = roomiest_tier - # no putting multiples on the same device - if len(subtier) == 4 and ( - subtier in occupied_tiers_by_tier_len[4]): - sibling_tiers = [ - (d['region'], d['zone'], d['ip'], d['id']) - for d in tier2devs[tier]] - unused_sibling_tiers = [ - t for t in sibling_tiers - if t not in occupied_tiers_by_tier_len[4]] - if unused_sibling_tiers: - # anything is better than the alternative - subtier = random.choice(unused_sibling_tiers) - else: - warnings.warn(RingValidationWarning( - "All devices in tier %r already " - "contain a replica" % (tier,))) - tier = subtier depth += 1 dev = tier2devs[tier][-1] dev['parts_wanted'] -= 1 dev['parts'] += 1 - old_sort_key = dev['sort_key'] - new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in dev['tiers']: parts_available_in_tier[tier] -= 1 - fudge_available_in_tier[tier] -= 1 - other_replicas[tier] += 1 - occupied_tiers_by_tier_len[len(tier)].add(tier) - - index = bisect.bisect_left(tier2dev_sort_key[tier], - old_sort_key) - tier2devs[tier].pop(index) - tier2dev_sort_key[tier].pop(index) - - new_index = bisect.bisect_left(tier2dev_sort_key[tier], - new_sort_key) - tier2devs[tier].insert(new_index, dev) - tier2dev_sort_key[tier].insert(new_index, new_sort_key) - - new_last_sort_key = tier2dev_sort_key[tier][-1] - tier2sort_key[tier] = new_last_sort_key - - # Now jiggle tier2children values to keep them sorted - parent_tier = tier[0:-1] - index = bisect.bisect_left( - tier2children_sort_key[parent_tier], - old_sort_key) - popped = tier2children[parent_tier].pop(index) - tier2children_sort_key[parent_tier].pop(index) - - new_index = bisect.bisect_left( - tier2children_sort_key[parent_tier], - new_last_sort_key) - tier2children[parent_tier].insert(new_index, popped) - tier2children_sort_key[parent_tier].insert( - new_index, new_last_sort_key) + replicas_at_tier[tier] += 1 self._replica2part2dev[replica][part] = dev['id'] self.logger.debug( @@ -1420,13 +1247,12 @@ class RingBuilder(object): # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key'] - del dev['tiers'] @staticmethod def _sort_key_for(dev): return (dev['parts_wanted'], random.randint(0, 0xFFFF), dev['id']) - def _build_max_replicas_by_tier(self): + def _build_max_replicas_by_tier(self, bound=math.ceil): """ Returns a defaultdict of (tier: replica_count) for all tiers in the ring excluding zero weight devices. @@ -1477,21 +1303,254 @@ class RingBuilder(object): """ # Used by walk_tree to know what entries to create for each recursive # call. - tier2children = build_tier_tree(d for d in self._iter_devs() if - d['weight']) + tier2children = self._build_tier2children() def walk_tree(tier, replica_count): + if len(tier) == 4: + # special case for device, it's not recursive + replica_count = min(1, replica_count) mr = {tier: replica_count} if tier in tier2children: subtiers = tier2children[tier] for subtier in subtiers: - submax = math.ceil(float(replica_count) / len(subtiers)) + submax = bound(float(replica_count) / len(subtiers)) mr.update(walk_tree(subtier, submax)) return mr mr = defaultdict(float) mr.update(walk_tree((), self.replicas)) return mr + def _build_weighted_replicas_by_tier(self): + """ + Returns a dict mapping => replicanths for all tiers in + the ring based on their weights. + """ + weight_of_one_part = self.weight_of_one_part() + + # assign each device some replicanths by weight (can't be > 1) + weighted_replicas_for_dev = {} + devices_with_room = [] + for dev in self._iter_devs(): + if not dev['weight']: + continue + weighted_replicas = ( + dev['weight'] * weight_of_one_part / self.parts) + if weighted_replicas < 1: + devices_with_room.append(dev['id']) + else: + weighted_replicas = 1 + weighted_replicas_for_dev[dev['id']] = weighted_replicas + + while True: + remaining = self.replicas - sum(weighted_replicas_for_dev.values()) + if remaining < 1e-10: + break + devices_with_room = [d for d in devices_with_room if + weighted_replicas_for_dev[d] < 1] + rel_weight = remaining / sum( + weighted_replicas_for_dev[d] for d in devices_with_room) + for d in devices_with_room: + weighted_replicas_for_dev[d] = min( + 1, weighted_replicas_for_dev[d] * (rel_weight + 1)) + + weighted_replicas_by_tier = defaultdict(float) + for dev in self._iter_devs(): + if not dev['weight']: + continue + assigned_replicanths = weighted_replicas_for_dev[dev['id']] + dev_tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + for i in range(len(dev_tier) + 1): + tier = dev_tier[:i] + weighted_replicas_by_tier[tier] += assigned_replicanths + + # belts & suspenders/paranoia - at every level, the sum of + # weighted_replicas should be very close to the total number of + # replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(weighted_replicas_by_tier[t] for t in + weighted_replicas_by_tier if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return weighted_replicas_by_tier + + def _build_wanted_replicas_by_tier(self): + """ + Returns a defaultdict of (tier: replicanths) for all tiers in the ring + based on unique-as-possible (full dispersion) with respect to their + weights and device counts. + + N.B. _build_max_replicas_by_tier calculates the upper bound on the + replicanths each tier may hold irrespective of the weights of the + tier; this method will calculate the minimum replicanth <= + max_replicas[tier] that will still solve dispersion. However it is + not guaranteed to return a fully dispersed solution if failure domains + are over-weighted for their device count. + """ + weighted_replicas = self._build_weighted_replicas_by_tier() + dispersed_replicas = { + t: { + 'min': math.floor(r), + 'max': math.ceil(r), + } for (t, r) in + self._build_max_replicas_by_tier(bound=float).items() + } + + # watch out for device limited tiers + num_devices = defaultdict(int) + for d in self._iter_devs(): + if d['weight'] <= 0: + continue + for t in (d.get('tiers') or tiers_for_dev(d)): + num_devices[t] += 1 + num_devices[()] += 1 + + tier2children = self._build_tier2children() + + wanted_replicas = defaultdict(float) + + def place_replicas(tier, replicanths): + if replicanths > num_devices[tier]: + raise exceptions.RingValidationError( + 'More than replicanths (%s) than devices (%s) ' + 'in tier (%s)' % (replicanths, num_devices[tier], tier)) + wanted_replicas[tier] = replicanths + sub_tiers = sorted(tier2children[tier]) + if not sub_tiers: + return + + to_place = defaultdict(float) + remaining = replicanths + tiers_to_spread = sub_tiers + device_limited = False + + while True: + rel_weight = remaining / sum(weighted_replicas[t] + for t in tiers_to_spread) + for t in tiers_to_spread: + replicas = to_place[t] + ( + weighted_replicas[t] * rel_weight) + if replicas < dispersed_replicas[t]['min']: + replicas = dispersed_replicas[t]['min'] + elif (replicas > dispersed_replicas[t]['max'] and + not device_limited): + replicas = dispersed_replicas[t]['max'] + if replicas > num_devices[t]: + replicas = num_devices[t] + to_place[t] = replicas + + remaining = replicanths - sum(to_place.values()) + + if remaining < -1e-10: + tiers_to_spread = [ + t for t in sub_tiers + if to_place[t] > dispersed_replicas[t]['min'] + ] + elif remaining > 1e-10: + tiers_to_spread = [ + t for t in sub_tiers + if (num_devices[t] > to_place[t] < + dispersed_replicas[t]['max']) + ] + if not tiers_to_spread: + device_limited = True + tiers_to_spread = [ + t for t in sub_tiers + if to_place[t] < num_devices[t] + ] + else: + # remaining is "empty" + break + + for t in sub_tiers: + self.logger.debug('Planning %s on %s', + to_place[t], t) + place_replicas(t, to_place[t]) + + # place all replicas in the cluster tier + place_replicas((), self.replicas) + + # belts & suspenders/paranoia - at every level, the sum of + # wanted_replicas should be very close to the total number of + # replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(wanted_replicas[t] for t in + wanted_replicas if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return wanted_replicas + + def _build_target_replicas_by_tier(self): + """ + Build a map of => accounting for device + weights, unique-as-possible dispersion and overload. + + - a tuple, describing each tier in the ring topology + - a float, the target replicanths at the tier + """ + weighted_replicas = self._build_weighted_replicas_by_tier() + wanted_replicas = self._build_wanted_replicas_by_tier() + max_overload = self.get_required_overload(weighted=weighted_replicas, + wanted=wanted_replicas) + if max_overload <= 0.0: + return wanted_replicas + else: + overload = min(self.overload, max_overload) + self.logger.debug("Using effective overload of %f", overload) + target_replicas = defaultdict(float) + for tier, weighted in weighted_replicas.items(): + m = (wanted_replicas[tier] - weighted) / max_overload + target_replicas[tier] = m * overload + weighted + + # belts & suspenders/paranoia - at every level, the sum of + # target_replicas should be very close to the total number + # of replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(target_replicas[t] for t in + target_replicas if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return target_replicas + + def _build_replica_plan(self): + """ + Wraps return value of _build_target_replicas_by_tier to include + pre-calculated min and max values for each tier. + + :returns: a dict, mapping => , where + is itself a dict + + include at least the following keys: + + min - the minimum number of replicas at the tier + target - the target replicanths at the tier + max - the maximum number of replicas at the tier + """ + # replica part-y planner! + target_replicas = self._build_target_replicas_by_tier() + replica_plan = defaultdict( + lambda: {'min': 0, 'target': 0, 'max': 0}) + replica_plan.update({ + t: { + 'min': math.floor(r + 1e-10), + 'target': r, + 'max': math.ceil(r - 1e-10), + } for (t, r) in + target_replicas.items() + }) + return replica_plan + def _devs_for_part(self, part): """ Returns a list of devices for a specified partition. @@ -1500,9 +1559,15 @@ class RingBuilder(object): """ if self._replica2part2dev is None: return [] - return [self.devs[part2dev[part]] - for part2dev in self._replica2part2dev - if part < len(part2dev)] + devs = [] + for part2dev in self._replica2part2dev: + if part >= len(part2dev): + continue + dev_id = part2dev[part] + if dev_id == NONE_DEV: + continue + devs.append(self.devs[dev_id]) + return devs def _replicas_for_part(self, part): """ diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index f31e4ab747..99348d445e 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -25,13 +25,14 @@ from collections import defaultdict from math import ceil from tempfile import mkdtemp from shutil import rmtree -import warnings +import random from six.moves import range from swift.common import exceptions from swift.common import ring -from swift.common.ring.builder import MAX_BALANCE, RingValidationWarning +from swift.common.ring import utils +from swift.common.ring.builder import MAX_BALANCE class TestRingBuilder(unittest.TestCase): @@ -343,12 +344,16 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance() rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) - rb.pretend_min_part_hours_passed() - parts = rb._gather_reassign_parts() + replica_plan = rb._build_replica_plan() + rb._set_parts_wanted(replica_plan) + for dev in rb._iter_devs(): + dev['tiers'] = utils.tiers_for_dev(dev) + assign_parts = defaultdict(list) + rb._gather_parts_for_balance(assign_parts, replica_plan) max_run = 0 run = 0 last_part = 0 - for part, _ in parts: + for part, _ in assign_parts.items(): if part > last_part: run += 1 else: @@ -358,7 +363,7 @@ class TestRingBuilder(unittest.TestCase): last_part = part if run > max_run: max_run = run - return max_run > len(parts) / 2 + return max_run > len(assign_parts) / 2 def test_initial_balance(self): # 2 boxes, 2 drives each in zone 1 @@ -652,7 +657,7 @@ class TestRingBuilder(unittest.TestCase): "Partition %d did not move (got %r)" % (part, devs)) def test_multitier_dont_move_too_many_replicas(self): - rb = ring.RingBuilder(8, 3, 0) + rb = ring.RingBuilder(8, 3, 1) # there'll be at least one replica in z0 and z1 rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 0.5, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) @@ -672,6 +677,7 @@ class TestRingBuilder(unittest.TestCase): 'ip': '127.0.0.1', 'port': 10000, 'device': 'sde1'}) rb.add_dev({'id': 4, 'region': 0, 'zone': 4, 'weight': 1, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdf1'}) + rb.pretend_min_part_hours_passed() rb.rebalance() rb.validate() @@ -688,6 +694,73 @@ class TestRingBuilder(unittest.TestCase): "Partition %d not in zones 0 and 1 (got %r)" % (part, zones)) + def test_min_part_hours_zero_will_move_whatever_it_takes(self): + rb = ring.RingBuilder(8, 3, 0) + # there'll be at least one replica in z0 and z1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb1'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb1'}) + rb.rebalance(seed=1) + rb.validate() + + rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd1'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sde1'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 4, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdf1'}) + rb.rebalance(seed=3) + rb.validate() + + self.assertEqual(0, rb.dispersion) + # a balance of w/i a 1% isn't too bad for 3 replicas on 7 + # devices when part power is only 8 + self.assertAlmostEqual(rb.get_balance(), 0, delta=0.5) + + # every zone has either 153 or 154 parts + for zone, count in self._partition_counts( + rb, key='zone').items(): + self.assertAlmostEqual(153.5, count, delta=1) + + parts_with_moved_count = defaultdict(int) + for part in range(rb.parts): + zones = set() + for replica in range(rb.replicas): + zones.add(rb.devs[rb._replica2part2dev[replica][part]]['zone']) + moved_replicas = len(zones - {0, 1}) + parts_with_moved_count[moved_replicas] += 1 + + # as usual, the real numbers depend on the seed, but we want to + # validate a few things here: + # + # 1) every part had to move one replica to hit dispersion (so no + # one can have a moved count 0) + # + # 2) it's quite reasonable that some small percent of parts will + # have a replica in {0, 1, X} (meaning only one replica of the + # part moved) + # + # 3) when min_part_hours is 0, more than one replica of a part + # can move in a rebalance, and since that movement would get to + # better dispersion faster we expect to observe most parts in + # {[0,1], X, X} (meaning *two* replicas of the part moved) + # + # 4) there's plenty of weight in z0 & z1 to hold a whole + # replicanth, so there is no reason for any part to have to move + # all three replicas out of those zones (meaning no one can have + # a moved count 3) + # + expected = { + 1: 52, + 2: 204, + } + self.assertEqual(parts_with_moved_count, expected) + def test_rerebalance(self): rb = ring.RingBuilder(8, 3, 1) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, @@ -697,29 +770,17 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, 'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'}) rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts, {0: 256, 1: 256, 2: 256}) rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) rb.pretend_min_part_hours_passed() rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts, {0: 192, 1: 192, 2: 192, 3: 192}) rb.set_dev_weight(3, 100) rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts[3], 256) def test_add_rebalance_add_rebalance_delete_rebalance(self): @@ -771,12 +832,12 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 1.0, 'ip': '127.0.0.3', 'port': 10000, 'device': 'sdc'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.5, + rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.4, 'ip': '127.0.0.3', 'port': 10001, 'device': 'zero'}) zero_weight_dev = 3 - rb.rebalance() + rb.rebalance(seed=1) # We want at least one partition with replicas only in zone 2 and 3 # due to device weights. It would *like* to spread out into zone 1, @@ -799,6 +860,14 @@ class TestRingBuilder(unittest.TestCase): array('H', [1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4]), array('H', [0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 5, 6, 2, 5, 6])] + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + rb.set_dev_weight(zero_weight_dev, 0.0) rb.pretend_min_part_hours_passed() rb.rebalance(seed=1) @@ -807,13 +876,243 @@ class TestRingBuilder(unittest.TestCase): for part2dev_id in rb._replica2part2dev: for dev_id in part2dev_id: node_counts[dev_id] += 1 + self.assertEqual(node_counts[zero_weight_dev], 0) # it's as balanced as it gets, so nothing moves anymore rb.pretend_min_part_hours_passed() parts_moved, _balance, _removed = rb.rebalance(seed=1) + + new_node_counts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_node_counts[dev_id] += 1 + + del node_counts[zero_weight_dev] + self.assertEqual(node_counts, new_node_counts) + self.assertEqual(parts_moved, 0) + def test_part_swapping_problem(self): + rb = ring.RingBuilder(4, 3, 1) + # 127.0.0.1 (2 devs) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + # 127.0.0.2 (3 devs) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdc'}) + + expected = { + '127.0.0.1': 1.2, + '127.0.0.2': 1.7999999999999998, + } + for wr in (rb._build_weighted_replicas_by_tier(), + rb._build_wanted_replicas_by_tier(), + rb._build_target_replicas_by_tier()): + self.assertEqual(expected, {t[-1]: r for (t, r) in + wr.items() if len(t) == 3}) + self.assertEqual(rb.get_required_overload(), 0) + rb.rebalance(seed=3) + # so 127.0.0.1 ended up with... + tier = (0, 0, '127.0.0.1') + # ... 6 parts with 1 replicas + self.assertEqual(rb._dispersion_graph[tier][1], 12) + # ... 4 parts with 2 replicas + self.assertEqual(rb._dispersion_graph[tier][2], 4) + # but since we only have two tiers, this is *totally* dispersed + self.assertEqual(0, rb.dispersion) + + # small rings are hard to balance... + expected = {0: 10, 1: 10, 2: 10, 3: 9, 4: 9} + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + # everyone wants 9.6 parts + expected = { + 0: 4.166666666666671, + 1: 4.166666666666671, + 2: 4.166666666666671, + 3: -6.25, + 4: -6.25, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # original sorted _replica2part2dev + """ + rb._replica2part2dev = [ + array('H', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), + array('H', [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3]), + array('H', [2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4])] + """ + + # now imagine if we came along this _replica2part2dev through no + # fault of our own; if instead of the 12 parts with only one + # replica on 127.0.0.1 being split evenly (6 and 6) on device's + # 0 and 1 - device 1 inexplicitly had 3 extra parts + rb._replica2part2dev = [ + # these are the relevant one's here + # | | | + # v v v + array('H', [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + array('H', [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3]), + array('H', [2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4])] + + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + + rb.pretend_min_part_hours_passed() + rb.rebalance() + expected = { + 0: 4.166666666666671, + 1: 4.166666666666671, + 2: 4.166666666666671, + 3: -6.25, + 4: -6.25, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + self.assertEqual(rb.get_balance(), 6.25) + + def test_wrong_tier_with_no_where_to_go(self): + rb = ring.RingBuilder(4, 3, 1) + + # 127.0.0.1 (even devices) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + # 127.0.0.2 (odd devices) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdd'}) + + expected = { + '127.0.0.1': 1.75, + '127.0.0.2': 1.25, + } + for wr in (rb._build_weighted_replicas_by_tier(), + rb._build_wanted_replicas_by_tier(), + rb._build_target_replicas_by_tier()): + self.assertEqual(expected, {t[-1]: r for (t, r) in + wr.items() if len(t) == 3}) + self.assertEqual(rb.get_required_overload(), 0) + rb.rebalance(seed=3) + # so 127.0.0.1 ended up with... + tier = (0, 0, '127.0.0.1') + # ... 4 parts with 1 replicas + self.assertEqual(rb._dispersion_graph[tier][1], 4) + # ... 12 parts with 2 replicas + self.assertEqual(rb._dispersion_graph[tier][2], 12) + # ... and of course 0 parts with 3 replicas + self.assertEqual(rb._dispersion_graph[tier][3], 0) + # but since we only have two tiers, this is *totally* dispersed + self.assertEqual(0, rb.dispersion) + + # small rings are hard to balance, but it's possible when + # part-replicas (3 * 2 ** 4) can go evenly into device weights + # (4800) like we've done here + expected = { + 0: 1, + 2: 9, + 4: 9, + 6: 9, + 1: 5, + 3: 5, + 5: 5, + 7: 5, + } + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + expected = { + 0: 0.0, + 1: 0.0, + 2: 0.0, + 3: 0.0, + 4: 0.0, + 5: 0.0, + 6: 0.0, + 7: 0.0, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # all devices have exactly the # of parts they want + expected = { + 0: 0, + 2: 0, + 4: 0, + 6: 0, + 1: 0, + 3: 0, + 5: 0, + 7: 0, + } + self.assertEqual(expected, {d['id']: d['parts_wanted'] + for d in rb._iter_devs()}) + + # original sorted _replica2part2dev + """ + rb._replica2part2dev = [ + array('H', [0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, ]), + array('H', [4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, ]), + array('H', [1, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, ])] + """ + # now imagine if we came along this _replica2part2dev through no + # fault of our own; and device 0 had extra parts, but both + # copies of the other replicas were already in the other tier! + rb._replica2part2dev = [ + # these are the relevant one's here + # | | + # v v + array('H', [2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 0, 0]), + array('H', [4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1]), + array('H', [1, 1, 3, 3, 3, 3, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7])] + + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + replica_plan = rb._build_replica_plan() + rb._set_parts_wanted(replica_plan) + + expected = { + 0: -1, # this device wants to shed + 2: 0, + 4: 0, + 6: 0, + 1: 0, + 3: 1, # there's devices with room on the other server + 5: 0, + 7: 0, + } + self.assertEqual(expected, {d['id']: d['parts_wanted'] + for d in rb._iter_devs()}) + + rb.pretend_min_part_hours_passed() + rb.rebalance() + self.assertEqual(rb.get_balance(), 0) + def test_region_fullness_with_balanceable_ring(self): rb = ring.RingBuilder(8, 3, 1) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, @@ -891,9 +1190,14 @@ class TestRingBuilder(unittest.TestCase): population_by_region = self._get_population_by_region(rb) self.assertEqual(population_by_region, {0: 682, 1: 86}) - # only 86 parts *should* move (to the new region) but randomly some - # parts will flop around devices in the original region too - self.assertEqual(90, changed_parts) + # really 86 parts *should* move (to the new region) but to avoid + # accidentally picking up too many and causing some parts to randomly + # flop around devices in the original region - our gather algorithm + # is conservative when picking up only from devices that are for sure + # holding more parts than they want (math.ceil() of the replica_plan) + # which guarantees any parts picked up will have new homes in a better + # tier or failure_domain. + self.assertEqual(86, changed_parts) # and since there's not enough room, subsequent rebalances will not # cause additional assignments to r1 @@ -933,6 +1237,7 @@ class TestRingBuilder(unittest.TestCase): # Increase the weight of region 1 slowly moved_partitions = [] + errors = [] for weight in range(0, 101, 10): rb.set_dev_weight(5, weight) rb.pretend_min_part_hours_passed() @@ -943,11 +1248,17 @@ class TestRingBuilder(unittest.TestCase): # Otherwise there will be replicas at risk min_parts_for_r1 = ceil(weight / (500.0 + weight) * 768) parts_for_r1 = self._get_population_by_region(rb).get(1, 0) - self.assertEqual(min_parts_for_r1, parts_for_r1) + try: + self.assertEqual(min_parts_for_r1, parts_for_r1) + except AssertionError: + errors.append('weight %s got %s parts but expected %s' % ( + weight, parts_for_r1, min_parts_for_r1)) + + self.assertFalse(errors) # Number of partitions moved on each rebalance # 10/510 * 768 ~ 15.06 -> move at least 15 partitions in first step - ref = [0, 17, 16, 17, 13, 15, 13, 12, 11, 13, 13] + ref = [0, 16, 14, 14, 13, 13, 13, 12, 11, 12, 10] self.assertEqual(ref, moved_partitions) def test_set_replicas_increase(self): @@ -1167,14 +1478,13 @@ class TestRingBuilder(unittest.TestCase): # Devices 0 and 1 take 10% more than their fair shares by weight since # overload is 10% (0.1). rb.set_overload(0.1) - for _ in range(2): - rb.pretend_min_part_hours_passed() - rb.rebalance(seed=12345) + rb.pretend_min_part_hours_passed() + rb.rebalance() part_counts = self._partition_counts(rb, key='zone') self.assertEqual(part_counts[0], 212) - self.assertEqual(part_counts[1], 212) - self.assertEqual(part_counts[2], 344) + self.assertEqual(part_counts[1], 211) + self.assertEqual(part_counts[2], 345) # Now, devices 0 and 1 take 50% more than their fair shares by # weight. @@ -1244,14 +1554,14 @@ class TestRingBuilder(unittest.TestCase): # Add some weight: balance improves for dev in rb.devs: if dev['ip'] in ('127.0.0.1', '127.0.0.2'): - rb.set_dev_weight(dev['id'], 1.5) + rb.set_dev_weight(dev['id'], 1.22) rb.pretend_min_part_hours_passed() rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 236) - self.assertEqual(part_counts['127.0.0.2'], 236) - self.assertEqual(part_counts['127.0.0.3'], 296) + self.assertEqual(part_counts['127.0.0.1'], 238) + self.assertEqual(part_counts['127.0.0.2'], 237) + self.assertEqual(part_counts['127.0.0.3'], 293) # Even out the weights: balance becomes perfect for dev in rb.devs: @@ -1287,11 +1597,15 @@ class TestRingBuilder(unittest.TestCase): rb.pretend_min_part_hours_passed() rb.rebalance(seed=12345) + expected = { + '127.0.0.1': 192, + '127.0.0.2': 192, + '127.0.0.3': 192, + '127.0.0.4': 192, + } + part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 192) - self.assertEqual(part_counts['127.0.0.2'], 192) - self.assertEqual(part_counts['127.0.0.3'], 192) - self.assertEqual(part_counts['127.0.0.4'], 192) + self.assertEqual(part_counts, expected) def test_overload_keeps_balanceable_things_balanced_initially(self): rb = ring.RingBuilder(8, 3, 1) @@ -1705,6 +2019,8 @@ class TestRingBuilder(unittest.TestCase): ]) self.assertEqual(int(worst), 0) + # min part hours should pin all the parts assigned to this zero + # weight device onto it such that the balance will look horrible rb.set_dev_weight(2, 0) rb.rebalance() self.assertEqual(rb.validate(stats=True)[1], MAX_BALANCE) @@ -1789,24 +2105,11 @@ class TestRingBuilder(unittest.TestCase): def __eq__(self, other): return self.substr in other - with warnings.catch_warnings(): - # we're firing the warning twice in this test and resetwarnings - # doesn't work - https://bugs.python.org/issue4180 - warnings.simplefilter('always') + with self.assertRaises(exceptions.RingValidationError) as e: + rb.validate() - # by default things will work, but log a warning - with mock.patch('sys.stderr') as mock_stderr: - rb.validate() - expected = SubStringMatcher( - 'RingValidationWarning: The partition 200 has been assigned ' - 'to duplicate devices') - # ... but the warning is written to stderr - self.assertEqual(mock_stderr.method_calls, - [mock.call.write(expected)]) - # if you make warnings errors it blows up - with warnings.catch_warnings(): - warnings.filterwarnings('error') - self.assertRaises(RingValidationWarning, rb.validate) + expected = 'The partition 200 has been assigned to duplicate devices' + self.assertIn(expected, str(e.exception)) def test_get_part_devices(self): rb = ring.RingBuilder(8, 3, 1) @@ -1832,12 +2135,12 @@ class TestRingBuilder(unittest.TestCase): 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) - rb.rebalance(seed=9) + rb.rebalance(seed=4) # note: partition 255 will only have 2 replicas part_devs = sorted(rb.get_part_devices(255), key=operator.itemgetter('id')) - self.assertEqual(part_devs, [rb.devs[0], rb.devs[1]]) + self.assertEqual(part_devs, [rb.devs[1], rb.devs[2]]) def test_dispersion_with_zero_weight_devices(self): rb = ring.RingBuilder(8, 3.0, 0) @@ -1975,31 +2278,10 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=7) rb.validate() - # ok, we didn't quite disperse - self.assertGreater(rb.dispersion, 0) - - # ... but let's unlock some parts - rb.pretend_min_part_hours_passed() - rb.rebalance(seed=7) - rb.validate() - - # ... and that got it! + # ... and that got it in one pass boo-yah! self.assertEqual(rb.dispersion, 0) - def strawman_test(self): - """ - This test demonstrates a trivial failure of part-replica placement. - - If you turn warnings into errors this will fail. - - i.e. - - export PYTHONWARNINGS=error:::swift.common.ring.builder - - N.B. try not to get *too* hung up on doing something silly to make - this particular case pass w/o warnings - it's trivial to write up a - dozen more. - """ + def zone_weights_over_device_count(self): rb = ring.RingBuilder(8, 3, 1) # z0 rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, @@ -2011,16 +2293,35 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 200, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - with warnings.catch_warnings(record=True) as w: - rb.rebalance(seed=7) - rb.validate() - self.assertEqual(len(w), 65) + rb.rebalance(seed=7) + rb.validate() + self.assertEqual(rb.dispersion, 0) + self.assertAlmostEqual(rb.get_balance(), (1.0 / 3.0) * 100) + + def test_more_devices_than_replicas_validation_when_removed_dev(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sdc'}) + rb.rebalance() + rb.remove_dev(2) + with self.assertRaises(ValueError) as e: + rb.set_dev_weight(2, 1) + msg = "Can not set weight of dev_id 2 because it is marked " \ + "for removal" + self.assertIn(msg, str(e.exception)) + with self.assertRaises(exceptions.RingValidationError) as e: + rb.rebalance() + msg = 'Replica count of 3 requires more than 2 devices' + self.assertIn(msg, str(e.exception)) class TestGetRequiredOverload(unittest.TestCase): - def assertApproximately(self, a, b, error=1e-6): - self.assertTrue(abs(a - b) < error, - "%f and %f differ by more than %f" % (a, b, error)) + + maxDiff = None def test_none_needed(self): rb = ring.RingBuilder(8, 3, 1) @@ -2035,11 +2336,108 @@ class TestGetRequiredOverload(unittest.TestCase): # 4 equal-weight devs and 3 replicas: this can be balanced without # resorting to overload at all - self.assertApproximately(rb.get_required_overload(), 0) + self.assertAlmostEqual(rb.get_required_overload(), 0) + + expected = { + (0, 0, '127.0.0.1', 0): 0.75, + (0, 0, '127.0.0.1', 1): 0.75, + (0, 0, '127.0.0.1', 2): 0.75, + (0, 0, '127.0.0.1', 3): 0.75, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # since no overload is needed, target_replicas is the same + rb.set_overload(0.10) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # ... no matter how high you go! + rb.set_overload(100.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) # 3 equal-weight devs and 3 replicas: this can also be balanced rb.remove_dev(3) - self.assertApproximately(rb.get_required_overload(), 0) + self.assertAlmostEqual(rb.get_required_overload(), 0) + + expected = { + (0, 0, '127.0.0.1', 0): 1.0, + (0, 0, '127.0.0.1', 1): 1.0, + (0, 0, '127.0.0.1', 2): 1.0, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # ... still no overload + rb.set_overload(100.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_equal_replica_and_devices_count_ignore_weights(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 7.47, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 5.91, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 6.44, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + expected = { + 0: 1.0, + 1: 1.0, + 2: 1.0, + } + # simplicity itself + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_weighted_replicas_by_tier().items() + if len(t) == 4}) + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_wanted_replicas_by_tier().items() + if len(t) == 4}) + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_target_replicas_by_tier().items() + if len(t) == 4}) + # ... no overload required! + self.assertEqual(0, rb.get_required_overload()) + + rb.rebalance() + expected = { + 0: 256, + 1: 256, + 2: 256, + } + self.assertEqual(expected, {d['id']: d['parts'] for d in + rb._iter_devs()}) def test_small_zone(self): rb = ring.RingBuilder(8, 3, 1) @@ -2058,9 +2456,155 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 3, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) - # Zone 2 has 7/8 of the capacity of the other two zones, so an - # overload of 1/7 will allow things to balance out. - self.assertApproximately(rb.get_required_overload(), 1.0 / 7) + expected = { + (0, 0): 1.0434782608695652, + (0, 1): 1.0434782608695652, + (0, 2): 0.9130434782608695, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # the device tier is interesting because one of the devices in zone + # two has a different weight + expected = { + 0: 0.5217391304347826, + 1: 0.5217391304347826, + 2: 0.5217391304347826, + 3: 0.5217391304347826, + 4: 0.5217391304347826, + 5: 0.3913043478260869, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + + # ... but, each pair of devices still needs to hold a whole + # replicanth; which we'll try distribute fairly among devices in + # zone 2, so that they can share the burden and ultimately the + # required overload will be as small as possible. + expected = { + 0: 0.5, + 1: 0.5, + 2: 0.5, + 3: 0.5, + 4: 0.5714285714285715, + 5: 0.42857142857142855, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # full dispersion requires zone two's devices to eat more than + # they're weighted for + self.assertAlmostEqual(rb.get_required_overload(), 0.095238, + delta=1e-5) + + # so... if we give it enough overload it we should get full dispersion + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_multiple_small_zones(self): + rb = ring.RingBuilder(8, 3, 1) + + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 8, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 9, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 10, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + expected = { + (0, 0): 2.1052631578947367, + (0, 1): 0.47368421052631576, + (0, 2): 0.21052631578947367, + (0, 3): 0.21052631578947367, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + # without any overload, we get weight + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.49999999999999994, + (0, 3): 0.49999999999999994, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {t: r + for (t, r) in wanted_replicas.items() + if len(t) == 2}) + + self.assertEqual(1.3750000000000002, rb.get_required_overload()) + + # with enough overload we get the full dispersion + rb.set_overload(1.5) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) + + # with not enough overload, we get somewhere in the middle + rb.set_overload(1.0) + expected = { + (0, 0): 1.3014354066985647, + (0, 1): 0.8564593301435406, + (0, 2): 0.4210526315789473, + (0, 3): 0.4210526315789473, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) def test_big_zone(self): rb = ring.RingBuilder(8, 3, 1) @@ -2084,37 +2628,124 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.3', 'port': 10000, 'device': 'sdb'}) - # Zone 1 has weight 200, while zones 2, 3, and 4 together have only - # 360. The small zones would need to go from 360 to 400 to balance - # out zone 1, for an overload of 40/360 = 1/9. - self.assertApproximately(rb.get_required_overload(), 1.0 / 9) + expected = { + (0, 0): 1.0714285714285714, + (0, 1): 0.6428571428571429, + (0, 2): 0.6428571428571429, + (0, 3): 0.6428571428571429, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + expected = { + (0, 0): 1.0, + (0, 1): 0.6666666666666667, + (0, 2): 0.6666666666666667, + (0, 3): 0.6666666666666667, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # when all the devices and servers in a zone are evenly weighted + # it will accurately proxy their required overload, all the + # zones besides 0 require the same overload + t = random.choice([t for t in weighted_replicas + if len(t) == 2 + and t[1] != 0]) + expected_overload = ((wanted_replicas[t] - weighted_replicas[t]) + / weighted_replicas[t]) + self.assertAlmostEqual(rb.get_required_overload(), + expected_overload) + + # but if you only give it out half of that + rb.set_overload(expected_overload / 2.0) + # ... you can expect it's not going to full disperse + expected = { + (0, 0): 1.0357142857142856, + (0, 1): 0.6547619047619049, + (0, 2): 0.6547619047619049, + (0, 3): 0.6547619047619049, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_enormous_zone(self): rb = ring.RingBuilder(8, 3, 1) - rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1000, + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 500, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 1000, + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 500, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 60, + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'weight': 60, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 60, + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'weight': 60, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'weight': 60, + rb.add_dev({'id': 6, 'region': 0, 'zone': 2, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 60, + rb.add_dev({'id': 7, 'region': 0, 'zone': 2, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 60, + rb.add_dev({'id': 8, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 60, + rb.add_dev({'id': 9, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - # Zone 1 has weight 2000, while zones 2, 3, and 4 together have only - # 360. The small zones would need to go from 360 to 4000 to balance - # out zone 1, for an overload of 3640/360. - self.assertApproximately(rb.get_required_overload(), 3640.0 / 360) + expected = { + (0, 0): 2.542372881355932, + (0, 1): 0.15254237288135591, + (0, 2): 0.15254237288135591, + (0, 3): 0.15254237288135591, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 0.6666666666666667, + (0, 2): 0.6666666666666667, + (0, 3): 0.6666666666666667, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # ouch, those "tiny" devices need to hold 3x more than their + # weighted for! + self.assertAlmostEqual(rb.get_required_overload(), 3.370370, + delta=1e-5) + + # let's get a little crazy, and let devices eat up to 1x more than + # their capacity is weighted for - see how far that gets us... + rb.set_overload(1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 2.084745762711864, + (0, 1): 0.30508474576271183, + (0, 2): 0.30508474576271183, + (0, 3): 0.30508474576271183, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_two_big_two_small(self): rb = ring.RingBuilder(8, 3, 1) @@ -2138,27 +2769,923 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 35, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - # Zones 1 and 2 each have weight 200, while zones 3 and 4 together - # have only 160. The small zones would need to go from 160 to 200 to - # balance out the big zones, for an overload of 40/160 = 1/4. - self.assertApproximately(rb.get_required_overload(), 1.0 / 4) + expected = { + (0, 0): 1.0714285714285714, + (0, 1): 1.0714285714285714, + (0, 2): 0.48214285714285715, + (0, 3): 0.375, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.5625, + (0, 3): 0.43749999999999994, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # I'm not sure it's significant or coincidental that the devices + # in zone 2 & 3 who end up splitting the 3rd replica turn out to + # need to eat ~1/6th extra replicanths + self.assertAlmostEqual(rb.get_required_overload(), 1.0 / 6.0) + + # ... *so* 10% isn't *quite* enough + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 1.0285714285714285, + (0, 1): 1.0285714285714285, + (0, 2): 0.5303571428571429, + (0, 3): 0.4125, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) + + # ... but 20% will do the trick! + rb.set_overload(0.2) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.5625, + (0, 3): 0.43749999999999994, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_multiple_replicas_each(self): rb = ring.RingBuilder(8, 7, 1) - rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 80, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 100, + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 80, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sde'}) - rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 70, + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'weight': 70, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 70, + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'weight': 70, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 1, 'weight': 70, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 8, 'region': 0, 'zone': 1, 'weight': 70, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) - # Zone 0 has more than 4/7 of the weight, so we'll need to bring - # zone 1 up to a total of 150 so it can take 3 replicas, so the - # overload should be 10/140. - self.assertApproximately(rb.get_required_overload(), 10.0 / 140) + expected = { + (0, 0): 4.117647058823529, + (0, 1): 2.8823529411764706, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 4.0, + (0, 1): 3.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # I guess 2.88 => 3.0 is about a 4% increase + self.assertAlmostEqual(rb.get_required_overload(), + 0.040816326530612256) + + # ... 10% is plenty enough here + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) + + def test_small_extra_server_in_zone_with_multiple_replicas(self): + rb = ring.RingBuilder(8, 5, 1) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 1000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdc', 'weight': 1000}) + + # z1 + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 1000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdc', 'weight': 1000}) + + # z1 - extra small server + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 50}) + + expected = { + (0, 0): 2.479338842975207, + (0, 1): 2.5206611570247937, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t: r for (t, r) in + weighted_replicas.items() + if len(t) == 2}) + + # dispersion is fine with this at the zone tier + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, {t: r for (t, r) in + wanted_replicas.items() + if len(t) == 2}) + + # ... but not ok with that tiny server + expected = { + '127.0.0.1': 2.479338842975207, + '127.0.0.2': 1.5206611570247937, + '127.0.0.3': 1.0, + } + self.assertEqual(expected, {t[-1]: r for (t, r) in + wanted_replicas.items() + if len(t) == 3}) + + self.assertAlmostEqual(23.2, rb.get_required_overload()) + + def test_multiple_replicas_in_zone_with_single_device(self): + rb = ring.RingBuilder(8, 5, 0) + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z1 + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdc', 'weight': 100}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdd', 'weight': 100}) + + # first things first, make sure we do this right + rb.rebalance() + + # each device get's a sing replica of every part + expected = { + 0: 256, + 1: 256, + 2: 256, + 3: 256, + 4: 256, + } + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + + # but let's make sure we're thinking about it right too + expected = { + 0: 1.0, + 1: 1.0, + 2: 1.0, + 3: 1.0, + 4: 1.0, + } + + # by weight everyone is equal + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # wanted might have liked to have fewer replicas in z1, but the + # single device in z0 limits us one replica per device + with rb.debug(): + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + wanted_replicas.items() + if len(t) == 4}) + + # even with some overload - still one replica per device + rb.set_overload(1.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # when overload can not change the outcome none is required + self.assertEqual(0.0, rb.get_required_overload()) + # even though dispersion is terrible (in z1 particularly) + self.assertEqual(100.0, rb.dispersion) + + def test_one_big_guy_does_not_spoil_his_buddy(self): + rb = ring.RingBuilder(8, 3, 0) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + + # obviously d5 gets one whole replica; the other two replicas + # are split evenly among the five other devices + # (i.e. ~0.4 replicanths for each 100 units of weight) + expected = { + 0: 0.39999999999999997, + 1: 0.39999999999999997, + 2: 0.39999999999999997, + 3: 0.39999999999999997, + 4: 0.39999999999999997, + 5: 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # with no overload we get the "balanced" placement + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # but in reality, these devices having such disparate weights + # leads to a *terrible* balance even w/o overload! + rb.rebalance(seed=9) + self.assertEqual(rb.get_balance(), 1308.2031249999998) + + # even though part assignment is pretty reasonable + expected = { + 0: 103, + 1: 102, + 2: 103, + 3: 102, + 4: 102, + 5: 256, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # so whats happening is the small devices are holding *way* more + # *real* parts than their *relative* portion of the weight would + # like them too! + expected = { + 0: 1308.2031249999998, + 1: 1294.5312499999998, + 2: 1308.2031249999998, + 3: 1294.5312499999998, + 4: 1294.5312499999998, + 5: -65.0, + + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # increasing overload moves towards one replica in each tier + rb.set_overload(0.20) + expected = { + 0: 0.48, + 1: 0.48, + 2: 0.48, + 3: 0.48, + 4: 0.30857142857142855, + 5: 0.7714285714285714, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=17) + self.assertEqual(rb.get_balance(), 1581.6406249999998) + + # but despite the overall trend toward imbalance, in the tier + # with the huge device, the small device is trying to shed parts + # as effectively as it can (which would be useful if it was the + # only small device isolated in a tier with other huge devices + # trying to gobble up all the replicanths in the tier - see + # `test_one_small_guy_does_not_spoil_his_buddy`!) + expected = { + 0: 123, + 1: 123, + 2: 123, + 3: 123, + 4: 79, + 5: 197, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # *see*, at least *someones* balance is getting better! + expected = { + 0: 1581.6406249999998, + 1: 1581.6406249999998, + 2: 1581.6406249999998, + 3: 1581.6406249999998, + 4: 980.078125, + 5: -73.06640625, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + def test_one_small_guy_does_not_spoil_his_buddy(self): + rb = ring.RingBuilder(8, 3, 0) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + # z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + # z2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + + # it's almost like 3.0 / 5 ~= 0.6, but that one little guy get's + # his fair share + expected = { + 0: 0.5988023952095808, + 1: 0.5988023952095808, + 2: 0.5988023952095808, + 3: 0.5988023952095808, + 4: 0.5988023952095808, + 5: 0.005988023952095809, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # with no overload we get a nice balanced placement + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + rb.rebalance(seed=9) + + # part placement looks goods + expected = { + 0: 154, + 1: 153, + 2: 153, + 3: 153, + 4: 153, + 5: 2, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # ... balance is a little lumpy on the small guy since he wants + # one and a half parts :\ + expected = { + 0: 0.4609375000000142, + 1: -0.1914062499999858, + 2: -0.1914062499999858, + 3: -0.1914062499999858, + 4: -0.1914062499999858, + 5: 30.46875, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + self.assertEqual(rb.get_balance(), 30.46875) + + # increasing overload moves towards one replica in each tier + rb.set_overload(0.5) + expected = { + 0: 0.5232035928143712, + 1: 0.5232035928143712, + 2: 0.5232035928143712, + 3: 0.5232035928143712, + 4: 0.8982035928143712, + 5: 0.008982035928143714, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=17) + self.assertEqual(rb.get_balance(), 95.703125) + + # but despite the overall trend toward imbalance, the little guy + # isn't really taking on many new parts! + expected = { + 0: 134, + 1: 134, + 2: 134, + 3: 133, + 4: 230, + 5: 3, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # *see*, at everyone's balance is getting worse *together*! + expected = { + 0: -12.585937499999986, + 1: -12.585937499999986, + 2: -12.585937499999986, + 3: -13.238281249999986, + 4: 50.0390625, + 5: 95.703125, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + def test_two_servers_with_more_than_one_replica(self): + rb = ring.RingBuilder(8, 3, 0) + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 60}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 60}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 60}) + # z1 + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 80}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 128}) + # z2 + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 80}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 240}) + + rb.set_overload(0.1) + rb.rebalance() + self.assertEqual(12.161458333333343, rb.get_balance()) + + replica_plan = rb._build_target_replicas_by_tier() + for dev in rb._iter_devs(): + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + expected_parts = replica_plan[tier] * rb.parts + self.assertAlmostEqual(dev['parts'], expected_parts, + delta=1) + + def test_multi_zone_with_failed_device(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + # sanity, balanced and dispersed + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + self.assertEqual(rb.get_required_overload(), 0.0) + + # fail a device in zone 2 + rb.remove_dev(4) + + expected = { + 0: 0.6, + 1: 0.6, + 2: 0.6, + 3: 0.6, + 5: 0.6, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + + expected = { + 0: 0.5, + 1: 0.5, + 2: 0.5, + 3: 0.5, + 5: 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # does this make sense? every zone was holding 1/3rd of the + # replicas, so each device was 1/6th, remove a device and + # suddenly it's holding *both* sixths which is 2/3rds? + self.assertAlmostEqual(rb.get_required_overload(), 2.0 / 3.0) + + # 10% isn't nearly enough + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.585, + 1: 0.585, + 2: 0.585, + 3: 0.585, + 5: 0.6599999999999999, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # 50% isn't even enough + rb.set_overload(0.5) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.525, + 1: 0.525, + 2: 0.525, + 3: 0.525, + 5: 0.8999999999999999, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # even 65% isn't enough (but it's getting closer) + rb.set_overload(0.65) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.5025000000000001, + 1: 0.5025000000000001, + 2: 0.5025000000000001, + 3: 0.5025000000000001, + 5: 0.99, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_balanced_zones_unbalanced_servers(self): + rb = ring.RingBuilder(8, 3, 1) + # zone 0 server 127.0.0.1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 3000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 3000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 3000}) + # zone 1 server 127.0.0.2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 4000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 4000}) + # zone 1 (again) server 127.0.0.3 + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + + weighted_replicas = rb._build_weighted_replicas_by_tier() + + # zones are evenly weighted + expected = { + (0, 0): 1.5, + (0, 1): 1.5, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + # ... but servers are not + expected = { + '127.0.0.1': 1.5, + '127.0.0.2': 1.3333333333333333, + '127.0.0.3': 0.16666666666666666, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + + # make sure wanted will even it out + expected = { + '127.0.0.1': 1.5, + '127.0.0.2': 1.0, + '127.0.0.3': 0.4999999999999999, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + # so it wants 1/6th and eats 1/2 - that's 2/6ths more than it + # wants which is a 200% increase + self.assertAlmostEqual(rb.get_required_overload(), 2.0) + + # the overload doesn't effect the tiers that are already dispersed + rb.set_overload(1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + '127.0.0.1': 1.5, + # notice with half the overload 1/6th replicanth swapped servers + '127.0.0.2': 1.1666666666666665, + '127.0.0.3': 0.3333333333333333, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 3}) + + def test_adding_second_zone(self): + rb = ring.RingBuilder(3, 3, 1) + # zone 0 server 127.0.0.1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + # zone 0 server 127.0.0.2 + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + # zone 0 server 127.0.0.3 + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + # sanity, balanced and dispersed + expected = { + '127.0.0.1': 1.0, + '127.0.0.2': 1.0, + '127.0.0.3': 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + self.assertEqual(rb.get_required_overload(), 0) + + # start adding a second zone + + # zone 1 server 127.0.1.1 + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + # zone 1 server 127.0.1.2 + rb.add_dev({'id': 8, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 9, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + # zone 1 server 127.0.1.3 + rb.add_dev({'id': 10, 'region': 0, 'zone': 1, 'ip': '127.0.1.3', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 11, 'region': 0, 'zone': 1, 'ip': '127.0.1.3', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + + # this messes things up pretty royally + expected = { + '127.0.0.1': 0.9523809523809523, + '127.0.0.2': 0.9523809523809523, + '127.0.0.3': 0.9523809523809523, + '127.0.1.1': 0.047619047619047616, + '127.0.1.2': 0.047619047619047616, + '127.0.1.3': 0.047619047619047616, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + expected = { + '127.0.0.1': 0.6666666666666667, + '127.0.0.2': 0.6666666666666667, + '127.0.0.3': 0.6666666666666667, + '127.0.1.1': 0.3333333333333333, + '127.0.1.2': 0.3333333333333333, + '127.0.1.3': 0.3333333333333333, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + # so dispersion would require these devices hold 6x more than + # prescribed by weight, defeating any attempt at gradually + # anything + self.assertAlmostEqual(rb.get_required_overload(), 6.0) + + # so let's suppose we only allow for 10% overload + rb.set_overload(0.10) + target_replicas = rb._build_target_replicas_by_tier() + + expected = { + # we expect servers in zone 0 to be between 0.952 and 0.666 + '127.0.0.1': 0.9476190476190476, + '127.0.0.2': 0.9476190476190476, + '127.0.0.3': 0.9476190476190476, + # we expect servers in zone 1 to be between 0.0476 and 0.333 + # and in fact its ~10% increase (very little compared to 6x!) + '127.0.1.1': 0.052380952380952375, + '127.0.1.2': 0.052380952380952375, + '127.0.1.3': 0.052380952380952375, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 3}) + + def test_gradual_replica_count(self): + rb = ring.RingBuilder(3, 2.5, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + expected = { + 0: 0.625, + 1: 0.625, + 2: 0.625, + 3: 0.625, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, { + tier[3]: wanted + for (tier, wanted) in wanted_replicas.items() + if len(tier) == 4}) + + self.assertEqual(rb.get_required_overload(), 0) + + # server 127.0.0.2 will have only one device + rb.remove_dev(2) + + # server 127.0.0.1 has twice the capacity of 127.0.0.2 + expected = { + '127.0.0.1': 1.6666666666666667, + '127.0.0.2': 0.8333333333333334, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + + # dispersion requirements extend only to whole replicas + expected = { + '127.0.0.1': 1.4999999999999998, + '127.0.0.2': 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, { + tier[2]: wanted + for (tier, wanted) in wanted_replicas.items() + if len(tier) == 3}) + + # 5/6ths to a whole replicanth is a 20% increase + self.assertAlmostEqual(rb.get_required_overload(), 0.2) + + # so let's suppose we only allow for 10% overload + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + '127.0.0.1': 1.5833333333333333, + '127.0.0.2': 0.9166666666666667, + } + self.assertEqual(expected, { + tier[2]: wanted + for (tier, wanted) in target_replicas.items() + if len(tier) == 3}) + + def test_perfect_four_zone_four_replica_bad_placement(self): + rb = ring.RingBuilder(4, 4, 1) + + # this weight is sorta nuts, but it's really just to help the + # weight_of_one_part hit a magic number where floats mess up + # like they would on ring with a part power of 19 and 100's of + # 1000's of units of weight. + weight = 21739130434795e-11 + + # r0z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': weight, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': weight, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + # r0z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': weight, + 'ip': '127.0.1.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': weight, + 'ip': '127.0.1.2', 'port': 10000, 'device': 'sdb'}) + # r1z0 + rb.add_dev({'id': 4, 'region': 1, 'zone': 0, 'weight': weight, + 'ip': '127.1.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 5, 'region': 1, 'zone': 0, 'weight': weight, + 'ip': '127.1.0.2', 'port': 10000, 'device': 'sdb'}) + # r1z1 + rb.add_dev({'id': 6, 'region': 1, 'zone': 1, 'weight': weight, + 'ip': '127.1.1.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 7, 'region': 1, 'zone': 1, 'weight': weight, + 'ip': '127.1.1.2', 'port': 10000, 'device': 'sdb'}) + + # the replica plan is sound + expectations = { + # tier_len => expected replicas + 1: { + (0,): 2.0, + (1,): 2.0, + }, + 2: { + (0, 0): 1.0, + (0, 1): 1.0, + (1, 0): 1.0, + (1, 1): 1.0, + } + } + wr = rb._build_replica_plan() + for tier_len, expected in expectations.items(): + self.assertEqual(expected, {t: r['max'] for (t, r) in + wr.items() if len(t) == tier_len}) + + # even thought a naive ceil of weights is surprisingly wrong + expectations = { + # tier_len => expected replicas + 1: { + (0,): 3.0, + (1,): 3.0, + }, + 2: { + (0, 0): 2.0, + (0, 1): 2.0, + (1, 0): 2.0, + (1, 1): 2.0, + } + } + wr = rb._build_weighted_replicas_by_tier() + for tier_len, expected in expectations.items(): + self.assertEqual(expected, {t: ceil(r) for (t, r) in + wr.items() if len(t) == tier_len}) if __name__ == '__main__': diff --git a/test/unit/common/ring/test_ring.py b/test/unit/common/ring/test_ring.py index 90df6ce4ce..a492b44bd4 100644 --- a/test/unit/common/ring/test_ring.py +++ b/test/unit/common/ring/test_ring.py @@ -457,39 +457,42 @@ class TestRing(TestRingBase): # Yes, these tests are deliberately very fragile. We want to make sure # that if someone changes the results the ring produces, they know it. exp_part = 6 - exp_devs = [48, 93, 96] - exp_zones = set([5, 8, 9]) + exp_devs = [71, 77, 30] + exp_zones = set([6, 3, 7]) + + exp_handoffs = [99, 43, 94, 13, 1, 49, 60, 72, 27, 68, 78, 26, 21, 9, + 51, 105, 47, 89, 65, 82, 34, 98, 38, 85, 16, 4, 59, + 102, 40, 90, 20, 8, 54, 66, 80, 25, 14, 2, 50, 12, 0, + 48, 70, 76, 32, 107, 45, 87, 101, 44, 93, 100, 42, 95, + 106, 46, 88, 97, 37, 86, 96, 36, 84, 17, 5, 57, 63, + 81, 33, 67, 79, 24, 15, 3, 58, 69, 75, 31, 61, 74, 29, + 23, 10, 52, 22, 11, 53, 64, 83, 35, 62, 73, 28, 18, 6, + 56, 104, 39, 91, 103, 41, 92, 19, 7, 55] + + exp_first_handoffs = [23, 64, 105, 102, 67, 17, 99, 65, 69, 97, 15, + 17, 24, 98, 66, 65, 69, 18, 104, 105, 16, 107, + 100, 15, 14, 19, 102, 105, 63, 104, 99, 12, 107, + 99, 16, 105, 71, 15, 15, 63, 63, 99, 21, 68, 20, + 64, 96, 21, 98, 19, 68, 99, 15, 69, 62, 100, 96, + 102, 17, 62, 13, 61, 102, 105, 22, 16, 21, 18, + 21, 100, 20, 16, 21, 106, 66, 106, 16, 99, 16, + 22, 62, 60, 99, 69, 18, 23, 104, 98, 106, 61, + 21, 23, 23, 16, 67, 71, 101, 16, 64, 66, 70, 15, + 102, 63, 19, 98, 18, 106, 101, 100, 62, 63, 98, + 18, 13, 97, 23, 22, 100, 13, 14, 67, 96, 14, + 105, 97, 71, 64, 96, 22, 65, 66, 98, 19, 105, + 98, 97, 21, 15, 69, 100, 98, 106, 65, 66, 97, + 62, 22, 68, 63, 61, 67, 67, 20, 105, 106, 105, + 18, 71, 100, 17, 62, 60, 13, 103, 99, 101, 96, + 97, 16, 60, 21, 14, 20, 12, 60, 69, 104, 65, 65, + 17, 16, 67, 13, 64, 15, 16, 68, 96, 21, 104, 66, + 96, 105, 58, 105, 103, 21, 96, 60, 16, 96, 21, + 71, 16, 99, 101, 63, 62, 103, 18, 102, 60, 17, + 19, 106, 97, 14, 99, 68, 102, 13, 70, 103, 21, + 22, 19, 61, 103, 23, 104, 65, 62, 68, 16, 65, + 15, 102, 102, 71, 99, 63, 67, 19, 23, 15, 69, + 107, 14, 13, 64, 13, 105, 15, 98, 69] - exp_handoffs = [11, 47, 25, 76, 69, 23, 99, 59, 106, 64, 43, 34, 88, 3, - 30, 83, 16, 27, 103, 39, 60, 0, 8, 72, 56, 19, 91, 13, - 84, 38, 66, 52, 78, 107, 50, 57, 31, 32, 77, 24, 42, - 100, 71, 26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102, - 101, 61, 95, 21, 81, 1, 105, 58, 74, 90, 86, 46, 4, 68, - 40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, 87, - 65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, 37, 63, - 53, 92, 33, 85, 73, 51, 98, 36, 10] - exp_first_handoffs = [1, 37, 48, 68, 84, 75, 11, 101, 14, 73, 100, 75, - 29, 19, 18, 101, 15, 99, 95, 24, 46, 82, 73, 62, - 24, 89, 9, 22, 107, 74, 54, 63, 40, 106, 99, 83, - 64, 73, 73, 106, 106, 80, 6, 25, 20, 33, 6, 79, - 59, 42, 62, 24, 14, 107, 28, 0, 85, 5, 4, 12, 58, - 11, 92, 18, 36, 56, 86, 1, 21, 33, 80, 97, 4, 81, - 79, 76, 89, 50, 75, 27, 7, 96, 47, 55, 81, 104, - 12, 5, 18, 106, 27, 93, 39, 92, 42, 30, 20, 88, - 58, 105, 65, 29, 17, 52, 11, 106, 7, 24, 21, 91, - 62, 52, 50, 31, 77, 102, 19, 11, 8, 58, 53, 20, - 26, 8, 18, 82, 48, 68, 82, 89, 101, 50, 3, 52, - 46, 11, 2, 30, 79, 66, 4, 61, 3, 56, 45, 102, 73, - 84, 36, 19, 34, 84, 49, 40, 103, 66, 31, 33, 93, - 33, 4, 52, 26, 58, 30, 47, 100, 57, 40, 79, 33, - 107, 24, 20, 44, 4, 7, 59, 83, 101, 1, 56, 20, - 61, 33, 16, 5, 74, 98, 4, 80, 15, 104, 52, 73, - 18, 67, 75, 98, 73, 79, 68, 75, 27, 91, 36, 100, - 52, 95, 37, 46, 70, 14, 47, 3, 70, 23, 40, 105, - 62, 86, 48, 22, 54, 4, 72, 81, 13, 0, 18, 98, - 101, 36, 29, 24, 39, 79, 97, 105, 28, 107, 47, - 52, 101, 20, 22, 29, 65, 27, 7, 33, 64, 101, 60, - 19, 55] rb = ring.RingBuilder(8, 3, 1) next_dev_id = 0 for zone in range(1, 10): @@ -501,16 +504,27 @@ class TestRing(TestRingBase): 'zone': zone, 'region': 0, 'weight': 1.0}) next_dev_id += 1 - rb.rebalance(seed=1) + rb.rebalance(seed=2) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') + + # every part has the same number of handoffs + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {105}) + # which less the primaries - is every device in the ring + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105) + part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) self.assertEqual(part, exp_part) self.assertEqual([d['id'] for d in devs], exp_devs) self.assertEqual(primary_zones, exp_zones) devs = list(r.get_more_nodes(part)) - self.assertEqual([d['id'] for d in devs], exp_handoffs) + self.assertEqual(len(devs), len(exp_handoffs)) + dev_ids = [d['id'] for d in devs] + self.assertEqual(dev_ids, exp_handoffs) # The first 6 replicas plus the 3 primary nodes should cover all 9 # zones in this test @@ -531,11 +545,22 @@ class TestRing(TestRingBase): 'ip': '1.2.%d.%d' % (zone, server), 'port': 1234, 'zone': zone, 'region': 0, 'weight': 1.0}) next_dev_id += 1 - rb.rebalance(seed=1) + rb.pretend_min_part_hours_passed() + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=2) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') - # We would change expectations here, but in this test no handoffs - # changed at all. + + # so now we expect the device list to be longer by one device + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {106}) + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 106) + # I don't think there's any special reason this dev goes at this index + exp_handoffs.insert(27, rb.devs[-1]['id']) + + # We would change expectations here, but in this part only the added + # device changed at all. part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) self.assertEqual(part, exp_part) @@ -555,36 +580,60 @@ class TestRing(TestRingBase): seen_zones.update([d['zone'] for d in devs[:6]]) self.assertEqual(seen_zones, set(range(1, 10))) + # Change expectations for the rest of the parts devs = [] for part in range(r.partition_count): devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 for part in range(r.partition_count): - self.assertEqual( - devs[part], exp_first_handoffs[part], - 'handoff for partitition %d is now device id %d' % ( - part, devs[part])) + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertEqual(changed_first_handoff, num_parts_changed) - # Remove a device. + # Remove a device - no need to fluff min_part_hours. rb.remove_dev(0) - rb.rebalance(seed=1) + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=1) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') - # Change expectations - # The long string of handoff nodes for the partition were the same for - # the first 20, which is pretty good. - exp_handoffs[20:] = [60, 108, 8, 72, 56, 19, 91, 13, 84, 38, 66, 52, - 1, 78, 107, 50, 57, 31, 32, 77, 24, 42, 100, 71, - 26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102, 101, - 61, 95, 21, 81, 105, 58, 74, 90, 86, 46, 4, 68, - 40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, - 87, 65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, - 37, 63, 53, 92, 33, 85, 73, 51, 98, 36, 10] - # Just a few of the first handoffs changed - exp_first_handoffs[3] = 68 - exp_first_handoffs[55] = 104 - exp_first_handoffs[116] = 6 - exp_first_handoffs[181] = 15 - exp_first_handoffs[228] = 38 + + # so now we expect the device list to be shorter by one device + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {105}) + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105) + + # Change expectations for our part + exp_handoffs.remove(0) + first_matches = 0 + total_changed = 0 + devs = list(d['id'] for d in r.get_more_nodes(exp_part)) + for i, part in enumerate(devs): + if exp_handoffs[i] != devs[i]: + total_changed += 1 + exp_handoffs[i] = devs[i] + if not total_changed: + first_matches += 1 + self.assertEqual(devs, exp_handoffs) + # the first 21 handoffs were the same across the rebalance + self.assertEqual(first_matches, 21) + # but as you dig deeper some of the differences show up + self.assertEqual(total_changed, 41) + + # Change expectations for the rest of the parts + devs = [] + for part in range(r.partition_count): + devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 + for part in range(r.partition_count): + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertEqual(changed_first_handoff, num_parts_changed) + # Test part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) @@ -615,56 +664,48 @@ class TestRing(TestRingBase): # Add a partial replica rb.set_replicas(3.5) - rb.rebalance(seed=1) + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=164) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') + # Change expectations + # We have another replica now - exp_devs.append(47) - exp_zones.add(4) + exp_devs.append(90) + exp_zones.add(8) + # and therefore one less handoff + exp_handoffs = exp_handoffs[:-1] # Caused some major changes in the sequence of handoffs for our test # partition, but at least the first stayed the same. - exp_handoffs[1:] = [81, 25, 69, 23, 99, 59, 76, 3, 106, 64, 43, 13, 34, - 88, 30, 16, 27, 103, 39, 74, 60, 108, 8, 56, 19, - 91, 52, 84, 38, 66, 1, 78, 45, 107, 50, 57, 83, 31, - 46, 32, 77, 24, 42, 63, 100, 72, 71, 7, 26, 9, 20, - 35, 5, 87, 14, 94, 62, 28, 41, 90, 18, 82, 102, 22, - 101, 61, 85, 95, 21, 98, 67, 105, 58, 86, 4, 79, - 68, 40, 80, 54, 75, 44, 49, 6, 29, 15, 70, 65, 12, - 17, 104, 97, 55, 89, 2, 37, 53, 92, 33, 73, 51, 36, - 10] + devs = list(d['id'] for d in r.get_more_nodes(exp_part)) + first_matches = 0 + total_changed = 0 + for i, part in enumerate(devs): + if exp_handoffs[i] != devs[i]: + total_changed += 1 + exp_handoffs[i] = devs[i] + if not total_changed: + first_matches += 1 + # most seeds seem to throw out first handoff stabilization with + # replica_count change + self.assertEqual(first_matches, 2) + # and lots of other handoff changes... + self.assertEqual(total_changed, 95) + + self.assertEqual(devs, exp_handoffs) + + # Change expectations for the rest of the parts + devs = [] + for part in range(r.partition_count): + devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 + for part in range(r.partition_count): + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertLessEqual(changed_first_handoff, num_parts_changed) - # Lots of first handoffs changed, but 30 of 256 is still just 11.72%. - exp_first_handoffs[1] = 6 - exp_first_handoffs[4] = 104 - exp_first_handoffs[11] = 106 - exp_first_handoffs[17] = 13 - exp_first_handoffs[21] = 77 - exp_first_handoffs[22] = 95 - exp_first_handoffs[27] = 46 - exp_first_handoffs[29] = 65 - exp_first_handoffs[30] = 3 - exp_first_handoffs[31] = 20 - exp_first_handoffs[51] = 50 - exp_first_handoffs[53] = 8 - exp_first_handoffs[54] = 2 - exp_first_handoffs[72] = 107 - exp_first_handoffs[79] = 72 - exp_first_handoffs[85] = 71 - exp_first_handoffs[88] = 66 - exp_first_handoffs[92] = 29 - exp_first_handoffs[93] = 46 - exp_first_handoffs[96] = 38 - exp_first_handoffs[101] = 57 - exp_first_handoffs[103] = 87 - exp_first_handoffs[104] = 28 - exp_first_handoffs[107] = 1 - exp_first_handoffs[109] = 69 - exp_first_handoffs[110] = 50 - exp_first_handoffs[111] = 76 - exp_first_handoffs[115] = 47 - exp_first_handoffs[117] = 48 - exp_first_handoffs[119] = 7 # Test part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) @@ -696,17 +737,16 @@ class TestRing(TestRingBase): # One last test of a partial replica partition exp_part2 = 136 - exp_devs2 = [52, 76, 97] - exp_zones2 = set([9, 5, 7]) - exp_handoffs2 = [2, 67, 37, 92, 33, 23, 107, 63, 44, 103, 108, 85, - 73, 10, 89, 80, 4, 17, 49, 32, 12, 41, 58, 20, 25, - 61, 94, 47, 69, 56, 101, 28, 83, 8, 96, 53, 51, 42, - 98, 35, 36, 84, 43, 104, 31, 65, 1, 40, 9, 74, 95, - 45, 5, 71, 86, 78, 30, 93, 48, 91, 15, 88, 39, 18, - 57, 72, 70, 27, 54, 16, 24, 21, 14, 11, 77, 62, 50, - 6, 105, 26, 55, 29, 60, 34, 13, 87, 59, 38, 99, 75, - 106, 3, 82, 66, 79, 7, 46, 64, 81, 22, 68, 19, 102, - 90, 100] + exp_devs2 = [70, 76, 32] + exp_zones2 = set([3, 6, 7]) + exp_handoffs2 = [89, 97, 37, 53, 20, 1, 86, 64, 102, 40, 90, 60, 72, + 27, 99, 68, 78, 26, 105, 45, 42, 95, 22, 13, 49, 55, + 11, 8, 83, 16, 4, 59, 33, 108, 61, 74, 29, 88, 66, + 80, 25, 100, 39, 67, 79, 24, 65, 96, 36, 84, 54, 21, + 63, 81, 56, 71, 77, 30, 48, 23, 10, 52, 82, 34, 17, + 107, 87, 104, 5, 35, 2, 50, 43, 62, 73, 28, 18, 14, + 98, 38, 85, 15, 57, 9, 51, 12, 6, 91, 3, 103, 41, 92, + 47, 75, 44, 69, 101, 93, 106, 46, 94, 31, 19, 7, 58] part2, devs2 = r.get_nodes('a', 'c', 'o2') primary_zones2 = set([d['zone'] for d in devs2]) @@ -764,14 +804,15 @@ class TestRing(TestRingBase): # Here's a brittle canary-in-the-coalmine test to make sure the region # handoff computation didn't change accidentally - exp_handoffs = [111, 112, 74, 54, 93, 31, 2, 43, 100, 22, 71, 92, 35, - 9, 50, 41, 76, 80, 84, 88, 17, 96, 6, 102, 37, 29, - 105, 5, 47, 20, 13, 108, 66, 81, 53, 65, 25, 58, 32, - 94, 101, 1, 10, 44, 73, 75, 21, 97, 28, 106, 30, 16, - 39, 77, 42, 72, 34, 99, 14, 61, 90, 4, 40, 3, 45, 62, - 7, 15, 87, 12, 83, 89, 33, 98, 49, 107, 56, 86, 48, - 57, 24, 11, 23, 26, 46, 64, 69, 38, 36, 79, 63, 104, - 51, 70, 82, 67, 68, 8, 95, 91, 55, 59, 85] + exp_handoffs = [111, 112, 35, 58, 62, 74, 20, 105, 41, 90, 53, 6, 3, + 67, 55, 76, 108, 32, 12, 80, 38, 85, 94, 42, 27, 99, + 50, 47, 70, 87, 26, 9, 15, 97, 102, 81, 23, 65, 33, + 77, 34, 4, 75, 8, 5, 30, 13, 73, 36, 92, 54, 51, 72, + 78, 66, 1, 48, 14, 93, 95, 88, 86, 84, 106, 60, 101, + 57, 43, 89, 59, 79, 46, 61, 52, 44, 45, 37, 68, 25, + 100, 49, 24, 16, 71, 96, 21, 107, 98, 64, 39, 18, 29, + 103, 91, 22, 63, 69, 28, 56, 11, 82, 10, 17, 19, 7, + 40, 83, 104, 31] dev_ids = [d['id'] for d in more_devs] self.assertEqual(len(dev_ids), len(exp_handoffs)) diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index 3cba0fb512..705d619b9b 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -692,10 +692,10 @@ class TestUtils(unittest.TestCase): rb.rebalance(seed=100) rb.validate() - self.assertEqual(rb.dispersion, 39.0625) + self.assertEqual(rb.dispersion, 39.84375) report = dispersion_report(rb) self.assertEqual(report['worst_tier'], 'r1z1') - self.assertEqual(report['max_dispersion'], 39.0625) + self.assertEqual(report['max_dispersion'], 39.84375) def build_tier_report(max_replicas, placed_parts, dispersion, replicas): @@ -711,11 +711,11 @@ class TestUtils(unittest.TestCase): # zone 1 are stored at least twice on the nodes expected = [ ['r1z1', build_tier_report( - 2, 256, 39.0625, [0, 0, 156, 100])], + 2, 256, 39.84375, [0, 0, 154, 102])], ['r1z1-127.0.0.1', build_tier_report( - 1, 256, 19.53125, [0, 206, 50, 0])], + 1, 256, 19.921875, [0, 205, 51, 0])], ['r1z1-127.0.0.2', build_tier_report( - 1, 256, 19.53125, [0, 206, 50, 0])], + 1, 256, 19.921875, [0, 205, 51, 0])], ] report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True) graph = report['graph'] @@ -735,12 +735,18 @@ class TestUtils(unittest.TestCase): 'ip': '127.0.0.3', 'port': 10003, 'device': 'sdc1'}) rb.add_dev({'id': 15, 'region': 1, 'zone': 0, 'weight': 500, 'ip': '127.0.0.3', 'port': 10003, 'device': 'sdd1'}) - rb.rebalance(seed=10) - report = dispersion_report(rb) - self.assertEqual(rb.dispersion, 44.53125) + # when the biggest tier has the smallest devices things get ugly + rb.rebalance(seed=100) + report = dispersion_report(rb, verbose=True) + self.assertEqual(rb.dispersion, 70.3125) self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.3') - self.assertEqual(report['max_dispersion'], 32.520325203252035) + self.assertEqual(report['max_dispersion'], 88.23529411764706) + + # ... but overload can square it + rb.set_overload(rb.get_required_overload()) + rb.rebalance() + self.assertEqual(rb.dispersion, 0.0) def test_parse_address_old_format(self): # Test old format