diff --git a/swift/cli/ringbuilder.py b/swift/cli/ringbuilder.py index fc7ada82d8..192a788518 100755 --- a/swift/cli/ringbuilder.py +++ b/swift/cli/ringbuilder.py @@ -448,29 +448,19 @@ swift-ring-builder print('The overload factor is %0.2f%% (%.6f)' % ( builder.overload * 100, builder.overload)) if builder.devs: + balance_per_dev = builder._build_balance_per_dev() print('Devices: id region zone ip address port ' 'replication ip replication port name ' 'weight partitions balance flags meta') - weighted_parts = builder.parts * builder.replicas / \ - sum(d['weight'] for d in builder.devs if d is not None) - for dev in builder.devs: - if dev is None: - continue - if not dev['weight']: - if dev['parts']: - balance = MAX_BALANCE - else: - balance = 0 - else: - balance = 100.0 * dev['parts'] / \ - (dev['weight'] * weighted_parts) - 100.0 + for dev in builder._iter_devs(): flags = 'DEL' if dev in builder._remove_devs else '' print(' %5d %7d %5d %15s %5d %15s %17d %9s %6.02f ' '%10s %7.02f %5s %s' % (dev['id'], dev['region'], dev['zone'], dev['ip'], dev['port'], dev['replication_ip'], dev['replication_port'], dev['device'], dev['weight'], - dev['parts'], balance, flags, dev['meta'])) + dev['parts'], balance_per_dev[dev['id']], flags, + dev['meta'])) exit(EXIT_SUCCESS) def search(): @@ -924,6 +914,8 @@ swift-ring-builder dispersion [options] verbose=options.verbose) print('Dispersion is %.06f, Balance is %.06f, Overload is %0.2f%%' % ( builder.dispersion, builder.get_balance(), builder.overload * 100)) + print('Required overload is %.6f%%' % ( + builder.get_required_overload() * 100)) if report['worst_tier']: status = EXIT_WARNING print('Worst tier is %.06f (%s)' % (report['max_dispersion'], @@ -1034,7 +1026,6 @@ swift-ring-builder write_builder [min_part_hours] for parts in builder._replica2part2dev: for dev_id in parts: builder.devs[dev_id]['parts'] += 1 - builder._set_parts_wanted() builder.save(builder_file) def pretend_min_part_hours_passed(): diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py index 830a381946..7629bbb900 100644 --- a/swift/common/ring/builder.py +++ b/swift/common/ring/builder.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bisect import copy import errno import itertools @@ -23,7 +22,6 @@ import random import six.moves.cPickle as pickle from copy import deepcopy from contextlib import contextmanager -import warnings from array import array from collections import defaultdict @@ -36,7 +34,12 @@ from swift.common.ring import RingData from swift.common.ring.utils import tiers_for_dev, build_tier_tree, \ validate_and_normalize_address +# we can't store None's in the replica2part2dev array, so we high-jack +# the max value for magic to represent the part is not currently +# assigned to any device. +NONE_DEV = 2 ** 16 - 1 MAX_BALANCE = 999.99 +MAX_BALANCE_GATHER_COUNT = 3 class RingValidationWarning(Warning): @@ -88,7 +91,6 @@ class RingBuilder(object): self.devs_changed = False self.version = 0 self.overload = 0.0 - self._effective_overload = None # _replica2part2dev maps from replica number to partition number to # device id. So, for a three replica, 2**23 ring, it's an array of @@ -99,16 +101,16 @@ class RingBuilder(object): # a while ago, code-wise, when I last tried it). self._replica2part2dev = None - # _last_part_moves is a 2**23 array of unsigned bytes representing the - # number of hours since a given partition was last moved. This is used - # to guarantee we don't move a partition twice within a given number of - # hours (24 is my usual test). Removing a device or setting its weight - # to 0 overrides this behavior as it's assumed those actions are done - # because of device failure. + # _last_part_moves is an array of unsigned bytes representing + # the number of hours since a given partition was last moved. + # This is used to guarantee we don't move a partition twice + # within a given number of hours (24 is my usual test). Removing + # a device overrides this behavior as it's assumed that's only + # done because of device failure. + self._last_part_moves = None # _last_part_moves_epoch indicates the time the offsets in # _last_part_moves is based on. - self._last_part_moves_epoch = None - self._last_part_moves = None + self._last_part_moves_epoch = 0 self._last_part_gather_start = 0 @@ -204,6 +206,9 @@ class RingBuilder(object): for dev in self._iter_devs(): dev.setdefault("region", 1) + if not self._last_part_moves_epoch: + self._last_part_moves_epoch = 0 + def __deepcopy__(self, memo): return type(self).from_dict(deepcopy(self.to_dict(), memo)) @@ -341,7 +346,6 @@ class RingBuilder(object): dev['weight'] = float(dev['weight']) dev['parts'] = 0 self.devs[dev['id']] = dev - self._set_parts_wanted() self.devs_changed = True self.version += 1 return dev['id'] @@ -359,8 +363,10 @@ class RingBuilder(object): :param dev_id: device id :param weight: new weight for device """ + if any(dev_id == d['id'] for d in self._remove_devs): + raise ValueError("Can not set weight of dev_id %s because it " + "is marked for removal" % (dev_id,)) self.devs[dev_id]['weight'] = weight - self._set_parts_wanted() self.devs_changed = True self.version += 1 @@ -377,7 +383,6 @@ class RingBuilder(object): dev = self.devs[dev_id] dev['weight'] = 0 self._remove_devs.append(dev) - self._set_parts_wanted() self.devs_changed = True self.version += 1 @@ -399,68 +404,81 @@ class RingBuilder(object): :returns: (number_of_partitions_altered, resulting_balance, number_of_removed_devices) """ - num_devices = len([d for d in self._iter_devs() if d['weight'] > 0]) - removed_devs = 0 + # count up the devs, and cache some stuff + num_devices = 0 + for dev in self._iter_devs(): + dev['tiers'] = tiers_for_dev(dev) + if dev['weight'] > 0: + num_devices += 1 if num_devices < self.replicas: - warnings.warn(RingValidationWarning( + raise exceptions.RingValidationError( "Replica count of %(replicas)s requires more " "than %(num_devices)s devices" % { 'replicas': self.replicas, 'num_devices': num_devices, - })) - old_replica2part2dev = copy.deepcopy(self._replica2part2dev) + }) if seed is not None: random.seed(seed) - self._effective_overload = self.overload - if self.overload and self.dispersion <= 0: - # iff we're fully dispersed we want to bring in overload - self._effective_overload = min(self.overload, - self.get_required_overload()) - self.logger.debug("Using effective overload of %f", - self._effective_overload) - self._ring = None - if self._last_part_moves_epoch is None: + + old_replica2part2dev = copy.deepcopy(self._replica2part2dev) + + if self._last_part_moves is None: self.logger.debug("New builder; performing initial balance") - self._initial_balance() - self.devs_changed = False - self._build_dispersion_graph() - return self.parts, self.get_balance(), removed_devs - changed_parts = 0 + self._last_part_moves = array('B', itertools.repeat(0, self.parts)) self._update_last_part_moves() - last_balance = 0 - new_parts, removed_part_count = self._adjust_replica2part2dev_size() - self.logger.debug( - "%d new parts and %d removed parts from replica-count change", - len(new_parts), removed_part_count) - changed_parts += removed_part_count - self._set_parts_wanted() - self._reassign_parts(new_parts) - changed_parts += len(new_parts) - while True: - reassign_parts = self._gather_reassign_parts() - changed_parts += len(reassign_parts) - self.logger.debug("Gathered %d parts thus far (%d this pass)", - changed_parts, len(reassign_parts)) - self._reassign_parts(reassign_parts) - self.logger.debug("Assigned %d parts", changed_parts) - while self._remove_devs: - remove_dev_id = self._remove_devs.pop()['id'] - self.logger.debug("Removing dev %d", remove_dev_id) - self.devs[remove_dev_id] = None - removed_devs += 1 - balance = self.get_balance() - if balance < 1 or abs(last_balance - balance) < 1 or \ - changed_parts == self.parts: + + replica_plan = self._build_replica_plan() + self._set_parts_wanted(replica_plan) + + assign_parts = defaultdict(list) + # gather parts from failed devices + removed_devs = self._gather_parts_from_failed_devices(assign_parts) + # gather parts from replica count adjustment + self._adjust_replica2part2dev_size(assign_parts) + # gather parts for dispersion (N.B. this only picks up parts that + # *must* disperse according to the replica plan) + self._gather_parts_for_dispersion(assign_parts, replica_plan) + + # we'll gather a few times, or until we archive the plan + for gather_count in range(MAX_BALANCE_GATHER_COUNT): + self._gather_parts_for_balance(assign_parts, replica_plan) + if not assign_parts: + # most likely min part hours + finish_status = 'Unable to finish' break - last_balance = balance + assign_parts_list = list(assign_parts.items()) + # shuffle the parts to be reassigned, we have no preference on the + # order in which the replica plan is fulfilled. + random.shuffle(assign_parts_list) + # reset assign_parts map for next iteration + assign_parts = defaultdict(list) + + num_part_replicas = sum(len(r) for p, r in assign_parts_list) + self.logger.debug("Gathered %d parts", num_part_replicas) + self._reassign_parts(assign_parts_list, replica_plan) + self.logger.debug("Assigned %d parts", num_part_replicas) + + if not sum(d['parts_wanted'] < 0 for d in + self._iter_devs()): + finish_status = 'Finished' + break + else: + finish_status = 'Unable to finish' + self.logger.debug('%s rebalance plan after %s attempts' % ( + finish_status, gather_count + 1)) + self.devs_changed = False self.version += 1 - changed_parts = self._build_dispersion_graph(old_replica2part2dev) - return changed_parts, balance, removed_devs + + # clean up the cache + for dev in self._iter_devs(): + dev.pop('tiers', None) + + return changed_parts, self.get_balance(), removed_devs def _build_dispersion_graph(self, old_replica2part2dev=None): """ @@ -500,8 +518,6 @@ class RingBuilder(object): max_allowed_replicas = self._build_max_replicas_by_tier() parts_at_risk = 0 - tfd = {} - dispersion_graph = {} # go over all the devices holding each replica part by part for part_id, dev_ids in enumerate( @@ -511,9 +527,7 @@ class RingBuilder(object): replicas_at_tier = defaultdict(int) for rep_id, dev in enumerate(iter( self.devs[dev_id] for dev_id in dev_ids)): - if dev['id'] not in tfd: - tfd[dev['id']] = tiers_for_dev(dev) - for tier in tfd[dev['id']]: + for tier in (dev.get('tiers') or tiers_for_dev(dev)): replicas_at_tier[tier] += 1 # IndexErrors will be raised if the replicas are increased or # decreased, and that actually means the partition has changed @@ -616,10 +630,10 @@ class RingBuilder(object): (part, replica)) devs_for_part.append(dev_id) if len(devs_for_part) != len(set(devs_for_part)): - warnings.warn(RingValidationWarning( + raise exceptions.RingValidationError( "The partition %s has been assigned to " "duplicate devices %r" % ( - part, devs_for_part))) + part, devs_for_part)) if stats: weight_of_one_part = self.weight_of_one_part() @@ -641,6 +655,32 @@ class RingBuilder(object): return dev_usage, worst return None, None + def _build_balance_per_dev(self): + """ + Build a map of => where is a float + representing the percentage difference from the desired amount of + partitions a given device wants and the amount it has. + + N.B. this method only considers a device's weight and the parts + assigned, not the parts wanted according to the replica plan. + """ + weight_of_one_part = self.weight_of_one_part() + balance_per_dev = {} + for dev in self._iter_devs(): + if not dev['weight']: + if dev['parts']: + # If a device has no weight, but has partitions, then its + # overage is considered "infinity" and therefore always the + # worst possible. We show MAX_BALANCE for convenience. + balance = MAX_BALANCE + else: + balance = 0 + else: + balance = 100.0 * dev['parts'] / ( + dev['weight'] * weight_of_one_part) - 100.0 + balance_per_dev[dev['id']] = balance + return balance_per_dev + def get_balance(self): """ Get the balance of the ring. The balance value is the highest @@ -652,167 +692,39 @@ class RingBuilder(object): :returns: balance of the ring """ - balance = 0 - weight_of_one_part = self.weight_of_one_part() - for dev in self._iter_devs(): - if not dev['weight']: - if dev['parts']: - # If a device has no weight, but has partitions, then its - # overage is considered "infinity" and therefore always the - # worst possible. We show MAX_BALANCE for convenience. - balance = MAX_BALANCE - break - continue - dev_balance = abs(100.0 * dev['parts'] / - (dev['weight'] * weight_of_one_part) - 100.0) - if dev_balance > balance: - balance = dev_balance - return balance + balance_per_dev = self._build_balance_per_dev() + return max(abs(b) for b in balance_per_dev.values()) - def get_required_overload(self): + def get_required_overload(self, weighted=None, wanted=None): """ Returns the minimum overload value required to make the ring maximally dispersed. + + The required overload is the largest percentage change of any single + device from its weighted replicanth to its wanted replicanth (note + under weighted devices have a negative percentage change) to archive + dispersion - that is to say a single device that must be overloaded by + 5% is worse than 5 devices in a single tier overloaded by 1%. """ - self.logger.debug("computing required overload") - tfd, sibling_tiers = self._compute_sibling_tiers() - max_allowed_replicas = self._build_max_replicas_by_tier() - - # We're computing a bunch of different things here, but iterating - # over all the devs once is more efficient than doing it a bunch of - # times. - all_tiers = set([()]) - tier_weight = defaultdict(float) - total_weight = 0.0 - tier2children = defaultdict(set) + weighted = weighted or self._build_weighted_replicas_by_tier() + wanted = wanted or self._build_wanted_replicas_by_tier() + max_overload = 0.0 for dev in self._iter_devs(): - dev_weight = dev['weight'] - total_weight += dev_weight - for tier in tfd[dev['id']]: - all_tiers.add(tier) - tier_weight[tier] += dev_weight - tier2children[tier[:-1]].add(tier) - tier_weight[()] = total_weight - - max_required_overload = 0.0 - for tier in all_tiers: - if tier not in tier2children: - continue - if tier_weight[tier] <= 0: - continue - # Example 1: Consider a 3-replica cluster with 2 regions. If one - # region has more than 2/3 the total weight, then (ignoring - # overload) some partitions will reside entirely in the big - # region. - # - # Example 2: Consider a 3-replica cluster with 3 regions. If any - # region has more than 1/3 the total weight, some partitions will - # not have replicas spread across all regions. - # - # Example 3: Consider a 3-replica cluster with 4 regions. If any - # region has more than 1/3 the total weight, some partitions will - # not have replicas spread across all regions. - # - # Example 4: Consider a 3-replica cluster with 100 regions. If - # any region has more than 1/3 the total weight, some partitions - # will not have replicas spread across all regions. The fact - # that there's 100 regions doesn't matter; if one region is big - # enough, it'll get multiple replicas of some partitions. - # - # Example 5: Consider a 5-replica cluster with 2 regions. If the - # bigger region has more than 3/5 the weight, some partitions - # will have more than 3 replicas in the big region. (Optimal - # dispersion is 3 replicas in some region and 2 in the other; 4 - # and 1 is not good enough.) - # - # In general, what we do is split this tier's child tiers - # into two groups: "big" and "small". "Big" child tiers are - # ones whose weight exceeds their fraction of the replicas. - # For example, given 3 replicas and 4 zones of total weight - # 12,000, a zone with weight greater than 1/3 of 12,000 (= - # 4,000) would be considered big. "Small" child tiers are - # those which are not big. - # - # Once we've divided the child tiers into big and small, we - # figure out how many replicas should wind up on the small - # child tiers (all together), and then compute the needed - # overload factor to boost their weights so they can take - # that many replicas. - child_tiers = tier2children[tier] - tier_replicas = max_allowed_replicas[tier] - big_child_count = small_child_count = 0 - big_child_weight = small_child_weight = 0.0 - - max_child_replicas = math.ceil(tier_replicas / len(child_tiers)) - bigness_threshold = ( - max_child_replicas / tier_replicas * tier_weight[tier]) - - for child_tier in child_tiers: - child_weight = tier_weight[child_tier] - if child_weight == 0: - # If it's got 0 weight, it's not taking any - # partitions at all, so it doesn't count. + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + if not dev['weight']: + if tier not in wanted or not wanted[tier]: continue - if child_weight >= bigness_threshold: - big_child_count += 1 - big_child_weight += child_weight - else: - small_child_count += 1 - small_child_weight += child_weight - - if big_child_count == 0 or small_child_count == 0: - # We only need overload if we have both big and small - # tiers. Usually, all small tiers means things can - # balance, while all big tiers means that we have - # exactly one child tier (e.g. a cluster with only one - # region). - continue - - # We assume each big child tier takes the maximum possible - # number of replicas for optimal dispersion, but no more. - # That leaves the remainder for the small child tiers. - big_child_replicas = max_child_replicas * big_child_count - small_child_replicas = tier_replicas - big_child_replicas - - if small_child_replicas == 0: - # If we're not putting any replicas on small child - # tiers, then there's no need for overload. This also - # avoids a division-by-zero below. - continue - - # We want the overloaded small tiers to take up their fair - # share of the replicas. We can express this as follows: - # - # Let Ws be the sum of the weights of the small child tiers. - # - # Let Wb be the sum of the weights of the big child tiers. - # - # Let Rt be the number of replicas at the current tier. - # - # Let Rs be the desired number of replicas for the small - # child tiers. - # - # Let L be the overload. - # - # Then, we have the following: - # - # (L * Ws) / (Wb + L * Ws) = Rs / Rt - # - # Solving for L, we get: - # - # L = 1 / (Ws / Wb * (Rt / Rs - 1)) - required_overload = 1.0 / ( - (small_child_weight / big_child_weight) - * (tier_replicas / small_child_replicas - 1)) - 1 - - if required_overload > max_required_overload: - self.logger.debug("Required overload for %r is %f [NEW HIGH]", - tier, required_overload) - max_required_overload = required_overload - else: - self.logger.debug("Required overload for %r is %f", - tier, required_overload) - return max_required_overload + raise exceptions.RingValidationError( + 'Device %s has zero weight and ' + 'should not want any replicas' % (tier,)) + required = (wanted[tier] - weighted[tier]) / weighted[tier] + self.logger.debug('%s wants %s and is weighted for %s so ' + 'therefore requires %s overload' % ( + tier, wanted[tier], weighted[tier], + required)) + if required > max_overload: + max_overload = required + return max_overload def pretend_min_part_hours_passed(self): """ @@ -848,7 +760,13 @@ class RingBuilder(object): if dev is not None: yield dev - def _set_parts_wanted(self): + def _build_tier2children(self): + """ + Wrap helper build_tier_tree so exclude zero-weight devices. + """ + return build_tier_tree(d for d in self._iter_devs() if d['weight']) + + def _set_parts_wanted(self, replica_plan): """ Sets the parts_wanted key for each of the devices to the number of partitions the device wants based on its relative weight. This key is @@ -856,9 +774,49 @@ class RingBuilder(object): to best distribute partitions. A negative parts_wanted indicates the device is "overweight" and wishes to give partitions away if possible. - Note: parts_wanted does *not* consider overload. + :param replica_plan: a dict of dicts, as returned from + _build_replica_plan, that that maps + each tier to it's target replicanths. """ - weight_of_one_part = self.weight_of_one_part() + tier2children = self._build_tier2children() + + parts_by_tier = defaultdict(int) + + def place_parts(tier, parts): + parts_by_tier[tier] = parts + sub_tiers = sorted(tier2children[tier]) + if not sub_tiers: + return + to_place = defaultdict(int) + for t in sub_tiers: + to_place[t] = int(math.floor( + replica_plan[t]['target'] * self.parts)) + parts -= to_place[t] + + # if there's some parts left over, just throw 'em about + sub_tier_gen = itertools.cycle(sorted( + sub_tiers, key=lambda t: replica_plan[t]['target'])) + while parts: + t = next(sub_tier_gen) + to_place[t] += 1 + parts -= 1 + + for t, p in to_place.items(): + place_parts(t, p) + + total_parts = int(self.replicas * self.parts) + place_parts((), total_parts) + + # belts & suspenders/paranoia - at every level, the sum of + # parts_by_tier should be total_parts for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + parts_at_tier = sum(parts_by_tier[t] for t in parts_by_tier + if len(t) == i) + if parts_at_tier != total_parts: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + parts_at_tier, total_parts, tier_name)) for dev in self._iter_devs(): if not dev['weight']: @@ -867,97 +825,8 @@ class RingBuilder(object): # indicate its strong desire to give up everything it has. dev['parts_wanted'] = -self.parts * self.replicas else: - dev['parts_wanted'] = ( - # Round up here so that every partition ultimately ends up - # with a placement. - # - # Imagine 5 partitions to be placed on 4 devices. If we - # didn't use math.ceil() here, each device would have a - # parts_wanted of 1, so 4 partitions would be placed but - # the last would not, probably resulting in a crash. This - # way, some devices end up with leftover parts_wanted, but - # at least every partition ends up somewhere. - int(math.ceil(weight_of_one_part * dev['weight'] - - dev['parts']))) - - def _adjust_replica2part2dev_size(self): - """ - Make sure that the lengths of the arrays in _replica2part2dev - are correct for the current value of self.replicas. - - Example: - self.part_power = 8 - self.replicas = 2.25 - - self._replica2part2dev will contain 3 arrays: the first 2 of - length 256 (2**8), and the last of length 64 (0.25 * 2**8). - - Returns a 2-tuple: the first element is a list of (partition, - replicas) tuples indicating which replicas need to be - (re)assigned to devices, and the second element is a count of - how many replicas were removed. - """ - removed_replicas = 0 - - fractional_replicas, whole_replicas = math.modf(self.replicas) - whole_replicas = int(whole_replicas) - - desired_lengths = [self.parts] * whole_replicas - if fractional_replicas: - desired_lengths.append(int(self.parts * fractional_replicas)) - - to_assign = defaultdict(list) - - if self._replica2part2dev is not None: - # If we crossed an integer threshold (say, 4.1 --> 4), - # we'll have a partial extra replica clinging on here. Clean - # up any such extra stuff. - for part2dev in self._replica2part2dev[len(desired_lengths):]: - for dev_id in part2dev: - dev_losing_part = self.devs[dev_id] - dev_losing_part['parts'] -= 1 - removed_replicas += 1 - self._replica2part2dev = \ - self._replica2part2dev[:len(desired_lengths)] - else: - self._replica2part2dev = [] - - for replica, desired_length in enumerate(desired_lengths): - if replica < len(self._replica2part2dev): - part2dev = self._replica2part2dev[replica] - if len(part2dev) < desired_length: - # Not long enough: needs to be extended and the - # newly-added pieces assigned to devices. - for part in range(len(part2dev), desired_length): - to_assign[part].append(replica) - part2dev.append(0) - elif len(part2dev) > desired_length: - # Too long: truncate this mapping. - for part in range(desired_length, len(part2dev)): - dev_losing_part = self.devs[part2dev[part]] - dev_losing_part['parts'] -= 1 - removed_replicas += 1 - self._replica2part2dev[replica] = part2dev[:desired_length] - else: - # Mapping not present at all: make one up and assign - # all of it. - for part in range(desired_length): - to_assign[part].append(replica) - self._replica2part2dev.append( - array('H', (0 for _junk in range(desired_length)))) - - return (to_assign.items(), removed_replicas) - - def _initial_balance(self): - """ - Initial partition assignment is the same as rebalancing an - existing ring, but with some initial setup beforehand. - """ - self._last_part_moves = array('B', (0 for _junk in range(self.parts))) - self._last_part_moves_epoch = int(time()) - self._set_parts_wanted() - - self._reassign_parts(self._adjust_replica2part2dev_size()[0]) + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + dev['parts_wanted'] = parts_by_tier[tier] - dev['parts'] def _update_last_part_moves(self): """ @@ -976,231 +845,302 @@ class RingBuilder(object): self._last_part_moves[part] = 0xff self._last_part_moves_epoch = int(time()) - def _get_available_parts(self): + def _gather_parts_from_failed_devices(self, assign_parts): """ - Returns a dict of (tier: available parts in other tiers) for all tiers - in the ring. - - Devices that have too many partitions (negative parts_wanted plus - overload) are ignored, otherwise the sum of all returned values is 0 - +/- rounding errors. - - This takes overload into account. + Update the map of partition => [replicas] to be reassigned from + removed devices. """ - wanted_parts_for_tier = {} - for dev in self._iter_devs(): - extra_overload_parts = self._n_overload_parts(dev) - pw = max(dev['parts_wanted'] + extra_overload_parts, 0) - for tier in tiers_for_dev(dev): - wanted_parts_for_tier.setdefault(tier, 0) - wanted_parts_for_tier[tier] += pw - return wanted_parts_for_tier - - def _compute_sibling_tiers(self): - """ - Returns a 2-tuple; the first value is a dictionary mapping each - device's id to its tiers, and the second is a dictionary mapping - a-tier: list-of-sibling-tiers. - """ - # inline memoization of tiers_for_dev() results (profiling reveals it - # as a hot-spot). We also return it so callers don't have to - # rebuild it. - tfd = {} - - tiers_by_len = defaultdict(set) - for dev in self._iter_devs(): - tiers = tiers_for_dev(dev) - tfd[dev['id']] = tiers - for tier in tiers: - tiers_by_len[len(tier)].add(tier) - - tiers_by_len = dict((length, list(tiers)) - for length, tiers in tiers_by_len.items()) - - sibling_tiers = {} - for length, tiers in tiers_by_len.items(): - for i, tier in enumerate(tiers): - sibling_tiers[tier] = [t for t in (tiers[:i] + tiers[(i + 1):]) - if t[:-1] == tier[:-1]] - return (tfd, sibling_tiers) - - def _gather_reassign_parts(self): - """ - Returns a list of (partition, replicas) pairs to be reassigned by - gathering from removed devices, insufficiently-far-apart replicas, and - overweight drives. - """ - tfd, sibling_tiers = self._compute_sibling_tiers() - # First we gather partitions from removed devices. Since removed # devices usually indicate device failures, we have no choice but to # reassign these partitions. However, we mark them as moved so later # choices will skip other replicas of the same partition if possible. - removed_dev_parts = defaultdict(list) + if self._remove_devs: dev_ids = [d['id'] for d in self._remove_devs if d['parts']] if dev_ids: for part, replica in self._each_part_replica(): dev_id = self._replica2part2dev[replica][part] if dev_id in dev_ids: + self._replica2part2dev[replica][part] = NONE_DEV self._last_part_moves[part] = 0 - removed_dev_parts[part].append(replica) + assign_parts[part].append(replica) self.logger.debug( "Gathered %d/%d from dev %d [dev removed]", part, replica, dev_id) + removed_devs = 0 + while self._remove_devs: + remove_dev_id = self._remove_devs.pop()['id'] + self.logger.debug("Removing dev %d", remove_dev_id) + self.devs[remove_dev_id] = None + removed_devs += 1 + return removed_devs + def _adjust_replica2part2dev_size(self, to_assign): + """ + Make sure that the lengths of the arrays in _replica2part2dev + are correct for the current value of self.replicas. + + Example: + self.part_power = 8 + self.replicas = 2.25 + + self._replica2part2dev will contain 3 arrays: the first 2 of + length 256 (2**8), and the last of length 64 (0.25 * 2**8). + + Update the mapping of partition => [replicas] that need assignment. + """ + fractional_replicas, whole_replicas = math.modf(self.replicas) + whole_replicas = int(whole_replicas) + removed_parts = 0 + new_parts = 0 + + desired_lengths = [self.parts] * whole_replicas + if fractional_replicas: + desired_lengths.append(int(self.parts * fractional_replicas)) + + if self._replica2part2dev is not None: + # If we crossed an integer threshold (say, 4.1 --> 4), + # we'll have a partial extra replica clinging on here. Clean + # up any such extra stuff. + for part2dev in self._replica2part2dev[len(desired_lengths):]: + for dev_id in part2dev: + dev_losing_part = self.devs[dev_id] + dev_losing_part['parts'] -= 1 + removed_parts -= 1 + self._replica2part2dev = \ + self._replica2part2dev[:len(desired_lengths)] + else: + self._replica2part2dev = [] + + for replica, desired_length in enumerate(desired_lengths): + if replica < len(self._replica2part2dev): + part2dev = self._replica2part2dev[replica] + if len(part2dev) < desired_length: + # Not long enough: needs to be extended and the + # newly-added pieces assigned to devices. + for part in range(len(part2dev), desired_length): + to_assign[part].append(replica) + part2dev.append(NONE_DEV) + new_parts += 1 + elif len(part2dev) > desired_length: + # Too long: truncate this mapping. + for part in range(desired_length, len(part2dev)): + dev_losing_part = self.devs[part2dev[part]] + dev_losing_part['parts'] -= 1 + removed_parts -= 1 + self._replica2part2dev[replica] = part2dev[:desired_length] + else: + # Mapping not present at all: make one up and assign + # all of it. + for part in range(desired_length): + to_assign[part].append(replica) + new_parts += 1 + self._replica2part2dev.append( + array('H', itertools.repeat(NONE_DEV, desired_length))) + + self.logger.debug( + "%d new parts and %d removed parts from replica-count change", + new_parts, removed_parts) + + def _gather_parts_for_dispersion(self, assign_parts, replica_plan): + """ + Update the map of partition => [replicas] to be reassigned from + insufficiently-far-apart replicas. + """ # Now we gather partitions that are "at risk" because they aren't # currently sufficient spread out across the cluster. - spread_out_parts = defaultdict(list) - max_allowed_replicas = self._build_max_replicas_by_tier() - wanted_parts_for_tier = self._get_available_parts() - moved_parts = 0 for part in range(self.parts): - # Only move one replica at a time if possible. - if part in removed_dev_parts: + if self._last_part_moves[part] < self.min_part_hours: continue - # First, add up the count of replicas at each tier for each # partition. - # replicas_at_tier was a "lambda: 0" defaultdict, but profiling - # revealed the lambda invocation as a significant cost. - replicas_at_tier = {} + replicas_at_tier = defaultdict(int) for dev in self._devs_for_part(part): - for tier in tfd[dev['id']]: - if tier not in replicas_at_tier: - replicas_at_tier[tier] = 1 - else: - replicas_at_tier[tier] += 1 + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 - # Now, look for partitions not yet spread out enough and not - # recently moved. + # Now, look for partitions not yet spread out enough. + undispersed_dev_replicas = [] for replica in self._replicas_for_part(part): - dev = self.devs[self._replica2part2dev[replica][part]] - removed_replica = False - for tier in tfd[dev['id']]: - rep_at_tier = replicas_at_tier.get(tier, 0) + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + # the min part hour check is ignored iff a device has more + # than one replica of a part assigned to it - which would have + # only been possible on rings built with older version of code + if (self._last_part_moves[part] < self.min_part_hours and + not replicas_at_tier[dev['tiers'][-1]] > 1): + break + if all(replicas_at_tier[tier] <= + replica_plan[tier]['max'] + for tier in dev['tiers']): + continue + undispersed_dev_replicas.append((dev, replica)) - # If this tier's not overcrowded, there's nothing to - # gather, so we can avoid some calculation here as an - # optimization. - if rep_at_tier <= max_allowed_replicas[tier]: - continue + if not undispersed_dev_replicas: + continue - available_parts_for_tier = sum( - wanted_parts_for_tier[t] - for t in sibling_tiers[tier] - # If a sibling tier is "full" with respect to - # partition dispersion, but not "full" with respect - # to parts_wanted, we don't count it as a possible - # destination. - # - # Otherwise, we gather a partition from tier X - # (because its replicas are not spread out), and - # then we may place it right back in tier X or in - # another tier that already has replicas (because - # that tier has parts_wanted). Then, on the next - # rebalance, it'll happen again, and then again... - # - # Worse yet, this "dancing replica" immobilizes - # other replicas of the partition that want to move - # because they're on devices with negative - # parts_wanted. This can lead to a replica that - # sticks to a zero-weight device no matter how often - # the ring is rebalanced. - if (max_allowed_replicas[t] > - replicas_at_tier.get(t, 0)) - ) - moved_parts + undispersed_dev_replicas.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in undispersed_dev_replicas: + if self._last_part_moves[part] < self.min_part_hours: + break + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [dispersion]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + for tier in dev['tiers']: + replicas_at_tier[tier] -= 1 + self._last_part_moves[part] = 0 - # Only allow a part to be gathered if there are wanted - # parts on other tiers. - if (self._last_part_moves[part] >= self.min_part_hours - and available_parts_for_tier > 0): - self._last_part_moves[part] = 0 - spread_out_parts[part].append(replica) - dev['parts_wanted'] += 1 - dev['parts'] -= 1 - removed_replica = True - moved_parts += 1 - self.logger.debug( - "Gathered %d/%d from dev %d [dispersion]", - part, replica, dev['id']) - break - if removed_replica: - for tier in tfd[dev['id']]: - replicas_at_tier[tier] -= 1 + def _gather_parts_for_balance_can_disperse(self, assign_parts, start, + replica_plan): + """ + Update the map of partition => [replicas] to be reassigned from + overweight drives where the replicas can be better dispersed to + another failure domain. + :param assign_parts: the map of partition => [replica] to update + :param start: offset into self.parts to begin search + :param replica_plan: replicanth targets for tiers + """ # Last, we gather partitions from devices that are "overweight" because # they have more partitions than their parts_wanted. - reassign_parts = defaultdict(list) + for offset in range(self.parts): + part = (start + offset) % self.parts + if self._last_part_moves[part] < self.min_part_hours: + continue + # For each part we'll look at the devices holding those parts and + # see if any are overweight, keeping track of replicas_at_tier as + # we go + overweight_dev_replica = [] + replicas_at_tier = defaultdict(int) + for replica in self._replicas_for_part(part): + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 + if dev['parts_wanted'] < 0: + overweight_dev_replica.append((dev, replica)) - # We randomly pick a new starting point in the "circular" ring of - # partitions to try to get a better rebalance when called multiple - # times. + if not overweight_dev_replica: + continue - start = self._last_part_gather_start / 4 - start += random.randint(0, self.parts / 2) # GRAH PEP8!!! + overweight_dev_replica.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in overweight_dev_replica: + if self._last_part_moves[part] < self.min_part_hours: + break + if any(replica_plan[tier]['min'] <= + replicas_at_tier[tier] < + replica_plan[tier]['max'] + for tier in dev['tiers']): + continue + # this is the most overweight_device holding a replica + # of this part that can shed it according to the plan + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [weight disperse]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + for tier in dev['tiers']: + replicas_at_tier[tier] -= 1 + self._last_part_moves[part] = 0 + + def _gather_parts_for_balance(self, assign_parts, replica_plan): + """ + Gather parts that look like they should move for balance reasons. + + A simple gather of parts that looks dispersible normally works out, + we'll switch strategies if things don't be seem to moving... + """ + # pick a random starting point on the other side of the ring + quarter_turn = (self.parts // 4) + random_half = random.randint(0, self.parts / 2) + start = (self._last_part_gather_start + quarter_turn + + random_half) % self.parts + self.logger.debug('Gather start is %s ' + '(Last start was %s)' % ( + start, self._last_part_gather_start)) self._last_part_gather_start = start - for replica, part2dev in enumerate(self._replica2part2dev): - # If we've got a partial replica, start may be out of - # range. Scale it down so that we get a similar movement - # pattern (but scaled down) on sequential runs. - this_start = int(float(start) * len(part2dev) / self.parts) + self._gather_parts_for_balance_can_disperse( + assign_parts, start, replica_plan) + if not assign_parts: + self._gather_parts_for_balance_forced(assign_parts, start) - for part in itertools.chain(range(this_start, len(part2dev)), - range(0, this_start)): + def _gather_parts_for_balance_forced(self, assign_parts, start, **kwargs): + """ + Update the map of partition => [replicas] to be reassigned from + overweight drives without restriction, parts gathered from this method + may be placed back onto devices that are no better (or worse) than the + device from which they are gathered. + + This method allows devices to flop around enough to unlock replicas + that would have otherwise potentially been locked because of + dispersion - it should be used as a last resort. + + :param assign_parts: the map of partition => [replica] to update + :param start: offset into self.parts to begin search + """ + for offset in range(self.parts): + part = (start + offset) % self.parts + if self._last_part_moves[part] < self.min_part_hours: + continue + overweight_dev_replica = [] + for replica in self._replicas_for_part(part): + dev_id = self._replica2part2dev[replica][part] + if dev_id == NONE_DEV: + continue + dev = self.devs[dev_id] + if dev['parts_wanted'] < 0: + overweight_dev_replica.append((dev, replica)) + + if not overweight_dev_replica: + continue + + overweight_dev_replica.sort( + key=lambda dr: dr[0]['parts_wanted']) + for dev, replica in overweight_dev_replica: if self._last_part_moves[part] < self.min_part_hours: - continue - if part in removed_dev_parts or part in spread_out_parts: - continue - dev = self.devs[part2dev[part]] - fudge = self._n_overload_parts(dev) - if dev['parts_wanted'] + fudge < 0: - self._last_part_moves[part] = 0 - dev['parts_wanted'] += 1 - dev['parts'] -= 1 - reassign_parts[part].append(replica) - self.logger.debug( - "Gathered %d/%d from dev %d [weight]", - part, replica, dev['id']) + break + # this is the most overweight_device holding a replica of this + # part we don't know where it's going to end up - but we'll + # pick it up and hope for the best. + dev['parts_wanted'] += 1 + dev['parts'] -= 1 + assign_parts[part].append(replica) + self.logger.debug( + "Gathered %d/%d from dev %d [weight forced]", + part, replica, dev['id']) + self._replica2part2dev[replica][part] = NONE_DEV + self._last_part_moves[part] = 0 - reassign_parts.update(spread_out_parts) - reassign_parts.update(removed_dev_parts) - - reassign_parts_list = list(reassign_parts.items()) - # We shuffle the partitions to reassign so we get a more even - # distribution later. There has been discussion of trying to distribute - # partitions more "regularly" because that would actually reduce risk - # but 1) it is really difficult to do this with uneven clusters and 2) - # it would concentrate load during failure recovery scenarios - # (increasing risk). The "right" answer has yet to be debated to - # conclusion, but working code wins for now. - random.shuffle(reassign_parts_list) - return reassign_parts_list - - def _n_overload_parts(self, dev): - """ - The number of extra partitions a device can take due to overload. - """ - return max(int(math.ceil( - (dev['parts_wanted'] + dev['parts']) - * self._effective_overload)), 0) - - def _reassign_parts(self, reassign_parts): + def _reassign_parts(self, reassign_parts, replica_plan): """ For an existing ring data set, partitions are reassigned similarly to - the initial assignment. The devices are ordered by how many partitions - they still want and kept in that order throughout the process. The - gathered partitions are iterated through, assigning them to devices - according to the "most wanted" while keeping the replicas as "far - apart" as possible. Two different regions are considered the - farthest-apart things, followed by zones, then different ip/port pairs - within a zone; the least-far-apart things are different devices with - the same ip/port pair in the same zone. + the initial assignment. - If you want more replicas than devices, you won't get all your - replicas. + The devices are ordered by how many partitions they still want and + kept in that order throughout the process. + + The gathered partitions are iterated through, assigning them to + devices according to the "most wanted" while keeping the replicas as + "far apart" as possible. + + Two different regions are considered the farthest-apart things, + followed by zones, then different ip within a zone; the + least-far-apart things are different devices with the same ip in the + same zone. :param reassign_parts: An iterable of (part, replicas_to_replace) pairs. replicas_to_replace is an iterable of the @@ -1208,12 +1148,9 @@ class RingBuilder(object): replicas_to_replace may be shared for multiple partitions, so be sure you do not modify it. """ - fudge_available_in_tier = defaultdict(int) parts_available_in_tier = defaultdict(int) for dev in self._iter_devs(): dev['sort_key'] = self._sort_key_for(dev) - tiers = tiers_for_dev(dev) - dev['tiers'] = tiers # Note: this represents how many partitions may be assigned to a # given tier (region/zone/server/disk). It does not take into # account how many partitions a given tier wants to shed. @@ -1226,9 +1163,7 @@ class RingBuilder(object): # with partitions to shed, which is any time a device is being # removed, which is a pretty frequent operation. wanted = max(dev['parts_wanted'], 0) - fudge = self._n_overload_parts(dev) - for tier in tiers: - fudge_available_in_tier[tier] += (wanted + fudge) + for tier in dev['tiers']: parts_available_in_tier[tier] += wanted available_devs = \ @@ -1265,153 +1200,45 @@ class RingBuilder(object): depth += 1 for part, replace_replicas in reassign_parts: - # Gather up what other tiers (regions, zones, ip/ports, and - # devices) the replicas not-to-be-moved are in for this part. - other_replicas = defaultdict(int) - occupied_tiers_by_tier_len = defaultdict(set) - for replica in self._replicas_for_part(part): - if replica not in replace_replicas: - dev = self.devs[self._replica2part2dev[replica][part]] - for tier in dev['tiers']: - other_replicas[tier] += 1 - occupied_tiers_by_tier_len[len(tier)].add(tier) + # always update part_moves for min_part_hours + self._last_part_moves[part] = 0 + # count up where these replicas be + replicas_at_tier = defaultdict(int) + for dev in self._devs_for_part(part): + for tier in dev['tiers']: + replicas_at_tier[tier] += 1 for replica in replace_replicas: # Find a new home for this replica tier = () + # This used to be a cute, recursive function, but it's been + # unrolled for performance. depth = 1 while depth <= max_tier_depth: - roomiest_tier = fudgiest_tier = None - # Order the tiers by how many replicas of this - # partition they already have. Then, of the ones - # with the smallest number of replicas and that have - # room to accept more partitions, pick the tier with - # the hungriest drive and then continue searching in - # that subtree. - # - # There are other strategies we could use here, - # such as hungriest-tier (i.e. biggest - # sum-of-parts-wanted) or picking one at random. - # However, hungriest-drive is what was used here - # before, and it worked pretty well in practice. - # - # Note that this allocator prioritizes even device - # filling over dispersion, so if your layout is - # extremely unbalanced, you may not get the replica - # dispersion that you expect, and your durability - # may be lessened. - # - # This used to be a cute, recursive function, but it's been - # unrolled for performance. + # Choose the roomiest tier among those that don't + # already have their max replicas assigned according + # to the replica_plan. + candidates = [t for t in tier2children[tier] if + replicas_at_tier[t] < + replica_plan[t]['max']] - # We sort the tiers here so that, when we look for a tier - # with the lowest number of replicas, the first one we - # find is the one with the hungriest drive (i.e. drive - # with the largest sort_key value). This lets us - # short-circuit the search while still ensuring we get the - # right tier. - candidates_with_replicas = \ - occupied_tiers_by_tier_len[len(tier) + 1] + if not candidates: + raise Exception('no home for %s/%s %s' % ( + part, replica, {t: ( + replicas_at_tier[t], + replica_plan[t]['max'], + ) for t in tier2children[tier]})) + tier = max(candidates, key=lambda t: + parts_available_in_tier[t]) - # Among the tiers with room for more partitions, - # find one with the smallest possible number of - # replicas already in it, breaking ties by which one - # has the hungriest drive. - candidates_with_room = [ - t for t in tier2children[tier] - if parts_available_in_tier[t] > 0] - candidates_with_fudge = set([ - t for t in tier2children[tier] - if fudge_available_in_tier[t] > 0]) - candidates_with_fudge.update(candidates_with_room) - - if candidates_with_room: - if len(candidates_with_room) > \ - len(candidates_with_replicas): - # There exists at least one tier with room for - # another partition and 0 other replicas already - # in it, so we can use a faster search. The else - # branch's search would work here, but it's - # significantly slower. - roomiest_tier = max( - (t for t in candidates_with_room - if other_replicas[t] == 0), - key=tier2sort_key.__getitem__) - else: - roomiest_tier = max( - candidates_with_room, - key=lambda t: (-other_replicas[t], - tier2sort_key[t])) - else: - roomiest_tier = None - - fudgiest_tier = max(candidates_with_fudge, - key=lambda t: (-other_replicas[t], - tier2sort_key[t])) - - if (roomiest_tier is None or - (other_replicas[roomiest_tier] > - other_replicas[fudgiest_tier])): - subtier = fudgiest_tier - else: - subtier = roomiest_tier - # no putting multiples on the same device - if len(subtier) == 4 and ( - subtier in occupied_tiers_by_tier_len[4]): - sibling_tiers = [ - (d['region'], d['zone'], d['ip'], d['id']) - for d in tier2devs[tier]] - unused_sibling_tiers = [ - t for t in sibling_tiers - if t not in occupied_tiers_by_tier_len[4]] - if unused_sibling_tiers: - # anything is better than the alternative - subtier = random.choice(unused_sibling_tiers) - else: - warnings.warn(RingValidationWarning( - "All devices in tier %r already " - "contain a replica" % (tier,))) - tier = subtier depth += 1 dev = tier2devs[tier][-1] dev['parts_wanted'] -= 1 dev['parts'] += 1 - old_sort_key = dev['sort_key'] - new_sort_key = dev['sort_key'] = self._sort_key_for(dev) for tier in dev['tiers']: parts_available_in_tier[tier] -= 1 - fudge_available_in_tier[tier] -= 1 - other_replicas[tier] += 1 - occupied_tiers_by_tier_len[len(tier)].add(tier) - - index = bisect.bisect_left(tier2dev_sort_key[tier], - old_sort_key) - tier2devs[tier].pop(index) - tier2dev_sort_key[tier].pop(index) - - new_index = bisect.bisect_left(tier2dev_sort_key[tier], - new_sort_key) - tier2devs[tier].insert(new_index, dev) - tier2dev_sort_key[tier].insert(new_index, new_sort_key) - - new_last_sort_key = tier2dev_sort_key[tier][-1] - tier2sort_key[tier] = new_last_sort_key - - # Now jiggle tier2children values to keep them sorted - parent_tier = tier[0:-1] - index = bisect.bisect_left( - tier2children_sort_key[parent_tier], - old_sort_key) - popped = tier2children[parent_tier].pop(index) - tier2children_sort_key[parent_tier].pop(index) - - new_index = bisect.bisect_left( - tier2children_sort_key[parent_tier], - new_last_sort_key) - tier2children[parent_tier].insert(new_index, popped) - tier2children_sort_key[parent_tier].insert( - new_index, new_last_sort_key) + replicas_at_tier[tier] += 1 self._replica2part2dev[replica][part] = dev['id'] self.logger.debug( @@ -1420,13 +1247,12 @@ class RingBuilder(object): # Just to save memory and keep from accidental reuse. for dev in self._iter_devs(): del dev['sort_key'] - del dev['tiers'] @staticmethod def _sort_key_for(dev): return (dev['parts_wanted'], random.randint(0, 0xFFFF), dev['id']) - def _build_max_replicas_by_tier(self): + def _build_max_replicas_by_tier(self, bound=math.ceil): """ Returns a defaultdict of (tier: replica_count) for all tiers in the ring excluding zero weight devices. @@ -1477,21 +1303,254 @@ class RingBuilder(object): """ # Used by walk_tree to know what entries to create for each recursive # call. - tier2children = build_tier_tree(d for d in self._iter_devs() if - d['weight']) + tier2children = self._build_tier2children() def walk_tree(tier, replica_count): + if len(tier) == 4: + # special case for device, it's not recursive + replica_count = min(1, replica_count) mr = {tier: replica_count} if tier in tier2children: subtiers = tier2children[tier] for subtier in subtiers: - submax = math.ceil(float(replica_count) / len(subtiers)) + submax = bound(float(replica_count) / len(subtiers)) mr.update(walk_tree(subtier, submax)) return mr mr = defaultdict(float) mr.update(walk_tree((), self.replicas)) return mr + def _build_weighted_replicas_by_tier(self): + """ + Returns a dict mapping => replicanths for all tiers in + the ring based on their weights. + """ + weight_of_one_part = self.weight_of_one_part() + + # assign each device some replicanths by weight (can't be > 1) + weighted_replicas_for_dev = {} + devices_with_room = [] + for dev in self._iter_devs(): + if not dev['weight']: + continue + weighted_replicas = ( + dev['weight'] * weight_of_one_part / self.parts) + if weighted_replicas < 1: + devices_with_room.append(dev['id']) + else: + weighted_replicas = 1 + weighted_replicas_for_dev[dev['id']] = weighted_replicas + + while True: + remaining = self.replicas - sum(weighted_replicas_for_dev.values()) + if remaining < 1e-10: + break + devices_with_room = [d for d in devices_with_room if + weighted_replicas_for_dev[d] < 1] + rel_weight = remaining / sum( + weighted_replicas_for_dev[d] for d in devices_with_room) + for d in devices_with_room: + weighted_replicas_for_dev[d] = min( + 1, weighted_replicas_for_dev[d] * (rel_weight + 1)) + + weighted_replicas_by_tier = defaultdict(float) + for dev in self._iter_devs(): + if not dev['weight']: + continue + assigned_replicanths = weighted_replicas_for_dev[dev['id']] + dev_tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + for i in range(len(dev_tier) + 1): + tier = dev_tier[:i] + weighted_replicas_by_tier[tier] += assigned_replicanths + + # belts & suspenders/paranoia - at every level, the sum of + # weighted_replicas should be very close to the total number of + # replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(weighted_replicas_by_tier[t] for t in + weighted_replicas_by_tier if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return weighted_replicas_by_tier + + def _build_wanted_replicas_by_tier(self): + """ + Returns a defaultdict of (tier: replicanths) for all tiers in the ring + based on unique-as-possible (full dispersion) with respect to their + weights and device counts. + + N.B. _build_max_replicas_by_tier calculates the upper bound on the + replicanths each tier may hold irrespective of the weights of the + tier; this method will calculate the minimum replicanth <= + max_replicas[tier] that will still solve dispersion. However it is + not guaranteed to return a fully dispersed solution if failure domains + are over-weighted for their device count. + """ + weighted_replicas = self._build_weighted_replicas_by_tier() + dispersed_replicas = { + t: { + 'min': math.floor(r), + 'max': math.ceil(r), + } for (t, r) in + self._build_max_replicas_by_tier(bound=float).items() + } + + # watch out for device limited tiers + num_devices = defaultdict(int) + for d in self._iter_devs(): + if d['weight'] <= 0: + continue + for t in (d.get('tiers') or tiers_for_dev(d)): + num_devices[t] += 1 + num_devices[()] += 1 + + tier2children = self._build_tier2children() + + wanted_replicas = defaultdict(float) + + def place_replicas(tier, replicanths): + if replicanths > num_devices[tier]: + raise exceptions.RingValidationError( + 'More than replicanths (%s) than devices (%s) ' + 'in tier (%s)' % (replicanths, num_devices[tier], tier)) + wanted_replicas[tier] = replicanths + sub_tiers = sorted(tier2children[tier]) + if not sub_tiers: + return + + to_place = defaultdict(float) + remaining = replicanths + tiers_to_spread = sub_tiers + device_limited = False + + while True: + rel_weight = remaining / sum(weighted_replicas[t] + for t in tiers_to_spread) + for t in tiers_to_spread: + replicas = to_place[t] + ( + weighted_replicas[t] * rel_weight) + if replicas < dispersed_replicas[t]['min']: + replicas = dispersed_replicas[t]['min'] + elif (replicas > dispersed_replicas[t]['max'] and + not device_limited): + replicas = dispersed_replicas[t]['max'] + if replicas > num_devices[t]: + replicas = num_devices[t] + to_place[t] = replicas + + remaining = replicanths - sum(to_place.values()) + + if remaining < -1e-10: + tiers_to_spread = [ + t for t in sub_tiers + if to_place[t] > dispersed_replicas[t]['min'] + ] + elif remaining > 1e-10: + tiers_to_spread = [ + t for t in sub_tiers + if (num_devices[t] > to_place[t] < + dispersed_replicas[t]['max']) + ] + if not tiers_to_spread: + device_limited = True + tiers_to_spread = [ + t for t in sub_tiers + if to_place[t] < num_devices[t] + ] + else: + # remaining is "empty" + break + + for t in sub_tiers: + self.logger.debug('Planning %s on %s', + to_place[t], t) + place_replicas(t, to_place[t]) + + # place all replicas in the cluster tier + place_replicas((), self.replicas) + + # belts & suspenders/paranoia - at every level, the sum of + # wanted_replicas should be very close to the total number of + # replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(wanted_replicas[t] for t in + wanted_replicas if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return wanted_replicas + + def _build_target_replicas_by_tier(self): + """ + Build a map of => accounting for device + weights, unique-as-possible dispersion and overload. + + - a tuple, describing each tier in the ring topology + - a float, the target replicanths at the tier + """ + weighted_replicas = self._build_weighted_replicas_by_tier() + wanted_replicas = self._build_wanted_replicas_by_tier() + max_overload = self.get_required_overload(weighted=weighted_replicas, + wanted=wanted_replicas) + if max_overload <= 0.0: + return wanted_replicas + else: + overload = min(self.overload, max_overload) + self.logger.debug("Using effective overload of %f", overload) + target_replicas = defaultdict(float) + for tier, weighted in weighted_replicas.items(): + m = (wanted_replicas[tier] - weighted) / max_overload + target_replicas[tier] = m * overload + weighted + + # belts & suspenders/paranoia - at every level, the sum of + # target_replicas should be very close to the total number + # of replicas for the ring + tiers = ['cluster', 'regions', 'zones', 'servers', 'devices'] + for i, tier_name in enumerate(tiers): + replicas_at_tier = sum(target_replicas[t] for t in + target_replicas if len(t) == i) + if abs(self.replicas - replicas_at_tier) > 1e-10: + raise exceptions.RingValidationError( + '%s != %s at tier %s' % ( + replicas_at_tier, self.replicas, tier_name)) + + return target_replicas + + def _build_replica_plan(self): + """ + Wraps return value of _build_target_replicas_by_tier to include + pre-calculated min and max values for each tier. + + :returns: a dict, mapping => , where + is itself a dict + + include at least the following keys: + + min - the minimum number of replicas at the tier + target - the target replicanths at the tier + max - the maximum number of replicas at the tier + """ + # replica part-y planner! + target_replicas = self._build_target_replicas_by_tier() + replica_plan = defaultdict( + lambda: {'min': 0, 'target': 0, 'max': 0}) + replica_plan.update({ + t: { + 'min': math.floor(r + 1e-10), + 'target': r, + 'max': math.ceil(r - 1e-10), + } for (t, r) in + target_replicas.items() + }) + return replica_plan + def _devs_for_part(self, part): """ Returns a list of devices for a specified partition. @@ -1500,9 +1559,15 @@ class RingBuilder(object): """ if self._replica2part2dev is None: return [] - return [self.devs[part2dev[part]] - for part2dev in self._replica2part2dev - if part < len(part2dev)] + devs = [] + for part2dev in self._replica2part2dev: + if part >= len(part2dev): + continue + dev_id = part2dev[part] + if dev_id == NONE_DEV: + continue + devs.append(self.devs[dev_id]) + return devs def _replicas_for_part(self, part): """ diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py index f31e4ab747..99348d445e 100644 --- a/test/unit/common/ring/test_builder.py +++ b/test/unit/common/ring/test_builder.py @@ -25,13 +25,14 @@ from collections import defaultdict from math import ceil from tempfile import mkdtemp from shutil import rmtree -import warnings +import random from six.moves import range from swift.common import exceptions from swift.common import ring -from swift.common.ring.builder import MAX_BALANCE, RingValidationWarning +from swift.common.ring import utils +from swift.common.ring.builder import MAX_BALANCE class TestRingBuilder(unittest.TestCase): @@ -343,12 +344,16 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance() rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) - rb.pretend_min_part_hours_passed() - parts = rb._gather_reassign_parts() + replica_plan = rb._build_replica_plan() + rb._set_parts_wanted(replica_plan) + for dev in rb._iter_devs(): + dev['tiers'] = utils.tiers_for_dev(dev) + assign_parts = defaultdict(list) + rb._gather_parts_for_balance(assign_parts, replica_plan) max_run = 0 run = 0 last_part = 0 - for part, _ in parts: + for part, _ in assign_parts.items(): if part > last_part: run += 1 else: @@ -358,7 +363,7 @@ class TestRingBuilder(unittest.TestCase): last_part = part if run > max_run: max_run = run - return max_run > len(parts) / 2 + return max_run > len(assign_parts) / 2 def test_initial_balance(self): # 2 boxes, 2 drives each in zone 1 @@ -652,7 +657,7 @@ class TestRingBuilder(unittest.TestCase): "Partition %d did not move (got %r)" % (part, devs)) def test_multitier_dont_move_too_many_replicas(self): - rb = ring.RingBuilder(8, 3, 0) + rb = ring.RingBuilder(8, 3, 1) # there'll be at least one replica in z0 and z1 rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 0.5, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) @@ -672,6 +677,7 @@ class TestRingBuilder(unittest.TestCase): 'ip': '127.0.0.1', 'port': 10000, 'device': 'sde1'}) rb.add_dev({'id': 4, 'region': 0, 'zone': 4, 'weight': 1, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdf1'}) + rb.pretend_min_part_hours_passed() rb.rebalance() rb.validate() @@ -688,6 +694,73 @@ class TestRingBuilder(unittest.TestCase): "Partition %d not in zones 0 and 1 (got %r)" % (part, zones)) + def test_min_part_hours_zero_will_move_whatever_it_takes(self): + rb = ring.RingBuilder(8, 3, 0) + # there'll be at least one replica in z0 and z1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb1'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'weight': 0.5, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb1'}) + rb.rebalance(seed=1) + rb.validate() + + rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd1'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sde1'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 4, 'weight': 1, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdf1'}) + rb.rebalance(seed=3) + rb.validate() + + self.assertEqual(0, rb.dispersion) + # a balance of w/i a 1% isn't too bad for 3 replicas on 7 + # devices when part power is only 8 + self.assertAlmostEqual(rb.get_balance(), 0, delta=0.5) + + # every zone has either 153 or 154 parts + for zone, count in self._partition_counts( + rb, key='zone').items(): + self.assertAlmostEqual(153.5, count, delta=1) + + parts_with_moved_count = defaultdict(int) + for part in range(rb.parts): + zones = set() + for replica in range(rb.replicas): + zones.add(rb.devs[rb._replica2part2dev[replica][part]]['zone']) + moved_replicas = len(zones - {0, 1}) + parts_with_moved_count[moved_replicas] += 1 + + # as usual, the real numbers depend on the seed, but we want to + # validate a few things here: + # + # 1) every part had to move one replica to hit dispersion (so no + # one can have a moved count 0) + # + # 2) it's quite reasonable that some small percent of parts will + # have a replica in {0, 1, X} (meaning only one replica of the + # part moved) + # + # 3) when min_part_hours is 0, more than one replica of a part + # can move in a rebalance, and since that movement would get to + # better dispersion faster we expect to observe most parts in + # {[0,1], X, X} (meaning *two* replicas of the part moved) + # + # 4) there's plenty of weight in z0 & z1 to hold a whole + # replicanth, so there is no reason for any part to have to move + # all three replicas out of those zones (meaning no one can have + # a moved count 3) + # + expected = { + 1: 52, + 2: 204, + } + self.assertEqual(parts_with_moved_count, expected) + def test_rerebalance(self): rb = ring.RingBuilder(8, 3, 1) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, @@ -697,29 +770,17 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, 'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'}) rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts, {0: 256, 1: 256, 2: 256}) rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 1, 'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'}) rb.pretend_min_part_hours_passed() rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts, {0: 192, 1: 192, 2: 192, 3: 192}) rb.set_dev_weight(3, 100) rb.rebalance() - r = rb.get_ring() - counts = {} - for part2dev_id in r._replica2part2dev_id: - for dev_id in part2dev_id: - counts[dev_id] = counts.get(dev_id, 0) + 1 + counts = self._partition_counts(rb) self.assertEqual(counts[3], 256) def test_add_rebalance_add_rebalance_delete_rebalance(self): @@ -771,12 +832,12 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 1.0, 'ip': '127.0.0.3', 'port': 10000, 'device': 'sdc'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.5, + rb.add_dev({'id': 3, 'region': 0, 'zone': 3, 'weight': 0.4, 'ip': '127.0.0.3', 'port': 10001, 'device': 'zero'}) zero_weight_dev = 3 - rb.rebalance() + rb.rebalance(seed=1) # We want at least one partition with replicas only in zone 2 and 3 # due to device weights. It would *like* to spread out into zone 1, @@ -799,6 +860,14 @@ class TestRingBuilder(unittest.TestCase): array('H', [1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4]), array('H', [0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 5, 6, 2, 5, 6])] + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + rb.set_dev_weight(zero_weight_dev, 0.0) rb.pretend_min_part_hours_passed() rb.rebalance(seed=1) @@ -807,13 +876,243 @@ class TestRingBuilder(unittest.TestCase): for part2dev_id in rb._replica2part2dev: for dev_id in part2dev_id: node_counts[dev_id] += 1 + self.assertEqual(node_counts[zero_weight_dev], 0) # it's as balanced as it gets, so nothing moves anymore rb.pretend_min_part_hours_passed() parts_moved, _balance, _removed = rb.rebalance(seed=1) + + new_node_counts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_node_counts[dev_id] += 1 + + del node_counts[zero_weight_dev] + self.assertEqual(node_counts, new_node_counts) + self.assertEqual(parts_moved, 0) + def test_part_swapping_problem(self): + rb = ring.RingBuilder(4, 3, 1) + # 127.0.0.1 (2 devs) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + # 127.0.0.2 (3 devs) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdc'}) + + expected = { + '127.0.0.1': 1.2, + '127.0.0.2': 1.7999999999999998, + } + for wr in (rb._build_weighted_replicas_by_tier(), + rb._build_wanted_replicas_by_tier(), + rb._build_target_replicas_by_tier()): + self.assertEqual(expected, {t[-1]: r for (t, r) in + wr.items() if len(t) == 3}) + self.assertEqual(rb.get_required_overload(), 0) + rb.rebalance(seed=3) + # so 127.0.0.1 ended up with... + tier = (0, 0, '127.0.0.1') + # ... 6 parts with 1 replicas + self.assertEqual(rb._dispersion_graph[tier][1], 12) + # ... 4 parts with 2 replicas + self.assertEqual(rb._dispersion_graph[tier][2], 4) + # but since we only have two tiers, this is *totally* dispersed + self.assertEqual(0, rb.dispersion) + + # small rings are hard to balance... + expected = {0: 10, 1: 10, 2: 10, 3: 9, 4: 9} + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + # everyone wants 9.6 parts + expected = { + 0: 4.166666666666671, + 1: 4.166666666666671, + 2: 4.166666666666671, + 3: -6.25, + 4: -6.25, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # original sorted _replica2part2dev + """ + rb._replica2part2dev = [ + array('H', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), + array('H', [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3]), + array('H', [2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4])] + """ + + # now imagine if we came along this _replica2part2dev through no + # fault of our own; if instead of the 12 parts with only one + # replica on 127.0.0.1 being split evenly (6 and 6) on device's + # 0 and 1 - device 1 inexplicitly had 3 extra parts + rb._replica2part2dev = [ + # these are the relevant one's here + # | | | + # v v v + array('H', [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), + array('H', [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3]), + array('H', [2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4])] + + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + + rb.pretend_min_part_hours_passed() + rb.rebalance() + expected = { + 0: 4.166666666666671, + 1: 4.166666666666671, + 2: 4.166666666666671, + 3: -6.25, + 4: -6.25, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + self.assertEqual(rb.get_balance(), 6.25) + + def test_wrong_tier_with_no_where_to_go(self): + rb = ring.RingBuilder(4, 3, 1) + + # 127.0.0.1 (even devices) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 0, 'weight': 900, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + # 127.0.0.2 (odd devices) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdd'}) + + expected = { + '127.0.0.1': 1.75, + '127.0.0.2': 1.25, + } + for wr in (rb._build_weighted_replicas_by_tier(), + rb._build_wanted_replicas_by_tier(), + rb._build_target_replicas_by_tier()): + self.assertEqual(expected, {t[-1]: r for (t, r) in + wr.items() if len(t) == 3}) + self.assertEqual(rb.get_required_overload(), 0) + rb.rebalance(seed=3) + # so 127.0.0.1 ended up with... + tier = (0, 0, '127.0.0.1') + # ... 4 parts with 1 replicas + self.assertEqual(rb._dispersion_graph[tier][1], 4) + # ... 12 parts with 2 replicas + self.assertEqual(rb._dispersion_graph[tier][2], 12) + # ... and of course 0 parts with 3 replicas + self.assertEqual(rb._dispersion_graph[tier][3], 0) + # but since we only have two tiers, this is *totally* dispersed + self.assertEqual(0, rb.dispersion) + + # small rings are hard to balance, but it's possible when + # part-replicas (3 * 2 ** 4) can go evenly into device weights + # (4800) like we've done here + expected = { + 0: 1, + 2: 9, + 4: 9, + 6: 9, + 1: 5, + 3: 5, + 5: 5, + 7: 5, + } + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + expected = { + 0: 0.0, + 1: 0.0, + 2: 0.0, + 3: 0.0, + 4: 0.0, + 5: 0.0, + 6: 0.0, + 7: 0.0, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # all devices have exactly the # of parts they want + expected = { + 0: 0, + 2: 0, + 4: 0, + 6: 0, + 1: 0, + 3: 0, + 5: 0, + 7: 0, + } + self.assertEqual(expected, {d['id']: d['parts_wanted'] + for d in rb._iter_devs()}) + + # original sorted _replica2part2dev + """ + rb._replica2part2dev = [ + array('H', [0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, ]), + array('H', [4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, ]), + array('H', [1, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, ])] + """ + # now imagine if we came along this _replica2part2dev through no + # fault of our own; and device 0 had extra parts, but both + # copies of the other replicas were already in the other tier! + rb._replica2part2dev = [ + # these are the relevant one's here + # | | + # v v + array('H', [2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 0, 0]), + array('H', [4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1]), + array('H', [1, 1, 3, 3, 3, 3, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7])] + + # fix up bookkeeping + new_dev_parts = defaultdict(int) + for part2dev_id in rb._replica2part2dev: + for dev_id in part2dev_id: + new_dev_parts[dev_id] += 1 + for dev in rb._iter_devs(): + dev['parts'] = new_dev_parts[dev['id']] + replica_plan = rb._build_replica_plan() + rb._set_parts_wanted(replica_plan) + + expected = { + 0: -1, # this device wants to shed + 2: 0, + 4: 0, + 6: 0, + 1: 0, + 3: 1, # there's devices with room on the other server + 5: 0, + 7: 0, + } + self.assertEqual(expected, {d['id']: d['parts_wanted'] + for d in rb._iter_devs()}) + + rb.pretend_min_part_hours_passed() + rb.rebalance() + self.assertEqual(rb.get_balance(), 0) + def test_region_fullness_with_balanceable_ring(self): rb = ring.RingBuilder(8, 3, 1) rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1, @@ -891,9 +1190,14 @@ class TestRingBuilder(unittest.TestCase): population_by_region = self._get_population_by_region(rb) self.assertEqual(population_by_region, {0: 682, 1: 86}) - # only 86 parts *should* move (to the new region) but randomly some - # parts will flop around devices in the original region too - self.assertEqual(90, changed_parts) + # really 86 parts *should* move (to the new region) but to avoid + # accidentally picking up too many and causing some parts to randomly + # flop around devices in the original region - our gather algorithm + # is conservative when picking up only from devices that are for sure + # holding more parts than they want (math.ceil() of the replica_plan) + # which guarantees any parts picked up will have new homes in a better + # tier or failure_domain. + self.assertEqual(86, changed_parts) # and since there's not enough room, subsequent rebalances will not # cause additional assignments to r1 @@ -933,6 +1237,7 @@ class TestRingBuilder(unittest.TestCase): # Increase the weight of region 1 slowly moved_partitions = [] + errors = [] for weight in range(0, 101, 10): rb.set_dev_weight(5, weight) rb.pretend_min_part_hours_passed() @@ -943,11 +1248,17 @@ class TestRingBuilder(unittest.TestCase): # Otherwise there will be replicas at risk min_parts_for_r1 = ceil(weight / (500.0 + weight) * 768) parts_for_r1 = self._get_population_by_region(rb).get(1, 0) - self.assertEqual(min_parts_for_r1, parts_for_r1) + try: + self.assertEqual(min_parts_for_r1, parts_for_r1) + except AssertionError: + errors.append('weight %s got %s parts but expected %s' % ( + weight, parts_for_r1, min_parts_for_r1)) + + self.assertFalse(errors) # Number of partitions moved on each rebalance # 10/510 * 768 ~ 15.06 -> move at least 15 partitions in first step - ref = [0, 17, 16, 17, 13, 15, 13, 12, 11, 13, 13] + ref = [0, 16, 14, 14, 13, 13, 13, 12, 11, 12, 10] self.assertEqual(ref, moved_partitions) def test_set_replicas_increase(self): @@ -1167,14 +1478,13 @@ class TestRingBuilder(unittest.TestCase): # Devices 0 and 1 take 10% more than their fair shares by weight since # overload is 10% (0.1). rb.set_overload(0.1) - for _ in range(2): - rb.pretend_min_part_hours_passed() - rb.rebalance(seed=12345) + rb.pretend_min_part_hours_passed() + rb.rebalance() part_counts = self._partition_counts(rb, key='zone') self.assertEqual(part_counts[0], 212) - self.assertEqual(part_counts[1], 212) - self.assertEqual(part_counts[2], 344) + self.assertEqual(part_counts[1], 211) + self.assertEqual(part_counts[2], 345) # Now, devices 0 and 1 take 50% more than their fair shares by # weight. @@ -1244,14 +1554,14 @@ class TestRingBuilder(unittest.TestCase): # Add some weight: balance improves for dev in rb.devs: if dev['ip'] in ('127.0.0.1', '127.0.0.2'): - rb.set_dev_weight(dev['id'], 1.5) + rb.set_dev_weight(dev['id'], 1.22) rb.pretend_min_part_hours_passed() rb.rebalance(seed=12345) part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 236) - self.assertEqual(part_counts['127.0.0.2'], 236) - self.assertEqual(part_counts['127.0.0.3'], 296) + self.assertEqual(part_counts['127.0.0.1'], 238) + self.assertEqual(part_counts['127.0.0.2'], 237) + self.assertEqual(part_counts['127.0.0.3'], 293) # Even out the weights: balance becomes perfect for dev in rb.devs: @@ -1287,11 +1597,15 @@ class TestRingBuilder(unittest.TestCase): rb.pretend_min_part_hours_passed() rb.rebalance(seed=12345) + expected = { + '127.0.0.1': 192, + '127.0.0.2': 192, + '127.0.0.3': 192, + '127.0.0.4': 192, + } + part_counts = self._partition_counts(rb, key='ip') - self.assertEqual(part_counts['127.0.0.1'], 192) - self.assertEqual(part_counts['127.0.0.2'], 192) - self.assertEqual(part_counts['127.0.0.3'], 192) - self.assertEqual(part_counts['127.0.0.4'], 192) + self.assertEqual(part_counts, expected) def test_overload_keeps_balanceable_things_balanced_initially(self): rb = ring.RingBuilder(8, 3, 1) @@ -1705,6 +2019,8 @@ class TestRingBuilder(unittest.TestCase): ]) self.assertEqual(int(worst), 0) + # min part hours should pin all the parts assigned to this zero + # weight device onto it such that the balance will look horrible rb.set_dev_weight(2, 0) rb.rebalance() self.assertEqual(rb.validate(stats=True)[1], MAX_BALANCE) @@ -1789,24 +2105,11 @@ class TestRingBuilder(unittest.TestCase): def __eq__(self, other): return self.substr in other - with warnings.catch_warnings(): - # we're firing the warning twice in this test and resetwarnings - # doesn't work - https://bugs.python.org/issue4180 - warnings.simplefilter('always') + with self.assertRaises(exceptions.RingValidationError) as e: + rb.validate() - # by default things will work, but log a warning - with mock.patch('sys.stderr') as mock_stderr: - rb.validate() - expected = SubStringMatcher( - 'RingValidationWarning: The partition 200 has been assigned ' - 'to duplicate devices') - # ... but the warning is written to stderr - self.assertEqual(mock_stderr.method_calls, - [mock.call.write(expected)]) - # if you make warnings errors it blows up - with warnings.catch_warnings(): - warnings.filterwarnings('error') - self.assertRaises(RingValidationWarning, rb.validate) + expected = 'The partition 200 has been assigned to duplicate devices' + self.assertIn(expected, str(e.exception)) def test_get_part_devices(self): rb = ring.RingBuilder(8, 3, 1) @@ -1832,12 +2135,12 @@ class TestRingBuilder(unittest.TestCase): 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 1, 'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'}) - rb.rebalance(seed=9) + rb.rebalance(seed=4) # note: partition 255 will only have 2 replicas part_devs = sorted(rb.get_part_devices(255), key=operator.itemgetter('id')) - self.assertEqual(part_devs, [rb.devs[0], rb.devs[1]]) + self.assertEqual(part_devs, [rb.devs[1], rb.devs[2]]) def test_dispersion_with_zero_weight_devices(self): rb = ring.RingBuilder(8, 3.0, 0) @@ -1975,31 +2278,10 @@ class TestRingBuilder(unittest.TestCase): rb.rebalance(seed=7) rb.validate() - # ok, we didn't quite disperse - self.assertGreater(rb.dispersion, 0) - - # ... but let's unlock some parts - rb.pretend_min_part_hours_passed() - rb.rebalance(seed=7) - rb.validate() - - # ... and that got it! + # ... and that got it in one pass boo-yah! self.assertEqual(rb.dispersion, 0) - def strawman_test(self): - """ - This test demonstrates a trivial failure of part-replica placement. - - If you turn warnings into errors this will fail. - - i.e. - - export PYTHONWARNINGS=error:::swift.common.ring.builder - - N.B. try not to get *too* hung up on doing something silly to make - this particular case pass w/o warnings - it's trivial to write up a - dozen more. - """ + def zone_weights_over_device_count(self): rb = ring.RingBuilder(8, 3, 1) # z0 rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, @@ -2011,16 +2293,35 @@ class TestRingBuilder(unittest.TestCase): rb.add_dev({'id': 2, 'region': 0, 'zone': 2, 'weight': 200, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - with warnings.catch_warnings(record=True) as w: - rb.rebalance(seed=7) - rb.validate() - self.assertEqual(len(w), 65) + rb.rebalance(seed=7) + rb.validate() + self.assertEqual(rb.dispersion, 0) + self.assertAlmostEqual(rb.get_balance(), (1.0 / 3.0) * 100) + + def test_more_devices_than_replicas_validation_when_removed_dev(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'weight': 1.0, 'device': 'sdc'}) + rb.rebalance() + rb.remove_dev(2) + with self.assertRaises(ValueError) as e: + rb.set_dev_weight(2, 1) + msg = "Can not set weight of dev_id 2 because it is marked " \ + "for removal" + self.assertIn(msg, str(e.exception)) + with self.assertRaises(exceptions.RingValidationError) as e: + rb.rebalance() + msg = 'Replica count of 3 requires more than 2 devices' + self.assertIn(msg, str(e.exception)) class TestGetRequiredOverload(unittest.TestCase): - def assertApproximately(self, a, b, error=1e-6): - self.assertTrue(abs(a - b) < error, - "%f and %f differ by more than %f" % (a, b, error)) + + maxDiff = None def test_none_needed(self): rb = ring.RingBuilder(8, 3, 1) @@ -2035,11 +2336,108 @@ class TestGetRequiredOverload(unittest.TestCase): # 4 equal-weight devs and 3 replicas: this can be balanced without # resorting to overload at all - self.assertApproximately(rb.get_required_overload(), 0) + self.assertAlmostEqual(rb.get_required_overload(), 0) + + expected = { + (0, 0, '127.0.0.1', 0): 0.75, + (0, 0, '127.0.0.1', 1): 0.75, + (0, 0, '127.0.0.1', 2): 0.75, + (0, 0, '127.0.0.1', 3): 0.75, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # since no overload is needed, target_replicas is the same + rb.set_overload(0.10) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # ... no matter how high you go! + rb.set_overload(100.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) # 3 equal-weight devs and 3 replicas: this can also be balanced rb.remove_dev(3) - self.assertApproximately(rb.get_required_overload(), 0) + self.assertAlmostEqual(rb.get_required_overload(), 0) + + expected = { + (0, 0, '127.0.0.1', 0): 1.0, + (0, 0, '127.0.0.1', 1): 1.0, + (0, 0, '127.0.0.1', 2): 1.0, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # ... still no overload + rb.set_overload(100.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_equal_replica_and_devices_count_ignore_weights(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 7.47, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 5.91, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 6.44, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + expected = { + 0: 1.0, + 1: 1.0, + 2: 1.0, + } + # simplicity itself + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_weighted_replicas_by_tier().items() + if len(t) == 4}) + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_wanted_replicas_by_tier().items() + if len(t) == 4}) + self.assertEqual(expected, { + t[-1]: r for (t, r) in + rb._build_target_replicas_by_tier().items() + if len(t) == 4}) + # ... no overload required! + self.assertEqual(0, rb.get_required_overload()) + + rb.rebalance() + expected = { + 0: 256, + 1: 256, + 2: 256, + } + self.assertEqual(expected, {d['id']: d['parts'] for d in + rb._iter_devs()}) def test_small_zone(self): rb = ring.RingBuilder(8, 3, 1) @@ -2058,9 +2456,155 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 3, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) - # Zone 2 has 7/8 of the capacity of the other two zones, so an - # overload of 1/7 will allow things to balance out. - self.assertApproximately(rb.get_required_overload(), 1.0 / 7) + expected = { + (0, 0): 1.0434782608695652, + (0, 1): 1.0434782608695652, + (0, 2): 0.9130434782608695, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # the device tier is interesting because one of the devices in zone + # two has a different weight + expected = { + 0: 0.5217391304347826, + 1: 0.5217391304347826, + 2: 0.5217391304347826, + 3: 0.5217391304347826, + 4: 0.5217391304347826, + 5: 0.3913043478260869, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + + # ... but, each pair of devices still needs to hold a whole + # replicanth; which we'll try distribute fairly among devices in + # zone 2, so that they can share the burden and ultimately the + # required overload will be as small as possible. + expected = { + 0: 0.5, + 1: 0.5, + 2: 0.5, + 3: 0.5, + 4: 0.5714285714285715, + 5: 0.42857142857142855, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # full dispersion requires zone two's devices to eat more than + # they're weighted for + self.assertAlmostEqual(rb.get_required_overload(), 0.095238, + delta=1e-5) + + # so... if we give it enough overload it we should get full dispersion + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_multiple_small_zones(self): + rb = ring.RingBuilder(8, 3, 1) + + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 8, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 9, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 10, 'region': 0, 'zone': 1, 'weight': 150, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 100, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) + + expected = { + (0, 0): 2.1052631578947367, + (0, 1): 0.47368421052631576, + (0, 2): 0.21052631578947367, + (0, 3): 0.21052631578947367, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + # without any overload, we get weight + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.49999999999999994, + (0, 3): 0.49999999999999994, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {t: r + for (t, r) in wanted_replicas.items() + if len(t) == 2}) + + self.assertEqual(1.3750000000000002, rb.get_required_overload()) + + # with enough overload we get the full dispersion + rb.set_overload(1.5) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) + + # with not enough overload, we get somewhere in the middle + rb.set_overload(1.0) + expected = { + (0, 0): 1.3014354066985647, + (0, 1): 0.8564593301435406, + (0, 2): 0.4210526315789473, + (0, 3): 0.4210526315789473, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: r + for (tier, r) in target_replicas.items() + if len(tier) == 2}) def test_big_zone(self): rb = ring.RingBuilder(8, 3, 1) @@ -2084,37 +2628,124 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.3', 'port': 10000, 'device': 'sdb'}) - # Zone 1 has weight 200, while zones 2, 3, and 4 together have only - # 360. The small zones would need to go from 360 to 400 to balance - # out zone 1, for an overload of 40/360 = 1/9. - self.assertApproximately(rb.get_required_overload(), 1.0 / 9) + expected = { + (0, 0): 1.0714285714285714, + (0, 1): 0.6428571428571429, + (0, 2): 0.6428571428571429, + (0, 3): 0.6428571428571429, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + expected = { + (0, 0): 1.0, + (0, 1): 0.6666666666666667, + (0, 2): 0.6666666666666667, + (0, 3): 0.6666666666666667, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # when all the devices and servers in a zone are evenly weighted + # it will accurately proxy their required overload, all the + # zones besides 0 require the same overload + t = random.choice([t for t in weighted_replicas + if len(t) == 2 + and t[1] != 0]) + expected_overload = ((wanted_replicas[t] - weighted_replicas[t]) + / weighted_replicas[t]) + self.assertAlmostEqual(rb.get_required_overload(), + expected_overload) + + # but if you only give it out half of that + rb.set_overload(expected_overload / 2.0) + # ... you can expect it's not going to full disperse + expected = { + (0, 0): 1.0357142857142856, + (0, 1): 0.6547619047619049, + (0, 2): 0.6547619047619049, + (0, 3): 0.6547619047619049, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_enormous_zone(self): rb = ring.RingBuilder(8, 3, 1) - rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1000, + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 500, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 1000, + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 500, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 500, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 60, + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'weight': 60, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 60, + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'weight': 60, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'weight': 60, + rb.add_dev({'id': 6, 'region': 0, 'zone': 2, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'weight': 60, + rb.add_dev({'id': 7, 'region': 0, 'zone': 2, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - rb.add_dev({'id': 6, 'region': 0, 'zone': 3, 'weight': 60, + rb.add_dev({'id': 8, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 60, + rb.add_dev({'id': 9, 'region': 0, 'zone': 3, 'weight': 60, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - # Zone 1 has weight 2000, while zones 2, 3, and 4 together have only - # 360. The small zones would need to go from 360 to 4000 to balance - # out zone 1, for an overload of 3640/360. - self.assertApproximately(rb.get_required_overload(), 3640.0 / 360) + expected = { + (0, 0): 2.542372881355932, + (0, 1): 0.15254237288135591, + (0, 2): 0.15254237288135591, + (0, 3): 0.15254237288135591, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 0.6666666666666667, + (0, 2): 0.6666666666666667, + (0, 3): 0.6666666666666667, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # ouch, those "tiny" devices need to hold 3x more than their + # weighted for! + self.assertAlmostEqual(rb.get_required_overload(), 3.370370, + delta=1e-5) + + # let's get a little crazy, and let devices eat up to 1x more than + # their capacity is weighted for - see how far that gets us... + rb.set_overload(1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 2.084745762711864, + (0, 1): 0.30508474576271183, + (0, 2): 0.30508474576271183, + (0, 3): 0.30508474576271183, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_two_big_two_small(self): rb = ring.RingBuilder(8, 3, 1) @@ -2138,27 +2769,923 @@ class TestGetRequiredOverload(unittest.TestCase): rb.add_dev({'id': 7, 'region': 0, 'zone': 3, 'weight': 35, 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) - # Zones 1 and 2 each have weight 200, while zones 3 and 4 together - # have only 160. The small zones would need to go from 160 to 200 to - # balance out the big zones, for an overload of 40/160 = 1/4. - self.assertApproximately(rb.get_required_overload(), 1.0 / 4) + expected = { + (0, 0): 1.0714285714285714, + (0, 1): 1.0714285714285714, + (0, 2): 0.48214285714285715, + (0, 3): 0.375, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.5625, + (0, 3): 0.43749999999999994, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # I'm not sure it's significant or coincidental that the devices + # in zone 2 & 3 who end up splitting the 3rd replica turn out to + # need to eat ~1/6th extra replicanths + self.assertAlmostEqual(rb.get_required_overload(), 1.0 / 6.0) + + # ... *so* 10% isn't *quite* enough + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 1.0285714285714285, + (0, 1): 1.0285714285714285, + (0, 2): 0.5303571428571429, + (0, 3): 0.4125, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) + + # ... but 20% will do the trick! + rb.set_overload(0.2) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 0.5625, + (0, 3): 0.43749999999999994, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) def test_multiple_replicas_each(self): rb = ring.RingBuilder(8, 7, 1) - rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 100, + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 80, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 100, + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': 80, 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sdd'}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'weight': 80, + 'ip': '127.0.0.0', 'port': 10000, 'device': 'sde'}) - rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': 70, + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'weight': 70, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) - rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': 70, + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'weight': 70, 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdb'}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 1, 'weight': 70, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdc'}) + rb.add_dev({'id': 8, 'region': 0, 'zone': 1, 'weight': 70, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sdd'}) - # Zone 0 has more than 4/7 of the weight, so we'll need to bring - # zone 1 up to a total of 150 so it can take 3 replicas, so the - # overload should be 10/140. - self.assertApproximately(rb.get_required_overload(), 10.0 / 140) + expected = { + (0, 0): 4.117647058823529, + (0, 1): 2.8823529411764706, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + expected = { + (0, 0): 4.0, + (0, 1): 3.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + # I guess 2.88 => 3.0 is about a 4% increase + self.assertAlmostEqual(rb.get_required_overload(), + 0.040816326530612256) + + # ... 10% is plenty enough here + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 2}) + + def test_small_extra_server_in_zone_with_multiple_replicas(self): + rb = ring.RingBuilder(8, 5, 1) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 1000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdc', 'weight': 1000}) + + # z1 + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 1000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdc', 'weight': 1000}) + + # z1 - extra small server + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 50}) + + expected = { + (0, 0): 2.479338842975207, + (0, 1): 2.5206611570247937, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t: r for (t, r) in + weighted_replicas.items() + if len(t) == 2}) + + # dispersion is fine with this at the zone tier + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, {t: r for (t, r) in + wanted_replicas.items() + if len(t) == 2}) + + # ... but not ok with that tiny server + expected = { + '127.0.0.1': 2.479338842975207, + '127.0.0.2': 1.5206611570247937, + '127.0.0.3': 1.0, + } + self.assertEqual(expected, {t[-1]: r for (t, r) in + wanted_replicas.items() + if len(t) == 3}) + + self.assertAlmostEqual(23.2, rb.get_required_overload()) + + def test_multiple_replicas_in_zone_with_single_device(self): + rb = ring.RingBuilder(8, 5, 0) + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z1 + rb.add_dev({'id': 1, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdc', 'weight': 100}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdd', 'weight': 100}) + + # first things first, make sure we do this right + rb.rebalance() + + # each device get's a sing replica of every part + expected = { + 0: 256, + 1: 256, + 2: 256, + 3: 256, + 4: 256, + } + self.assertEqual(expected, {d['id']: d['parts'] + for d in rb._iter_devs()}) + + # but let's make sure we're thinking about it right too + expected = { + 0: 1.0, + 1: 1.0, + 2: 1.0, + 3: 1.0, + 4: 1.0, + } + + # by weight everyone is equal + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # wanted might have liked to have fewer replicas in z1, but the + # single device in z0 limits us one replica per device + with rb.debug(): + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + wanted_replicas.items() + if len(t) == 4}) + + # even with some overload - still one replica per device + rb.set_overload(1.0) + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # when overload can not change the outcome none is required + self.assertEqual(0.0, rb.get_required_overload()) + # even though dispersion is terrible (in z1 particularly) + self.assertEqual(100.0, rb.dispersion) + + def test_one_big_guy_does_not_spoil_his_buddy(self): + rb = ring.RingBuilder(8, 3, 0) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + # z2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + + # obviously d5 gets one whole replica; the other two replicas + # are split evenly among the five other devices + # (i.e. ~0.4 replicanths for each 100 units of weight) + expected = { + 0: 0.39999999999999997, + 1: 0.39999999999999997, + 2: 0.39999999999999997, + 3: 0.39999999999999997, + 4: 0.39999999999999997, + 5: 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # with no overload we get the "balanced" placement + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # but in reality, these devices having such disparate weights + # leads to a *terrible* balance even w/o overload! + rb.rebalance(seed=9) + self.assertEqual(rb.get_balance(), 1308.2031249999998) + + # even though part assignment is pretty reasonable + expected = { + 0: 103, + 1: 102, + 2: 103, + 3: 102, + 4: 102, + 5: 256, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # so whats happening is the small devices are holding *way* more + # *real* parts than their *relative* portion of the weight would + # like them too! + expected = { + 0: 1308.2031249999998, + 1: 1294.5312499999998, + 2: 1308.2031249999998, + 3: 1294.5312499999998, + 4: 1294.5312499999998, + 5: -65.0, + + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + # increasing overload moves towards one replica in each tier + rb.set_overload(0.20) + expected = { + 0: 0.48, + 1: 0.48, + 2: 0.48, + 3: 0.48, + 4: 0.30857142857142855, + 5: 0.7714285714285714, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=17) + self.assertEqual(rb.get_balance(), 1581.6406249999998) + + # but despite the overall trend toward imbalance, in the tier + # with the huge device, the small device is trying to shed parts + # as effectively as it can (which would be useful if it was the + # only small device isolated in a tier with other huge devices + # trying to gobble up all the replicanths in the tier - see + # `test_one_small_guy_does_not_spoil_his_buddy`!) + expected = { + 0: 123, + 1: 123, + 2: 123, + 3: 123, + 4: 79, + 5: 197, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # *see*, at least *someones* balance is getting better! + expected = { + 0: 1581.6406249999998, + 1: 1581.6406249999998, + 2: 1581.6406249999998, + 3: 1581.6406249999998, + 4: 980.078125, + 5: -73.06640625, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + def test_one_small_guy_does_not_spoil_his_buddy(self): + rb = ring.RingBuilder(8, 3, 0) + + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + # z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + # z2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 10000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + + # it's almost like 3.0 / 5 ~= 0.6, but that one little guy get's + # his fair share + expected = { + 0: 0.5988023952095808, + 1: 0.5988023952095808, + 2: 0.5988023952095808, + 3: 0.5988023952095808, + 4: 0.5988023952095808, + 5: 0.005988023952095809, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + weighted_replicas.items() + if len(t) == 4}) + + # with no overload we get a nice balanced placement + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + rb.rebalance(seed=9) + + # part placement looks goods + expected = { + 0: 154, + 1: 153, + 2: 153, + 3: 153, + 4: 153, + 5: 2, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # ... balance is a little lumpy on the small guy since he wants + # one and a half parts :\ + expected = { + 0: 0.4609375000000142, + 1: -0.1914062499999858, + 2: -0.1914062499999858, + 3: -0.1914062499999858, + 4: -0.1914062499999858, + 5: 30.46875, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + self.assertEqual(rb.get_balance(), 30.46875) + + # increasing overload moves towards one replica in each tier + rb.set_overload(0.5) + expected = { + 0: 0.5232035928143712, + 1: 0.5232035928143712, + 2: 0.5232035928143712, + 3: 0.5232035928143712, + 4: 0.8982035928143712, + 5: 0.008982035928143714, + } + target_replicas = rb._build_target_replicas_by_tier() + self.assertEqual(expected, {t[-1]: r for (t, r) in + target_replicas.items() + if len(t) == 4}) + + # ... and as always increasing overload makes balance *worse* + rb.rebalance(seed=17) + self.assertEqual(rb.get_balance(), 95.703125) + + # but despite the overall trend toward imbalance, the little guy + # isn't really taking on many new parts! + expected = { + 0: 134, + 1: 134, + 2: 134, + 3: 133, + 4: 230, + 5: 3, + } + self.assertEqual(expected, { + d['id']: d['parts'] for d in rb._iter_devs()}) + + # *see*, at everyone's balance is getting worse *together*! + expected = { + 0: -12.585937499999986, + 1: -12.585937499999986, + 2: -12.585937499999986, + 3: -13.238281249999986, + 4: 50.0390625, + 5: 95.703125, + } + self.assertEqual(expected, rb._build_balance_per_dev()) + + def test_two_servers_with_more_than_one_replica(self): + rb = ring.RingBuilder(8, 3, 0) + # z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 60}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 60}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 60}) + # z1 + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 80}) + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 128}) + # z2 + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.2.1', + 'port': 6000, 'device': 'sda', 'weight': 80}) + rb.add_dev({'id': 6, 'region': 0, 'zone': 2, 'ip': '127.0.2.2', + 'port': 6000, 'device': 'sda', 'weight': 240}) + + rb.set_overload(0.1) + rb.rebalance() + self.assertEqual(12.161458333333343, rb.get_balance()) + + replica_plan = rb._build_target_replicas_by_tier() + for dev in rb._iter_devs(): + tier = (dev['region'], dev['zone'], dev['ip'], dev['id']) + expected_parts = replica_plan[tier] * rb.parts + self.assertAlmostEqual(dev['parts'], expected_parts, + delta=1) + + def test_multi_zone_with_failed_device(self): + rb = ring.RingBuilder(8, 3, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + rb.add_dev({'id': 4, 'region': 0, 'zone': 2, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 2, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + # sanity, balanced and dispersed + expected = { + (0, 0): 1.0, + (0, 1): 1.0, + (0, 2): 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 2}) + + self.assertEqual(rb.get_required_overload(), 0.0) + + # fail a device in zone 2 + rb.remove_dev(4) + + expected = { + 0: 0.6, + 1: 0.6, + 2: 0.6, + 3: 0.6, + 5: 0.6, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + + expected = { + 0: 0.5, + 1: 0.5, + 2: 0.5, + 3: 0.5, + 5: 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 4}) + + # does this make sense? every zone was holding 1/3rd of the + # replicas, so each device was 1/6th, remove a device and + # suddenly it's holding *both* sixths which is 2/3rds? + self.assertAlmostEqual(rb.get_required_overload(), 2.0 / 3.0) + + # 10% isn't nearly enough + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.585, + 1: 0.585, + 2: 0.585, + 3: 0.585, + 5: 0.6599999999999999, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # 50% isn't even enough + rb.set_overload(0.5) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.525, + 1: 0.525, + 2: 0.525, + 3: 0.525, + 5: 0.8999999999999999, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + # even 65% isn't enough (but it's getting closer) + rb.set_overload(0.65) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + 0: 0.5025000000000001, + 1: 0.5025000000000001, + 2: 0.5025000000000001, + 3: 0.5025000000000001, + 5: 0.99, + } + self.assertEqual(expected, + {tier[3]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 4}) + + def test_balanced_zones_unbalanced_servers(self): + rb = ring.RingBuilder(8, 3, 1) + # zone 0 server 127.0.0.1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 3000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 3000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 3000}) + # zone 1 server 127.0.0.2 + rb.add_dev({'id': 4, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 4000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 1, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 4000}) + # zone 1 (again) server 127.0.0.3 + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 1000}) + + weighted_replicas = rb._build_weighted_replicas_by_tier() + + # zones are evenly weighted + expected = { + (0, 0): 1.5, + (0, 1): 1.5, + } + self.assertEqual(expected, + {tier: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 2}) + + # ... but servers are not + expected = { + '127.0.0.1': 1.5, + '127.0.0.2': 1.3333333333333333, + '127.0.0.3': 0.16666666666666666, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + + # make sure wanted will even it out + expected = { + '127.0.0.1': 1.5, + '127.0.0.2': 1.0, + '127.0.0.3': 0.4999999999999999, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + # so it wants 1/6th and eats 1/2 - that's 2/6ths more than it + # wants which is a 200% increase + self.assertAlmostEqual(rb.get_required_overload(), 2.0) + + # the overload doesn't effect the tiers that are already dispersed + rb.set_overload(1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + '127.0.0.1': 1.5, + # notice with half the overload 1/6th replicanth swapped servers + '127.0.0.2': 1.1666666666666665, + '127.0.0.3': 0.3333333333333333, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 3}) + + def test_adding_second_zone(self): + rb = ring.RingBuilder(3, 3, 1) + # zone 0 server 127.0.0.1 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + # zone 0 server 127.0.0.2 + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + # zone 0 server 127.0.0.3 + rb.add_dev({'id': 4, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 5, 'region': 0, 'zone': 0, 'ip': '127.0.0.3', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + # sanity, balanced and dispersed + expected = { + '127.0.0.1': 1.0, + '127.0.0.2': 1.0, + '127.0.0.3': 1.0, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + self.assertEqual(rb.get_required_overload(), 0) + + # start adding a second zone + + # zone 1 server 127.0.1.1 + rb.add_dev({'id': 6, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 7, 'region': 0, 'zone': 1, 'ip': '127.0.1.1', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + # zone 1 server 127.0.1.2 + rb.add_dev({'id': 8, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 9, 'region': 0, 'zone': 1, 'ip': '127.0.1.2', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + # zone 1 server 127.0.1.3 + rb.add_dev({'id': 10, 'region': 0, 'zone': 1, 'ip': '127.0.1.3', + 'port': 6000, 'device': 'sda', 'weight': 100}) + rb.add_dev({'id': 11, 'region': 0, 'zone': 1, 'ip': '127.0.1.3', + 'port': 6000, 'device': 'sdb', 'weight': 100}) + + # this messes things up pretty royally + expected = { + '127.0.0.1': 0.9523809523809523, + '127.0.0.2': 0.9523809523809523, + '127.0.0.3': 0.9523809523809523, + '127.0.1.1': 0.047619047619047616, + '127.0.1.2': 0.047619047619047616, + '127.0.1.3': 0.047619047619047616, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + expected = { + '127.0.0.1': 0.6666666666666667, + '127.0.0.2': 0.6666666666666667, + '127.0.0.3': 0.6666666666666667, + '127.0.1.1': 0.3333333333333333, + '127.0.1.2': 0.3333333333333333, + '127.0.1.3': 0.3333333333333333, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in wanted_replicas.items() + if len(tier) == 3}) + + # so dispersion would require these devices hold 6x more than + # prescribed by weight, defeating any attempt at gradually + # anything + self.assertAlmostEqual(rb.get_required_overload(), 6.0) + + # so let's suppose we only allow for 10% overload + rb.set_overload(0.10) + target_replicas = rb._build_target_replicas_by_tier() + + expected = { + # we expect servers in zone 0 to be between 0.952 and 0.666 + '127.0.0.1': 0.9476190476190476, + '127.0.0.2': 0.9476190476190476, + '127.0.0.3': 0.9476190476190476, + # we expect servers in zone 1 to be between 0.0476 and 0.333 + # and in fact its ~10% increase (very little compared to 6x!) + '127.0.1.1': 0.052380952380952375, + '127.0.1.2': 0.052380952380952375, + '127.0.1.3': 0.052380952380952375, + } + self.assertEqual(expected, + {tier[2]: weighted + for (tier, weighted) in target_replicas.items() + if len(tier) == 3}) + + def test_gradual_replica_count(self): + rb = ring.RingBuilder(3, 2.5, 1) + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'ip': '127.0.0.1', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + rb.add_dev({'id': 2, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sda', 'weight': 2000}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 0, 'ip': '127.0.0.2', + 'port': 6000, 'device': 'sdb', 'weight': 2000}) + + expected = { + 0: 0.625, + 1: 0.625, + 2: 0.625, + 3: 0.625, + } + + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier[3]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 4}) + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, { + tier[3]: wanted + for (tier, wanted) in wanted_replicas.items() + if len(tier) == 4}) + + self.assertEqual(rb.get_required_overload(), 0) + + # server 127.0.0.2 will have only one device + rb.remove_dev(2) + + # server 127.0.0.1 has twice the capacity of 127.0.0.2 + expected = { + '127.0.0.1': 1.6666666666666667, + '127.0.0.2': 0.8333333333333334, + } + weighted_replicas = rb._build_weighted_replicas_by_tier() + self.assertEqual(expected, { + tier[2]: weighted + for (tier, weighted) in weighted_replicas.items() + if len(tier) == 3}) + + # dispersion requirements extend only to whole replicas + expected = { + '127.0.0.1': 1.4999999999999998, + '127.0.0.2': 1.0, + } + wanted_replicas = rb._build_wanted_replicas_by_tier() + self.assertEqual(expected, { + tier[2]: wanted + for (tier, wanted) in wanted_replicas.items() + if len(tier) == 3}) + + # 5/6ths to a whole replicanth is a 20% increase + self.assertAlmostEqual(rb.get_required_overload(), 0.2) + + # so let's suppose we only allow for 10% overload + rb.set_overload(0.1) + target_replicas = rb._build_target_replicas_by_tier() + expected = { + '127.0.0.1': 1.5833333333333333, + '127.0.0.2': 0.9166666666666667, + } + self.assertEqual(expected, { + tier[2]: wanted + for (tier, wanted) in target_replicas.items() + if len(tier) == 3}) + + def test_perfect_four_zone_four_replica_bad_placement(self): + rb = ring.RingBuilder(4, 4, 1) + + # this weight is sorta nuts, but it's really just to help the + # weight_of_one_part hit a magic number where floats mess up + # like they would on ring with a part power of 19 and 100's of + # 1000's of units of weight. + weight = 21739130434795e-11 + + # r0z0 + rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': weight, + 'ip': '127.0.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 1, 'region': 0, 'zone': 0, 'weight': weight, + 'ip': '127.0.0.2', 'port': 10000, 'device': 'sdb'}) + # r0z1 + rb.add_dev({'id': 2, 'region': 0, 'zone': 1, 'weight': weight, + 'ip': '127.0.1.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 3, 'region': 0, 'zone': 1, 'weight': weight, + 'ip': '127.0.1.2', 'port': 10000, 'device': 'sdb'}) + # r1z0 + rb.add_dev({'id': 4, 'region': 1, 'zone': 0, 'weight': weight, + 'ip': '127.1.0.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 5, 'region': 1, 'zone': 0, 'weight': weight, + 'ip': '127.1.0.2', 'port': 10000, 'device': 'sdb'}) + # r1z1 + rb.add_dev({'id': 6, 'region': 1, 'zone': 1, 'weight': weight, + 'ip': '127.1.1.1', 'port': 10000, 'device': 'sda'}) + rb.add_dev({'id': 7, 'region': 1, 'zone': 1, 'weight': weight, + 'ip': '127.1.1.2', 'port': 10000, 'device': 'sdb'}) + + # the replica plan is sound + expectations = { + # tier_len => expected replicas + 1: { + (0,): 2.0, + (1,): 2.0, + }, + 2: { + (0, 0): 1.0, + (0, 1): 1.0, + (1, 0): 1.0, + (1, 1): 1.0, + } + } + wr = rb._build_replica_plan() + for tier_len, expected in expectations.items(): + self.assertEqual(expected, {t: r['max'] for (t, r) in + wr.items() if len(t) == tier_len}) + + # even thought a naive ceil of weights is surprisingly wrong + expectations = { + # tier_len => expected replicas + 1: { + (0,): 3.0, + (1,): 3.0, + }, + 2: { + (0, 0): 2.0, + (0, 1): 2.0, + (1, 0): 2.0, + (1, 1): 2.0, + } + } + wr = rb._build_weighted_replicas_by_tier() + for tier_len, expected in expectations.items(): + self.assertEqual(expected, {t: ceil(r) for (t, r) in + wr.items() if len(t) == tier_len}) if __name__ == '__main__': diff --git a/test/unit/common/ring/test_ring.py b/test/unit/common/ring/test_ring.py index 90df6ce4ce..a492b44bd4 100644 --- a/test/unit/common/ring/test_ring.py +++ b/test/unit/common/ring/test_ring.py @@ -457,39 +457,42 @@ class TestRing(TestRingBase): # Yes, these tests are deliberately very fragile. We want to make sure # that if someone changes the results the ring produces, they know it. exp_part = 6 - exp_devs = [48, 93, 96] - exp_zones = set([5, 8, 9]) + exp_devs = [71, 77, 30] + exp_zones = set([6, 3, 7]) + + exp_handoffs = [99, 43, 94, 13, 1, 49, 60, 72, 27, 68, 78, 26, 21, 9, + 51, 105, 47, 89, 65, 82, 34, 98, 38, 85, 16, 4, 59, + 102, 40, 90, 20, 8, 54, 66, 80, 25, 14, 2, 50, 12, 0, + 48, 70, 76, 32, 107, 45, 87, 101, 44, 93, 100, 42, 95, + 106, 46, 88, 97, 37, 86, 96, 36, 84, 17, 5, 57, 63, + 81, 33, 67, 79, 24, 15, 3, 58, 69, 75, 31, 61, 74, 29, + 23, 10, 52, 22, 11, 53, 64, 83, 35, 62, 73, 28, 18, 6, + 56, 104, 39, 91, 103, 41, 92, 19, 7, 55] + + exp_first_handoffs = [23, 64, 105, 102, 67, 17, 99, 65, 69, 97, 15, + 17, 24, 98, 66, 65, 69, 18, 104, 105, 16, 107, + 100, 15, 14, 19, 102, 105, 63, 104, 99, 12, 107, + 99, 16, 105, 71, 15, 15, 63, 63, 99, 21, 68, 20, + 64, 96, 21, 98, 19, 68, 99, 15, 69, 62, 100, 96, + 102, 17, 62, 13, 61, 102, 105, 22, 16, 21, 18, + 21, 100, 20, 16, 21, 106, 66, 106, 16, 99, 16, + 22, 62, 60, 99, 69, 18, 23, 104, 98, 106, 61, + 21, 23, 23, 16, 67, 71, 101, 16, 64, 66, 70, 15, + 102, 63, 19, 98, 18, 106, 101, 100, 62, 63, 98, + 18, 13, 97, 23, 22, 100, 13, 14, 67, 96, 14, + 105, 97, 71, 64, 96, 22, 65, 66, 98, 19, 105, + 98, 97, 21, 15, 69, 100, 98, 106, 65, 66, 97, + 62, 22, 68, 63, 61, 67, 67, 20, 105, 106, 105, + 18, 71, 100, 17, 62, 60, 13, 103, 99, 101, 96, + 97, 16, 60, 21, 14, 20, 12, 60, 69, 104, 65, 65, + 17, 16, 67, 13, 64, 15, 16, 68, 96, 21, 104, 66, + 96, 105, 58, 105, 103, 21, 96, 60, 16, 96, 21, + 71, 16, 99, 101, 63, 62, 103, 18, 102, 60, 17, + 19, 106, 97, 14, 99, 68, 102, 13, 70, 103, 21, + 22, 19, 61, 103, 23, 104, 65, 62, 68, 16, 65, + 15, 102, 102, 71, 99, 63, 67, 19, 23, 15, 69, + 107, 14, 13, 64, 13, 105, 15, 98, 69] - exp_handoffs = [11, 47, 25, 76, 69, 23, 99, 59, 106, 64, 43, 34, 88, 3, - 30, 83, 16, 27, 103, 39, 60, 0, 8, 72, 56, 19, 91, 13, - 84, 38, 66, 52, 78, 107, 50, 57, 31, 32, 77, 24, 42, - 100, 71, 26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102, - 101, 61, 95, 21, 81, 1, 105, 58, 74, 90, 86, 46, 4, 68, - 40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, 87, - 65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, 37, 63, - 53, 92, 33, 85, 73, 51, 98, 36, 10] - exp_first_handoffs = [1, 37, 48, 68, 84, 75, 11, 101, 14, 73, 100, 75, - 29, 19, 18, 101, 15, 99, 95, 24, 46, 82, 73, 62, - 24, 89, 9, 22, 107, 74, 54, 63, 40, 106, 99, 83, - 64, 73, 73, 106, 106, 80, 6, 25, 20, 33, 6, 79, - 59, 42, 62, 24, 14, 107, 28, 0, 85, 5, 4, 12, 58, - 11, 92, 18, 36, 56, 86, 1, 21, 33, 80, 97, 4, 81, - 79, 76, 89, 50, 75, 27, 7, 96, 47, 55, 81, 104, - 12, 5, 18, 106, 27, 93, 39, 92, 42, 30, 20, 88, - 58, 105, 65, 29, 17, 52, 11, 106, 7, 24, 21, 91, - 62, 52, 50, 31, 77, 102, 19, 11, 8, 58, 53, 20, - 26, 8, 18, 82, 48, 68, 82, 89, 101, 50, 3, 52, - 46, 11, 2, 30, 79, 66, 4, 61, 3, 56, 45, 102, 73, - 84, 36, 19, 34, 84, 49, 40, 103, 66, 31, 33, 93, - 33, 4, 52, 26, 58, 30, 47, 100, 57, 40, 79, 33, - 107, 24, 20, 44, 4, 7, 59, 83, 101, 1, 56, 20, - 61, 33, 16, 5, 74, 98, 4, 80, 15, 104, 52, 73, - 18, 67, 75, 98, 73, 79, 68, 75, 27, 91, 36, 100, - 52, 95, 37, 46, 70, 14, 47, 3, 70, 23, 40, 105, - 62, 86, 48, 22, 54, 4, 72, 81, 13, 0, 18, 98, - 101, 36, 29, 24, 39, 79, 97, 105, 28, 107, 47, - 52, 101, 20, 22, 29, 65, 27, 7, 33, 64, 101, 60, - 19, 55] rb = ring.RingBuilder(8, 3, 1) next_dev_id = 0 for zone in range(1, 10): @@ -501,16 +504,27 @@ class TestRing(TestRingBase): 'zone': zone, 'region': 0, 'weight': 1.0}) next_dev_id += 1 - rb.rebalance(seed=1) + rb.rebalance(seed=2) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') + + # every part has the same number of handoffs + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {105}) + # which less the primaries - is every device in the ring + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105) + part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) self.assertEqual(part, exp_part) self.assertEqual([d['id'] for d in devs], exp_devs) self.assertEqual(primary_zones, exp_zones) devs = list(r.get_more_nodes(part)) - self.assertEqual([d['id'] for d in devs], exp_handoffs) + self.assertEqual(len(devs), len(exp_handoffs)) + dev_ids = [d['id'] for d in devs] + self.assertEqual(dev_ids, exp_handoffs) # The first 6 replicas plus the 3 primary nodes should cover all 9 # zones in this test @@ -531,11 +545,22 @@ class TestRing(TestRingBase): 'ip': '1.2.%d.%d' % (zone, server), 'port': 1234, 'zone': zone, 'region': 0, 'weight': 1.0}) next_dev_id += 1 - rb.rebalance(seed=1) + rb.pretend_min_part_hours_passed() + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=2) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') - # We would change expectations here, but in this test no handoffs - # changed at all. + + # so now we expect the device list to be longer by one device + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {106}) + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 106) + # I don't think there's any special reason this dev goes at this index + exp_handoffs.insert(27, rb.devs[-1]['id']) + + # We would change expectations here, but in this part only the added + # device changed at all. part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) self.assertEqual(part, exp_part) @@ -555,36 +580,60 @@ class TestRing(TestRingBase): seen_zones.update([d['zone'] for d in devs[:6]]) self.assertEqual(seen_zones, set(range(1, 10))) + # Change expectations for the rest of the parts devs = [] for part in range(r.partition_count): devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 for part in range(r.partition_count): - self.assertEqual( - devs[part], exp_first_handoffs[part], - 'handoff for partitition %d is now device id %d' % ( - part, devs[part])) + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertEqual(changed_first_handoff, num_parts_changed) - # Remove a device. + # Remove a device - no need to fluff min_part_hours. rb.remove_dev(0) - rb.rebalance(seed=1) + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=1) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') - # Change expectations - # The long string of handoff nodes for the partition were the same for - # the first 20, which is pretty good. - exp_handoffs[20:] = [60, 108, 8, 72, 56, 19, 91, 13, 84, 38, 66, 52, - 1, 78, 107, 50, 57, 31, 32, 77, 24, 42, 100, 71, - 26, 9, 20, 35, 5, 14, 94, 28, 41, 18, 102, 101, - 61, 95, 21, 81, 105, 58, 74, 90, 86, 46, 4, 68, - 40, 80, 54, 75, 45, 79, 44, 49, 62, 29, 7, 15, 70, - 87, 65, 12, 82, 17, 104, 97, 55, 22, 6, 89, 2, 67, - 37, 63, 53, 92, 33, 85, 73, 51, 98, 36, 10] - # Just a few of the first handoffs changed - exp_first_handoffs[3] = 68 - exp_first_handoffs[55] = 104 - exp_first_handoffs[116] = 6 - exp_first_handoffs[181] = 15 - exp_first_handoffs[228] = 38 + + # so now we expect the device list to be shorter by one device + part_handoff_counts = set() + for part in range(r.partition_count): + part_handoff_counts.add(len(list(r.get_more_nodes(part)))) + self.assertEqual(part_handoff_counts, {105}) + self.assertEqual(len(list(rb._iter_devs())) - rb.replicas, 105) + + # Change expectations for our part + exp_handoffs.remove(0) + first_matches = 0 + total_changed = 0 + devs = list(d['id'] for d in r.get_more_nodes(exp_part)) + for i, part in enumerate(devs): + if exp_handoffs[i] != devs[i]: + total_changed += 1 + exp_handoffs[i] = devs[i] + if not total_changed: + first_matches += 1 + self.assertEqual(devs, exp_handoffs) + # the first 21 handoffs were the same across the rebalance + self.assertEqual(first_matches, 21) + # but as you dig deeper some of the differences show up + self.assertEqual(total_changed, 41) + + # Change expectations for the rest of the parts + devs = [] + for part in range(r.partition_count): + devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 + for part in range(r.partition_count): + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertEqual(changed_first_handoff, num_parts_changed) + # Test part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) @@ -615,56 +664,48 @@ class TestRing(TestRingBase): # Add a partial replica rb.set_replicas(3.5) - rb.rebalance(seed=1) + num_parts_changed, _balance, _removed_dev = rb.rebalance(seed=164) rb.get_ring().save(self.testgz) r = ring.Ring(self.testdir, ring_name='whatever') + # Change expectations + # We have another replica now - exp_devs.append(47) - exp_zones.add(4) + exp_devs.append(90) + exp_zones.add(8) + # and therefore one less handoff + exp_handoffs = exp_handoffs[:-1] # Caused some major changes in the sequence of handoffs for our test # partition, but at least the first stayed the same. - exp_handoffs[1:] = [81, 25, 69, 23, 99, 59, 76, 3, 106, 64, 43, 13, 34, - 88, 30, 16, 27, 103, 39, 74, 60, 108, 8, 56, 19, - 91, 52, 84, 38, 66, 1, 78, 45, 107, 50, 57, 83, 31, - 46, 32, 77, 24, 42, 63, 100, 72, 71, 7, 26, 9, 20, - 35, 5, 87, 14, 94, 62, 28, 41, 90, 18, 82, 102, 22, - 101, 61, 85, 95, 21, 98, 67, 105, 58, 86, 4, 79, - 68, 40, 80, 54, 75, 44, 49, 6, 29, 15, 70, 65, 12, - 17, 104, 97, 55, 89, 2, 37, 53, 92, 33, 73, 51, 36, - 10] + devs = list(d['id'] for d in r.get_more_nodes(exp_part)) + first_matches = 0 + total_changed = 0 + for i, part in enumerate(devs): + if exp_handoffs[i] != devs[i]: + total_changed += 1 + exp_handoffs[i] = devs[i] + if not total_changed: + first_matches += 1 + # most seeds seem to throw out first handoff stabilization with + # replica_count change + self.assertEqual(first_matches, 2) + # and lots of other handoff changes... + self.assertEqual(total_changed, 95) + + self.assertEqual(devs, exp_handoffs) + + # Change expectations for the rest of the parts + devs = [] + for part in range(r.partition_count): + devs.append(next(r.get_more_nodes(part))['id']) + changed_first_handoff = 0 + for part in range(r.partition_count): + if devs[part] != exp_first_handoffs[part]: + changed_first_handoff += 1 + exp_first_handoffs[part] = devs[part] + self.assertEqual(devs, exp_first_handoffs) + self.assertLessEqual(changed_first_handoff, num_parts_changed) - # Lots of first handoffs changed, but 30 of 256 is still just 11.72%. - exp_first_handoffs[1] = 6 - exp_first_handoffs[4] = 104 - exp_first_handoffs[11] = 106 - exp_first_handoffs[17] = 13 - exp_first_handoffs[21] = 77 - exp_first_handoffs[22] = 95 - exp_first_handoffs[27] = 46 - exp_first_handoffs[29] = 65 - exp_first_handoffs[30] = 3 - exp_first_handoffs[31] = 20 - exp_first_handoffs[51] = 50 - exp_first_handoffs[53] = 8 - exp_first_handoffs[54] = 2 - exp_first_handoffs[72] = 107 - exp_first_handoffs[79] = 72 - exp_first_handoffs[85] = 71 - exp_first_handoffs[88] = 66 - exp_first_handoffs[92] = 29 - exp_first_handoffs[93] = 46 - exp_first_handoffs[96] = 38 - exp_first_handoffs[101] = 57 - exp_first_handoffs[103] = 87 - exp_first_handoffs[104] = 28 - exp_first_handoffs[107] = 1 - exp_first_handoffs[109] = 69 - exp_first_handoffs[110] = 50 - exp_first_handoffs[111] = 76 - exp_first_handoffs[115] = 47 - exp_first_handoffs[117] = 48 - exp_first_handoffs[119] = 7 # Test part, devs = r.get_nodes('a', 'c', 'o') primary_zones = set([d['zone'] for d in devs]) @@ -696,17 +737,16 @@ class TestRing(TestRingBase): # One last test of a partial replica partition exp_part2 = 136 - exp_devs2 = [52, 76, 97] - exp_zones2 = set([9, 5, 7]) - exp_handoffs2 = [2, 67, 37, 92, 33, 23, 107, 63, 44, 103, 108, 85, - 73, 10, 89, 80, 4, 17, 49, 32, 12, 41, 58, 20, 25, - 61, 94, 47, 69, 56, 101, 28, 83, 8, 96, 53, 51, 42, - 98, 35, 36, 84, 43, 104, 31, 65, 1, 40, 9, 74, 95, - 45, 5, 71, 86, 78, 30, 93, 48, 91, 15, 88, 39, 18, - 57, 72, 70, 27, 54, 16, 24, 21, 14, 11, 77, 62, 50, - 6, 105, 26, 55, 29, 60, 34, 13, 87, 59, 38, 99, 75, - 106, 3, 82, 66, 79, 7, 46, 64, 81, 22, 68, 19, 102, - 90, 100] + exp_devs2 = [70, 76, 32] + exp_zones2 = set([3, 6, 7]) + exp_handoffs2 = [89, 97, 37, 53, 20, 1, 86, 64, 102, 40, 90, 60, 72, + 27, 99, 68, 78, 26, 105, 45, 42, 95, 22, 13, 49, 55, + 11, 8, 83, 16, 4, 59, 33, 108, 61, 74, 29, 88, 66, + 80, 25, 100, 39, 67, 79, 24, 65, 96, 36, 84, 54, 21, + 63, 81, 56, 71, 77, 30, 48, 23, 10, 52, 82, 34, 17, + 107, 87, 104, 5, 35, 2, 50, 43, 62, 73, 28, 18, 14, + 98, 38, 85, 15, 57, 9, 51, 12, 6, 91, 3, 103, 41, 92, + 47, 75, 44, 69, 101, 93, 106, 46, 94, 31, 19, 7, 58] part2, devs2 = r.get_nodes('a', 'c', 'o2') primary_zones2 = set([d['zone'] for d in devs2]) @@ -764,14 +804,15 @@ class TestRing(TestRingBase): # Here's a brittle canary-in-the-coalmine test to make sure the region # handoff computation didn't change accidentally - exp_handoffs = [111, 112, 74, 54, 93, 31, 2, 43, 100, 22, 71, 92, 35, - 9, 50, 41, 76, 80, 84, 88, 17, 96, 6, 102, 37, 29, - 105, 5, 47, 20, 13, 108, 66, 81, 53, 65, 25, 58, 32, - 94, 101, 1, 10, 44, 73, 75, 21, 97, 28, 106, 30, 16, - 39, 77, 42, 72, 34, 99, 14, 61, 90, 4, 40, 3, 45, 62, - 7, 15, 87, 12, 83, 89, 33, 98, 49, 107, 56, 86, 48, - 57, 24, 11, 23, 26, 46, 64, 69, 38, 36, 79, 63, 104, - 51, 70, 82, 67, 68, 8, 95, 91, 55, 59, 85] + exp_handoffs = [111, 112, 35, 58, 62, 74, 20, 105, 41, 90, 53, 6, 3, + 67, 55, 76, 108, 32, 12, 80, 38, 85, 94, 42, 27, 99, + 50, 47, 70, 87, 26, 9, 15, 97, 102, 81, 23, 65, 33, + 77, 34, 4, 75, 8, 5, 30, 13, 73, 36, 92, 54, 51, 72, + 78, 66, 1, 48, 14, 93, 95, 88, 86, 84, 106, 60, 101, + 57, 43, 89, 59, 79, 46, 61, 52, 44, 45, 37, 68, 25, + 100, 49, 24, 16, 71, 96, 21, 107, 98, 64, 39, 18, 29, + 103, 91, 22, 63, 69, 28, 56, 11, 82, 10, 17, 19, 7, + 40, 83, 104, 31] dev_ids = [d['id'] for d in more_devs] self.assertEqual(len(dev_ids), len(exp_handoffs)) diff --git a/test/unit/common/ring/test_utils.py b/test/unit/common/ring/test_utils.py index 3cba0fb512..705d619b9b 100644 --- a/test/unit/common/ring/test_utils.py +++ b/test/unit/common/ring/test_utils.py @@ -692,10 +692,10 @@ class TestUtils(unittest.TestCase): rb.rebalance(seed=100) rb.validate() - self.assertEqual(rb.dispersion, 39.0625) + self.assertEqual(rb.dispersion, 39.84375) report = dispersion_report(rb) self.assertEqual(report['worst_tier'], 'r1z1') - self.assertEqual(report['max_dispersion'], 39.0625) + self.assertEqual(report['max_dispersion'], 39.84375) def build_tier_report(max_replicas, placed_parts, dispersion, replicas): @@ -711,11 +711,11 @@ class TestUtils(unittest.TestCase): # zone 1 are stored at least twice on the nodes expected = [ ['r1z1', build_tier_report( - 2, 256, 39.0625, [0, 0, 156, 100])], + 2, 256, 39.84375, [0, 0, 154, 102])], ['r1z1-127.0.0.1', build_tier_report( - 1, 256, 19.53125, [0, 206, 50, 0])], + 1, 256, 19.921875, [0, 205, 51, 0])], ['r1z1-127.0.0.2', build_tier_report( - 1, 256, 19.53125, [0, 206, 50, 0])], + 1, 256, 19.921875, [0, 205, 51, 0])], ] report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True) graph = report['graph'] @@ -735,12 +735,18 @@ class TestUtils(unittest.TestCase): 'ip': '127.0.0.3', 'port': 10003, 'device': 'sdc1'}) rb.add_dev({'id': 15, 'region': 1, 'zone': 0, 'weight': 500, 'ip': '127.0.0.3', 'port': 10003, 'device': 'sdd1'}) - rb.rebalance(seed=10) - report = dispersion_report(rb) - self.assertEqual(rb.dispersion, 44.53125) + # when the biggest tier has the smallest devices things get ugly + rb.rebalance(seed=100) + report = dispersion_report(rb, verbose=True) + self.assertEqual(rb.dispersion, 70.3125) self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.3') - self.assertEqual(report['max_dispersion'], 32.520325203252035) + self.assertEqual(report['max_dispersion'], 88.23529411764706) + + # ... but overload can square it + rb.set_overload(rb.get_required_overload()) + rb.rebalance() + self.assertEqual(rb.dispersion, 0.0) def test_parse_address_old_format(self): # Test old format