swift/swift/common/ring/builder.py

# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import bisect
import itertools
import math
import random
import cPickle as pickle

from array import array
from collections import defaultdict
from time import time

from swift.common import exceptions
from swift.common.ring import RingData
from swift.common.ring.utils import tiers_for_dev, build_tier_tree

MAX_BALANCE = 999.99


class RingBuilder(object):
    """
    Used to build swift.common.ring.RingData instances to be written to disk
    and used with swift.common.ring.Ring instances. See bin/swift-ring-builder
    for example usage.

    The instance variable devs_changed indicates if the device information has
    changed since the last balancing. This can be used by tools to know whether
    a rebalance request is an isolated request or due to added, changed, or
    removed devices.

    :param part_power: number of partitions = 2**part_power.
    :param replicas: number of replicas for each partition
    :param min_part_hours: minimum number of hours between partition changes
    """

    def __init__(self, part_power, replicas, min_part_hours):
        if part_power > 32:
            raise ValueError("part_power must be at most 32 (was %d)"
                             % (part_power,))
        if replicas < 1:
            raise ValueError("replicas must be at least 1 (was %.6f)"
                             % (replicas,))
        if min_part_hours < 0:
            raise ValueError("min_part_hours must be non-negative (was %d)"
                             % (min_part_hours,))

        self.part_power = part_power
        self.replicas = replicas
        self.min_part_hours = min_part_hours
        self.parts = 2 ** self.part_power
        self.devs = []
        self.devs_changed = False
        self.version = 0

        # _replica2part2dev maps from replica number to partition number to
        # device id. So, for a three replica, 2**23 ring, it's an array of
        # three 2**23 arrays of device ids (unsigned shorts). This can work a
        # bit faster than the 2**23 array of triplet arrays of device ids in
        # many circumstances. Making one big 2**23 * 3 array didn't seem to
        # have any speed change; though you're welcome to try it again (it was
        # a while ago, code-wise, when I last tried it).
        self._replica2part2dev = None

        # _last_part_moves is a 2**23 array of unsigned bytes representing the
        # number of hours since a given partition was last moved. This is used
        # to guarantee we don't move a partition twice within a given number of
        # hours (24 is my usual test). Removing a device or setting its weight
        # to 0 overrides this behavior as it's assumed those actions are done
        # because of device failure.
        # _last_part_moves_epoch indicates the time the offsets in
        # _last_part_moves is based on.
        self._last_part_moves_epoch = None
        self._last_part_moves = None

        self._last_part_gather_start = 0
        self._remove_devs = []
        self._ring = None

    def weight_of_one_part(self):
        """
        Returns the weight of each partition as calculated from the
        total weight of all the devices.
        """
        try:
            return self.parts * self.replicas / \
                sum(d['weight'] for d in self._iter_devs())
        except ZeroDivisionError:
            raise exceptions.EmptyRingError('There are no devices in this '
                                            'ring, or all devices have been '
                                            'deleted')

    def copy_from(self, builder):
        """
        Reinitializes this RingBuilder instance from data obtained from the
        builder dict given. Code example::

            b = RingBuilder(1, 1, 1)  # Dummy values
            b.copy_from(builder)

        This is to restore a RingBuilder that has had its b.to_dict()
        previously saved.
        """
        if hasattr(builder, 'devs'):
            self.part_power = builder.part_power
            self.replicas = builder.replicas
            self.min_part_hours = builder.min_part_hours
            self.parts = builder.parts
            self.devs = builder.devs
            self.devs_changed = builder.devs_changed
            self.version = builder.version
            self._replica2part2dev = builder._replica2part2dev
            self._last_part_moves_epoch = builder._last_part_moves_epoch
            self._last_part_moves = builder._last_part_moves
            self._last_part_gather_start = builder._last_part_gather_start
            self._remove_devs = builder._remove_devs
        else:
            self.part_power = builder['part_power']
            self.replicas = builder['replicas']
            self.min_part_hours = builder['min_part_hours']
            self.parts = builder['parts']
            self.devs = builder['devs']
            self.devs_changed = builder['devs_changed']
            self.version = builder['version']
            self._replica2part2dev = builder['_replica2part2dev']
            self._last_part_moves_epoch = builder['_last_part_moves_epoch']
            self._last_part_moves = builder['_last_part_moves']
            self._last_part_gather_start = builder['_last_part_gather_start']
            self._remove_devs = builder['_remove_devs']
        self._ring = None

        # Old builders may not have a region defined for their devices, in
        # which case we default it to 1.
        for dev in self._iter_devs():
            dev.setdefault("region", 1)

    def to_dict(self):
        """
        Returns a dict that can be used later with copy_from to
        restore a RingBuilder. swift-ring-builder uses this to
        pickle.dump the dict to a file and later load that dict into
        copy_from.
        """
        return {'part_power': self.part_power,
                'replicas': self.replicas,
                'min_part_hours': self.min_part_hours,
                'parts': self.parts,
                'devs': self.devs,
                'devs_changed': self.devs_changed,
                'version': self.version,
                '_replica2part2dev': self._replica2part2dev,
                '_last_part_moves_epoch': self._last_part_moves_epoch,
                '_last_part_moves': self._last_part_moves,
                '_last_part_gather_start': self._last_part_gather_start,
                '_remove_devs': self._remove_devs}

    def change_min_part_hours(self, min_part_hours):
        """
        Changes the value used to decide if a given partition can be moved
        again. This restriction is to give the overall system enough time to
        settle a partition to its new location before moving it to yet another
        location. While no data would be lost if a partition is moved several
        times quickly, it could make that data unreachable for a short period
        of time.

        This should be set to at least the average full partition replication
        time. Starting it at 24 hours and then lowering it to what the
        replicator reports as the longest partition cycle is best.

        :param min_part_hours: new value for min_part_hours
        """
        self.min_part_hours = min_part_hours

    def set_replicas(self, new_replica_count):
        """
        Changes the number of replicas in this ring.

        If the new replica count is sufficiently different that
        self._replica2part2dev will change size, sets
        self.devs_changed. This is so tools like
        bin/swift-ring-builder can know to write out the new ring
        rather than bailing out due to lack of balance change.
        """
        old_slots_used = int(self.parts * self.replicas)
        new_slots_used = int(self.parts * new_replica_count)
        if old_slots_used != new_slots_used:
            self.devs_changed = True

        self.replicas = new_replica_count

    def get_ring(self):
        """
        Get the ring, or more specifically, the swift.common.ring.RingData.
        This ring data is the minimum required for use of the ring. The ring
        builder itself keeps additional data such as when partitions were last
        moved.
        """
        # We cache the self._ring value so multiple requests for it don't build
        # it multiple times. Be sure to set self._ring = None whenever the ring
        # will need to be rebuilt.
        if not self._ring:
            # Make devs list (with holes for deleted devices) and not including
            # builder-specific extra attributes.
            devs = [None] * len(self.devs)
            for dev in self._iter_devs():
                devs[dev['id']] = dict((k, v) for k, v in dev.items()
                                       if k not in ('parts', 'parts_wanted'))
            # Copy over the replica+partition->device assignments, the device
            # information, and the part_shift value (the number of bits to
            # shift an unsigned int >I right to obtain the partition for the
            # int).
            if not self._replica2part2dev:
                self._ring = RingData([], devs, 32 - self.part_power)
            else:
                self._ring = \
                    RingData([array('H', p2d) for p2d in
                              self._replica2part2dev],
                             devs, 32 - self.part_power)
        return self._ring

    def add_dev(self, dev):
        """
        Add a device to the ring. This device dict should have a minimum of the
        following keys:

        ======  ===============================================================
        id      unique integer identifier amongst devices. Defaults to the next
                id if the 'id' key is not provided in the dict
        weight  a float of the relative weight of this device as compared to
                others; this indicates how many partitions the builder will try
                to assign to this device
        region  integer indicating which region the device is in
        zone    integer indicating which zone the device is in; a given
                partition will not be assigned to multiple devices within the
                same (region, zone) pair if there is any alternative
        ip      the ip address of the device
        port    the tcp port of the device
        device  the device's name on disk (sdb1, for example)
        meta    general use 'extra' field; for example: the online date, the
                hardware description
        ======  ===============================================================

        .. note::
            This will not rebalance the ring immediately as you may want to
            make multiple changes for a single rebalance.

        :param dev: device dict

        :returns: id of device
        """
        if 'id' not in dev:
            dev['id'] = 0
            if self.devs:
                dev['id'] = max(d['id'] for d in self.devs if d) + 1
        if dev['id'] < len(self.devs) and self.devs[dev['id']] is not None:
            raise exceptions.DuplicateDeviceError(
                'Duplicate device id: %d' % dev['id'])
        # Add holes to self.devs to ensure self.devs[dev['id']] will be the dev
        while dev['id'] >= len(self.devs):
            self.devs.append(None)
        dev['weight'] = float(dev['weight'])
        dev['parts'] = 0
        self.devs[dev['id']] = dev
        self._set_parts_wanted()
        self.devs_changed = True
        self.version += 1
        return dev['id']

    def set_dev_weight(self, dev_id, weight):
        """
        Set the weight of a device. This should be called rather than just
        altering the weight key in the device dict directly, as the builder
        will need to rebuild some internal state to reflect the change.

        .. note::
            This will not rebalance the ring immediately as you may want to
            make multiple changes for a single rebalance.

        :param dev_id: device id
        :param weight: new weight for device
        """
        self.devs[dev_id]['weight'] = weight
        self._set_parts_wanted()
        self.devs_changed = True
        self.version += 1

    def remove_dev(self, dev_id):
        """
        Remove a device from the ring.

        .. note::
            This will not rebalance the ring immediately as you may want to
            make multiple changes for a single rebalance.

        :param dev_id: device id
        """
        dev = self.devs[dev_id]
        dev['weight'] = 0
        self._remove_devs.append(dev)
        self._set_parts_wanted()
        self.devs_changed = True
        self.version += 1

    def rebalance(self, seed=None):
        """
        Rebalance the ring.

        This is the main work function of the builder, as it will assign and
        reassign partitions to devices in the ring based on weights, distinct
        zones, recent reassignments, etc.

        The process doesn't always perfectly assign partitions (that'd take a
        lot more analysis and therefore a lot more time -- I had code that did
        that before). Because of this, it keeps rebalancing until the device
        skew (number of partitions a device wants compared to what it has) gets
        below 1% or doesn't change by more than 1% (only happens with ring that
        can't be balanced no matter what -- like with 3 zones of differing
        weights with replicas set to 3).

        :returns: (number_of_partitions_altered, resulting_balance)
        """

        if seed is not None:
            random.seed(seed)

        self._ring = None
        if self._last_part_moves_epoch is None:
            self._initial_balance()
            self.devs_changed = False
            return self.parts, self.get_balance()
        retval = 0
        self._update_last_part_moves()
        last_balance = 0
        new_parts, removed_part_count = self._adjust_replica2part2dev_size()
        retval += removed_part_count
        self._reassign_parts(new_parts)
        retval += len(new_parts)
        while True:
            reassign_parts = self._gather_reassign_parts()
            self._reassign_parts(reassign_parts)
            retval += len(reassign_parts)
            while self._remove_devs:
                self.devs[self._remove_devs.pop()['id']] = None
            balance = self.get_balance()
            if balance < 1 or abs(last_balance - balance) < 1 or \
                    retval == self.parts:
                break
            last_balance = balance
        self.devs_changed = False
        self.version += 1
        return retval, balance

    def validate(self, stats=False):
        """
        Validate the ring.

        This is a safety function to try to catch any bugs in the building
        process. It ensures partitions have been assigned to real devices,
        aren't doubly assigned, etc. It can also optionally check the even
        distribution of partitions across devices.

        :param stats: if True, check distribution of partitions across devices
        :returns: if stats is True, a tuple of (device_usage, worst_stat), else
                  (None, None). device_usage[dev_id] will equal the number of
                  partitions assigned to that device. worst_stat will equal the
                  number of partitions the worst device is skewed from the
                  number it should have.
        :raises RingValidationError: problem was found with the ring.
        """

        # "len" showed up in profiling, so it's just computed once.
        dev_len = len(self.devs)

        parts_on_devs = sum(d['parts'] for d in self._iter_devs())

        if not self._replica2part2dev:
            raise exceptions.RingValidationError(
                '_replica2part2dev empty; did you forget to rebalance?')

        parts_in_map = sum(len(p2d) for p2d in self._replica2part2dev)
        if parts_on_devs != parts_in_map:
            raise exceptions.RingValidationError(
                'All partitions are not double accounted for: %d != %d' %
                (parts_on_devs, parts_in_map))
        if stats:
            # dev_usage[dev_id] will equal the number of partitions assigned to
            # that device.
            dev_usage = array('I', (0 for _junk in xrange(dev_len)))
            for part2dev in self._replica2part2dev:
                for dev_id in part2dev:
                    dev_usage[dev_id] += 1

        for part, replica in self._each_part_replica():
            dev_id = self._replica2part2dev[replica][part]
            if dev_id >= dev_len or not self.devs[dev_id]:
                raise exceptions.RingValidationError(
                    "Partition %d, replica %d was not allocated "
                    "to a device." %
                    (part, replica))

        for dev in self._iter_devs():
            if not isinstance(dev['port'], int):
                raise exceptions.RingValidationError(
                    "Device %d has port %r, which is not an integer." %
                    (dev['id'], dev['port']))

        if stats:
            weight_of_one_part = self.weight_of_one_part()
            worst = 0
            for dev in self._iter_devs():
                if not dev['weight']:
                    if dev_usage[dev['id']]:
                        # If a device has no weight, but has partitions, then
                        # its overage is considered "infinity" and therefore
                        # always the worst possible. We show MAX_BALANCE for
                        # convenience.
                        worst = MAX_BALANCE
                        break
                    continue
                skew = abs(100.0 * dev_usage[dev['id']] /
                           (dev['weight'] * weight_of_one_part) - 100.0)
                if skew > worst:
                    worst = skew
            return dev_usage, worst
        return None, None

    def get_balance(self):
        """
        Get the balance of the ring. The balance value is the highest
        percentage off the desired amount of partitions a given device
        wants. For instance, if the "worst" device wants (based on its
        weight relative to the sum of all the devices' weights) 123
        partitions and it has 124 partitions, the balance value would
        be 0.83 (1 extra / 123 wanted * 100 for percentage).

        :returns: balance of the ring
        """
        balance = 0
        weight_of_one_part = self.weight_of_one_part()
        for dev in self._iter_devs():
            if not dev['weight']:
                if dev['parts']:
                    # If a device has no weight, but has partitions, then its
                    # overage is considered "infinity" and therefore always the
                    # worst possible. We show MAX_BALANCE for convenience.
                    balance = MAX_BALANCE
                    break
                continue
            dev_balance = abs(100.0 * dev['parts'] /
                              (dev['weight'] * weight_of_one_part) - 100.0)
            if dev_balance > balance:
                balance = dev_balance
        return balance

    def pretend_min_part_hours_passed(self):
        """
        Override min_part_hours by marking all partitions as having been moved
        255 hours ago. This can be used to force a full rebalance on the next
        call to rebalance.
        """
        for part in xrange(self.parts):
            self._last_part_moves[part] = 0xff

    def get_part_devices(self, part):
        """
        Get the devices that are responsible for the partition,
        filtering out duplicates.

        :param part: partition to get devices for
        :returns: list of device dicts
        """
        devices = []
        for dev in self._devs_for_part(part):
            if dev not in devices:
                devices.append(dev)
        return devices

    def _iter_devs(self):
        """
        Returns an iterator all the non-None devices in the ring. Note that
        this means list(b._iter_devs())[some_id] may not equal b.devs[some_id];
        you will have to check the 'id' key of each device to obtain its
        dev_id.
        """
        for dev in self.devs:
            if dev is not None:
                yield dev

    def _set_parts_wanted(self):
        """
        Sets the parts_wanted key for each of the devices to the number of
        partitions the device wants based on its relative weight. This key is
        used to sort the devices according to "most wanted" during rebalancing
        to best distribute partitions. A negative parts_wanted indicates the
        device is "overweight" and wishes to give partitions away if possible.
        """
        weight_of_one_part = self.weight_of_one_part()

        for dev in self._iter_devs():
            if not dev['weight']:
                # With no weight, that means we wish to "drain" the device. So
                # we set the parts_wanted to a really large negative number to
                # indicate its strong desire to give up everything it has.
                dev['parts_wanted'] = -self.parts * self.replicas
            else:
                dev['parts_wanted'] = \
                    int(weight_of_one_part * dev['weight']) - dev['parts']

    def _adjust_replica2part2dev_size(self):
        """
        Make sure that the lengths of the arrays in _replica2part2dev
        are correct for the current value of self.replicas.

        Example:
        self.part_power = 8
        self.replicas = 2.25

        self._replica2part2dev will contain 3 arrays: the first 2 of
        length 256 (2**8), and the last of length 64 (0.25 * 2**8).

        Returns a 2-tuple: the first element is a list of (partition,
        replicas) tuples indicating which replicas need to be
        (re)assigned to devices, and the second element is a count of
        how many replicas were removed.
        """
        removed_replicas = 0

        fractional_replicas, whole_replicas = math.modf(self.replicas)
        whole_replicas = int(whole_replicas)

        desired_lengths = [self.parts] * whole_replicas
        if fractional_replicas:
            desired_lengths.append(int(self.parts * fractional_replicas))

        to_assign = defaultdict(list)

        if self._replica2part2dev is not None:
            # If we crossed an integer threshold (say, 4.1 --> 4),
            # we'll have a partial extra replica clinging on here. Clean
            # up any such extra stuff.
            for part2dev in self._replica2part2dev[len(desired_lengths):]:
                for dev_id in part2dev:
                    dev_losing_part = self.devs[dev_id]
                    dev_losing_part['parts'] -= 1
                    removed_replicas += 1
            self._replica2part2dev = \
                self._replica2part2dev[:len(desired_lengths)]
        else:
            self._replica2part2dev = []

        for replica, desired_length in enumerate(desired_lengths):
            if replica < len(self._replica2part2dev):
                part2dev = self._replica2part2dev[replica]
                if len(part2dev) < desired_length:
                    # Not long enough: needs to be extended and the
                    # newly-added pieces assigned to devices.
                    for part in xrange(len(part2dev), desired_length):
                        to_assign[part].append(replica)
                        part2dev.append(0)
                elif len(part2dev) > desired_length:
                    # Too long: truncate this mapping.
                    for part in xrange(desired_length, len(part2dev)):
                        dev_losing_part = self.devs[part2dev[part]]
                        dev_losing_part['parts'] -= 1
                        removed_replicas += 1
                    self._replica2part2dev[replica] = part2dev[:desired_length]
            else:
                # Mapping not present at all: make one up and assign
                # all of it.
                for part in xrange(desired_length):
                    to_assign[part].append(replica)
                self._replica2part2dev.append(
                    array('H', (0 for _junk in xrange(desired_length))))

        return (list(to_assign.iteritems()), removed_replicas)

    def _initial_balance(self):
        """
        Initial partition assignment is the same as rebalancing an
        existing ring, but with some initial setup beforehand.
        """
        self._last_part_moves = array('B', (0 for _junk in xrange(self.parts)))
        self._last_part_moves_epoch = int(time())

        self._reassign_parts(self._adjust_replica2part2dev_size()[0])

    def _update_last_part_moves(self):
        """
        Updates how many hours ago each partition was moved based on the
        current time. The builder won't move a partition that has been moved
        more recently than min_part_hours.
        """
        elapsed_hours = int(time() - self._last_part_moves_epoch) / 3600
        for part in xrange(self.parts):
            # The "min(self._last_part_moves[part] + elapsed_hours, 0xff)"
            # which was here showed up in profiling, so it got inlined.
            last_plus_elapsed = self._last_part_moves[part] + elapsed_hours
            if last_plus_elapsed < 0xff:
                self._last_part_moves[part] = last_plus_elapsed
            else:
                self._last_part_moves[part] = 0xff
        self._last_part_moves_epoch = int(time())

    def _gather_reassign_parts(self):
        """
        Returns a list of (partition, replicas) pairs to be reassigned by
        gathering from removed devices, insufficiently-far-apart replicas, and
        overweight drives.
        """
        # inline memoization of tiers_for_dev() results (profiling reveals it
        # as a hot-spot).
        tfd = {}

        # First we gather partitions from removed devices. Since removed
        # devices usually indicate device failures, we have no choice but to
        # reassign these partitions. However, we mark them as moved so later
        # choices will skip other replicas of the same partition if possible.
        removed_dev_parts = defaultdict(list)
        if self._remove_devs:
            dev_ids = [d['id'] for d in self._remove_devs if d['parts']]
            if dev_ids:
                for part, replica in self._each_part_replica():
                    dev_id = self._replica2part2dev[replica][part]
                    if dev_id in dev_ids:
                        self._last_part_moves[part] = 0
                        removed_dev_parts[part].append(replica)

        # Now we gather partitions that are "at risk" because they aren't
        # currently sufficient spread out across the cluster.
        spread_out_parts = defaultdict(list)
        max_allowed_replicas = self._build_max_replicas_by_tier()
        for part in xrange(self.parts):
            # Only move one replica at a time if possible.
            if part in removed_dev_parts:
                continue

            # First, add up the count of replicas at each tier for each
            # partition.
            # replicas_at_tier was a "lambda: 0" defaultdict, but profiling
            # revealed the lambda invocation as a significant cost.
            replicas_at_tier = {}
            for dev in self._devs_for_part(part):
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    if tier not in replicas_at_tier:
                        replicas_at_tier[tier] = 1
                    else:
                        replicas_at_tier[tier] += 1

            # Now, look for partitions not yet spread out enough and not
            # recently moved.
            for replica in self._replicas_for_part(part):
                dev = self.devs[self._replica2part2dev[replica][part]]
                removed_replica = False
                if dev['id'] not in tfd:
                    tfd[dev['id']] = tiers_for_dev(dev)
                for tier in tfd[dev['id']]:
                    rep_at_tier = 0
                    if tier in replicas_at_tier:
                        rep_at_tier = replicas_at_tier[tier]
                    if (rep_at_tier > max_allowed_replicas[tier] and
                            self._last_part_moves[part] >=
                            self.min_part_hours):
                        self._last_part_moves[part] = 0
                        spread_out_parts[part].append(replica)
                        dev['parts_wanted'] += 1
                        dev['parts'] -= 1
                        removed_replica = True
                        break
                if removed_replica:
                    if dev['id'] not in tfd:
                        tfd[dev['id']] = tiers_for_dev(dev)
                    for tier in tfd[dev['id']]:
                        replicas_at_tier[tier] -= 1

        # Last, we gather partitions from devices that are "overweight" because
        # they have more partitions than their parts_wanted.
        reassign_parts = defaultdict(list)

        # We randomly pick a new starting point in the "circular" ring of
        # partitions to try to get a better rebalance when called multiple
        # times.

        start = self._last_part_gather_start / 4
        start += random.randint(0, self.parts / 2)  # GRAH PEP8!!!

        self._last_part_gather_start = start
        for replica, part2dev in enumerate(self._replica2part2dev):
            # If we've got a partial replica, start may be out of
            # range. Scale it down so that we get a similar movement
            # pattern (but scaled down) on sequential runs.
            this_start = int(float(start) * len(part2dev) / self.parts)

            for part in itertools.chain(xrange(this_start, len(part2dev)),
                                        xrange(0, this_start)):
                if self._last_part_moves[part] < self.min_part_hours:
                    continue
                if part in removed_dev_parts or part in spread_out_parts:
                    continue
                dev = self.devs[part2dev[part]]
                if dev['parts_wanted'] < 0:
                    self._last_part_moves[part] = 0
                    dev['parts_wanted'] += 1
                    dev['parts'] -= 1
                    reassign_parts[part].append(replica)

        reassign_parts.update(spread_out_parts)
        reassign_parts.update(removed_dev_parts)

        reassign_parts_list = list(reassign_parts.iteritems())
        # We shuffle the partitions to reassign so we get a more even
        # distribution later. There has been discussion of trying to distribute
        # partitions more "regularly" because that would actually reduce risk
        # but 1) it is really difficult to do this with uneven clusters and 2)
        # it would concentrate load during failure recovery scenarios
        # (increasing risk). The "right" answer has yet to be debated to
        # conclusion, but working code wins for now.
        random.shuffle(reassign_parts_list)
        return reassign_parts_list

    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
            dev['tiers'] = tiers_for_dev(dev)

        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in dev['tiers']:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev['sort_key'])
                tier2sort_key[tier] = dev['sort_key']
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(
                    tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev['tiers']:
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = \
                        unique_tiers_by_tier_len[len(tier) + 1]
                    # Find a tier with the minimal replica count and the
                    # hungriest drive among all the tiers with the minimal
                    # replica count.
                    if len(tier2children[tier]) > \
                            len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas
                        tier = max((t for t in tier2children[tier]
                                    if other_replicas[t] == 0),
                                   key=tier2sort_key.__getitem__)
                    else:
                        tier = max(tier2children[tier],
                                   key=lambda t: (-other_replicas[t],
                                                  tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in dev['tiers']:
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(
                        new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
            del dev['tiers']

    def _sort_key_for(self, dev):
        return (dev['parts_wanted'], random.randint(0, 0xFFFF), dev['id'])

    def _build_max_replicas_by_tier(self):
        """
        Returns a dict of (tier: replica_count) for all tiers in the ring.

        There will always be a () entry as the root of the structure, whose
        replica_count will equal the ring's replica_count.

        Then there will be (dev_id,) entries for each device, indicating the
        maximum number of replicas the device might have for any given
        partition. Anything greater than 1 indicates a partition at serious
        risk, as the data on that partition will not be stored distinctly at
        the ring's replica_count.

        Next there will be (dev_id, ip_port) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices on the same ip_port for any given partition. Anything greater
        than 1 indicates a partition at elevated risk, as if that ip_port were
        to fail multiple replicas of that partition would be unreachable.

        Last there will be (dev_id, ip_port, zone) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices within the same zone for any given partition. Anything greater
        than 1 indicates a partition at slightly elevated risk, as if that zone
        were to fail multiple replicas of that partition would be unreachable.

        Example return dict for the common SAIO setup::

            {(): 3,
             (1,): 1.0,
             (1, '127.0.0.1:6010'): 1.0,
             (1, '127.0.0.1:6010', 0): 1.0,
             (2,): 1.0,
             (2, '127.0.0.1:6020'): 1.0,
             (2, '127.0.0.1:6020', 1): 1.0,
             (3,): 1.0,
             (3, '127.0.0.1:6030'): 1.0,
             (3, '127.0.0.1:6030', 2): 1.0,
             (4,): 1.0,
             (4, '127.0.0.1:6040'): 1.0,
             (4, '127.0.0.1:6040', 3): 1.0}
        """
        # Used by walk_tree to know what entries to create for each recursive
        # call.
        tier2children = build_tier_tree(self._iter_devs())

        def walk_tree(tier, replica_count):
            mr = {tier: replica_count}
            if tier in tier2children:
                subtiers = tier2children[tier]
                for subtier in subtiers:
                    submax = math.ceil(float(replica_count) / len(subtiers))
                    mr.update(walk_tree(subtier, submax))
            return mr
        return walk_tree((), self.replicas)

    def _devs_for_part(self, part):
        """
        Returns a list of devices for a specified partition.

        Deliberately includes duplicates.
        """
        if self._replica2part2dev is None:
            return []
        return [self.devs[part2dev[part]]
                for part2dev in self._replica2part2dev
                if part < len(part2dev)]

    def _replicas_for_part(self, part):
        """
        Returns a list of replicas for a specified partition.

        These can be used as indices into self._replica2part2dev
        without worrying about IndexErrors.
        """
        return [replica for replica, part2dev
                in enumerate(self._replica2part2dev)
                if part < len(part2dev)]

    def _each_part_replica(self):
        """
        Generator yielding every (partition, replica) pair in the ring.
        """
        for replica, part2dev in enumerate(self._replica2part2dev):
            for part in xrange(len(part2dev)):
                yield (part, replica)

    @classmethod
    def load(cls, builder_file, open=open):
        """
        Obtain RingBuilder instance of the provided builder file

        :param builder_file: path to builder file to load
        :return: RingBuilder instance
        """
        builder = pickle.load(open(builder_file, 'rb'))
        if not hasattr(builder, 'devs'):
            builder_dict = builder
            builder = RingBuilder(1, 1, 1)
            builder.copy_from(builder_dict)
        for dev in builder.devs:
            #really old rings didn't have meta keys
            if dev and 'meta' not in dev:
                dev['meta'] = ''
            # NOTE(akscram): An old ring builder file don't contain
            #                replication parameters.
            if dev:
                if 'ip' in dev:
                    dev.setdefault('replication_ip', dev['ip'])
                if 'port' in dev:
                    dev.setdefault('replication_port', dev['port'])
        return builder

    def save(self, builder_file):
        """Serialize this RingBuilder instance to disk.

        :param builder_file: path to builder file to save
        """
        with open(builder_file, 'wb') as f:
            pickle.dump(self.to_dict(), f, protocol=2)

    def search_devs(self, search_values):
        """Search devices by parameters.

        :param search_values: a dictionary with search values to filter
                              devices, supported parameters are id,
                              region, zone, ip, port, replication_ip,
                              replication_port, device, weight, meta

        :returns: list of device dicts
        """
        matched_devs = []
        for dev in self.devs:
            if not dev:
                continue
            matched = True
            for key in ('id', 'region', 'zone', 'ip', 'port', 'replication_ip',
                        'replication_port', 'device', 'weight', 'meta'):
                if key in search_values:
                    value = search_values.get(key)
                    if value is not None:
                        if key == 'meta':
                            if value not in dev.get(key):
                                matched = False
                        elif dev.get(key) != value:
                            matched = False
            if matched:
                matched_devs.append(dev)
        return matched_devs