Fix rebalancing when 2+ of a partition's replicas are on deleted devices.

RingBuilder._reassign_parts assumed that only replica for a given
partition would move. This isn't necessarily true in the case where a
bunch of devices have been removed. This would leave invalid entries
in _replica2part2dev and also cause validation to fail.

One easy way to reproduce this is to create a 3-replica, 3-zone,
6-device ring with 2 drives per zone (all of equal weight), rebalance,
and then remove one drive from each zone and rebalance again.

Bug: 943493

Change-Id: I0d399bed5d733448ad877fa2823b542777d385a4
This commit is contained in:
Samuel Merritt 2012-02-28 18:25:14 -08:00
parent 9316a8f876
commit 7fe0c6c695
3 changed files with 55 additions and 20 deletions

View File

@ -38,6 +38,7 @@ Dragos Manolescu (dragosm@hp.com)
Juan J. Martinez (juan@memset.com) Juan J. Martinez (juan@memset.com)
Donagh McCabe (donagh.mccabe@hp.com) Donagh McCabe (donagh.mccabe@hp.com)
Ewan Mellor (ewan.mellor@citrix.com) Ewan Mellor (ewan.mellor@citrix.com)
Samuel Merritt (spam@andcheese.org)
Stephen Milton (milton@isomedia.com) Stephen Milton (milton@isomedia.com)
Russ Nelson (russ@crynwr.com) Russ Nelson (russ@crynwr.com)
Maru Newby (mnewby@internap.com) Maru Newby (mnewby@internap.com)

View File

@ -502,31 +502,35 @@ class RingBuilder(object):
key=lambda x: x['sort_key']) key=lambda x: x['sort_key'])
for part in reassign_parts: for part in reassign_parts:
other_zones = array('H') other_zones = array('H')
replace = None replace = []
for replica in xrange(self.replicas): for replica in xrange(self.replicas):
if self._replica2part2dev[replica][part] == 0xffff: if self._replica2part2dev[replica][part] == 0xffff:
replace = replica replace.append(replica)
else: else:
other_zones.append(self.devs[ other_zones.append(self.devs[
self._replica2part2dev[replica][part]]['zone']) self._replica2part2dev[replica][part]]['zone'])
index = len(available_devs) - 1
while available_devs[index]['zone'] in other_zones: for replica in replace:
index -= 1 index = len(available_devs) - 1
dev = available_devs.pop(index) while available_devs[index]['zone'] in other_zones:
self._replica2part2dev[replace][part] = dev['id'] index -= 1
dev['parts_wanted'] -= 1 dev = available_devs.pop(index)
dev['parts'] += 1 other_zones.append(dev['zone'])
dev['sort_key'] = '%08x.%04x' % (self.parts + dev['parts_wanted'], self._replica2part2dev[replica][part] = dev['id']
randint(0, 0xffff)) dev['parts_wanted'] -= 1
index = 0 dev['parts'] += 1
end = len(available_devs) dev['sort_key'] = \
while index < end: '%08x.%04x' % (self.parts + dev['parts_wanted'],
mid = (index + end) // 2 randint(0, 0xffff))
if dev['sort_key'] < available_devs[mid]['sort_key']: index = 0
end = mid end = len(available_devs)
else: while index < end:
index = mid + 1 mid = (index + end) // 2
available_devs.insert(index, dev) if dev['sort_key'] < available_devs[mid]['sort_key']:
end = mid
else:
index = mid + 1
available_devs.insert(index, dev)
for dev in self.devs: for dev in self.devs:
if dev is not None: if dev is not None:
del dev['sort_key'] del dev['sort_key']

View File

@ -126,6 +126,36 @@ class TestRingBuilder(unittest.TestCase):
counts[dev_id] = counts.get(dev_id, 0) + 1 counts[dev_id] = counts.get(dev_id, 0) + 1
self.assertEquals(counts, {0: 256, 2: 256, 3: 256}) self.assertEquals(counts, {0: 256, 2: 256, 3: 256})
def test_remove_a_lot(self):
rb = ring.RingBuilder(3, 3, 1)
rb.add_dev({'id': 0, 'device': 'd0', 'ip': '10.0.0.1',
'port': 6002, 'weight': 1000.0, 'zone': 1})
rb.add_dev({'id': 1, 'device': 'd1', 'ip': '10.0.0.2',
'port': 6002, 'weight': 1000.0, 'zone': 2})
rb.add_dev({'id': 2, 'device': 'd2', 'ip': '10.0.0.3',
'port': 6002, 'weight': 1000.0, 'zone': 3})
rb.add_dev({'id': 3, 'device': 'd3', 'ip': '10.0.0.1',
'port': 6002, 'weight': 1000.0, 'zone': 1})
rb.add_dev({'id': 4, 'device': 'd4', 'ip': '10.0.0.2',
'port': 6002, 'weight': 1000.0, 'zone': 2})
rb.add_dev({'id': 5, 'device': 'd5', 'ip': '10.0.0.3',
'port': 6002, 'weight': 1000.0, 'zone': 3})
rb.rebalance()
rb.validate()
# this has to put more than 1/3 of the partitions in the
# cluster on removed devices in order to ensure that at least
# one partition has multiple replicas that need to move.
#
# (for an N-replica ring, it's more than 1/N of the
# partitions, of course)
rb.remove_dev(3)
rb.remove_dev(4)
rb.remove_dev(5)
rb.rebalance()
rb.validate()
def test_shuffled_gather(self): def test_shuffled_gather(self):
if self._shuffled_gather_helper() and \ if self._shuffled_gather_helper() and \
self._shuffled_gather_helper(): self._shuffled_gather_helper():