Fix ring-builder crash

If you had a partition with 1 replica on a deleted device, 1 on a
zero-weight device, and 1 on a normal device, then rebalancing would
crash. This was because the ring builder was memoizing tiers_for_dev()
for all *available* devices (i.e. weight > 0 and not deleted), but
depended on it being present for all devices with partitions still on
them.

Since the builder moved the replica from the deleted device, it left
the one on the zero-weight device alone, so you had an unavailable
device with a partition replica still on it, triggering the crash.

Now we go ahead and memoize tiers_for_dev() for *all* devices, not
just available ones, thereby fixing the crash.

Change-Id: Ie0b58b65e0353732cf785ab772e95e699f3a5b5d
This commit is contained in:
Samuel Merritt 2014-03-11 11:25:04 -07:00
parent 86668aa1ac
commit e7fb089f59
2 changed files with 30 additions and 3 deletions

View File

@ -754,6 +754,7 @@ class RingBuilder(object):
"""
for dev in self._iter_devs():
dev['sort_key'] = self._sort_key_for(dev)
dev['tiers'] = tiers_for_dev(dev)
available_devs = \
sorted((d for d in self._iter_devs() if d['weight']),
@ -764,7 +765,6 @@ class RingBuilder(object):
tier2dev_sort_key = defaultdict(list)
max_tier_depth = 0
for dev in available_devs:
dev['tiers'] = tiers_for_dev(dev)
for tier in dev['tiers']:
tier2devs[tier].append(dev) # <-- starts out sorted!
tier2dev_sort_key[tier].append(dev['sort_key'])
@ -889,7 +889,7 @@ class RingBuilder(object):
# Just to save memory and keep from accidental reuse.
for dev in self._iter_devs():
del dev['sort_key']
dev.pop('tiers', None) # May be absent for devices w/o weight
del dev['tiers']
def _sort_key_for(self, dev):
return (dev['parts_wanted'], random.randint(0, 0xFFFF), dev['id'])

View File

@ -107,6 +107,33 @@ class TestRingBuilder(unittest.TestCase):
self.assertNotEquals(r0.to_dict(), r1.to_dict())
self.assertEquals(r1.to_dict(), r2.to_dict())
def test_rebalance_part_on_deleted_other_part_on_drained(self):
rb = ring.RingBuilder(8, 3, 1)
rb.add_dev({'id': 0, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10000, 'device': 'sda1'})
rb.add_dev({'id': 1, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
rb.add_dev({'id': 2, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10002, 'device': 'sda1'})
rb.add_dev({'id': 3, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10003, 'device': 'sda1'})
rb.add_dev({'id': 4, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10004, 'device': 'sda1'})
rb.add_dev({'id': 5, 'region': 1, 'zone': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 10005, 'device': 'sda1'})
rb.rebalance(seed=1)
# We want a partition where 1 replica is on a removed device, 1
# replica is on a 0-weight device, and 1 on a normal device. To
# guarantee we have one, we see where partition 123 is, then
# manipulate its devices accordingly.
zero_weight_dev_id = rb._replica2part2dev[1][123]
delete_dev_id = rb._replica2part2dev[2][123]
rb.set_dev_weight(zero_weight_dev_id, 0.0)
rb.remove_dev(delete_dev_id)
rb.rebalance()
def test_set_replicas(self):
rb = ring.RingBuilder(8, 3.2, 1)
rb.devs_changed = False
@ -125,7 +152,7 @@ class TestRingBuilder(unittest.TestCase):
self.assertRaises(exceptions.DuplicateDeviceError, rb.add_dev, dev)
self.assertEqual(dev_id, 0)
rb = ring.RingBuilder(8, 3, 1)
#test add new dev with no id
# test add new dev with no id
dev_id = rb.add_dev({'zone': 0, 'region': 1, 'weight': 1,
'ip': '127.0.0.1', 'port': 6000})
self.assertEquals(rb.devs[0]['id'], 0)