Optimize ec duplication and its md5 hashing
Originally, ec duplication is designed to make duplicated copy at chunk_transformer function for each node. However, it can cause doubled (or more than, depends on ec_duplication_factor) etag calculation which should be expensive for Swift proxy work. Since this patch, chunk_transformer keeps only an unique fragments slot to store, and then, send_chunk in ECObjectController._transfer_data picks up the suitable chunk by unique backend fragment index assigned by _determine_chunk_destination function to send to object server. Note that, Putter still keeps node_index but the new putter_to_frag_index dict and frag_hasher (chunk_index and chunk_hasher in the old names) now refers the value by fragment index. Change-Id: Ib9e8a6f67c2985164dd20b049c7f144f19fd1822
This commit is contained in:
@@ -1747,7 +1747,12 @@ class MIMEPutter(Putter):
|
|||||||
mime_boundary, multiphase=need_multiphase)
|
mime_boundary, multiphase=need_multiphase)
|
||||||
|
|
||||||
|
|
||||||
def chunk_transformer(policy, nstreams):
|
def chunk_transformer(policy):
|
||||||
|
"""
|
||||||
|
A generator to transform a source chunk to erasure coded chunks for each
|
||||||
|
`send` call. The number of erasure coded chunks is as
|
||||||
|
policy.ec_n_unique_fragments.
|
||||||
|
"""
|
||||||
segment_size = policy.ec_segment_size
|
segment_size = policy.ec_segment_size
|
||||||
|
|
||||||
buf = collections.deque()
|
buf = collections.deque()
|
||||||
@@ -1775,9 +1780,8 @@ def chunk_transformer(policy, nstreams):
|
|||||||
|
|
||||||
frags_by_byte_order = []
|
frags_by_byte_order = []
|
||||||
for chunk_to_encode in chunks_to_encode:
|
for chunk_to_encode in chunks_to_encode:
|
||||||
encoded_chunks = policy.pyeclib_driver.encode(chunk_to_encode)
|
frags_by_byte_order.append(
|
||||||
send_chunks = encoded_chunks * policy.ec_duplication_factor
|
policy.pyeclib_driver.encode(chunk_to_encode))
|
||||||
frags_by_byte_order.append(send_chunks)
|
|
||||||
# Sequential calls to encode() have given us a list that
|
# Sequential calls to encode() have given us a list that
|
||||||
# looks like this:
|
# looks like this:
|
||||||
#
|
#
|
||||||
@@ -1802,9 +1806,9 @@ def chunk_transformer(policy, nstreams):
|
|||||||
last_bytes = ''.join(buf)
|
last_bytes = ''.join(buf)
|
||||||
if last_bytes:
|
if last_bytes:
|
||||||
last_frags = policy.pyeclib_driver.encode(last_bytes)
|
last_frags = policy.pyeclib_driver.encode(last_bytes)
|
||||||
yield last_frags * policy.ec_duplication_factor
|
yield last_frags
|
||||||
else:
|
else:
|
||||||
yield [''] * nstreams
|
yield [''] * policy.ec_n_unique_fragments
|
||||||
|
|
||||||
|
|
||||||
def trailing_metadata(policy, client_obj_hasher,
|
def trailing_metadata(policy, client_obj_hasher,
|
||||||
@@ -2330,28 +2334,29 @@ class ECObjectController(BaseObjectController):
|
|||||||
def _determine_chunk_destinations(self, putters, policy):
|
def _determine_chunk_destinations(self, putters, policy):
|
||||||
"""
|
"""
|
||||||
Given a list of putters, return a dict where the key is the putter
|
Given a list of putters, return a dict where the key is the putter
|
||||||
and the value is the node index to use.
|
and the value is the frag index to use.
|
||||||
|
|
||||||
This is done so that we line up handoffs using the same node index
|
This is done so that we line up handoffs using the same frag index
|
||||||
(in the primary part list) as the primary that the handoff is standing
|
(in the primary part list) as the primary that the handoff is standing
|
||||||
in for. This lets erasure-code fragment archives wind up on the
|
in for. This lets erasure-code fragment archives wind up on the
|
||||||
preferred local primary nodes when possible.
|
preferred local primary nodes when possible.
|
||||||
|
|
||||||
:param putters: a list of swift.proxy.controllers.obj.MIMEPutter
|
:param putters: a list of swift.proxy.controllers.obj.MIMEPutter
|
||||||
instance
|
instance
|
||||||
:param policy: A policy instance
|
:param policy: A policy instance which should be one of ECStoragePolicy
|
||||||
"""
|
"""
|
||||||
# Give each putter a "chunk index": the index of the
|
# Give each putter a "frag index": the index of the
|
||||||
# transformed chunk that we'll send to it.
|
# transformed chunk that we'll send to it.
|
||||||
#
|
#
|
||||||
# For primary nodes, that's just its index (primary 0 gets
|
# For primary nodes, that's just its index (primary 0 gets
|
||||||
# chunk 0, primary 1 gets chunk 1, and so on). For handoffs,
|
# chunk 0, primary 1 gets chunk 1, and so on). For handoffs,
|
||||||
# we assign the chunk index of a missing primary.
|
# we assign the chunk index of a missing primary.
|
||||||
handoff_conns = []
|
handoff_conns = []
|
||||||
chunk_index = {}
|
putter_to_frag_index = {}
|
||||||
for p in putters:
|
for p in putters:
|
||||||
if p.node_index is not None:
|
if p.node_index is not None:
|
||||||
chunk_index[p] = p.node_index
|
putter_to_frag_index[p] = policy.get_backend_index(
|
||||||
|
p.node_index)
|
||||||
else:
|
else:
|
||||||
handoff_conns.append(p)
|
handoff_conns.append(p)
|
||||||
|
|
||||||
@@ -2362,35 +2367,33 @@ class ECObjectController(BaseObjectController):
|
|||||||
# returns 507, in which case a handoff is used to replace it.
|
# returns 507, in which case a handoff is used to replace it.
|
||||||
|
|
||||||
# lack_list is a dict of list to keep hole indexes
|
# lack_list is a dict of list to keep hole indexes
|
||||||
# e.g. if we have 2 holes for index 0 with ec_duplication_factor=2
|
# e.g. if we have 2 holes for frag index 0 with ec_duplication_factor=2
|
||||||
# lack_list is like {0: [0], 1: [0]}, and then, if 1 hole found
|
# lack_list is like {0: [0], 1: [0]}, and then, if 1 hole found
|
||||||
# for index 1, lack_list will be {0: [0, 1], 1: [0]}.
|
# for frag index 1, lack_list will be {0: [0, 1], 1: [0]}.
|
||||||
# After that, holes will be filled from bigger key
|
# After that, holes will be filled from bigger key
|
||||||
# (i.e. 1:[0] at first)
|
# (i.e. 1:[0] at first)
|
||||||
|
|
||||||
# Grouping all missing fragment indexes for each unique_index
|
# Grouping all missing fragment indexes for each frag_index
|
||||||
unique_index_to_holes = collections.defaultdict(list)
|
available_indexes = putter_to_frag_index.values()
|
||||||
available_indexes = chunk_index.values()
|
|
||||||
for node_index in range(policy.object_ring.replica_count):
|
|
||||||
if node_index not in available_indexes:
|
|
||||||
unique_index = policy.get_backend_index(node_index)
|
|
||||||
unique_index_to_holes[unique_index].append(node_index)
|
|
||||||
|
|
||||||
# Set the missing index to lack_list
|
|
||||||
lack_list = collections.defaultdict(list)
|
lack_list = collections.defaultdict(list)
|
||||||
for unique_index, holes in unique_index_to_holes.items():
|
for frag_index in range(policy.ec_n_unique_fragments):
|
||||||
for lack_tier, hole_node_index in enumerate(holes):
|
# Set the missing index to lack_list
|
||||||
lack_list[lack_tier].append(hole_node_index)
|
available_count = available_indexes.count(frag_index)
|
||||||
|
# N.B. it should be duplication_factor >= lack >= 0
|
||||||
|
lack = policy.ec_duplication_factor - available_count
|
||||||
|
# now we are missing one or more nodes to store the frag index
|
||||||
|
for lack_tier in range(lack):
|
||||||
|
lack_list[lack_tier].append(frag_index)
|
||||||
|
|
||||||
# Extract the lack_list to a flat list
|
# Extract the lack_list to a flat list
|
||||||
holes = []
|
holes = []
|
||||||
for lack_tier, indexes in sorted(lack_list.items(), reverse=True):
|
for lack_tier, indexes in sorted(lack_list.items(), reverse=True):
|
||||||
holes.extend(indexes)
|
holes.extend(indexes)
|
||||||
|
|
||||||
# Fill chunk_index list with the hole list
|
# Fill putter_to_frag_index list with the hole list
|
||||||
for hole, p in zip(holes, handoff_conns):
|
for hole, p in zip(holes, handoff_conns):
|
||||||
chunk_index[p] = hole
|
putter_to_frag_index[p] = hole
|
||||||
return chunk_index
|
return putter_to_frag_index
|
||||||
|
|
||||||
def _transfer_data(self, req, policy, data_source, putters, nodes,
|
def _transfer_data(self, req, policy, data_source, putters, nodes,
|
||||||
min_conns, etag_hasher):
|
min_conns, etag_hasher):
|
||||||
@@ -2400,15 +2403,15 @@ class ECObjectController(BaseObjectController):
|
|||||||
This method was added in the PUT method extraction change
|
This method was added in the PUT method extraction change
|
||||||
"""
|
"""
|
||||||
bytes_transferred = 0
|
bytes_transferred = 0
|
||||||
chunk_transform = chunk_transformer(policy, len(nodes))
|
chunk_transform = chunk_transformer(policy)
|
||||||
chunk_transform.send(None)
|
chunk_transform.send(None)
|
||||||
chunk_hashers = collections.defaultdict(md5)
|
frag_hashers = collections.defaultdict(md5)
|
||||||
|
|
||||||
def send_chunk(chunk):
|
def send_chunk(chunk):
|
||||||
# Note: there's two different hashers in here. etag_hasher is
|
# Note: there's two different hashers in here. etag_hasher is
|
||||||
# hashing the original object so that we can validate the ETag
|
# hashing the original object so that we can validate the ETag
|
||||||
# that the client sent (and etag_hasher is None if the client
|
# that the client sent (and etag_hasher is None if the client
|
||||||
# didn't send one). The hasher in chunk_hashers is hashing the
|
# didn't send one). The hasher in frag_hashers is hashing the
|
||||||
# fragment archive being sent to the client; this lets us guard
|
# fragment archive being sent to the client; this lets us guard
|
||||||
# against data corruption on the network between proxy and
|
# against data corruption on the network between proxy and
|
||||||
# object server.
|
# object server.
|
||||||
@@ -2420,11 +2423,17 @@ class ECObjectController(BaseObjectController):
|
|||||||
# or whatever we're doing, the transform will give us None.
|
# or whatever we're doing, the transform will give us None.
|
||||||
return
|
return
|
||||||
|
|
||||||
|
updated_frag_indexes = set()
|
||||||
for putter in list(putters):
|
for putter in list(putters):
|
||||||
ci = chunk_index[putter]
|
frag_index = putter_to_frag_index[putter]
|
||||||
backend_chunk = backend_chunks[ci]
|
backend_chunk = backend_chunks[frag_index]
|
||||||
if not putter.failed:
|
if not putter.failed:
|
||||||
chunk_hashers[ci].update(backend_chunk)
|
# N.B. same frag_index will appear when using
|
||||||
|
# ec_duplication_factor >= 2. So skip to feed the chunk
|
||||||
|
# to hasher if the frag was updated already.
|
||||||
|
if frag_index not in updated_frag_indexes:
|
||||||
|
frag_hashers[frag_index].update(backend_chunk)
|
||||||
|
updated_frag_indexes.add(frag_index)
|
||||||
putter.send_chunk(backend_chunk)
|
putter.send_chunk(backend_chunk)
|
||||||
else:
|
else:
|
||||||
putter.close()
|
putter.close()
|
||||||
@@ -2437,9 +2446,9 @@ class ECObjectController(BaseObjectController):
|
|||||||
try:
|
try:
|
||||||
with ContextPool(len(putters)) as pool:
|
with ContextPool(len(putters)) as pool:
|
||||||
|
|
||||||
# build our chunk index dict to place handoffs in the
|
# build our putter_to_frag_index dict to place handoffs in the
|
||||||
# same part nodes index as the primaries they are covering
|
# same part nodes index as the primaries they are covering
|
||||||
chunk_index = self._determine_chunk_destinations(
|
putter_to_frag_index = self._determine_chunk_destinations(
|
||||||
putters, policy)
|
putters, policy)
|
||||||
|
|
||||||
for putter in putters:
|
for putter in putters:
|
||||||
@@ -2487,14 +2496,14 @@ class ECObjectController(BaseObjectController):
|
|||||||
footers = {(k, v) for k, v in footers.items()
|
footers = {(k, v) for k, v in footers.items()
|
||||||
if not k.lower().startswith('x-object-sysmeta-ec-')}
|
if not k.lower().startswith('x-object-sysmeta-ec-')}
|
||||||
for putter in putters:
|
for putter in putters:
|
||||||
ci = chunk_index[putter]
|
frag_index = putter_to_frag_index[putter]
|
||||||
# Update any footers set by middleware with EC footers
|
# Update any footers set by middleware with EC footers
|
||||||
trail_md = trailing_metadata(
|
trail_md = trailing_metadata(
|
||||||
policy, etag_hasher,
|
policy, etag_hasher,
|
||||||
bytes_transferred, policy.get_backend_index(ci))
|
bytes_transferred, frag_index)
|
||||||
trail_md.update(footers)
|
trail_md.update(footers)
|
||||||
# Etag footer must always be hash of what we sent
|
# Etag footer must always be hash of what we sent
|
||||||
trail_md['Etag'] = chunk_hashers[ci].hexdigest()
|
trail_md['Etag'] = frag_hashers[frag_index].hexdigest()
|
||||||
putter.end_of_object_data(footer_metadata=trail_md)
|
putter.end_of_object_data(footer_metadata=trail_md)
|
||||||
|
|
||||||
for putter in putters:
|
for putter in putters:
|
||||||
|
|||||||
@@ -3663,21 +3663,20 @@ class TestECFunctions(unittest.TestCase):
|
|||||||
ec_segment_size=segment_size,
|
ec_segment_size=segment_size,
|
||||||
ec_duplication_factor=dup)
|
ec_duplication_factor=dup)
|
||||||
expected = policy.pyeclib_driver.encode(orig_chunk)
|
expected = policy.pyeclib_driver.encode(orig_chunk)
|
||||||
transform = obj.chunk_transformer(
|
transform = obj.chunk_transformer(policy)
|
||||||
policy, policy.object_ring.replica_count)
|
|
||||||
transform.send(None)
|
transform.send(None)
|
||||||
|
|
||||||
backend_chunks = transform.send(orig_chunk)
|
backend_chunks = transform.send(orig_chunk)
|
||||||
self.assertIsNotNone(backend_chunks) # sanity
|
self.assertIsNotNone(backend_chunks) # sanity
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
len(backend_chunks), policy.object_ring.replica_count)
|
len(backend_chunks), policy.ec_n_unique_fragments)
|
||||||
self.assertEqual(expected * dup, backend_chunks)
|
self.assertEqual(expected, backend_chunks)
|
||||||
|
|
||||||
# flush out last chunk buffer
|
# flush out last chunk buffer
|
||||||
backend_chunks = transform.send('')
|
backend_chunks = transform.send('')
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
len(backend_chunks), policy.object_ring.replica_count)
|
len(backend_chunks), policy.ec_n_unique_fragments)
|
||||||
self.assertEqual([''] * policy.object_ring.replica_count,
|
self.assertEqual([''] * policy.ec_n_unique_fragments,
|
||||||
backend_chunks)
|
backend_chunks)
|
||||||
do_test(1)
|
do_test(1)
|
||||||
do_test(2)
|
do_test(2)
|
||||||
@@ -3693,8 +3692,7 @@ class TestECFunctions(unittest.TestCase):
|
|||||||
ec_segment_size=1024,
|
ec_segment_size=1024,
|
||||||
ec_duplication_factor=dup)
|
ec_duplication_factor=dup)
|
||||||
expected = policy.pyeclib_driver.encode(last_chunk)
|
expected = policy.pyeclib_driver.encode(last_chunk)
|
||||||
transform = obj.chunk_transformer(
|
transform = obj.chunk_transformer(policy)
|
||||||
policy, policy.object_ring.replica_count)
|
|
||||||
transform.send(None)
|
transform.send(None)
|
||||||
|
|
||||||
transform.send(last_chunk)
|
transform.send(last_chunk)
|
||||||
@@ -3702,8 +3700,9 @@ class TestECFunctions(unittest.TestCase):
|
|||||||
backend_chunks = transform.send('')
|
backend_chunks = transform.send('')
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
len(backend_chunks), policy.object_ring.replica_count)
|
len(backend_chunks), policy.ec_n_unique_fragments)
|
||||||
self.assertEqual(expected * dup, backend_chunks)
|
self.assertEqual(expected, backend_chunks)
|
||||||
|
|
||||||
do_test(1)
|
do_test(1)
|
||||||
do_test(2)
|
do_test(2)
|
||||||
|
|
||||||
@@ -4310,12 +4309,13 @@ class TestECDuplicationObjController(
|
|||||||
got = controller._determine_chunk_destinations(putters, self.policy)
|
got = controller._determine_chunk_destinations(putters, self.policy)
|
||||||
expected = {}
|
expected = {}
|
||||||
for i, p in enumerate(putters):
|
for i, p in enumerate(putters):
|
||||||
expected[p] = i
|
expected[p] = self.policy.get_backend_index(i)
|
||||||
# sanity
|
# sanity
|
||||||
self.assertEqual(got, expected)
|
self.assertEqual(got, expected)
|
||||||
|
|
||||||
# now lets make an unique fragment as handoffs
|
# now lets make an unique fragment as handoffs
|
||||||
putters[unique].node_index = None
|
handoff_putter = putters[unique]
|
||||||
|
handoff_putter.node_index = None
|
||||||
|
|
||||||
# and then, pop a fragment which has same fragment index with unique
|
# and then, pop a fragment which has same fragment index with unique
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@@ -4329,21 +4329,20 @@ class TestECDuplicationObjController(
|
|||||||
# index 0 missing 2 copies and unique frag index 1 missing 1 copy
|
# index 0 missing 2 copies and unique frag index 1 missing 1 copy
|
||||||
# i.e. the handoff node should be assigned to unique frag index 1
|
# i.e. the handoff node should be assigned to unique frag index 1
|
||||||
got = controller._determine_chunk_destinations(putters, self.policy)
|
got = controller._determine_chunk_destinations(putters, self.policy)
|
||||||
self.assertEqual(len(expected) - 2, len(got))
|
# N.B. len(putters) is now len(expected - 2) due to pop twice
|
||||||
|
self.assertEqual(len(putters), len(got))
|
||||||
# index one_more_missing should not be choosen
|
# sanity, no node index - for handoff putter
|
||||||
self.assertNotIn(one_more_missing, got.values())
|
self.assertIsNone(handoff_putter.node_index)
|
||||||
# either index unique or duplicated should be in the got dict
|
self.assertEqual(got[handoff_putter], unique)
|
||||||
self.assertTrue(
|
# sanity, other nodes execpt handoff_putter have node_index
|
||||||
any([unique in got.values(), duplicated in got.values()]))
|
self.assertTrue(all(
|
||||||
# but it's not both
|
[putter.node_index for putter in got if
|
||||||
self.assertFalse(
|
putter != handoff_putter]))
|
||||||
all([unique in got.values(), duplicated in got.values()]))
|
|
||||||
|
|
||||||
def test_determine_chunk_destinations_prioritize_more_missing(self):
|
def test_determine_chunk_destinations_prioritize_more_missing(self):
|
||||||
# drop 0, 14 and 1 should work
|
# drop node_index 0, 14 and 1 should work
|
||||||
self._test_determine_chunk_destinations_prioritize(0, 14, 1)
|
self._test_determine_chunk_destinations_prioritize(0, 14, 1)
|
||||||
# drop 1, 15 and 0 should work, too
|
# drop node_index 1, 15 and 0 should work, too
|
||||||
self._test_determine_chunk_destinations_prioritize(1, 15, 0)
|
self._test_determine_chunk_destinations_prioritize(1, 15, 0)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user