Use MurmurHash2 for key partition hashing
This commit is contained in:
		| @@ -1,6 +1,7 @@ | |||||||
| from .roundrobin import RoundRobinPartitioner | from .roundrobin import RoundRobinPartitioner | ||||||
| from .hashed import HashedPartitioner | from .hashed import HashedPartitioner, Murmur2Partitioner, LegacyPartitioner | ||||||
|  |  | ||||||
| __all__ = [ | __all__ = [ | ||||||
|     'RoundRobinPartitioner', 'HashedPartitioner' |     'RoundRobinPartitioner', 'HashedPartitioner', 'Murmur2Partitioner', | ||||||
|  |     'LegacyPartitioner' | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -1,7 +1,25 @@ | |||||||
| from .base import Partitioner | from .base import Partitioner | ||||||
|  |  | ||||||
| class HashedPartitioner(Partitioner): |  | ||||||
|  | class Murmur2Partitioner(Partitioner): | ||||||
|     """ |     """ | ||||||
|  |     Implements a partitioner which selects the target partition based on | ||||||
|  |     the hash of the key. Attempts to apply the same hashing | ||||||
|  |     function as mainline java client. | ||||||
|  |     """ | ||||||
|  |     def partition(self, key, partitions=None): | ||||||
|  |         if not partitions: | ||||||
|  |             partitions = self.partitions | ||||||
|  |  | ||||||
|  |         # https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/clients/producer/internals/Partitioner.java#L69 | ||||||
|  |         idx = (murmur2(key) & 0x7fffffff) % len(partitions) | ||||||
|  |  | ||||||
|  |         return partitions[idx] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class LegacyPartitioner(Partitioner): | ||||||
|  |     """DEPRECATED -- See Issue 374 | ||||||
|  |  | ||||||
|     Implements a partitioner which selects the target partition based on |     Implements a partitioner which selects the target partition based on | ||||||
|     the hash of the key |     the hash of the key | ||||||
|     """ |     """ | ||||||
| @@ -12,3 +30,79 @@ class HashedPartitioner(Partitioner): | |||||||
|         idx = hash(key) % size |         idx = hash(key) % size | ||||||
|  |  | ||||||
|         return partitions[idx] |         return partitions[idx] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Default will change to Murmur2 in 0.10 release | ||||||
|  | HashedPartitioner = LegacyPartitioner | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/common/utils/Utils.java#L244 | ||||||
|  | def murmur2(key): | ||||||
|  |     """Pure-python Murmur2 implementation. | ||||||
|  |  | ||||||
|  |     Based on java client, see org.apache.kafka.common.utils.Utils.murmur2 | ||||||
|  |  | ||||||
|  |     Args: | ||||||
|  |         key: if not a bytearray, converted via bytearray(str(key)) | ||||||
|  |  | ||||||
|  |     Returns: MurmurHash2 of key bytearray | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     # Convert key to a bytearray | ||||||
|  |     if not isinstance(key, bytearray): | ||||||
|  |         data = bytearray(str(key)) | ||||||
|  |  | ||||||
|  |     length = len(data) | ||||||
|  |     seed = 0x9747b28c | ||||||
|  |     # 'm' and 'r' are mixing constants generated offline. | ||||||
|  |     # They're not really 'magic', they just happen to work well. | ||||||
|  |     m = 0x5bd1e995 | ||||||
|  |     r = 24 | ||||||
|  |  | ||||||
|  |     # Initialize the hash to a random value | ||||||
|  |     h = seed ^ length | ||||||
|  |     length4 = length / 4 | ||||||
|  |  | ||||||
|  |     for i in range(length4): | ||||||
|  |         i4 = i * 4 | ||||||
|  |         k = ((data[i4 + 0] & 0xff) + | ||||||
|  |             ((data[i4 + 1] & 0xff) << 8) + | ||||||
|  |             ((data[i4 + 2] & 0xff) << 16) + | ||||||
|  |             ((data[i4 + 3] & 0xff) << 24)) | ||||||
|  |         k &= 0xffffffff | ||||||
|  |         k *= m | ||||||
|  |         k &= 0xffffffff | ||||||
|  |         k ^= (k % 0x100000000) >> r # k ^= k >>> r | ||||||
|  |         k &= 0xffffffff | ||||||
|  |         k *= m | ||||||
|  |         k &= 0xffffffff | ||||||
|  |  | ||||||
|  |         h *= m | ||||||
|  |         h &= 0xffffffff | ||||||
|  |         h ^= k | ||||||
|  |         h &= 0xffffffff | ||||||
|  |  | ||||||
|  |     # Handle the last few bytes of the input array | ||||||
|  |     extra_bytes = length % 4 | ||||||
|  |     if extra_bytes == 3: | ||||||
|  |         h ^= (data[(length & ~3) + 2] & 0xff) << 16 | ||||||
|  |         h &= 0xffffffff | ||||||
|  |  | ||||||
|  |     if extra_bytes == 2: | ||||||
|  |         h ^= (data[(length & ~3) + 1] & 0xff) << 8 | ||||||
|  |         h &= 0xffffffff | ||||||
|  |  | ||||||
|  |     if extra_bytes == 1: | ||||||
|  |         h ^= (data[length & ~3] & 0xff) | ||||||
|  |         h &= 0xffffffff | ||||||
|  |         h *= m | ||||||
|  |         h &= 0xffffffff | ||||||
|  |  | ||||||
|  |     h ^= (h % 0x100000000) >> 13 # h >>> 13; | ||||||
|  |     h &= 0xffffffff | ||||||
|  |     h *= m | ||||||
|  |     h &= 0xffffffff | ||||||
|  |     h ^= (h % 0x100000000) >> 15 # h >>> 15; | ||||||
|  |     h &= 0xffffffff | ||||||
|  |  | ||||||
|  |     return h | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Dana Powers
					Dana Powers