
Avoid further type changes when Murmur2Partitioner is passed a byte array. Change leftover-bytes logic to match the Java switch statement, and add tests to verify that partition selection matches Java implementation.
111 lines
2.9 KiB
Python
111 lines
2.9 KiB
Python
import six
|
|
|
|
from .base import Partitioner
|
|
|
|
|
|
class Murmur2Partitioner(Partitioner):
|
|
"""
|
|
Implements a partitioner which selects the target partition based on
|
|
the hash of the key. Attempts to apply the same hashing
|
|
function as mainline java client.
|
|
"""
|
|
def partition(self, key, partitions=None):
|
|
if not partitions:
|
|
partitions = self.partitions
|
|
|
|
# https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/clients/producer/internals/Partitioner.java#L69
|
|
idx = (murmur2(key) & 0x7fffffff) % len(partitions)
|
|
|
|
return partitions[idx]
|
|
|
|
|
|
class LegacyPartitioner(Partitioner):
|
|
"""DEPRECATED -- See Issue 374
|
|
|
|
Implements a partitioner which selects the target partition based on
|
|
the hash of the key
|
|
"""
|
|
def partition(self, key, partitions=None):
|
|
if not partitions:
|
|
partitions = self.partitions
|
|
size = len(partitions)
|
|
idx = hash(key) % size
|
|
|
|
return partitions[idx]
|
|
|
|
|
|
# Default will change to Murmur2 in 0.10 release
|
|
HashedPartitioner = LegacyPartitioner
|
|
|
|
|
|
# https://github.com/apache/kafka/blob/0.8.2/clients/src/main/java/org/apache/kafka/common/utils/Utils.java#L244
|
|
def murmur2(key):
|
|
"""Pure-python Murmur2 implementation.
|
|
|
|
Based on java client, see org.apache.kafka.common.utils.Utils.murmur2
|
|
|
|
Args:
|
|
key: if not a bytes type, encoded using default encoding
|
|
|
|
Returns: MurmurHash2 of key bytearray
|
|
"""
|
|
|
|
# Convert key to bytes or bytearray
|
|
if isinstance(key, bytearray) or (six.PY3 and isinstance(key, bytes)):
|
|
data = key
|
|
else:
|
|
data = bytearray(str(key).encode())
|
|
|
|
length = len(data)
|
|
seed = 0x9747b28c
|
|
# 'm' and 'r' are mixing constants generated offline.
|
|
# They're not really 'magic', they just happen to work well.
|
|
m = 0x5bd1e995
|
|
r = 24
|
|
|
|
# Initialize the hash to a random value
|
|
h = seed ^ length
|
|
length4 = length // 4
|
|
|
|
for i in range(length4):
|
|
i4 = i * 4
|
|
k = ((data[i4 + 0] & 0xff) +
|
|
((data[i4 + 1] & 0xff) << 8) +
|
|
((data[i4 + 2] & 0xff) << 16) +
|
|
((data[i4 + 3] & 0xff) << 24))
|
|
k &= 0xffffffff
|
|
k *= m
|
|
k &= 0xffffffff
|
|
k ^= (k % 0x100000000) >> r # k ^= k >>> r
|
|
k &= 0xffffffff
|
|
k *= m
|
|
k &= 0xffffffff
|
|
|
|
h *= m
|
|
h &= 0xffffffff
|
|
h ^= k
|
|
h &= 0xffffffff
|
|
|
|
# Handle the last few bytes of the input array
|
|
extra_bytes = length % 4
|
|
if extra_bytes >= 3:
|
|
h ^= (data[(length & ~3) + 2] & 0xff) << 16
|
|
h &= 0xffffffff
|
|
if extra_bytes >= 2:
|
|
h ^= (data[(length & ~3) + 1] & 0xff) << 8
|
|
h &= 0xffffffff
|
|
if extra_bytes >= 1:
|
|
h ^= (data[length & ~3] & 0xff)
|
|
h &= 0xffffffff
|
|
h *= m
|
|
h &= 0xffffffff
|
|
|
|
h ^= (h % 0x100000000) >> 13 # h >>> 13;
|
|
h &= 0xffffffff
|
|
h *= m
|
|
h &= 0xffffffff
|
|
h ^= (h % 0x100000000) >> 15 # h >>> 15;
|
|
h &= 0xffffffff
|
|
|
|
return h
|