Merge pull request #381 from datastax/363

PYTHON-363 - Pure Python Murmur3 Implementation
This commit is contained in:
Adam Holmberg
2015-08-12 15:56:27 -05:00
5 changed files with 171 additions and 26 deletions

2
.gitignore vendored
View File

@@ -18,7 +18,7 @@ setuptools*.tar.gz
setuptools*.egg setuptools*.egg
cassandra/*.c cassandra/*.c
!cassandra/murmur3.c !cassandra/cmurmur3.c
# OSX # OSX
.DS_Store .DS_Store

View File

@@ -137,7 +137,6 @@ int64_t MurmurHash3_x64_128 (const void * key, const int len,
k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
} }
//---------- //----------
@@ -212,51 +211,51 @@ murmur3(PyObject *self, PyObject *args)
return (PyObject *) PyLong_FromLongLong(result); return (PyObject *) PyLong_FromLongLong(result);
} }
static PyMethodDef murmur3_methods[] = { static PyMethodDef cmurmur3_methods[] = {
{"murmur3", murmur3, METH_VARARGS, "Make an x64 murmur3 64-bit hash value"}, {"murmur3", murmur3, METH_VARARGS, "Make an x64 murmur3 64-bit hash value"},
{NULL, NULL, 0, NULL} {NULL, NULL, 0, NULL}
}; };
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
static int murmur3_traverse(PyObject *m, visitproc visit, void *arg) { static int cmurmur3_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error); Py_VISIT(GETSTATE(m)->error);
return 0; return 0;
} }
static int murmur3_clear(PyObject *m) { static int cmurmur3_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error); Py_CLEAR(GETSTATE(m)->error);
return 0; return 0;
} }
static struct PyModuleDef moduledef = { static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT, PyModuleDef_HEAD_INIT,
"murmur3", "cmurmur3",
NULL, NULL,
sizeof(struct module_state), sizeof(struct module_state),
murmur3_methods, cmurmur3_methods,
NULL, NULL,
murmur3_traverse, cmurmur3_traverse,
murmur3_clear, cmurmur3_clear,
NULL NULL
}; };
#define INITERROR return NULL #define INITERROR return NULL
PyObject * PyObject *
PyInit_murmur3(void) PyInit_cmurmur3(void)
#else #else
#define INITERROR return #define INITERROR return
void void
initmurmur3(void) initcmurmur3(void)
#endif #endif
{ {
#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
PyObject *module = PyModule_Create(&moduledef); PyObject *module = PyModule_Create(&moduledef);
#else #else
PyObject *module = Py_InitModule("murmur3", murmur3_methods); PyObject *module = Py_InitModule("cmurmur3", cmurmur3_methods);
#endif #endif
struct module_state *st = NULL; struct module_state *st = NULL;
@@ -264,7 +263,7 @@ initmurmur3(void)
INITERROR; INITERROR;
st = GETSTATE(module); st = GETSTATE(module);
st->error = PyErr_NewException("murmur3.Error", NULL, NULL); st->error = PyErr_NewException("cmurmur3.Error", NULL, NULL);
if (st->error == NULL) { if (st->error == NULL) {
Py_DECREF(module); Py_DECREF(module);
INITERROR; INITERROR;

113
cassandra/murmur3.py Normal file
View File

@@ -0,0 +1,113 @@
from six.moves import range
import struct
def body_and_tail(data):
l = len(data)
nblocks = l // 16
tail = l % 16
if nblocks:
return struct.unpack_from('qq' * nblocks, data), struct.unpack_from('b' * tail, data, -tail), l
else:
return tuple(), struct.unpack_from('b' * tail, data, -tail), l
def rotl64(x, r):
# note: not a general-purpose function because it leaves the high-order bits intact
# suitable for this use case without wasting cycles
mask = 2 ** r - 1
rotated = (x << r) | ((x >> 64 - r) & mask)
return rotated
def fmix(k):
# masking off the 31s bits that would be leftover after >> 33 a 64-bit number
k ^= (k >> 33) & 0x7fffffff
k *= 0xff51afd7ed558ccd
k ^= (k >> 33) & 0x7fffffff
k *= 0xc4ceb9fe1a85ec53
k ^= (k >> 33) & 0x7fffffff
return k
INT64_MAX = int(2 ** 63 - 1)
INT64_MIN = -INT64_MAX - 1
INT64_OVF_OFFSET = INT64_MAX + 1
INT64_OVF_DIV = 2 * INT64_OVF_OFFSET
def truncate_int64(x):
if not INT64_MIN <= x <= INT64_MAX:
x = (x + INT64_OVF_OFFSET) % INT64_OVF_DIV - INT64_OVF_OFFSET
return x
def _murmur3(data):
h1 = h2 = 0
c1 = -8663945395140668459 # 0x87c37b91114253d5
c2 = 0x4cf5ad432745937f
body, tail, total_len = body_and_tail(data)
# body
for i in range(0, len(body), 2):
k1 = body[i]
k2 = body[i + 1]
k1 *= c1
k1 = rotl64(k1, 31)
k1 *= c2
h1 ^= k1
h1 = rotl64(h1, 27)
h1 += h2
h1 = h1 * 5 + 0x52dce729
k2 *= c2
k2 = rotl64(k2, 33)
k2 *= c1
h2 ^= k2
h2 = rotl64(h2, 31)
h2 += h1
h2 = h2 * 5 + 0x38495ab5
# tail
k1 = k2 = 0
len_tail = len(tail)
if len_tail > 8:
for i in range(len_tail - 1, 7, -1):
k2 ^= tail[i] << (i - 8) * 8
k2 *= c2
k2 = rotl64(k2, 33)
k2 *= c1
h2 ^= k2
if len_tail:
for i in range(min(7, len_tail - 1), -1, -1):
k1 ^= tail[i] << i * 8
k1 *= c1
k1 = rotl64(k1, 31)
k1 *= c2
h1 ^= k1
# finalization
h1 ^= total_len
h2 ^= total_len
h1 += h2
h2 += h1
h1 = fmix(h1)
h2 = fmix(h2)
h1 += h2
return truncate_int64(h1)
try:
from cassandra.cmurmur3 import murmur3
except ImportError:
murmur3 = _murmur3

View File

@@ -121,8 +121,8 @@ class BuildFailed(Exception):
self.ext = ext self.ext = ext
murmur3_ext = Extension('cassandra.murmur3', murmur3_ext = Extension('cassandra.cmurmur3',
sources=['cassandra/murmur3.c']) sources=['cassandra/cmurmur3.c'])
libev_ext = Extension('cassandra.io.libevwrapper', libev_ext = Extension('cassandra.io.libevwrapper',
sources=['cassandra/io/libevwrapper.c'], sources=['cassandra/io/libevwrapper.c'],

View File

@@ -18,6 +18,9 @@ except ImportError:
import unittest # noqa import unittest # noqa
from mock import Mock from mock import Mock
import os
import random
import six
import cassandra import cassandra
from cassandra.cqltypes import IntegerType, AsciiType, TupleType from cassandra.cqltypes import IntegerType, AsciiType, TupleType
@@ -215,19 +218,46 @@ class NameEscapingTest(unittest.TestCase):
self.assertEqual(is_valid_name(keyword), False) self.assertEqual(is_valid_name(keyword), False)
class TokensTest(unittest.TestCase): class Murmur3TokensTest(unittest.TestCase):
def test_murmur3_tokens(self): def test_murmur3_init(self):
murmur3_token = Murmur3Token(cassandra.metadata.MIN_LONG - 1)
self.assertEqual(str(murmur3_token), '<Murmur3Token: -9223372036854775809>')
def test_python_vs_c(self):
from cassandra.murmur3 import _murmur3 as mm3_python
try: try:
murmur3_token = Murmur3Token(cassandra.metadata.MIN_LONG - 1) from cassandra.cmurmur3 import murmur3 as mm3_c
self.assertEqual(murmur3_token.hash_fn('123'), -7468325962851647638)
self.assertEqual(murmur3_token.hash_fn(b'\x00\xff\x10\xfa\x99' * 10), 5837342703291459765) iterations = 100
self.assertEqual(murmur3_token.hash_fn(b'\xfe' * 8), -8927430733708461935) for _ in range(iterations):
self.assertEqual(murmur3_token.hash_fn(b'\x10' * 8), 1446172840243228796) for len in range(0, 32): # zero to one block plus full range of tail lengths
self.assertEqual(murmur3_token.hash_fn(str(cassandra.metadata.MAX_LONG)), 7162290910810015547) key = os.urandom(len)
self.assertEqual(str(murmur3_token), '<Murmur3Token: -9223372036854775809>') self.assertEqual(mm3_python(key), mm3_c(key))
except NoMurmur3:
raise unittest.SkipTest('The murmur3 extension is not available') except ImportError:
raise unittest.SkipTest('The cmurmur3 extension is not available')
def test_murmur3_python(self):
from cassandra.murmur3 import _murmur3
self._verify_hash(_murmur3)
def test_murmur3_c(self):
try:
from cassandra.cmurmur3 import murmur3
self._verify_hash(murmur3)
except ImportError:
raise unittest.SkipTest('The cmurmur3 extension is not available')
def _verify_hash(self, fn):
self.assertEqual(fn(six.b('123')), -7468325962851647638)
self.assertEqual(fn(b'\x00\xff\x10\xfa\x99' * 10), 5837342703291459765)
self.assertEqual(fn(b'\xfe' * 8), -8927430733708461935)
self.assertEqual(fn(b'\x10' * 8), 1446172840243228796)
self.assertEqual(fn(six.b(str(cassandra.metadata.MAX_LONG))), 7162290910810015547)
class MD5TokensTest(unittest.TestCase):
def test_md5_tokens(self): def test_md5_tokens(self):
md5_token = MD5Token(cassandra.metadata.MIN_LONG - 1) md5_token = MD5Token(cassandra.metadata.MIN_LONG - 1)
@@ -235,6 +265,9 @@ class TokensTest(unittest.TestCase):
self.assertEqual(md5_token.hash_fn(str(cassandra.metadata.MAX_LONG)), 28528976619278518853815276204542453639) self.assertEqual(md5_token.hash_fn(str(cassandra.metadata.MAX_LONG)), 28528976619278518853815276204542453639)
self.assertEqual(str(md5_token), '<MD5Token: %s>' % -9223372036854775809) self.assertEqual(str(md5_token), '<MD5Token: %s>' % -9223372036854775809)
class BytesTokensTest(unittest.TestCase):
def test_bytes_tokens(self): def test_bytes_tokens(self):
bytes_token = BytesToken(str(cassandra.metadata.MIN_LONG - 1)) bytes_token = BytesToken(str(cassandra.metadata.MIN_LONG - 1))
self.assertEqual(bytes_token.hash_fn('123'), '123') self.assertEqual(bytes_token.hash_fn('123'), '123')