diff --git a/.gitignore b/.gitignore index ee93232c..3bdda58a 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,7 @@ setuptools*.tar.gz setuptools*.egg cassandra/*.c -!cassandra/murmur3.c +!cassandra/cmurmur3.c # OSX .DS_Store diff --git a/cassandra/murmur3.c b/cassandra/cmurmur3.c similarity index 93% rename from cassandra/murmur3.c rename to cassandra/cmurmur3.c index dc98b491..0ffc3ad5 100644 --- a/cassandra/murmur3.c +++ b/cassandra/cmurmur3.c @@ -137,7 +137,6 @@ int64_t MurmurHash3_x64_128 (const void * key, const int len, k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } //---------- @@ -212,51 +211,51 @@ murmur3(PyObject *self, PyObject *args) return (PyObject *) PyLong_FromLongLong(result); } -static PyMethodDef murmur3_methods[] = { +static PyMethodDef cmurmur3_methods[] = { {"murmur3", murmur3, METH_VARARGS, "Make an x64 murmur3 64-bit hash value"}, {NULL, NULL, 0, NULL} }; #if PY_MAJOR_VERSION >= 3 -static int murmur3_traverse(PyObject *m, visitproc visit, void *arg) { +static int cmurmur3_traverse(PyObject *m, visitproc visit, void *arg) { Py_VISIT(GETSTATE(m)->error); return 0; } -static int murmur3_clear(PyObject *m) { +static int cmurmur3_clear(PyObject *m) { Py_CLEAR(GETSTATE(m)->error); return 0; } static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, - "murmur3", + "cmurmur3", NULL, sizeof(struct module_state), - murmur3_methods, + cmurmur3_methods, NULL, - murmur3_traverse, - murmur3_clear, + cmurmur3_traverse, + cmurmur3_clear, NULL }; #define INITERROR return NULL PyObject * -PyInit_murmur3(void) +PyInit_cmurmur3(void) #else #define INITERROR return void -initmurmur3(void) +initcmurmur3(void) #endif { #if PY_MAJOR_VERSION >= 3 PyObject *module = PyModule_Create(&moduledef); #else - PyObject *module = Py_InitModule("murmur3", murmur3_methods); + PyObject *module = Py_InitModule("cmurmur3", cmurmur3_methods); #endif struct module_state *st = NULL; @@ -264,7 +263,7 @@ initmurmur3(void) INITERROR; st = GETSTATE(module); - st->error = PyErr_NewException("murmur3.Error", NULL, NULL); + st->error = PyErr_NewException("cmurmur3.Error", NULL, NULL); if (st->error == NULL) { Py_DECREF(module); INITERROR; diff --git a/cassandra/murmur3.py b/cassandra/murmur3.py new file mode 100644 index 00000000..61180c01 --- /dev/null +++ b/cassandra/murmur3.py @@ -0,0 +1,113 @@ +from six.moves import range +import struct + + +def body_and_tail(data): + l = len(data) + nblocks = l // 16 + tail = l % 16 + if nblocks: + return struct.unpack_from('qq' * nblocks, data), struct.unpack_from('b' * tail, data, -tail), l + else: + return tuple(), struct.unpack_from('b' * tail, data, -tail), l + + +def rotl64(x, r): + # note: not a general-purpose function because it leaves the high-order bits intact + # suitable for this use case without wasting cycles + mask = 2 ** r - 1 + rotated = (x << r) | ((x >> 64 - r) & mask) + return rotated + + +def fmix(k): + # masking off the 31s bits that would be leftover after >> 33 a 64-bit number + k ^= (k >> 33) & 0x7fffffff + k *= 0xff51afd7ed558ccd + k ^= (k >> 33) & 0x7fffffff + k *= 0xc4ceb9fe1a85ec53 + k ^= (k >> 33) & 0x7fffffff + return k + + +INT64_MAX = int(2 ** 63 - 1) +INT64_MIN = -INT64_MAX - 1 +INT64_OVF_OFFSET = INT64_MAX + 1 +INT64_OVF_DIV = 2 * INT64_OVF_OFFSET + + +def truncate_int64(x): + if not INT64_MIN <= x <= INT64_MAX: + x = (x + INT64_OVF_OFFSET) % INT64_OVF_DIV - INT64_OVF_OFFSET + return x + + +def _murmur3(data): + + h1 = h2 = 0 + + c1 = -8663945395140668459 # 0x87c37b91114253d5 + c2 = 0x4cf5ad432745937f + + body, tail, total_len = body_and_tail(data) + + # body + for i in range(0, len(body), 2): + k1 = body[i] + k2 = body[i + 1] + + k1 *= c1 + k1 = rotl64(k1, 31) + k1 *= c2 + h1 ^= k1 + + h1 = rotl64(h1, 27) + h1 += h2 + h1 = h1 * 5 + 0x52dce729 + + k2 *= c2 + k2 = rotl64(k2, 33) + k2 *= c1 + h2 ^= k2 + + h2 = rotl64(h2, 31) + h2 += h1 + h2 = h2 * 5 + 0x38495ab5 + + # tail + k1 = k2 = 0 + len_tail = len(tail) + if len_tail > 8: + for i in range(len_tail - 1, 7, -1): + k2 ^= tail[i] << (i - 8) * 8 + k2 *= c2 + k2 = rotl64(k2, 33) + k2 *= c1 + h2 ^= k2 + + if len_tail: + for i in range(min(7, len_tail - 1), -1, -1): + k1 ^= tail[i] << i * 8 + k1 *= c1 + k1 = rotl64(k1, 31) + k1 *= c2 + h1 ^= k1 + + # finalization + h1 ^= total_len + h2 ^= total_len + + h1 += h2 + h2 += h1 + + h1 = fmix(h1) + h2 = fmix(h2) + + h1 += h2 + + return truncate_int64(h1) + +try: + from cassandra.cmurmur3 import murmur3 +except ImportError: + murmur3 = _murmur3 diff --git a/setup.py b/setup.py index 67594036..d27cf165 100644 --- a/setup.py +++ b/setup.py @@ -121,8 +121,8 @@ class BuildFailed(Exception): self.ext = ext -murmur3_ext = Extension('cassandra.murmur3', - sources=['cassandra/murmur3.c']) +murmur3_ext = Extension('cassandra.cmurmur3', + sources=['cassandra/cmurmur3.c']) libev_ext = Extension('cassandra.io.libevwrapper', sources=['cassandra/io/libevwrapper.c'], diff --git a/tests/unit/test_metadata.py b/tests/unit/test_metadata.py index 8bdd428c..8bfad5ff 100644 --- a/tests/unit/test_metadata.py +++ b/tests/unit/test_metadata.py @@ -18,6 +18,9 @@ except ImportError: import unittest # noqa from mock import Mock +import os +import random +import six import cassandra from cassandra.cqltypes import IntegerType, AsciiType, TupleType @@ -215,19 +218,46 @@ class NameEscapingTest(unittest.TestCase): self.assertEqual(is_valid_name(keyword), False) -class TokensTest(unittest.TestCase): +class Murmur3TokensTest(unittest.TestCase): - def test_murmur3_tokens(self): + def test_murmur3_init(self): + murmur3_token = Murmur3Token(cassandra.metadata.MIN_LONG - 1) + self.assertEqual(str(murmur3_token), '') + + def test_python_vs_c(self): + from cassandra.murmur3 import _murmur3 as mm3_python try: - murmur3_token = Murmur3Token(cassandra.metadata.MIN_LONG - 1) - self.assertEqual(murmur3_token.hash_fn('123'), -7468325962851647638) - self.assertEqual(murmur3_token.hash_fn(b'\x00\xff\x10\xfa\x99' * 10), 5837342703291459765) - self.assertEqual(murmur3_token.hash_fn(b'\xfe' * 8), -8927430733708461935) - self.assertEqual(murmur3_token.hash_fn(b'\x10' * 8), 1446172840243228796) - self.assertEqual(murmur3_token.hash_fn(str(cassandra.metadata.MAX_LONG)), 7162290910810015547) - self.assertEqual(str(murmur3_token), '') - except NoMurmur3: - raise unittest.SkipTest('The murmur3 extension is not available') + from cassandra.cmurmur3 import murmur3 as mm3_c + + iterations = 100 + for _ in range(iterations): + for len in range(0, 32): # zero to one block plus full range of tail lengths + key = os.urandom(len) + self.assertEqual(mm3_python(key), mm3_c(key)) + + except ImportError: + raise unittest.SkipTest('The cmurmur3 extension is not available') + + def test_murmur3_python(self): + from cassandra.murmur3 import _murmur3 + self._verify_hash(_murmur3) + + def test_murmur3_c(self): + try: + from cassandra.cmurmur3 import murmur3 + self._verify_hash(murmur3) + except ImportError: + raise unittest.SkipTest('The cmurmur3 extension is not available') + + def _verify_hash(self, fn): + self.assertEqual(fn(six.b('123')), -7468325962851647638) + self.assertEqual(fn(b'\x00\xff\x10\xfa\x99' * 10), 5837342703291459765) + self.assertEqual(fn(b'\xfe' * 8), -8927430733708461935) + self.assertEqual(fn(b'\x10' * 8), 1446172840243228796) + self.assertEqual(fn(six.b(str(cassandra.metadata.MAX_LONG))), 7162290910810015547) + + +class MD5TokensTest(unittest.TestCase): def test_md5_tokens(self): md5_token = MD5Token(cassandra.metadata.MIN_LONG - 1) @@ -235,6 +265,9 @@ class TokensTest(unittest.TestCase): self.assertEqual(md5_token.hash_fn(str(cassandra.metadata.MAX_LONG)), 28528976619278518853815276204542453639) self.assertEqual(str(md5_token), '' % -9223372036854775809) + +class BytesTokensTest(unittest.TestCase): + def test_bytes_tokens(self): bytes_token = BytesToken(str(cassandra.metadata.MIN_LONG - 1)) self.assertEqual(bytes_token.hash_fn('123'), '123')