From d97562d506b035e3072f83cbd8d88a4c73be54f7 Mon Sep 17 00:00:00 2001 From: Adam Holmberg Date: Wed, 5 Aug 2015 15:54:54 -0500 Subject: [PATCH] Pure python murmur3 fallback initial implementation TBD: - Python 3 - Possibly some profiling and optimization --- cassandra/murmur3.c | 276 ------------------------------------------- cassandra/murmur3.py | 119 +++++++++++++++++++ setup.py | 4 +- 3 files changed, 121 insertions(+), 278 deletions(-) delete mode 100644 cassandra/murmur3.c create mode 100644 cassandra/murmur3.py diff --git a/cassandra/murmur3.c b/cassandra/murmur3.c deleted file mode 100644 index dc98b491..00000000 --- a/cassandra/murmur3.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * The majority of this code was taken from the python-smhasher library, - * which can be found here: https://github.com/phensley/python-smhasher - * - * That library is under the MIT license with the following copyright: - * - * Copyright (c) 2011 Austin Appleby (Murmur3 routine) - * Copyright (c) 2011 Patrick Hensley (Python wrapper, packaging) - * Copyright 2013 DataStax (Minor modifications to match Cassandra's MM3 hashes) - * - */ - -#define PY_SSIZE_T_CLEAN 1 -#include -#include - -#if PY_VERSION_HEX < 0x02050000 -typedef int Py_ssize_t; -#define PY_SSIZE_T_MAX INT_MAX -#define PY_SSIZE_T_MIN INT_MIN -#endif - -#ifdef PYPY_VERSION -#define COMPILING_IN_PYPY 1 -#define COMPILING_IN_CPYTHON 0 -#else -#define COMPILING_IN_PYPY 0 -#define COMPILING_IN_CPYTHON 1 -#endif -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -typedef unsigned char uint8_t; -typedef unsigned long uint32_t; -typedef unsigned __int64 uint64_t; - -typedef char int8_t; -typedef long int32_t; -typedef __int64 int64_t; - -#define FORCE_INLINE __forceinline - -#include - -#define ROTL32(x,y) _rotl(x,y) -#define ROTL64(x,y) _rotl64(x,y) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#include - -#define FORCE_INLINE inline __attribute__((always_inline)) - -inline uint32_t rotl32 ( int32_t x, int8_t r ) -{ - // cast to unsigned for logical right bitshift (to match C* MM3 implementation) - return (x << r) | ((int32_t) (((uint32_t) x) >> (32 - r))); -} - -inline int64_t rotl64 ( int64_t x, int8_t r ) -{ - // cast to unsigned for logical right bitshift (to match C* MM3 implementation) - return (x << r) | ((int64_t) (((uint64_t) x) >> (64 - r))); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LL) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -// TODO 32bit? - -FORCE_INLINE int64_t getblock ( const int64_t * p, int i ) -{ - return p[i]; -} - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -FORCE_INLINE int64_t fmix ( int64_t k ) -{ - // cast to unsigned for logical right bitshift (to match C* MM3 implementation) - k ^= ((uint64_t) k) >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= ((uint64_t) k) >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= ((uint64_t) k) >> 33; - - return k; -} - -int64_t MurmurHash3_x64_128 (const void * key, const int len, - const uint32_t seed) -{ - const int8_t * data = (const int8_t*)key; - const int nblocks = len / 16; - - int64_t h1 = seed; - int64_t h2 = seed; - - int64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - int64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - int64_t k1 = 0; - int64_t k2 = 0; - - const int64_t * blocks = (const int64_t *)(data); - const int8_t * tail = (const int8_t*)(data + nblocks*16); - - //---------- - // body - - int i; - for(i = 0; i < nblocks; i++) - { - int64_t k1 = getblock(blocks,i*2+0); - int64_t k2 = getblock(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - - } - - //---------- - // tail - switch(len & 15) - { - case 15: k2 ^= ((int64_t) (tail[14])) << 48; - case 14: k2 ^= ((int64_t) (tail[13])) << 40; - case 13: k2 ^= ((int64_t) (tail[12])) << 32; - case 12: k2 ^= ((int64_t) (tail[11])) << 24; - case 11: k2 ^= ((int64_t) (tail[10])) << 16; - case 10: k2 ^= ((int64_t) (tail[ 9])) << 8; - case 9: k2 ^= ((int64_t) (tail[ 8])) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= ((int64_t) (tail[ 7])) << 56; - case 7: k1 ^= ((int64_t) (tail[ 6])) << 48; - case 6: k1 ^= ((int64_t) (tail[ 5])) << 40; - case 5: k1 ^= ((int64_t) (tail[ 4])) << 32; - case 4: k1 ^= ((int64_t) (tail[ 3])) << 24; - case 3: k1 ^= ((int64_t) (tail[ 2])) << 16; - case 2: k1 ^= ((int64_t) (tail[ 1])) << 8; - case 1: k1 ^= ((int64_t) (tail[ 0])) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix(h1); - h2 = fmix(h2); - - h1 += h2; - h2 += h1; - - return h1; -} - - -struct module_state { - PyObject *error; -}; - -// pypy3 doesn't have GetState yet. -#if COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3 -#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else -#define GETSTATE(m) (&_state) -static struct module_state _state; -#endif - -static PyObject * -murmur3(PyObject *self, PyObject *args) -{ - const char *key; - Py_ssize_t len; - uint32_t seed = 0; - int64_t result = 0; - - - if (!PyArg_ParseTuple(args, "s#|I", &key, &len, &seed)) { - return NULL; - } - - // TODO handle x86 version? - result = MurmurHash3_x64_128((void *)key, len, seed); - return (PyObject *) PyLong_FromLongLong(result); -} - -static PyMethodDef murmur3_methods[] = { - {"murmur3", murmur3, METH_VARARGS, "Make an x64 murmur3 64-bit hash value"}, - {NULL, NULL, 0, NULL} -}; - -#if PY_MAJOR_VERSION >= 3 - -static int murmur3_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int murmur3_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - return 0; -} - -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "murmur3", - NULL, - sizeof(struct module_state), - murmur3_methods, - NULL, - murmur3_traverse, - murmur3_clear, - NULL -}; - -#define INITERROR return NULL - -PyObject * -PyInit_murmur3(void) - -#else -#define INITERROR return - -void -initmurmur3(void) -#endif -{ -#if PY_MAJOR_VERSION >= 3 - PyObject *module = PyModule_Create(&moduledef); -#else - PyObject *module = Py_InitModule("murmur3", murmur3_methods); -#endif - struct module_state *st = NULL; - - if (module == NULL) - INITERROR; - st = GETSTATE(module); - - st->error = PyErr_NewException("murmur3.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - -#if PY_MAJOR_VERSION >= 3 - return module; -#endif -} diff --git a/cassandra/murmur3.py b/cassandra/murmur3.py new file mode 100644 index 00000000..8cbc7396 --- /dev/null +++ b/cassandra/murmur3.py @@ -0,0 +1,119 @@ +import six +from six.moves import range + +if six.PY3: + def blocks_and_tail(data, nblocks): + # use array 'q' here? + pass +else: + import struct + + def body_and_tail(data): + l = len(data) + nblocks = l // 16 + tail = l % 16 + if nblocks: + return struct.unpack_from('qq' * nblocks, data), struct.unpack_from('b' * tail, data, -tail), l + # TODO: try unpack generator vs built formatter + # TODO: try unpack pairs in generator ^^ + else: + return tuple(), struct.unpack_from('b' * tail, data, -tail), l + +INT64_MAX = 2 ** 63 - 1 +# TODO: better if these are int or long constants? +INT64_MIN = -INT64_MAX - 1 + +INT64_OVF_OFFSET = INT64_MAX + 1 +INT64_OVF_DIV = 2 * INT64_OVF_OFFSET + + +def rotl64(x, r): + mask = 2 ** r - 1 + rotated = (x << r) | ((x >> 64 - r) & mask) + return rotated + + +def fmix(k): + k ^= (k >> 33) & 0x7fffffff + k *= 0xff51afd7ed558ccd + k ^= (k >> 33) & 0x7fffffff + k *= 0xc4ceb9fe1a85ec53 + k ^= (k >> 33) & 0x7fffffff + return k + + +def truncate_int64(x): + if not INT64_MIN <= x <= INT64_MAX: + x = (x + INT64_OVF_OFFSET) % INT64_OVF_DIV - INT64_OVF_OFFSET + return x + + +def _murmur3(data): + + h1 = h2 = 0 + + c1 = -8663945395140668459 # 0x87c37b91114253d5 + c2 = 0x4cf5ad432745937f + + body, tail, total_len = body_and_tail(data) + + # body + for i in range(0, len(body), 2): + k1 = body[i] + k2 = body[i + 1] + + k1 *= c1 + k1 = rotl64(k1, 31) + k1 *= c2 + h1 ^= k1 + + h1 = rotl64(h1, 27) + h1 += h2 + h1 = h1 * 5 + 0x52dce729 + + k2 *= c2 + k2 = rotl64(k2, 33) + k2 *= c1 + h2 ^= k2 + + h2 = rotl64(h2, 31) + h2 += h1 + h2 = h2 * 5 + 0x38495ab5 + + # tail + k1 = k2 = 0 + len_tail = len(tail) + if len_tail > 8: + for i in range(len_tail - 1, 7, -1): + k2 ^= tail[i] << (i - 8) * 8 + k2 *= c2 + k2 = rotl64(k2, 33) + k2 *= c1 + h2 ^= k2 + + if len_tail: + for i in range(min(7, len_tail - 1), -1, -1): + k1 ^= tail[i] << i * 8 + k1 *= c1 + k1 = rotl64(k1, 31) + k1 *= c2 + h1 ^= k1 + + # finalization + h1 ^= total_len + h2 ^= total_len + + h1 += h2 + h2 += h1 + + h1 = fmix(h1) + h2 = fmix(h2) + + h1 += h2 + + return truncate_int64(h1) + +try: + from cassandra.cmurmur3 import murmur3 +except ImportError: + murmur3 = _murmur3 diff --git a/setup.py b/setup.py index 67594036..d27cf165 100644 --- a/setup.py +++ b/setup.py @@ -121,8 +121,8 @@ class BuildFailed(Exception): self.ext = ext -murmur3_ext = Extension('cassandra.murmur3', - sources=['cassandra/murmur3.c']) +murmur3_ext = Extension('cassandra.cmurmur3', + sources=['cassandra/cmurmur3.c']) libev_ext = Extension('cassandra.io.libevwrapper', sources=['cassandra/io/libevwrapper.c'],