(Optional) NumPy row parser

2015-07-30 17:02:27 +01:00
parent 81ff98efc2
commit 1a6534b575
5 changed files with 158 additions and 29 deletions
--- a/cassandra/datatypes.pxd
+++ b/cassandra/datatypes.pxd
@@ -1,11 +1,12 @@
-cdef class LLDataType:
-    """
-    Low-level Cassandra datatype
-    """
-
-    cdef Py_ssize_t size
-
-    cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version)
+# cdef class LLDataType:
+#     """
+#     Low-level Cassandra datatype
+#     """
+#
+#     cdef Py_ssize_t size
+#
+#     cdef void deserialize_ptr(self, char *buf, Py_ssize_t size,
+#                               Py_ssize_t index, void *out, protocol_version)

 cdef class DataType:
    cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version)
--- a/cassandra/datatypes.pyx
+++ b/cassandra/datatypes.pyx
@@ -1,29 +1,10 @@
 include 'marshal.pyx'

-from cassandra import cqltypes
-
-
-cdef class LLDataType:
-    cdef void deserialize_ptr(self, char *buf, Py_ssize_t size,
-                              void *out, protocol_version):
-        pass
-
-
 cdef class DataType:
    cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version):
        pass


-cdef class LLInt64(LLDataType):
-    """
-    Low-level Cassandra datatype
-    """
-
-    cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version):
-        cdef int64_t x = int64_unpack(buf)
-        (<int64_t *> out)[0] = x
-
-
 cdef class Int64(DataType):

    cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version):
--- a/cassandra/marshal.pyx
+++ b/cassandra/marshal.pyx
@@ -22,8 +22,8 @@ import math
 from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t,
                          uint8_t, uint16_t, uint32_t, uint64_t)

-assert sys.byteorder in ('little', 'big')
-cdef bint is_little_endian = sys.byteorder == 'little'
+cdef bint is_little_endian
+from cassandra.util import is_little_endian

 # cdef extern from "marshal.h":
 #     cdef str c_string_to_python(char *p, Py_ssize_t len)
--- a/cassandra/numpyparser.pyx
+++ b/cassandra/numpyparser.pyx
@@ -0,0 +1,144 @@
+"""
+This module provider an optional protocol parser that returns
+NumPy arrays.
+
+=============================================================================
+This module should not be imported by any of the main python-driver modules,
+as numpy is an optional dependency.
+=============================================================================
+"""
+
+include "ioutils.pyx"
+
+from libc.stdint cimport uint64_t
+
+from cassandra.rowparser cimport RowParser
+from cassandra.bytesio cimport BytesIOReader
+from cassandra.datatypes cimport DataType
+from cassandra import cqltypes
+
+import numpy as np
+cimport numpy as np
+
+from cassandra.util import is_little_endian
+
+from cpython.ref cimport Py_INCREF, PyObject
+
+cdef extern from "Python.h":
+    # An integer type large enough to hold a pointer
+    ctypedef uint64_t Py_uintptr_t
+
+# ctypedef struct TypeRepr:
+#     Py_ssize_t size
+#     int is_object
+
+ctypedef struct ArrRepr:
+    # TypeRepr typ
+    Py_uintptr_t buf_ptr
+    Py_ssize_t stride
+    int is_object
+
+_cqltype_to_numpy = {
+    cqltypes.LongType:          np.dtype('>i8'),
+    cqltypes.CounterColumnType: np.dtype('>i8'),
+    cqltypes.Int32Type:         np.dtype('>i4'),
+    cqltypes.ShortType:         np.dtype('>i2'),
+    cqltypes.FloatType:         np.dtype('>f4'),
+    cqltypes.DoubleType:        np.dtype('>f8'),
+}
+
+
+# cdef type_repr(coltype):
+#     """
+#     Get a low-level type representation for the cqltype
+#     """
+#     cdef TypeRepr res
+#     if coltype in _cqltype_to_numpy:
+#         dtype = _cqltype_to_numpy[coltype]
+#         res.size = dtype.itemsize
+#         res.is_object = False
+#     else:
+#         res.size = sizeof(PyObject *)
+#         res.is_object = True
+#     return res
+
+
+cdef ArrRepr array_repr(np.ndarray arr, coltype):
+    """
+    Construct a low-level array representation
+    """
+    assert arr.ndim == 1, "Expected a one-dimensional array"
+
+    cdef ArrRepr res
+    # Get the data pointer to the underlying memory of the numpy array
+    res.buf_ptr = arr.ctypes.data
+    res.stride = arr.strides[0]
+    res.is_object = coltype in _cqltype_to_numpy
+    return res
+
+
+cdef class NativeRowParser(RowParser):
+    """
+    This is a row parser that copies bytes into arrays (e.g. NumPy arrays)
+    for types it recognizes, such as int64. Values of other types are
+    converted to objects.
+
+    NOTE: This class is stateful, in that every time unpack_row is called it
+          advanced the pointer into the array by updates the buf_ptr field
+          of self.arrays
+    """
+
+    # ArrRepr contains a 'buf_ptr' field, which is not supported as a memoryview dtype
+    cdef ArrRepr[::1] arrays
+    cdef DataType[::1] datatypes
+    cdef Py_ssize_t size
+
+    def __init__(self, ArrRepr[::1] arrays, DataType[::1] datatypes):
+        self.arrays = arrays
+        self.datatypes = datatypes
+        self.size = len(datatypes)
+
+    cpdef unpack_row(self, BytesIOReader reader, protocol_version):
+        cdef char *buf
+        cdef Py_ssize_t i, bufsize, rowsize = self.size
+        cdef ArrRepr arr
+
+        for i in range(rowsize):
+            buf = get_buf(reader, &bufsize)
+            if buf == NULL:
+                raise ValueError("Unexpected end of stream")
+
+            arr = self.arrays[i]
+
+            if arr.is_object:
+                dt = self.datatypes[i]
+                val = dt.deserialize(buf, bufsize, protocol_version)
+                Py_INCREF(val)
+                (<PyObject **> arr.buf_ptr)[0] = <PyObject *> val
+            else:
+                memcopy(buf, <char *> arr.buf_ptr, bufsize)
+
+            # Update the pointer into the array for the next time
+            self.arrays[i].buf_ptr += arr.stride
+
+
+cdef inline memcopy(char *src, char *dst, Py_ssize_t size):
+    """
+    Our own simple memcopy which can be inlined. This is useful because our data types
+    are only a few bytes.
+    """
+    cdef Py_ssize_t i
+    for i in range(size):
+        dst[i] = src[i]
+
+
+def make_native_byteorder(arr):
+    """
+    Make sure all values have a native endian in the NumPy arrays.
+    """
+    if is_little_endian:
+        # We have arrays in big-endian order. First swap the bytes
+        # into little endian order, and then update the numpy dtype
+        # accordingly (e.g. from '>i8' to '<i8')
+        return arr.byteswap().newbyteorder()
+    return arr
--- a/cassandra/util.py
+++ b/cassandra/util.py
@@ -4,9 +4,12 @@ import datetime
 import random
 import six
 import uuid
+import sys

 DATETIME_EPOC = datetime.datetime(1970, 1, 1)

+assert sys.byteorder in ('little', 'big')
+is_little_endian = sys.byteorder == 'little'

 def datetime_from_timestamp(timestamp):
    """