From 8afd853ae984bd5ef0fd2369dbd7d2a7c604e01f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 15:43:57 +0100 Subject: [PATCH 01/70] Add typecodes to module with Cython-compatible .pxd file --- cassandra/protocol.py | 34 ++-------------------- cassandra/typecodes.pxd | 28 +++++++++++++++++++ cassandra/typecodes.py | 62 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 31 deletions(-) create mode 100644 cassandra/typecodes.pxd create mode 100644 cassandra/typecodes.py diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 41439334..a6ce22ec 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -22,6 +22,7 @@ import six from six.moves import range import io +from cassandra import typecodes from cassandra import (Unavailable, WriteTimeout, ReadTimeout, WriteFailure, ReadFailure, FunctionFailure, AlreadyExists, InvalidRequest, Unauthorized, @@ -35,7 +36,7 @@ from cassandra.cqltypes import (AsciiType, BytesType, BooleanType, DoubleType, FloatType, Int32Type, InetAddressType, IntegerType, ListType, LongType, MapType, SetType, TimeUUIDType, - UTF8Type, UUIDType, UserType, + UTF8Type, VarcharType, UUIDType, UserType, TupleType, lookup_casstype, SimpleDateType, TimeType, ByteType, ShortType) from cassandra.policies import WriteType @@ -531,35 +532,6 @@ RESULT_KIND_SET_KEYSPACE = 0x0003 RESULT_KIND_PREPARED = 0x0004 RESULT_KIND_SCHEMA_CHANGE = 0x0005 -class CassandraTypeCodes(object): - CUSTOM_TYPE = 0x0000 - AsciiType = 0x0001 - LongType = 0x0002 - BytesType = 0x0003 - BooleanType = 0x0004 - CounterColumnType = 0x0005 - DecimalType = 0x0006 - DoubleType = 0x0007 - FloatType = 0x0008 - Int32Type = 0x0009 - UTF8Type = 0x000A - DateType = 0x000B - UUIDType = 0x000C - UTF8Type = 0x000D - IntegerType = 0x000E - TimeUUIDType = 0x000F - InetAddressType = 0x0010 - SimpleDateType = 0x0011 - TimeType = 0x0012 - ShortType = 0x0013 - ByteType = 0x0014 - ListType = 0x0020 - MapType = 0x0021 - SetType = 0x0022 - UserType = 0x0030 - TupleType = 0x0031 - - class ResultMessage(_MessageType): opcode = 0x08 name = 'RESULT' @@ -569,7 +541,7 @@ class ResultMessage(_MessageType): paging_state = None # Names match type name in module scope. Most are imported from cassandra.cqltypes (except CUSTOM_TYPE) - type_codes = _cqltypes_by_code = dict((v, globals()[k]) for k, v in CassandraTypeCodes.__dict__.items() if not k.startswith('_')) + type_codes = _cqltypes_by_code = dict((v, globals()[k]) for k, v in typecodes.__dict__.items() if not k.startswith('_')) _FLAGS_GLOBAL_TABLES_SPEC = 0x0001 _HAS_MORE_PAGES_FLAG = 0x0002 diff --git a/cassandra/typecodes.pxd b/cassandra/typecodes.pxd new file mode 100644 index 00000000..b0405284 --- /dev/null +++ b/cassandra/typecodes.pxd @@ -0,0 +1,28 @@ +cdef enum: + CUSTOM_TYPE + AsciiType + LongType + BytesType + BooleanType + CounterColumnType + DecimalType + DoubleType + FloatType + Int32Type + UTF8Type + DateType + UUIDType + VarcharType + IntegerType + TimeUUIDType + InetAddressType + SimpleDateType + TimeType + ShortType + ByteType + ListType + MapType + SetType + UserType + TupleType + diff --git a/cassandra/typecodes.py b/cassandra/typecodes.py new file mode 100644 index 00000000..651c58d7 --- /dev/null +++ b/cassandra/typecodes.py @@ -0,0 +1,62 @@ +""" +Module with constants for Cassandra type codes. + +These constants are useful for + + a) mapping messages to cqltypes (cassandra/cqltypes.py) + b) optimizezd dispatching for (de)serialization (cassandra/encoding.py) + +Type codes are repeated here from the Cassandra binary protocol specification: + + 0x0000 Custom: the value is a [string], see above. + 0x0001 Ascii + 0x0002 Bigint + 0x0003 Blob + 0x0004 Boolean + 0x0005 Counter + 0x0006 Decimal + 0x0007 Double + 0x0008 Float + 0x0009 Int + 0x000A Text + 0x000B Timestamp + 0x000C Uuid + 0x000D Varchar + 0x000E Varint + 0x000F Timeuuid + 0x0010 Inet + 0x0020 List: the value is an [option], representing the type + of the elements of the list. + 0x0021 Map: the value is two [option], representing the types of the + keys and values of the map + 0x0022 Set: the value is an [option], representing the type + of the elements of the set +""" + +CUSTOM_TYPE = 0x0000 +AsciiType = 0x0001 +LongType = 0x0002 +BytesType = 0x0003 +BooleanType = 0x0004 +CounterColumnType = 0x0005 +DecimalType = 0x0006 +DoubleType = 0x0007 +FloatType = 0x0008 +Int32Type = 0x0009 +UTF8Type = 0x000A +DateType = 0x000B +UUIDType = 0x000C +VarcharType = 0x000D +IntegerType = 0x000E +TimeUUIDType = 0x000F +InetAddressType = 0x0010 +SimpleDateType = 0x0011 +TimeType = 0x0012 +ShortType = 0x0013 +ByteType = 0x0014 +ListType = 0x0020 +MapType = 0x0021 +SetType = 0x0022 +UserType = 0x0030 +TupleType = 0x0031 + From f0b360a9c718b5d7c74604788a6092870242efcb Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 15:45:28 +0100 Subject: [PATCH 02/70] Cythonize marshalling code --- cassandra/marshal.pxd | 29 ++++++ cassandra/marshal.pyx | 201 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 cassandra/marshal.pxd create mode 100644 cassandra/marshal.pyx diff --git a/cassandra/marshal.pxd b/cassandra/marshal.pxd new file mode 100644 index 00000000..ef7d9858 --- /dev/null +++ b/cassandra/marshal.pxd @@ -0,0 +1,29 @@ +from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, + uint8_t, uint16_t, uint32_t, uint64_t) + +cpdef bytes int64_pack(int64_t x) +cpdef bytes int32_pack(int32_t x) +cpdef bytes int16_pack(int16_t x) +cpdef bytes int8_pack(int8_t x) + +cpdef int64_t int64_unpack(const char *buf) +cpdef int32_t int32_unpack(const char *buf) +cpdef int16_t int16_unpack(const char *buf) +cpdef int8_t int8_unpack(const char *buf) + +cpdef bytes uint64_pack(uint64_t x) +cpdef bytes uint32_pack(uint32_t x) +cpdef bytes uint16_pack(uint16_t x) +cpdef bytes uint8_pack(uint8_t x) + +cpdef uint64_t uint64_unpack(const char *buf) +cpdef uint32_t uint32_unpack(const char *buf) +cpdef uint16_t uint16_unpack(const char *buf) +cpdef uint8_t uint8_unpack(const char *buf) + +cpdef bytes double_pack(double x) +cpdef bytes float_pack(float x) + +cpdef double double_unpack(const char *buf) +cpdef float float_unpack(const char *buf) + diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx new file mode 100644 index 00000000..48036861 --- /dev/null +++ b/cassandra/marshal.pyx @@ -0,0 +1,201 @@ +# cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import six +import sys +import struct +import math + +from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, + uint8_t, uint16_t, uint32_t, uint64_t) + +assert sys.byteorder in ('little', 'big') +is_little_endian = sys.byteorder == 'little' + +# cdef extern from "marshal.h": +# cdef str c_string_to_python(char *p, Py_ssize_t len) + +def _make_packer(format_string): + packer = struct.Struct(format_string) + pack = packer.pack + unpack = lambda s: packer.unpack(s)[0] + return pack, unpack + + +cdef inline bytes pack(char *buf, Py_ssize_t size): + """ + Pack a buffer, given as a char *, into Python bytes in byte order. + """ + if is_little_endian: + swap_order(buf, size) + return buf[:size] + + +cdef inline swap_order(char *buf, Py_ssize_t size): + """ + Swap the byteorder of `buf` in-place (reverse all the bytes). + There are functions ntohl etc, but these may be POSIX-dependent. + """ + cdef Py_ssize_t start, end + cdef char c + for i in range(size/2): + end = size - i - 1 + c = buf[i] + buf[i] = buf[end] + buf[end] = c + +### Packing and unpacking of signed integers + +cpdef inline bytes int64_pack(int64_t x): + return pack( &x, 8) + +cpdef inline int64_t int64_unpack(const char *buf): + # The 'const' makes sure the buffer is not mutated in-place! + cdef int64_t x = ( buf)[0] + swap_order( &x, 8) + return x + +cpdef inline bytes int32_pack(int32_t x): + return pack( &x, 4) + +cpdef inline int32_t int32_unpack(const char *buf): + cdef int32_t x = ( buf)[0] + swap_order( &x, 4) + return x + +cpdef inline bytes int16_pack(int16_t x): + return pack( &x, 2) + +cpdef inline int16_t int16_unpack(const char *buf): + cdef int16_t x = ( buf)[0] + swap_order( &x, 2) + return x + +cpdef inline bytes int8_pack(int8_t x): + return ( &x)[:1] + +cpdef inline int8_t int8_unpack(const char *buf): + return ( buf)[0] + +cpdef inline bytes uint64_pack(uint64_t x): + return pack( &x, 8) + +cpdef inline uint64_t uint64_unpack(const char *buf): + cdef uint64_t x = ( buf)[0] + swap_order( &x, 8) + return x + +cpdef inline bytes uint32_pack(uint32_t x): + return pack( &x, 4) + +cpdef inline uint32_t uint32_unpack(const char *buf): + cdef uint32_t x = ( buf)[0] + swap_order( &x, 4) + return x + +cpdef inline bytes uint16_pack(uint16_t x): + return pack( &x, 2) + +cpdef inline uint16_t uint16_unpack(const char *buf): + cdef uint16_t x = ( buf)[0] + swap_order( &x, 2) + return x + +cpdef inline bytes uint8_pack(uint8_t x): + return pack( &x, 1) + +cpdef inline uint8_t uint8_unpack(const char *buf): + return ( buf)[0] + +cpdef inline bytes double_pack(double x): + return pack( &x, 8) + +cpdef inline double double_unpack(const char *buf): + cdef double x = ( buf)[0] + swap_order( &x, 8) + return x + +cpdef inline bytes float_pack(float x): + return pack( &x, 4) + +cpdef inline float float_unpack(const char *buf): + cdef float x = ( buf)[0] + swap_order( &x, 4) + return x + +# int64_pack, int64_unpack = _make_packer('>q') +# int32_pack, int32_unpack = _make_packer('>i') +# int16_pack, int16_unpack = _make_packer('>h') +# int8_pack, int8_unpack = _make_packer('>b') +# uint64_pack, uint64_unpack = _make_packer('>Q') +# uint32_pack, uint32_unpack = _make_packer('>I') +# uint16_pack, uint16_unpack = _make_packer('>H') +# uint8_pack, uint8_unpack = _make_packer('>B') +# float_pack, float_unpack = _make_packer('>f') +# double_pack, double_unpack = _make_packer('>d') + +# Special case for cassandra header +header_struct = struct.Struct('>BBbB') +header_pack = header_struct.pack +header_unpack = header_struct.unpack + +# in protocol version 3 and higher, the stream ID is two bytes +v3_header_struct = struct.Struct('>BBhB') +v3_header_pack = v3_header_struct.pack +v3_header_unpack = v3_header_struct.unpack + + +if six.PY3: + def varint_unpack(term): + val = int(''.join("%02x" % i for i in term), 16) + if (term[0] & 128) != 0: + # There is a bug in Cython (0.20 - 0.22), where if we do + # '1 << (len(term) * 8)' Cython generates '1' directly into the + # C code, causing integer overflows. Treat it as an object for now + val -= ( 1L) << (len(term) * 8) + return val +else: + def varint_unpack(term): # noqa + val = int(term.encode('hex'), 16) + if (ord(term[0]) & 128) != 0: + val = val - (1 << (len(term) * 8)) + return val + + +def bitlength(n): + # return int(math.log2(n)) + 1 + bitlen = 0 + while n > 0: + n >>= 1 + bitlen += 1 + return bitlen + + +def varint_pack(big): + pos = True + if big == 0: + return b'\x00' + if big < 0: + bytelength = bitlength(abs(big) - 1) // 8 + 1 + big = (1 << bytelength * 8) + big + pos = False + revbytes = bytearray() + while big > 0: + revbytes.append(big & 0xff) + big >>= 8 + if pos and revbytes[-1] & 0x80: + revbytes.append(0) + revbytes.reverse() + return six.binary_type(revbytes) From ad7e4e08481b8cd48c5724256c955604713101eb Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 15:47:04 +0100 Subject: [PATCH 03/70] Start on Cython version of ProtocolHandler --- cassandra/bytesio.pxd | 7 ++ cassandra/bytesio.pyx | 56 ++++++++++ cassandra/cython_protocol_handler.pyx | 154 ++++++++++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 cassandra/bytesio.pxd create mode 100644 cassandra/bytesio.pyx create mode 100644 cassandra/cython_protocol_handler.pyx diff --git a/cassandra/bytesio.pxd b/cassandra/bytesio.pxd new file mode 100644 index 00000000..349fd600 --- /dev/null +++ b/cassandra/bytesio.pxd @@ -0,0 +1,7 @@ +cdef class BytesIOReader: + cdef bytes buf + cdef char *buf_ptr + cdef Py_ssize_t pos + cdef Py_ssize_t size + cdef char *read(self, Py_ssize_t n = ?) + diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx new file mode 100644 index 00000000..505fe391 --- /dev/null +++ b/cassandra/bytesio.pyx @@ -0,0 +1,56 @@ +# ython profile=True + +cdef class BytesIOReader: + """ + This class provides efficient support for reading bytes from a 'bytes' buffer, + by returning char * values directly without allocating intermediate objects. + """ + + def __init__(self, bytes buf): + self.buf = buf + self.size = len(buf) + self.buf_ptr = self.buf + + cdef char *read(self, Py_ssize_t n = -1): + """Read at most size bytes from the file + (less if the read hits EOF before obtaining size bytes). + + If the size argument is negative or omitted, read all data until EOF + is reached. The bytes are returned as a string object. An empty + string is returned when EOF is encountered immediately. + """ + cdef Py_ssize_t newpos = self.pos + n + cdef char *res + + if n < 0: + newpos = self.size + elif newpos > self.size: + self.pos = self.size + return b'' + else: + res = self.buf_ptr + self.pos + self.pos = newpos + return res + + +class PyBytesIOReader(BytesIOReader): + """ + Python-compatible BytesIOReader class + """ + + def read(self, n = -1): + """Read at most size bytes from the file + (less if the read hits EOF before obtaining size bytes). + + If the size argument is negative or omitted, read all data until EOF + is reached. The bytes are returned as a string object. An empty + string is returned when EOF is encountered immediately. + """ + if n is None or n < 0: + newpos = self.len + else: + newpos = min(self.pos+n, self.len) + r = self.buf[self.pos:newpos] + self.pos = newpos + return r + diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx new file mode 100644 index 00000000..85a5945a --- /dev/null +++ b/cassandra/cython_protocol_handler.pyx @@ -0,0 +1,154 @@ +# ython: profile=True + +from libc.stdint cimport int64_t, int32_t + +# from cassandra.marshal cimport (int8_pack, int8_unpack, int16_pack, int16_unpack, +# uint16_pack, uint16_unpack, uint32_pack, uint32_unpack, +# int32_pack, int32_unpack, int64_pack, int64_unpack, float_pack, float_unpack, double_pack, double_unpack) + +from cassandra.marshal import varint_pack, varint_unpack +from cassandra import util +from cassandra.cqltypes import EMPTY +from cassandra.protocol import ResultMessage, ProtocolHandler + +from cassandra.bytesio cimport BytesIOReader +from cassandra cimport typecodes + +import numpy as np + +include "marshal.pyx" + +class FastResultMessage(ResultMessage): + """ + Cython version of Result Message that has a faster implementation of + recv_results_row. + """ + # type_codes = ResultMessage.type_codes.copy() + code_to_type = dict((v, k) for k, v in ResultMessage.type_codes.items()) + + @classmethod + def recv_results_rows(cls, f, protocol_version, user_type_map): + paging_state, column_metadata = cls.recv_results_metadata(f, user_type_map) + + colnames = [c[2] for c in column_metadata] + coltypes = [c[3] for c in column_metadata] + colcodes = np.array( + [cls.code_to_type.get(coltype, -1) for coltype in coltypes], + dtype=np.dtype('i')) + parsed_rows = parse_rows(BytesIOReader(f.read()), colnames, + coltypes, colcodes, protocol_version) + return (paging_state, (colnames, parsed_rows)) + + +cdef parse_rows(BytesIOReader reader, list colnames, list coltypes, + int[::1] colcodes, protocol_version): + cdef Py_ssize_t i, rowcount + cdef char *raw_val + cdef int32_t raw_val_size + rowcount = read_int(reader) + # return RowIterator(reader, coltypes, colcodes, protocol_version, rowcount) + return [parse_row(reader, coltypes, colcodes, protocol_version) + for i in range(rowcount)] + + +cdef class RowIterator: + """ + Result iterator for a set of rows + + There seems to be an issue with generator expressions + memoryviews, so we + have a special iterator class instead. + """ + cdef list coltypes + cdef int[::1] colcodes + cdef Py_ssize_t rowcount, pos + cdef BytesIOReader reader + cdef object protocol_version + + def __init__(self, reader, coltypes, colcodes, protocol_version, rowcount): + self.reader = reader + self.coltypes = coltypes + self.colcodes = colcodes + self.protocol_version = protocol_version + self.rowcount = rowcount + self.pos = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.pos >= self.rowcount: + raise StopIteration + self.pos += 1 + return parse_row(self.reader, self.coltypes, self.colcodes, self.protocol_version) + + next = __next__ + + +cdef inline parse_row(BytesIOReader reader, list coltypes, int[::1] colcodes, + protocol_version): + cdef Py_ssize_t j + + row = [] + for j, ctype in enumerate(coltypes): + raw_val_size = read_int(reader) + if raw_val_size < 0: + val = None + else: + raw_val = reader.read(raw_val_size) + val = from_binary(ctype, colcodes[j], raw_val, + raw_val_size, protocol_version) + row.append(val) + + return row + + +class CythonProtocolHandler(ProtocolHandler): + """ + Use FastResultMessage to decode query result message messages. + """ + my_opcodes = ProtocolHandler.message_types_by_opcode.copy() + my_opcodes[FastResultMessage.opcode] = FastResultMessage + message_types_by_opcode = my_opcodes + + +cdef inline int32_t read_int(BytesIOReader reader): + return int32_unpack(reader.read(4)) + + +cdef inline from_binary(cqltype, int typecode, char *byts, int32_t size, protocol_version): + """ + Deserialize a bytestring into a value. See the deserialize() method + for more information. This method differs in that if None or the empty + string is passed in, None may be returned. + + This method provides a fast-path deserialization routine. + """ + if size == 0 and cqltype.empty_binary_ok: + return empty(cqltype) + return deserialize(cqltype, typecode, byts, size, protocol_version) + + +cdef empty(cqltype): + return EMPTY if cqltype.support_empty_values else None + + +def to_binary(cqltype, val, protocol_version): + """ + Serialize a value into a bytestring. See the serialize() method for + more information. This method differs in that if None is passed in, + the result is the empty string. + """ + return b'' if val is None else cqltype.serialize(val, protocol_version) + + +cdef deserialize(cqltype, int typecode, char *byts, int32_t size, protocol_version): + if typecode == typecodes.LongType: + return int64_unpack(byts) + else: + return deserialize_generic(cqltype, typecode, byts, size, protocol_version) + +cdef deserialize_generic(cqltype, int typecode, char *byts, int32_t size, + protocol_version): + print("deserialize", cqltype) + return cqltype.deserialize(byts[:size], protocol_version) + From 39af4e15698081348dc97acdcaa4ffcd284f6ae2 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 16:03:23 +0100 Subject: [PATCH 04/70] Add Cython modules to setup.py --- cassandra/bytesio.pyx | 2 +- cassandra/cython_protocol_handler.pyx | 2 +- cassandra/marshal.pyx | 2 +- setup.py | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index 505fe391..82887f43 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -1,4 +1,4 @@ -# ython profile=True +# -- cython profile=True cdef class BytesIOReader: """ diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index 85a5945a..add1e9f5 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -1,4 +1,4 @@ -# ython: profile=True +# -- cython: profile=True from libc.stdint cimport int64_t, int32_t diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 48036861..0efbf705 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -1,4 +1,4 @@ -# cython: profile=True +# -- cython: profile=True # Copyright 2013-2015 DataStax, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/setup.py b/setup.py index 37899c2e..7083d7aa 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ from distutils.errors import (CCompilerError, DistutilsPlatformError, DistutilsExecError) from distutils.cmd import Command - try: import subprocess has_subprocess = True @@ -262,11 +261,13 @@ if "--no-libev" not in sys.argv and not is_windows: if "--no-cython" not in sys.argv: try: from Cython.Build import cythonize - cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'marshal', 'metadata', 'pool', 'protocol', 'query', 'util'] + cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', 'pool', 'protocol', 'query', 'util'] compile_args = [] if is_windows else ['-Wno-unused-function'] extensions.extend(cythonize( [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], extra_compile_args=compile_args) for m in cython_candidates], exclude_failures=True)) + + extensions.extend(cythonize("cassandra/*.pyx")) except ImportError: sys.stderr.write("Cython is not installed. Not compiling core driver files as extensions (optional).") From 92457198cca1a3832455fbefcc81e3fed351b33d Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 16:18:19 +0100 Subject: [PATCH 05/70] Use return type void for swap_order --- cassandra/marshal.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 0efbf705..8ffe3e46 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -43,7 +43,7 @@ cdef inline bytes pack(char *buf, Py_ssize_t size): return buf[:size] -cdef inline swap_order(char *buf, Py_ssize_t size): +cdef inline void swap_order(char *buf, Py_ssize_t size): """ Swap the byteorder of `buf` in-place (reverse all the bytes). There are functions ntohl etc, but these may be POSIX-dependent. From fe67aec185f63576e093d82a0491789a64301467 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 16:27:17 +0100 Subject: [PATCH 06/70] Make sure swap_order uses no PyObjects --- cassandra/marshal.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 8ffe3e46..529f45e7 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -48,9 +48,9 @@ cdef inline void swap_order(char *buf, Py_ssize_t size): Swap the byteorder of `buf` in-place (reverse all the bytes). There are functions ntohl etc, but these may be POSIX-dependent. """ - cdef Py_ssize_t start, end + cdef Py_ssize_t start, end, i cdef char c - for i in range(size/2): + for i in range(size//2): end = size - i - 1 c = buf[i] buf[i] = buf[end] From 2b7997830a3073a2e73942d36f2e3b22f7443a6c Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 23 Jul 2015 16:31:44 +0100 Subject: [PATCH 07/70] Check endianness before byte-swapping --- cassandra/marshal.pyx | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 529f45e7..2ecb0fa5 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -22,7 +22,7 @@ from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t) assert sys.byteorder in ('little', 'big') -is_little_endian = sys.byteorder == 'little' +cdef bint is_little_endian = sys.byteorder == 'little' # cdef extern from "marshal.h": # cdef str c_string_to_python(char *p, Py_ssize_t len) @@ -38,23 +38,25 @@ cdef inline bytes pack(char *buf, Py_ssize_t size): """ Pack a buffer, given as a char *, into Python bytes in byte order. """ - if is_little_endian: - swap_order(buf, size) + swap_order(buf, size) return buf[:size] cdef inline void swap_order(char *buf, Py_ssize_t size): """ - Swap the byteorder of `buf` in-place (reverse all the bytes). + Swap the byteorder of `buf` in-place on little-endian platforms + (reverse all the bytes). There are functions ntohl etc, but these may be POSIX-dependent. """ cdef Py_ssize_t start, end, i cdef char c - for i in range(size//2): - end = size - i - 1 - c = buf[i] - buf[i] = buf[end] - buf[end] = c + + if is_little_endian: + for i in range(size//2): + end = size - i - 1 + c = buf[i] + buf[i] = buf[end] + buf[end] = c ### Packing and unpacking of signed integers From 81ff98efc2cdc6cb256789806ff5bbbdd3c46e90 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 29 Jul 2015 10:38:15 +0100 Subject: [PATCH 08/70] Start on pluggable row parsers (e.g. tuple vs numpy record) --- cassandra/bytesio.pyx | 2 +- cassandra/cython_protocol_handler.pyx | 240 +++++++++++++++----------- cassandra/datatypes.pxd | 16 ++ cassandra/datatypes.pyx | 52 ++++++ cassandra/ioutils.pyx | 21 +++ cassandra/marshal.pyx | 8 + cassandra/rowparser.pxd | 5 + cassandra/rowparser.pyx | 70 ++++++++ setup.py | 14 +- 9 files changed, 319 insertions(+), 109 deletions(-) create mode 100644 cassandra/datatypes.pxd create mode 100644 cassandra/datatypes.pyx create mode 100644 cassandra/ioutils.pyx create mode 100644 cassandra/rowparser.pxd create mode 100644 cassandra/rowparser.pyx diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index 82887f43..d392b23f 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -1,4 +1,4 @@ -# -- cython profile=True +# -- cython: profile=True cdef class BytesIOReader: """ diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index add1e9f5..644985dc 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -8,16 +8,22 @@ from libc.stdint cimport int64_t, int32_t from cassandra.marshal import varint_pack, varint_unpack from cassandra import util -from cassandra.cqltypes import EMPTY +from cassandra.cqltypes import EMPTY, LongType from cassandra.protocol import ResultMessage, ProtocolHandler from cassandra.bytesio cimport BytesIOReader from cassandra cimport typecodes +from cassandra.datatypes cimport DataType +from cassandra.rowparser cimport RowParser -import numpy as np +from cassandra.rowparser import TupleRowParser +from cassandra.datatypes import Int64, GenericDataType + +from cython.view cimport array as cython_array include "marshal.pyx" + class FastResultMessage(ResultMessage): """ Cython version of Result Message that has a faster implementation of @@ -32,74 +38,24 @@ class FastResultMessage(ResultMessage): colnames = [c[2] for c in column_metadata] coltypes = [c[3] for c in column_metadata] - colcodes = np.array( - [cls.code_to_type.get(coltype, -1) for coltype in coltypes], - dtype=np.dtype('i')) - parsed_rows = parse_rows(BytesIOReader(f.read()), colnames, - coltypes, colcodes, protocol_version) + + cdef DataType[::1] datatypes + datatypes = obj_array( + [Int64() if coltype == LongType else GenericDataType(coltype) for coltype in coltypes]) + # [GenericDataType(coltype) for coltype in coltypes]) + + # parsed_rows = parse_rows2(BytesIOReader(f.read()), colnames, coltypes, protocol_version) + parsed_rows = parse_rows(BytesIOReader(f.read()), datatypes, protocol_version) return (paging_state, (colnames, parsed_rows)) -cdef parse_rows(BytesIOReader reader, list colnames, list coltypes, - int[::1] colcodes, protocol_version): - cdef Py_ssize_t i, rowcount - cdef char *raw_val - cdef int32_t raw_val_size - rowcount = read_int(reader) - # return RowIterator(reader, coltypes, colcodes, protocol_version, rowcount) - return [parse_row(reader, coltypes, colcodes, protocol_version) - for i in range(rowcount)] - - -cdef class RowIterator: - """ - Result iterator for a set of rows - - There seems to be an issue with generator expressions + memoryviews, so we - have a special iterator class instead. - """ - cdef list coltypes - cdef int[::1] colcodes - cdef Py_ssize_t rowcount, pos - cdef BytesIOReader reader - cdef object protocol_version - - def __init__(self, reader, coltypes, colcodes, protocol_version, rowcount): - self.reader = reader - self.coltypes = coltypes - self.colcodes = colcodes - self.protocol_version = protocol_version - self.rowcount = rowcount - self.pos = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.pos >= self.rowcount: - raise StopIteration - self.pos += 1 - return parse_row(self.reader, self.coltypes, self.colcodes, self.protocol_version) - - next = __next__ - - -cdef inline parse_row(BytesIOReader reader, list coltypes, int[::1] colcodes, - protocol_version): - cdef Py_ssize_t j - - row = [] - for j, ctype in enumerate(coltypes): - raw_val_size = read_int(reader) - if raw_val_size < 0: - val = None - else: - raw_val = reader.read(raw_val_size) - val = from_binary(ctype, colcodes[j], raw_val, - raw_val_size, protocol_version) - row.append(val) - - return row +def obj_array(list objs): + cdef object[:] arr + arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") + # arr[:] = objs # This does not work (segmentation faults) + for i, obj in enumerate(objs): + arr[i] = obj + return arr class CythonProtocolHandler(ProtocolHandler): @@ -111,44 +67,120 @@ class CythonProtocolHandler(ProtocolHandler): message_types_by_opcode = my_opcodes +cdef parse_rows(BytesIOReader reader, DataType[::1] datatypes, protocol_version): + cdef Py_ssize_t i, rowcount + cdef RowParser parser = TupleRowParser(len(datatypes), datatypes) + rowcount = read_int(reader) + return [parser.unpack_row(reader, protocol_version) for i in range(rowcount)] + + cdef inline int32_t read_int(BytesIOReader reader): return int32_unpack(reader.read(4)) -cdef inline from_binary(cqltype, int typecode, char *byts, int32_t size, protocol_version): - """ - Deserialize a bytestring into a value. See the deserialize() method - for more information. This method differs in that if None or the empty - string is passed in, None may be returned. - - This method provides a fast-path deserialization routine. - """ - if size == 0 and cqltype.empty_binary_ok: - return empty(cqltype) - return deserialize(cqltype, typecode, byts, size, protocol_version) - - -cdef empty(cqltype): - return EMPTY if cqltype.support_empty_values else None - - -def to_binary(cqltype, val, protocol_version): - """ - Serialize a value into a bytestring. See the serialize() method for - more information. This method differs in that if None is passed in, - the result is the empty string. - """ - return b'' if val is None else cqltype.serialize(val, protocol_version) - - -cdef deserialize(cqltype, int typecode, char *byts, int32_t size, protocol_version): - if typecode == typecodes.LongType: - return int64_unpack(byts) - else: - return deserialize_generic(cqltype, typecode, byts, size, protocol_version) - -cdef deserialize_generic(cqltype, int typecode, char *byts, int32_t size, - protocol_version): - print("deserialize", cqltype) - return cqltype.deserialize(byts[:size], protocol_version) - +# cdef parse_rows2(BytesIOReader reader, list colnames, list coltypes, protocol_version): +# cdef Py_ssize_t i, rowcount +# cdef char *raw_val +# cdef int[::1] colcodes +# +# colcodes = np.array( +# [FastResultMessage.code_to_type.get(coltype, -1) for coltype in coltypes], +# dtype=np.dtype('i')) +# +# rowcount = read_int(reader) +# # return RowIterator(reader, coltypes, colcodes, protocol_version, rowcount) +# return [parse_row(reader, coltypes, colcodes, protocol_version) +# for i in range(rowcount)] +# +# +# cdef class RowIterator: +# """ +# Result iterator for a set of rows +# +# There seems to be an issue with generator expressions + memoryviews, so we +# have a special iterator class instead. +# """ +# +# cdef list coltypes +# cdef int[::1] colcodes +# cdef Py_ssize_t rowcount, pos +# cdef BytesIOReader reader +# cdef object protocol_version +# +# def __init__(self, reader, coltypes, colcodes, protocol_version, rowcount): +# self.reader = reader +# self.coltypes = coltypes +# self.colcodes = colcodes +# self.protocol_version = protocol_version +# self.rowcount = rowcount +# self.pos = 0 +# +# def __iter__(self): +# return self +# +# def __next__(self): +# if self.pos >= self.rowcount: +# raise StopIteration +# self.pos += 1 +# return parse_row(self.reader, self.coltypes, self.colcodes, self.protocol_version) +# +# next = __next__ +# +# +# cdef inline parse_row(BytesIOReader reader, list coltypes, int[::1] colcodes, +# protocol_version): +# cdef Py_ssize_t j +# +# row = [] +# for j, ctype in enumerate(coltypes): +# raw_val_size = read_int(reader) +# if raw_val_size < 0: +# val = None +# else: +# raw_val = reader.read(raw_val_size) +# val = from_binary(ctype, colcodes[j], raw_val, +# raw_val_size, protocol_version) +# row.append(val) +# +# return row +# +# +# cdef inline from_binary(cqltype, int typecode, char *byts, int32_t size, protocol_version): +# """ +# Deserialize a bytestring into a value. See the deserialize() method +# for more information. This method differs in that if None or the empty +# string is passed in, None may be returned. +# +# This method provides a fast-path deserialization routine. +# """ +# if size == 0 and cqltype.empty_binary_ok: +# return empty(cqltype) +# return deserialize(cqltype, typecode, byts, size, protocol_version) +# +# +# cdef empty(cqltype): +# return EMPTY if cqltype.support_empty_values else None +# +# +# def to_binary(cqltype, val, protocol_version): +# """ +# Serialize a value into a bytestring. See the serialize() method for +# more information. This method differs in that if None is passed in, +# the result is the empty string. +# """ +# return b'' if val is None else cqltype.serialize(val, protocol_version) +# +# cdef DataType obj = Int64() +# +# cdef deserialize(cqltype, int typecode, char *byts, int32_t size, protocol_version): +# # if typecode == typecodes.LongType: +# # # return int64_unpack(byts) +# # return obj.deserialize(byts, size, protocol_version) +# # else: +# # return deserialize_generic(cqltype, typecode, byts, size, protocol_version) +# return cqltype.deserialize(byts[:size], protocol_version) +# +# cdef deserialize_generic(cqltype, int typecode, char *byts, int32_t size, +# protocol_version): +# return cqltype.deserialize(byts[:size], protocol_version) +# \ No newline at end of file diff --git a/cassandra/datatypes.pxd b/cassandra/datatypes.pxd new file mode 100644 index 00000000..40f8d742 --- /dev/null +++ b/cassandra/datatypes.pxd @@ -0,0 +1,16 @@ +cdef class LLDataType: + """ + Low-level Cassandra datatype + """ + + cdef Py_ssize_t size + + cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version) + +cdef class DataType: + cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version) + + +cdef class Int64(DataType): + pass + diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx new file mode 100644 index 00000000..5fc61cfa --- /dev/null +++ b/cassandra/datatypes.pyx @@ -0,0 +1,52 @@ +include 'marshal.pyx' + +from cassandra import cqltypes + + +cdef class LLDataType: + cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, + void *out, protocol_version): + pass + + +cdef class DataType: + cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): + pass + + +cdef class LLInt64(LLDataType): + """ + Low-level Cassandra datatype + """ + + cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version): + cdef int64_t x = int64_unpack(buf) + ( out)[0] = x + + +cdef class Int64(DataType): + + cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): + cdef int64_t x = int64_unpack(buf) + return x + + def __str__(self): + return "int64" + + +cdef class GenericDataType(DataType): + """ + Wrap a generic datatype for deserialization + """ + + cdef object cqltype + + def __init__(self, cqltype): + self.cqltype = cqltype + + cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): + return self.cqltype.deserialize(buf[:size], protocol_version) + + def __str__(self): + return "GenericDataType(%s)" % (self.cqltype,) + diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx new file mode 100644 index 00000000..8749457b --- /dev/null +++ b/cassandra/ioutils.pyx @@ -0,0 +1,21 @@ +include 'marshal.pyx' +from libc.stdint cimport int32_t +from cassandra.bytesio cimport BytesIOReader + + +cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out): + """ + Get a pointer into the buffer provided by BytesIOReader for the + next data item in the stream of values. + """ + raw_val_size = read_int(reader) + size_out[0] = raw_val_size + if raw_val_size < 0: + return NULL + else: + return reader.read(raw_val_size) + + +cdef inline int32_t read_int(BytesIOReader reader): + return int32_unpack(reader.read(4)) + diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 2ecb0fa5..85c3504c 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -1,4 +1,5 @@ # -- cython: profile=True +# # Copyright 2013-2015 DataStax, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -66,6 +67,10 @@ cpdef inline bytes int64_pack(int64_t x): cpdef inline int64_t int64_unpack(const char *buf): # The 'const' makes sure the buffer is not mutated in-place! cdef int64_t x = ( buf)[0] + cdef char *p = &x + # if is_little_endian: + # p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7] = ( + # p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]) swap_order( &x, 8) return x @@ -74,7 +79,10 @@ cpdef inline bytes int32_pack(int32_t x): cpdef inline int32_t int32_unpack(const char *buf): cdef int32_t x = ( buf)[0] + cdef char *p = &x swap_order( &x, 4) + # if is_little_endian: + # p[0], p[1], p[2], p[3] = p[3], p[2], p[1], p[0] return x cpdef inline bytes int16_pack(int16_t x): diff --git a/cassandra/rowparser.pxd b/cassandra/rowparser.pxd new file mode 100644 index 00000000..7597cca9 --- /dev/null +++ b/cassandra/rowparser.pxd @@ -0,0 +1,5 @@ +from cassandra.bytesio cimport BytesIOReader + +cdef class RowParser: + cpdef unpack_row(self, BytesIOReader reader, protocol_version) + diff --git a/cassandra/rowparser.pyx b/cassandra/rowparser.pyx new file mode 100644 index 00000000..d09bdf94 --- /dev/null +++ b/cassandra/rowparser.pyx @@ -0,0 +1,70 @@ +include "ioutils.pyx" + +from cpython.tuple cimport ( + PyTuple_New, + # Return value: New reference. + # Return a new tuple object of size len, or NULL on failure. + PyTuple_SET_ITEM, + # Like PyTuple_SetItem(), but does no error checking, and should + # only be used to fill in brand new tuples. Note: This function + # ``steals'' a reference to o. + ) + +from cpython.ref cimport ( + Py_INCREF + # void Py_INCREF(object o) + # Increment the reference count for object o. The object must not + # be NULL; if you aren't sure that it isn't NULL, use + # Py_XINCREF(). + ) + +from cassandra.bytesio cimport BytesIOReader +from cassandra.datatypes cimport DataType + + +cdef class RowParser: + cpdef unpack_row(self, BytesIOReader reader, protocol_version): + """ + Unpack a single row of data in a ResultMessage. + """ + raise NotImplementedError + + +cdef class TupleRowParser(RowParser): + """ + Parse a single returned row into a tuple of objects: + + (obj1, ..., objN) + + Attributes + =========== + datatypes: + this is a memoryview of N DataType objects that can deserialize bytes + into objects + """ + + cdef DataType[::1] datatypes + cdef Py_ssize_t size + + def __init__(self, Py_ssize_t n, DataType[::1] datatypes): + self.datatypes = datatypes + self.size = n + + cpdef unpack_row(self, BytesIOReader reader, protocol_version): + cdef char *buf + cdef Py_ssize_t i, bufsize, rowsize = self.size + cdef DataType dt + cdef tuple res = PyTuple_New(self.size) + + for i in range(rowsize): + buf = get_buf(reader, &bufsize) + if buf == NULL: + val = None + else: + dt = self.datatypes[i] + val = dt.deserialize(buf, bufsize, protocol_version) + + Py_INCREF(val) + PyTuple_SET_ITEM(res, i, val) + + return res diff --git a/setup.py b/setup.py index 7083d7aa..7fe2631a 100644 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ if __name__ == '__main__' and sys.argv[1] == "install": except ImportError: pass +PROFILING = False class DocCommand(Command): @@ -261,13 +262,18 @@ if "--no-libev" not in sys.argv and not is_windows: if "--no-cython" not in sys.argv: try: from Cython.Build import cythonize - cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', 'pool', 'protocol', 'query', 'util'] + # cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', 'pool', 'protocol', 'query', 'util'] + cython_candidates = [] compile_args = [] if is_windows else ['-Wno-unused-function'] + directives = {'profile': PROFILING} # this seems to have no effect... extensions.extend(cythonize( - [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], extra_compile_args=compile_args) for m in cython_candidates], + [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], + extra_compile_args=compile_args, + compiler_directives=directives) + for m in cython_candidates], exclude_failures=True)) - - extensions.extend(cythonize("cassandra/*.pyx")) + extensions.extend(cythonize("cassandra/*.pyx", + compiler_directives=directives)) except ImportError: sys.stderr.write("Cython is not installed. Not compiling core driver files as extensions (optional).") From 1a6534b575b3b8f19a81f3f88a9583807b5cacc3 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 30 Jul 2015 17:02:27 +0100 Subject: [PATCH 09/70] (Optional) NumPy row parser --- cassandra/datatypes.pxd | 17 ++--- cassandra/datatypes.pyx | 19 ----- cassandra/marshal.pyx | 4 +- cassandra/numpyparser.pyx | 144 ++++++++++++++++++++++++++++++++++++++ cassandra/util.py | 3 + 5 files changed, 158 insertions(+), 29 deletions(-) create mode 100644 cassandra/numpyparser.pyx diff --git a/cassandra/datatypes.pxd b/cassandra/datatypes.pxd index 40f8d742..d4db2b02 100644 --- a/cassandra/datatypes.pxd +++ b/cassandra/datatypes.pxd @@ -1,11 +1,12 @@ -cdef class LLDataType: - """ - Low-level Cassandra datatype - """ - - cdef Py_ssize_t size - - cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version) +# cdef class LLDataType: +# """ +# Low-level Cassandra datatype +# """ +# +# cdef Py_ssize_t size +# +# cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, +# Py_ssize_t index, void *out, protocol_version) cdef class DataType: cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version) diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx index 5fc61cfa..a1c50fcb 100644 --- a/cassandra/datatypes.pyx +++ b/cassandra/datatypes.pyx @@ -1,29 +1,10 @@ include 'marshal.pyx' -from cassandra import cqltypes - - -cdef class LLDataType: - cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, - void *out, protocol_version): - pass - - cdef class DataType: cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): pass -cdef class LLInt64(LLDataType): - """ - Low-level Cassandra datatype - """ - - cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, void *out, protocol_version): - cdef int64_t x = int64_unpack(buf) - ( out)[0] = x - - cdef class Int64(DataType): cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 85c3504c..92fb1293 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -22,8 +22,8 @@ import math from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t) -assert sys.byteorder in ('little', 'big') -cdef bint is_little_endian = sys.byteorder == 'little' +cdef bint is_little_endian +from cassandra.util import is_little_endian # cdef extern from "marshal.h": # cdef str c_string_to_python(char *p, Py_ssize_t len) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx new file mode 100644 index 00000000..73eeea95 --- /dev/null +++ b/cassandra/numpyparser.pyx @@ -0,0 +1,144 @@ +""" +This module provider an optional protocol parser that returns +NumPy arrays. + +============================================================================= +This module should not be imported by any of the main python-driver modules, +as numpy is an optional dependency. +============================================================================= +""" + +include "ioutils.pyx" + +from libc.stdint cimport uint64_t + +from cassandra.rowparser cimport RowParser +from cassandra.bytesio cimport BytesIOReader +from cassandra.datatypes cimport DataType +from cassandra import cqltypes + +import numpy as np +cimport numpy as np + +from cassandra.util import is_little_endian + +from cpython.ref cimport Py_INCREF, PyObject + +cdef extern from "Python.h": + # An integer type large enough to hold a pointer + ctypedef uint64_t Py_uintptr_t + +# ctypedef struct TypeRepr: +# Py_ssize_t size +# int is_object + +ctypedef struct ArrRepr: + # TypeRepr typ + Py_uintptr_t buf_ptr + Py_ssize_t stride + int is_object + +_cqltype_to_numpy = { + cqltypes.LongType: np.dtype('>i8'), + cqltypes.CounterColumnType: np.dtype('>i8'), + cqltypes.Int32Type: np.dtype('>i4'), + cqltypes.ShortType: np.dtype('>i2'), + cqltypes.FloatType: np.dtype('>f4'), + cqltypes.DoubleType: np.dtype('>f8'), +} + + +# cdef type_repr(coltype): +# """ +# Get a low-level type representation for the cqltype +# """ +# cdef TypeRepr res +# if coltype in _cqltype_to_numpy: +# dtype = _cqltype_to_numpy[coltype] +# res.size = dtype.itemsize +# res.is_object = False +# else: +# res.size = sizeof(PyObject *) +# res.is_object = True +# return res + + +cdef ArrRepr array_repr(np.ndarray arr, coltype): + """ + Construct a low-level array representation + """ + assert arr.ndim == 1, "Expected a one-dimensional array" + + cdef ArrRepr res + # Get the data pointer to the underlying memory of the numpy array + res.buf_ptr = arr.ctypes.data + res.stride = arr.strides[0] + res.is_object = coltype in _cqltype_to_numpy + return res + + +cdef class NativeRowParser(RowParser): + """ + This is a row parser that copies bytes into arrays (e.g. NumPy arrays) + for types it recognizes, such as int64. Values of other types are + converted to objects. + + NOTE: This class is stateful, in that every time unpack_row is called it + advanced the pointer into the array by updates the buf_ptr field + of self.arrays + """ + + # ArrRepr contains a 'buf_ptr' field, which is not supported as a memoryview dtype + cdef ArrRepr[::1] arrays + cdef DataType[::1] datatypes + cdef Py_ssize_t size + + def __init__(self, ArrRepr[::1] arrays, DataType[::1] datatypes): + self.arrays = arrays + self.datatypes = datatypes + self.size = len(datatypes) + + cpdef unpack_row(self, BytesIOReader reader, protocol_version): + cdef char *buf + cdef Py_ssize_t i, bufsize, rowsize = self.size + cdef ArrRepr arr + + for i in range(rowsize): + buf = get_buf(reader, &bufsize) + if buf == NULL: + raise ValueError("Unexpected end of stream") + + arr = self.arrays[i] + + if arr.is_object: + dt = self.datatypes[i] + val = dt.deserialize(buf, bufsize, protocol_version) + Py_INCREF(val) + ( arr.buf_ptr)[0] = val + else: + memcopy(buf, arr.buf_ptr, bufsize) + + # Update the pointer into the array for the next time + self.arrays[i].buf_ptr += arr.stride + + +cdef inline memcopy(char *src, char *dst, Py_ssize_t size): + """ + Our own simple memcopy which can be inlined. This is useful because our data types + are only a few bytes. + """ + cdef Py_ssize_t i + for i in range(size): + dst[i] = src[i] + + +def make_native_byteorder(arr): + """ + Make sure all values have a native endian in the NumPy arrays. + """ + if is_little_endian: + # We have arrays in big-endian order. First swap the bytes + # into little endian order, and then update the numpy dtype + # accordingly (e.g. from '>i8' to ' Date: Thu, 30 Jul 2015 17:51:49 +0100 Subject: [PATCH 10/70] Some more work on NumPy row parsing and array allocation --- cassandra/numpyparser.pyx | 107 ++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 39 deletions(-) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 73eeea95..763c3ab2 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -11,33 +11,46 @@ as numpy is an optional dependency. include "ioutils.pyx" from libc.stdint cimport uint64_t +from cpython.ref cimport Py_INCREF, PyObject from cassandra.rowparser cimport RowParser from cassandra.bytesio cimport BytesIOReader from cassandra.datatypes cimport DataType from cassandra import cqltypes - -import numpy as np -cimport numpy as np - from cassandra.util import is_little_endian -from cpython.ref cimport Py_INCREF, PyObject +import numpy as np + + +cdef extern from "numpyFlags.h": + + pass cdef extern from "Python.h": # An integer type large enough to hold a pointer ctypedef uint64_t Py_uintptr_t -# ctypedef struct TypeRepr: -# Py_ssize_t size -# int is_object +cdef extern from "numpy/arrayobject.h": + # Avoid using 'numpy' from Cython, as it access the 'data' attribute + # of PyArrayObject, which is deprecated: + # + # warning: #warning "Using deprecated NumPy API, disable it by + # #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp] + # + ctypedef class np.ndarray [object PyArrayObject]: + pass -ctypedef struct ArrRepr: - # TypeRepr typ + +# Simple array descriptor, useful to parse rows into a NumPy array +ctypedef struct ArrDesc: Py_uintptr_t buf_ptr Py_ssize_t stride int is_object +cdef ArrDesc[:] _dummyArray = NULL +arrDescDtype = np.array(_dummyArray).dtype + + _cqltype_to_numpy = { cqltypes.LongType: np.dtype('>i8'), cqltypes.CounterColumnType: np.dtype('>i8'), @@ -47,36 +60,53 @@ _cqltype_to_numpy = { cqltypes.DoubleType: np.dtype('>f8'), } +obj_dtype = np.dtype('O') -# cdef type_repr(coltype): +def make_array(coltype, array_size): + """ + Allocate a new NumPy array of the given column type and size. + """ + dtype = _cqltype_to_numpy.get(coltype, obj_dtype) + return np.empty((array_size,), dtype=dtype) + + +def make_arrays(colnames, coltypes, array_size): + """ + Allocate arrays for each result column. + + returns a tuple of (array_descs, arrays), where + 'array_descs' describe the arrays for NativeRowParser and + 'arrays' is a dict mapping column names to arrays + (e.g. this can be fed into pandas.DataFrame) + """ + row_size = len(colnames) + array_descs = np.empty((row_size,), arrDescDtype) + arrays = {} + + for i, colname, coltype in zip(range(row_size), colnames, coltypes): + arr = make_array(coltype, array_size) + array_descs[i].buf_ptr = arr.ctypes.data + array_descs[i].stride = arr.strides[0] + array_descs[i].is_object = coltype in _cqltype_to_numpy + arrays[colname] = arr + + return array_descs, arrays + + +# cdef ArrDesc array_repr(np.ndarray arr, coltype): # """ -# Get a low-level type representation for the cqltype +# Construct a low-level array representation # """ -# cdef TypeRepr res -# if coltype in _cqltype_to_numpy: -# dtype = _cqltype_to_numpy[coltype] -# res.size = dtype.itemsize -# res.is_object = False -# else: -# res.size = sizeof(PyObject *) -# res.is_object = True +# assert arr.ndim == 1, "Expected a one-dimensional array" +# +# cdef ArrDesc res +# # Get the data pointer to the underlying memory of the numpy array +# res.buf_ptr = arr.ctypes.data +# res.stride = arr.strides[0] +# res.is_object = coltype in _cqltype_to_numpy # return res -cdef ArrRepr array_repr(np.ndarray arr, coltype): - """ - Construct a low-level array representation - """ - assert arr.ndim == 1, "Expected a one-dimensional array" - - cdef ArrRepr res - # Get the data pointer to the underlying memory of the numpy array - res.buf_ptr = arr.ctypes.data - res.stride = arr.strides[0] - res.is_object = coltype in _cqltype_to_numpy - return res - - cdef class NativeRowParser(RowParser): """ This is a row parser that copies bytes into arrays (e.g. NumPy arrays) @@ -88,12 +118,11 @@ cdef class NativeRowParser(RowParser): of self.arrays """ - # ArrRepr contains a 'buf_ptr' field, which is not supported as a memoryview dtype - cdef ArrRepr[::1] arrays + cdef ArrDesc[::1] arrays cdef DataType[::1] datatypes cdef Py_ssize_t size - def __init__(self, ArrRepr[::1] arrays, DataType[::1] datatypes): + def __init__(self, ArrDesc[::1] arrays, DataType[::1] datatypes): self.arrays = arrays self.datatypes = datatypes self.size = len(datatypes) @@ -101,7 +130,7 @@ cdef class NativeRowParser(RowParser): cpdef unpack_row(self, BytesIOReader reader, protocol_version): cdef char *buf cdef Py_ssize_t i, bufsize, rowsize = self.size - cdef ArrRepr arr + cdef ArrDesc arr for i in range(rowsize): buf = get_buf(reader, &bufsize) @@ -110,7 +139,7 @@ cdef class NativeRowParser(RowParser): arr = self.arrays[i] - if arr.is_object: + if self.is_object[i]: dt = self.datatypes[i] val = dt.deserialize(buf, bufsize, protocol_version) Py_INCREF(val) From c7c50c973de4f6924fd99060fe40d8118c3b995a Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 30 Jul 2015 17:58:21 +0100 Subject: [PATCH 11/70] Fix wrong attribute access --- cassandra/numpyparser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 763c3ab2..ff0b9f33 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -139,7 +139,7 @@ cdef class NativeRowParser(RowParser): arr = self.arrays[i] - if self.is_object[i]: + if arr.is_object: dt = self.datatypes[i] val = dt.deserialize(buf, bufsize, protocol_version) Py_INCREF(val) From ed4efd2a66b5631dfa814d502f255db3a3364848 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 30 Jul 2015 17:59:01 +0100 Subject: [PATCH 12/70] Disable use of deprecated NumPy API --- cassandra/numpyFlags.h | 1 + 1 file changed, 1 insertion(+) create mode 100644 cassandra/numpyFlags.h diff --git a/cassandra/numpyFlags.h b/cassandra/numpyFlags.h new file mode 100644 index 00000000..6793b7a8 --- /dev/null +++ b/cassandra/numpyFlags.h @@ -0,0 +1 @@ +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION \ No newline at end of file From 43779022eceb7921973c01976a642a14a223f8cb Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 31 Jul 2015 10:32:53 +0100 Subject: [PATCH 13/70] Clean up some Cython deserialization code --- cassandra/cython_protocol_handler.pyx | 11 ++++------- cassandra/numpyparser.pyx | 23 ++++++---------------- cassandra/{rowparser.pxd => objparser.pxd} | 0 cassandra/{rowparser.pyx => objparser.pyx} | 0 4 files changed, 10 insertions(+), 24 deletions(-) rename cassandra/{rowparser.pxd => objparser.pxd} (100%) rename cassandra/{rowparser.pyx => objparser.pyx} (100%) diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index 644985dc..c9eb24e6 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -14,14 +14,14 @@ from cassandra.protocol import ResultMessage, ProtocolHandler from cassandra.bytesio cimport BytesIOReader from cassandra cimport typecodes from cassandra.datatypes cimport DataType -from cassandra.rowparser cimport RowParser +from cassandra.objparser cimport RowParser -from cassandra.rowparser import TupleRowParser +from cassandra.objparser import TupleRowParser from cassandra.datatypes import Int64, GenericDataType from cython.view cimport array as cython_array -include "marshal.pyx" +include "ioutils.pyx" class FastResultMessage(ResultMessage): @@ -50,6 +50,7 @@ class FastResultMessage(ResultMessage): def obj_array(list objs): + """Create a (Cython) array of objects given a list of objects""" cdef object[:] arr arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") # arr[:] = objs # This does not work (segmentation faults) @@ -74,10 +75,6 @@ cdef parse_rows(BytesIOReader reader, DataType[::1] datatypes, protocol_version) return [parser.unpack_row(reader, protocol_version) for i in range(rowcount)] -cdef inline int32_t read_int(BytesIOReader reader): - return int32_unpack(reader.read(4)) - - # cdef parse_rows2(BytesIOReader reader, list colnames, list coltypes, protocol_version): # cdef Py_ssize_t i, rowcount # cdef char *raw_val diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index ff0b9f33..4c279c3a 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -13,7 +13,7 @@ include "ioutils.pyx" from libc.stdint cimport uint64_t from cpython.ref cimport Py_INCREF, PyObject -from cassandra.rowparser cimport RowParser +from cassandra.objparser cimport RowParser from cassandra.bytesio cimport BytesIOReader from cassandra.datatypes cimport DataType from cassandra import cqltypes @@ -23,7 +23,8 @@ import numpy as np cdef extern from "numpyFlags.h": - + # Include 'numpyFlags.h' into the generated C code to disable the + # deprecated NumPy API pass cdef extern from "Python.h": @@ -93,20 +94,6 @@ def make_arrays(colnames, coltypes, array_size): return array_descs, arrays -# cdef ArrDesc array_repr(np.ndarray arr, coltype): -# """ -# Construct a low-level array representation -# """ -# assert arr.ndim == 1, "Expected a one-dimensional array" -# -# cdef ArrDesc res -# # Get the data pointer to the underlying memory of the numpy array -# res.buf_ptr = arr.ctypes.data -# res.stride = arr.strides[0] -# res.is_object = coltype in _cqltype_to_numpy -# return res - - cdef class NativeRowParser(RowParser): """ This is a row parser that copies bytes into arrays (e.g. NumPy arrays) @@ -165,9 +152,11 @@ def make_native_byteorder(arr): """ Make sure all values have a native endian in the NumPy arrays. """ - if is_little_endian: + if is_little_endian and not arr.dtype.kind == 'O': # We have arrays in big-endian order. First swap the bytes # into little endian order, and then update the numpy dtype # accordingly (e.g. from '>i8' to ' Date: Fri, 31 Jul 2015 13:37:16 +0100 Subject: [PATCH 14/70] Add lazy and list-based column deserializers --- cassandra/cython_protocol_handler.pyx | 69 ++++++++++++++++++--------- cassandra/objparser.pxd | 6 +++ cassandra/objparser.pyx | 35 ++++++++++++++ 3 files changed, 88 insertions(+), 22 deletions(-) diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index c9eb24e6..e8fabad0 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -14,9 +14,9 @@ from cassandra.protocol import ResultMessage, ProtocolHandler from cassandra.bytesio cimport BytesIOReader from cassandra cimport typecodes from cassandra.datatypes cimport DataType -from cassandra.objparser cimport RowParser +from cassandra.objparser cimport ColumnParser, RowParser -from cassandra.objparser import TupleRowParser +from cassandra.objparser import ListParser from cassandra.datatypes import Int64, GenericDataType from cython.view cimport array as cython_array @@ -24,16 +24,12 @@ from cython.view cimport array as cython_array include "ioutils.pyx" -class FastResultMessage(ResultMessage): - """ - Cython version of Result Message that has a faster implementation of - recv_results_row. - """ - # type_codes = ResultMessage.type_codes.copy() - code_to_type = dict((v, k) for k, v in ResultMessage.type_codes.items()) - - @classmethod +def make_recv_results_rows(ColumnParser colparser): def recv_results_rows(cls, f, protocol_version, user_type_map): + """ + Parse protocol data given as a BytesIO f into a set of columns (e.g. list of tuples) + This is used as the recv_results_rows method of (Fast)ResultMessage + """ paging_state, column_metadata = cls.recv_results_metadata(f, user_type_map) colnames = [c[2] for c in column_metadata] @@ -44,9 +40,12 @@ class FastResultMessage(ResultMessage): [Int64() if coltype == LongType else GenericDataType(coltype) for coltype in coltypes]) # [GenericDataType(coltype) for coltype in coltypes]) + parsed_rows = colparser.parse_rows( + BytesIOReader(f.read()), datatypes, protocol_version) # parsed_rows = parse_rows2(BytesIOReader(f.read()), colnames, coltypes, protocol_version) - parsed_rows = parse_rows(BytesIOReader(f.read()), datatypes, protocol_version) + # parsed_rows = parse_rows(BytesIOReader(f.read()), datatypes, protocol_version) return (paging_state, (colnames, parsed_rows)) + return recv_results_rows def obj_array(list objs): @@ -59,20 +58,46 @@ def obj_array(list objs): return arr -class CythonProtocolHandler(ProtocolHandler): +def make_protocol_handler(colparser=ListParser()): """ - Use FastResultMessage to decode query result message messages. + Given a column parser to deserialize ResultMessages, return a suitable + Cython-based protocol handler. + + There are three Cython-based protocol handlers (least to most performant): + + 1. objparser.ListParser + this parser decodes result messages into a list of tuples + + 2. objparser.LazyParser + this parser decodes result messages lazily by returning an iterator + + 3. numpyparser.NumPyParser + this parser decodes result messages into NumPy arrays + + The default is to use objparser.ListParser """ - my_opcodes = ProtocolHandler.message_types_by_opcode.copy() - my_opcodes[FastResultMessage.opcode] = FastResultMessage - message_types_by_opcode = my_opcodes + # TODO: It may be cleaner to turn ProtocolHandler and ResultMessage into + # TODO: instances and use methods instead of class methods + class FastResultMessage(ResultMessage): + """ + Cython version of Result Message that has a faster implementation of + recv_results_row. + """ + # type_codes = ResultMessage.type_codes.copy() + code_to_type = dict((v, k) for k, v in ResultMessage.type_codes.items()) + recv_results_rows = classmethod(make_recv_results_rows(colparser)) -cdef parse_rows(BytesIOReader reader, DataType[::1] datatypes, protocol_version): - cdef Py_ssize_t i, rowcount - cdef RowParser parser = TupleRowParser(len(datatypes), datatypes) - rowcount = read_int(reader) - return [parser.unpack_row(reader, protocol_version) for i in range(rowcount)] + class CythonProtocolHandler(ProtocolHandler): + """ + Use FastResultMessage to decode query result message messages. + """ + + my_opcodes = ProtocolHandler.message_types_by_opcode.copy() + my_opcodes[FastResultMessage.opcode] = FastResultMessage + message_types_by_opcode = my_opcodes + + return CythonProtocolHandler # cdef parse_rows2(BytesIOReader reader, list colnames, list coltypes, protocol_version): diff --git a/cassandra/objparser.pxd b/cassandra/objparser.pxd index 7597cca9..edfa2a60 100644 --- a/cassandra/objparser.pxd +++ b/cassandra/objparser.pxd @@ -1,4 +1,10 @@ from cassandra.bytesio cimport BytesIOReader +from cassandra.datatypes cimport DataType + +cdef class ColumnParser: + cpdef parse_rows(self, BytesIOReader reader, DataType[::1] datatypes, + protocol_version) + cdef class RowParser: cpdef unpack_row(self, BytesIOReader reader, protocol_version) diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index d09bdf94..1fa9d283 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -22,7 +22,42 @@ from cassandra.bytesio cimport BytesIOReader from cassandra.datatypes cimport DataType +cdef class ColumnParser: + """Decode a ResultMessage into a set of columns""" + cpdef parse_rows(self, BytesIOReader reader, DataType[::1] datatypes, + protocol_version): + raise NotImplementedError + + +cdef class ListParser(ColumnParser): + """Decode a ResultMessage into a list of tuples (or other objects)""" + + cpdef parse_rows(self, BytesIOReader r, DataType[::1] datatypes, ver): + cdef Py_ssize_t i, rowcount + rowcount = read_int(r) + cdef RowParser rowparser = TupleRowParser(len(datatypes), datatypes) + return [rowparser.unpack_row(r, ver) for i in range(rowcount)] + + +cdef class LazyParser(ColumnParser): + """Decode a ResultMessage lazily using a generator""" + + cpdef parse_rows(self, BytesIOReader r, DataType[::1] datatypes, ver): + # Use a little helper function as closures (generators) are not + # supported in cpdef methods + return parse_rows_lazy(r, self.rowparser, datatypes, ver) + + +def parse_rows_lazy(BytesIOReader r, DataType[::1] datatypes, ver): + cdef Py_ssize_t i, rowcount + rowcount = read_int(r) + cdef RowParser rowparser = TupleRowParser(len(datatypes), datatypes) + return (rowparser.unpack_row(r, ver) for i in range(rowcount)) + + cdef class RowParser: + """Parser for a single row""" + cpdef unpack_row(self, BytesIOReader reader, protocol_version): """ Unpack a single row of data in a ResultMessage. From 7fbc6aa731eca509e19d739e6277de6a09f2bcc9 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 31 Jul 2015 15:51:20 +0100 Subject: [PATCH 15/70] Some fixes to numpy and object deserializers --- cassandra/cython_protocol_handler.pyx | 38 +++-------- cassandra/datatypes.pxd | 16 +---- cassandra/datatypes.pyx | 39 ++++++++++-- cassandra/numpyparser.pyx | 91 +++++++++++++++------------ cassandra/objparser.pxd | 11 ---- cassandra/objparser.pyx | 55 +++++----------- cassandra/parsing.pxd | 16 +++++ cassandra/parsing.pyx | 30 +++++++++ 8 files changed, 157 insertions(+), 139 deletions(-) delete mode 100644 cassandra/objparser.pxd create mode 100644 cassandra/parsing.pxd create mode 100644 cassandra/parsing.pyx diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index e8fabad0..98c7f1d6 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -6,20 +6,16 @@ from libc.stdint cimport int64_t, int32_t # uint16_pack, uint16_unpack, uint32_pack, uint32_unpack, # int32_pack, int32_unpack, int64_pack, int64_unpack, float_pack, float_unpack, double_pack, double_unpack) -from cassandra.marshal import varint_pack, varint_unpack -from cassandra import util -from cassandra.cqltypes import EMPTY, LongType +# from cassandra.marshal import varint_pack, varint_unpack +# from cassandra import util +# from cassandra.cqltypes import EMPTY, LongType from cassandra.protocol import ResultMessage, ProtocolHandler -from cassandra.bytesio cimport BytesIOReader -from cassandra cimport typecodes -from cassandra.datatypes cimport DataType -from cassandra.objparser cimport ColumnParser, RowParser - +# from cassandra.bytesio cimport BytesIOReader +from cassandra.parsing cimport ParseDesc, ColumnParser +from cassandra.datatypes import make_datatypes from cassandra.objparser import ListParser -from cassandra.datatypes import Int64, GenericDataType -from cython.view cimport array as cython_array include "ioutils.pyx" @@ -35,29 +31,15 @@ def make_recv_results_rows(ColumnParser colparser): colnames = [c[2] for c in column_metadata] coltypes = [c[3] for c in column_metadata] - cdef DataType[::1] datatypes - datatypes = obj_array( - [Int64() if coltype == LongType else GenericDataType(coltype) for coltype in coltypes]) - # [GenericDataType(coltype) for coltype in coltypes]) + desc = ParseDesc(colnames, coltypes, make_datatypes(coltypes), protocol_version) + reader = BytesIOReader(f.read()) + parsed_rows = colparser.parse_rows(reader, desc) - parsed_rows = colparser.parse_rows( - BytesIOReader(f.read()), datatypes, protocol_version) - # parsed_rows = parse_rows2(BytesIOReader(f.read()), colnames, coltypes, protocol_version) - # parsed_rows = parse_rows(BytesIOReader(f.read()), datatypes, protocol_version) return (paging_state, (colnames, parsed_rows)) + return recv_results_rows -def obj_array(list objs): - """Create a (Cython) array of objects given a list of objects""" - cdef object[:] arr - arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") - # arr[:] = objs # This does not work (segmentation faults) - for i, obj in enumerate(objs): - arr[i] = obj - return arr - - def make_protocol_handler(colparser=ListParser()): """ Given a column parser to deserialize ResultMessages, return a suitable diff --git a/cassandra/datatypes.pxd b/cassandra/datatypes.pxd index d4db2b02..cd58b6b3 100644 --- a/cassandra/datatypes.pxd +++ b/cassandra/datatypes.pxd @@ -1,17 +1,3 @@ -# cdef class LLDataType: -# """ -# Low-level Cassandra datatype -# """ -# -# cdef Py_ssize_t size -# -# cdef void deserialize_ptr(self, char *buf, Py_ssize_t size, -# Py_ssize_t index, void *out, protocol_version) - cdef class DataType: + cdef object cqltype cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version) - - -cdef class Int64(DataType): - pass - diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx index a1c50fcb..24dd18e6 100644 --- a/cassandra/datatypes.pyx +++ b/cassandra/datatypes.pyx @@ -1,8 +1,21 @@ include 'marshal.pyx' +from cython.view cimport array as cython_array +from cassandra.datatypes import Int64, GenericDataType +from cassandra.cqltypes import LongType + +# TODO: Port cqltypes to this module + cdef class DataType: + """ + Cython-based datatype + """ + + def __init__(self, cqltype): + self.cqltype = cqltype + cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): - pass + raise NotImplementedError cdef class Int64(DataType): @@ -20,14 +33,28 @@ cdef class GenericDataType(DataType): Wrap a generic datatype for deserialization """ - cdef object cqltype - - def __init__(self, cqltype): - self.cqltype = cqltype - cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): return self.cqltype.deserialize(buf[:size], protocol_version) def __str__(self): return "GenericDataType(%s)" % (self.cqltype,) + +def make_datatypes(coltypes): + cdef DataType[::1] datatypes + return obj_array([make_datatype(ct) for ct in coltypes]) + + +def make_datatype(coltype): + return Int64(coltype) if coltype == LongType else GenericDataType(coltype) + + +def obj_array(list objs): + """Create a (Cython) array of objects given a list of objects""" + cdef object[:] arr + arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") + # arr[:] = objs # This does not work (segmentation faults) + for i, obj in enumerate(objs): + arr[i] = obj + return arr + diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 4c279c3a..3dd28286 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -13,13 +13,14 @@ include "ioutils.pyx" from libc.stdint cimport uint64_t from cpython.ref cimport Py_INCREF, PyObject -from cassandra.objparser cimport RowParser from cassandra.bytesio cimport BytesIOReader from cassandra.datatypes cimport DataType +from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser from cassandra import cqltypes from cassandra.util import is_little_endian import numpy as np +import pandas as pd cdef extern from "numpyFlags.h": @@ -31,26 +32,18 @@ cdef extern from "Python.h": # An integer type large enough to hold a pointer ctypedef uint64_t Py_uintptr_t -cdef extern from "numpy/arrayobject.h": - # Avoid using 'numpy' from Cython, as it access the 'data' attribute - # of PyArrayObject, which is deprecated: - # - # warning: #warning "Using deprecated NumPy API, disable it by - # #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp] - # - ctypedef class np.ndarray [object PyArrayObject]: - pass - # Simple array descriptor, useful to parse rows into a NumPy array ctypedef struct ArrDesc: Py_uintptr_t buf_ptr - Py_ssize_t stride + int stride # should be large enough as we allocate contiguous arrays int is_object -cdef ArrDesc[:] _dummyArray = NULL -arrDescDtype = np.array(_dummyArray).dtype - +arrDescDtype = np.dtype( + [ ('buf_ptr', np.uintp) + , ('stride', np.dtype('i')) + , ('is_object', np.dtype('i')) + ]) _cqltype_to_numpy = { cqltypes.LongType: np.dtype('>i8'), @@ -63,15 +56,27 @@ _cqltype_to_numpy = { obj_dtype = np.dtype('O') -def make_array(coltype, array_size): - """ - Allocate a new NumPy array of the given column type and size. - """ - dtype = _cqltype_to_numpy.get(coltype, obj_dtype) - return np.empty((array_size,), dtype=dtype) + +cdef class NumpyParser(ColumnParser): + """Decode a ResultMessage into a bunch of NumPy arrays""" + + cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): + cdef Py_ssize_t i, rowcount + + rowcount = read_int(reader) + array_descs, arrays = make_arrays(desc, rowcount) + cdef RowParser rowparser = NumPyRowParser(array_descs) + for i in range(rowcount): + rowparser.unpack_row(reader, desc) + + # arrays = map(make_native_byteorder, arrays) + return arrays + # return pd.DataFrame(dict(zip(desc.colnames, arrays))) -def make_arrays(colnames, coltypes, array_size): +### Helper functions to create NumPy arrays and array descriptors + +def make_arrays(ParseDesc desc, array_size): """ Allocate arrays for each result column. @@ -80,21 +85,30 @@ def make_arrays(colnames, coltypes, array_size): 'arrays' is a dict mapping column names to arrays (e.g. this can be fed into pandas.DataFrame) """ - row_size = len(colnames) - array_descs = np.empty((row_size,), arrDescDtype) - arrays = {} + array_descs = np.empty((desc.rowsize,), arrDescDtype) + arrays = [] - for i, colname, coltype in zip(range(row_size), colnames, coltypes): + for i, coltype in enumerate(desc.coltypes): arr = make_array(coltype, array_size) - array_descs[i].buf_ptr = arr.ctypes.data - array_descs[i].stride = arr.strides[0] - array_descs[i].is_object = coltype in _cqltype_to_numpy - arrays[colname] = arr + array_descs[i]['buf_ptr'] = arr.ctypes.data + array_descs[i]['stride'] = arr.strides[0] + array_descs[i]['is_object'] = coltype in _cqltype_to_numpy + arrays.append(arr) return array_descs, arrays -cdef class NativeRowParser(RowParser): +def make_array(coltype, array_size): + """ + Allocate a new NumPy array of the given column type and size. + """ + dtype = _cqltype_to_numpy.get(coltype, obj_dtype) + return np.empty((array_size,), dtype=dtype) + + +#### Parse rows into NumPy arrays + +cdef class NumPyRowParser(RowParser): """ This is a row parser that copies bytes into arrays (e.g. NumPy arrays) for types it recognizes, such as int64. Values of other types are @@ -106,18 +120,15 @@ cdef class NativeRowParser(RowParser): """ cdef ArrDesc[::1] arrays - cdef DataType[::1] datatypes - cdef Py_ssize_t size - def __init__(self, ArrDesc[::1] arrays, DataType[::1] datatypes): + def __init__(self, ArrDesc[::1] arrays): self.arrays = arrays - self.datatypes = datatypes - self.size = len(datatypes) - cpdef unpack_row(self, BytesIOReader reader, protocol_version): + cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): cdef char *buf - cdef Py_ssize_t i, bufsize, rowsize = self.size + cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize cdef ArrDesc arr + cdef DataType dt for i in range(rowsize): buf = get_buf(reader, &bufsize) @@ -127,8 +138,8 @@ cdef class NativeRowParser(RowParser): arr = self.arrays[i] if arr.is_object: - dt = self.datatypes[i] - val = dt.deserialize(buf, bufsize, protocol_version) + dt = desc.datatypes[i] + val = dt.deserialize(buf, bufsize, desc.protocol_version) Py_INCREF(val) ( arr.buf_ptr)[0] = val else: diff --git a/cassandra/objparser.pxd b/cassandra/objparser.pxd deleted file mode 100644 index edfa2a60..00000000 --- a/cassandra/objparser.pxd +++ /dev/null @@ -1,11 +0,0 @@ -from cassandra.bytesio cimport BytesIOReader -from cassandra.datatypes cimport DataType - -cdef class ColumnParser: - cpdef parse_rows(self, BytesIOReader reader, DataType[::1] datatypes, - protocol_version) - - -cdef class RowParser: - cpdef unpack_row(self, BytesIOReader reader, protocol_version) - diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index 1fa9d283..da6e6c01 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -20,49 +20,33 @@ from cpython.ref cimport ( from cassandra.bytesio cimport BytesIOReader from cassandra.datatypes cimport DataType - - -cdef class ColumnParser: - """Decode a ResultMessage into a set of columns""" - cpdef parse_rows(self, BytesIOReader reader, DataType[::1] datatypes, - protocol_version): - raise NotImplementedError +from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser cdef class ListParser(ColumnParser): """Decode a ResultMessage into a list of tuples (or other objects)""" - cpdef parse_rows(self, BytesIOReader r, DataType[::1] datatypes, ver): + cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): cdef Py_ssize_t i, rowcount - rowcount = read_int(r) - cdef RowParser rowparser = TupleRowParser(len(datatypes), datatypes) - return [rowparser.unpack_row(r, ver) for i in range(rowcount)] + rowcount = read_int(reader) + cdef RowParser rowparser = TupleRowParser() + return [rowparser.unpack_row(reader, desc) for i in range(rowcount)] cdef class LazyParser(ColumnParser): """Decode a ResultMessage lazily using a generator""" - cpdef parse_rows(self, BytesIOReader r, DataType[::1] datatypes, ver): + cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): # Use a little helper function as closures (generators) are not # supported in cpdef methods - return parse_rows_lazy(r, self.rowparser, datatypes, ver) + return parse_rows_lazy(reader, desc) -def parse_rows_lazy(BytesIOReader r, DataType[::1] datatypes, ver): +def parse_rows_lazy(BytesIOReader reader, ParseDesc desc): cdef Py_ssize_t i, rowcount - rowcount = read_int(r) - cdef RowParser rowparser = TupleRowParser(len(datatypes), datatypes) - return (rowparser.unpack_row(r, ver) for i in range(rowcount)) - - -cdef class RowParser: - """Parser for a single row""" - - cpdef unpack_row(self, BytesIOReader reader, protocol_version): - """ - Unpack a single row of data in a ResultMessage. - """ - raise NotImplementedError + rowcount = read_int(reader) + cdef RowParser rowparser = TupleRowParser() + return (rowparser.unpack_row(reader, desc) for i in range(rowcount)) cdef class TupleRowParser(RowParser): @@ -78,26 +62,19 @@ cdef class TupleRowParser(RowParser): into objects """ - cdef DataType[::1] datatypes - cdef Py_ssize_t size - - def __init__(self, Py_ssize_t n, DataType[::1] datatypes): - self.datatypes = datatypes - self.size = n - - cpdef unpack_row(self, BytesIOReader reader, protocol_version): + cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): cdef char *buf - cdef Py_ssize_t i, bufsize, rowsize = self.size + cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize cdef DataType dt - cdef tuple res = PyTuple_New(self.size) + cdef tuple res = PyTuple_New(desc.rowsize) for i in range(rowsize): buf = get_buf(reader, &bufsize) if buf == NULL: val = None else: - dt = self.datatypes[i] - val = dt.deserialize(buf, bufsize, protocol_version) + dt = desc.datatypes[i] + val = dt.deserialize(buf, bufsize, desc.protocol_version) Py_INCREF(val) PyTuple_SET_ITEM(res, i, val) diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd new file mode 100644 index 00000000..c4774385 --- /dev/null +++ b/cassandra/parsing.pxd @@ -0,0 +1,16 @@ +from cassandra.bytesio cimport BytesIOReader +from cassandra.datatypes cimport DataType + +cdef class ParseDesc: + cdef public object colnames + cdef public object coltypes + cdef DataType[::1] datatypes + cdef public object protocol_version + cdef Py_ssize_t rowsize + +cdef class ColumnParser: + cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc) + +cdef class RowParser: + cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc) + diff --git a/cassandra/parsing.pyx b/cassandra/parsing.pyx new file mode 100644 index 00000000..71196d14 --- /dev/null +++ b/cassandra/parsing.pyx @@ -0,0 +1,30 @@ +""" +Module containing the definitions and declarations (parsing.pxd) for parsers. +""" + +cdef class ParseDesc: + """Description of what structure to parse""" + + def __init__(self, colnames, coltypes, datatypes, protocol_version): + self.colnames = colnames + self.coltypes = coltypes + self.datatypes = datatypes + self.protocol_version = protocol_version + self.rowsize = len(colnames) + + +cdef class ColumnParser: + """Decode a ResultMessage into a set of columns""" + + cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): + raise NotImplementedError + + +cdef class RowParser: + """Parser for a single row""" + + cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): + """ + Unpack a single row of data in a ResultMessage. + """ + raise NotImplementedError From 51e090cc61a4a2ace3772556a109ac1968602d30 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 31 Jul 2015 17:49:24 +0100 Subject: [PATCH 16/70] Experiment with various optimizations --- cassandra/datatypes.pyx | 2 ++ cassandra/marshal.pyx | 7 ++-- cassandra/numpyparser.pyx | 76 ++++++++++++++++++--------------------- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx index 24dd18e6..272435b2 100644 --- a/cassandra/datatypes.pyx +++ b/cassandra/datatypes.pyx @@ -1,3 +1,5 @@ +# -- cython: profile=True + include 'marshal.pyx' from cython.view cimport array as cython_array diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 92fb1293..cc80461b 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -53,12 +53,15 @@ cdef inline void swap_order(char *buf, Py_ssize_t size): cdef char c if is_little_endian: - for i in range(size//2): + for i in range(div2(size)): end = size - i - 1 c = buf[i] buf[i] = buf[end] buf[end] = c +cdef inline Py_ssize_t div2(Py_ssize_t x): + return x >> 1 + ### Packing and unpacking of signed integers cpdef inline bytes int64_pack(int64_t x): @@ -80,9 +83,9 @@ cpdef inline bytes int32_pack(int32_t x): cpdef inline int32_t int32_unpack(const char *buf): cdef int32_t x = ( buf)[0] cdef char *p = &x - swap_order( &x, 4) # if is_little_endian: # p[0], p[1], p[2], p[3] = p[3], p[2], p[1], p[0] + swap_order( &x, 4) return x cpdef inline bytes int16_pack(int16_t x): diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 3dd28286..9360c247 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -1,3 +1,5 @@ +# -- cython: profile=True + """ This module provider an optional protocol parser that returns NumPy arrays. @@ -10,6 +12,7 @@ as numpy is an optional dependency. include "ioutils.pyx" +cimport cython from libc.stdint cimport uint64_t from cpython.ref cimport Py_INCREF, PyObject @@ -62,15 +65,17 @@ cdef class NumpyParser(ColumnParser): cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): cdef Py_ssize_t i, rowcount + cdef ArrDesc[::1] array_descs + cdef ArrDesc *arrs rowcount = read_int(reader) array_descs, arrays = make_arrays(desc, rowcount) - cdef RowParser rowparser = NumPyRowParser(array_descs) - for i in range(rowcount): - rowparser.unpack_row(reader, desc) + arrs = &array_descs[0] - # arrays = map(make_native_byteorder, arrays) - return arrays + for i in range(rowcount): + unpack_row(reader, desc, arrs) + + return [make_native_byteorder(arr) for arr in arrays] # return pd.DataFrame(dict(zip(desc.colnames, arrays))) @@ -92,7 +97,7 @@ def make_arrays(ParseDesc desc, array_size): arr = make_array(coltype, array_size) array_descs[i]['buf_ptr'] = arr.ctypes.data array_descs[i]['stride'] = arr.strides[0] - array_descs[i]['is_object'] = coltype in _cqltype_to_numpy + array_descs[i]['is_object'] = coltype not in _cqltype_to_numpy arrays.append(arr) return array_descs, arrays @@ -108,48 +113,37 @@ def make_array(coltype, array_size): #### Parse rows into NumPy arrays -cdef class NumPyRowParser(RowParser): - """ - This is a row parser that copies bytes into arrays (e.g. NumPy arrays) - for types it recognizes, such as int64. Values of other types are - converted to objects. +@cython.boundscheck(False) +@cython.wraparound(False) +cdef inline int unpack_row( + BytesIOReader reader, ParseDesc desc, ArrDesc *arrays) except -1: + cdef char *buf + cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize + cdef ArrDesc arr + cdef DataType dt - NOTE: This class is stateful, in that every time unpack_row is called it - advanced the pointer into the array by updates the buf_ptr field - of self.arrays - """ + for i in range(rowsize): + buf = get_buf(reader, &bufsize) + if buf == NULL: + raise ValueError("Unexpected end of stream") - cdef ArrDesc[::1] arrays + arr = arrays[i] - def __init__(self, ArrDesc[::1] arrays): - self.arrays = arrays + if arr.is_object: + dt = desc.datatypes[i] + val = dt.deserialize(buf, bufsize, desc.protocol_version) + Py_INCREF(val) + ( arr.buf_ptr)[0] = val + else: + memcopy(buf, arr.buf_ptr, bufsize) - cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): - cdef char *buf - cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize - cdef ArrDesc arr - cdef DataType dt + # Update the pointer into the array for the next time + arrays[i].buf_ptr += arr.stride - for i in range(rowsize): - buf = get_buf(reader, &bufsize) - if buf == NULL: - raise ValueError("Unexpected end of stream") - - arr = self.arrays[i] - - if arr.is_object: - dt = desc.datatypes[i] - val = dt.deserialize(buf, bufsize, desc.protocol_version) - Py_INCREF(val) - ( arr.buf_ptr)[0] = val - else: - memcopy(buf, arr.buf_ptr, bufsize) - - # Update the pointer into the array for the next time - self.arrays[i].buf_ptr += arr.stride + return 0 -cdef inline memcopy(char *src, char *dst, Py_ssize_t size): +cdef inline void memcopy(char *src, char *dst, Py_ssize_t size): """ Our own simple memcopy which can be inlined. This is useful because our data types are only a few bytes. From 24e03f7f1409b7edb7b5f96025159c83608d6710 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 10:56:28 +0100 Subject: [PATCH 17/70] Clean up some old code --- cassandra/bytesio.pyx | 1 - cassandra/cython_protocol_handler.pyx | 118 -------------------------- cassandra/ioutils.pyx | 1 - 3 files changed, 120 deletions(-) diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index d392b23f..b18b1aa5 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -53,4 +53,3 @@ class PyBytesIOReader(BytesIOReader): r = self.buf[self.pos:newpos] self.pos = newpos return r - diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index 98c7f1d6..6ef3ae9f 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -1,17 +1,7 @@ # -- cython: profile=True -from libc.stdint cimport int64_t, int32_t - -# from cassandra.marshal cimport (int8_pack, int8_unpack, int16_pack, int16_unpack, -# uint16_pack, uint16_unpack, uint32_pack, uint32_unpack, -# int32_pack, int32_unpack, int64_pack, int64_unpack, float_pack, float_unpack, double_pack, double_unpack) - -# from cassandra.marshal import varint_pack, varint_unpack -# from cassandra import util -# from cassandra.cqltypes import EMPTY, LongType from cassandra.protocol import ResultMessage, ProtocolHandler -# from cassandra.bytesio cimport BytesIOReader from cassandra.parsing cimport ParseDesc, ColumnParser from cassandra.datatypes import make_datatypes from cassandra.objparser import ListParser @@ -80,111 +70,3 @@ def make_protocol_handler(colparser=ListParser()): message_types_by_opcode = my_opcodes return CythonProtocolHandler - - -# cdef parse_rows2(BytesIOReader reader, list colnames, list coltypes, protocol_version): -# cdef Py_ssize_t i, rowcount -# cdef char *raw_val -# cdef int[::1] colcodes -# -# colcodes = np.array( -# [FastResultMessage.code_to_type.get(coltype, -1) for coltype in coltypes], -# dtype=np.dtype('i')) -# -# rowcount = read_int(reader) -# # return RowIterator(reader, coltypes, colcodes, protocol_version, rowcount) -# return [parse_row(reader, coltypes, colcodes, protocol_version) -# for i in range(rowcount)] -# -# -# cdef class RowIterator: -# """ -# Result iterator for a set of rows -# -# There seems to be an issue with generator expressions + memoryviews, so we -# have a special iterator class instead. -# """ -# -# cdef list coltypes -# cdef int[::1] colcodes -# cdef Py_ssize_t rowcount, pos -# cdef BytesIOReader reader -# cdef object protocol_version -# -# def __init__(self, reader, coltypes, colcodes, protocol_version, rowcount): -# self.reader = reader -# self.coltypes = coltypes -# self.colcodes = colcodes -# self.protocol_version = protocol_version -# self.rowcount = rowcount -# self.pos = 0 -# -# def __iter__(self): -# return self -# -# def __next__(self): -# if self.pos >= self.rowcount: -# raise StopIteration -# self.pos += 1 -# return parse_row(self.reader, self.coltypes, self.colcodes, self.protocol_version) -# -# next = __next__ -# -# -# cdef inline parse_row(BytesIOReader reader, list coltypes, int[::1] colcodes, -# protocol_version): -# cdef Py_ssize_t j -# -# row = [] -# for j, ctype in enumerate(coltypes): -# raw_val_size = read_int(reader) -# if raw_val_size < 0: -# val = None -# else: -# raw_val = reader.read(raw_val_size) -# val = from_binary(ctype, colcodes[j], raw_val, -# raw_val_size, protocol_version) -# row.append(val) -# -# return row -# -# -# cdef inline from_binary(cqltype, int typecode, char *byts, int32_t size, protocol_version): -# """ -# Deserialize a bytestring into a value. See the deserialize() method -# for more information. This method differs in that if None or the empty -# string is passed in, None may be returned. -# -# This method provides a fast-path deserialization routine. -# """ -# if size == 0 and cqltype.empty_binary_ok: -# return empty(cqltype) -# return deserialize(cqltype, typecode, byts, size, protocol_version) -# -# -# cdef empty(cqltype): -# return EMPTY if cqltype.support_empty_values else None -# -# -# def to_binary(cqltype, val, protocol_version): -# """ -# Serialize a value into a bytestring. See the serialize() method for -# more information. This method differs in that if None is passed in, -# the result is the empty string. -# """ -# return b'' if val is None else cqltype.serialize(val, protocol_version) -# -# cdef DataType obj = Int64() -# -# cdef deserialize(cqltype, int typecode, char *byts, int32_t size, protocol_version): -# # if typecode == typecodes.LongType: -# # # return int64_unpack(byts) -# # return obj.deserialize(byts, size, protocol_version) -# # else: -# # return deserialize_generic(cqltype, typecode, byts, size, protocol_version) -# return cqltype.deserialize(byts[:size], protocol_version) -# -# cdef deserialize_generic(cqltype, int typecode, char *byts, int32_t size, -# protocol_version): -# return cqltype.deserialize(byts[:size], protocol_version) -# \ No newline at end of file diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 8749457b..41d50851 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -18,4 +18,3 @@ cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out): cdef inline int32_t read_int(BytesIOReader reader): return int32_unpack(reader.read(4)) - From e671354ebfa887d9dafcea24819ae529aa0a52c8 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 11:11:56 +0100 Subject: [PATCH 18/70] Improve error handling for deserialization --- cassandra/bytesio.pxd | 2 +- cassandra/bytesio.pyx | 15 +++++++++------ cassandra/ioutils.pyx | 9 ++++----- cassandra/numpyparser.pyx | 4 ---- cassandra/objparser.pyx | 7 ++----- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/cassandra/bytesio.pxd b/cassandra/bytesio.pxd index 349fd600..9754dd23 100644 --- a/cassandra/bytesio.pxd +++ b/cassandra/bytesio.pxd @@ -3,5 +3,5 @@ cdef class BytesIOReader: cdef char *buf_ptr cdef Py_ssize_t pos cdef Py_ssize_t size - cdef char *read(self, Py_ssize_t n = ?) + cdef char *read(self, Py_ssize_t n = ?) except NULL diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index b18b1aa5..68796120 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -11,7 +11,7 @@ cdef class BytesIOReader: self.size = len(buf) self.buf_ptr = self.buf - cdef char *read(self, Py_ssize_t n = -1): + cdef char *read(self, Py_ssize_t n = -1) except NULL: """Read at most size bytes from the file (less if the read hits EOF before obtaining size bytes). @@ -24,13 +24,16 @@ cdef class BytesIOReader: if n < 0: newpos = self.size - elif newpos > self.size: - self.pos = self.size - return b'' + + if newpos > self.size: + # Raise an error here, as we do not want the caller to consume past the + # end of the buffer + raise EOFError("Cannot read past the end of the file") else: res = self.buf_ptr + self.pos - self.pos = newpos - return res + + self.pos = newpos + return res class PyBytesIOReader(BytesIOReader): diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 41d50851..db3ce633 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -3,7 +3,7 @@ from libc.stdint cimport int32_t from cassandra.bytesio cimport BytesIOReader -cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out): +cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out) except NULL: """ Get a pointer into the buffer provided by BytesIOReader for the next data item in the stream of values. @@ -11,10 +11,9 @@ cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out): raw_val_size = read_int(reader) size_out[0] = raw_val_size if raw_val_size < 0: - return NULL - else: - return reader.read(raw_val_size) + raise ValueError("Expected positive item size") + return reader.read(raw_val_size) -cdef inline int32_t read_int(BytesIOReader reader): +cdef inline int32_t read_int(BytesIOReader reader) except ?0xDEAD: return int32_unpack(reader.read(4)) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 9360c247..936a3f99 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -23,7 +23,6 @@ from cassandra import cqltypes from cassandra.util import is_little_endian import numpy as np -import pandas as pd cdef extern from "numpyFlags.h": @@ -124,9 +123,6 @@ cdef inline int unpack_row( for i in range(rowsize): buf = get_buf(reader, &bufsize) - if buf == NULL: - raise ValueError("Unexpected end of stream") - arr = arrays[i] if arr.is_object: diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index da6e6c01..6ae614b9 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -70,11 +70,8 @@ cdef class TupleRowParser(RowParser): for i in range(rowsize): buf = get_buf(reader, &bufsize) - if buf == NULL: - val = None - else: - dt = desc.datatypes[i] - val = dt.deserialize(buf, bufsize, desc.protocol_version) + dt = desc.datatypes[i] + val = dt.deserialize(buf, bufsize, desc.protocol_version) Py_INCREF(val) PyTuple_SET_ITEM(res, i, val) From 9a72f8d5cddffd8a5e8e3dc5878683623079535e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 14:21:39 +0100 Subject: [PATCH 19/70] Start on unit tests for Cython code --- cassandra/bytesio.pyx | 30 ++---------------------- cassandra/datatypes.pyx | 1 + setup.py | 2 ++ tests/unit/cython/__init__.py | 0 tests/unit/cython/bytesio_testhelper.pyx | 30 ++++++++++++++++++++++++ tests/unit/cython/dummy_module.pyx | 2 ++ tests/unit/cython/test_bytesio.py | 21 +++++++++++++++++ tests/unit/cython/utils.py | 27 +++++++++++++++++++++ 8 files changed, 85 insertions(+), 28 deletions(-) create mode 100644 tests/unit/cython/__init__.py create mode 100644 tests/unit/cython/bytesio_testhelper.pyx create mode 100644 tests/unit/cython/dummy_module.pyx create mode 100644 tests/unit/cython/test_bytesio.py create mode 100644 tests/unit/cython/utils.py diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index 68796120..eb81c2fe 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -20,39 +20,13 @@ cdef class BytesIOReader: string is returned when EOF is encountered immediately. """ cdef Py_ssize_t newpos = self.pos + n - cdef char *res - if n < 0: newpos = self.size - - if newpos > self.size: + elif newpos > self.size: # Raise an error here, as we do not want the caller to consume past the # end of the buffer raise EOFError("Cannot read past the end of the file") - else: - res = self.buf_ptr + self.pos + cdef char *res = self.buf_ptr + self.pos self.pos = newpos return res - - -class PyBytesIOReader(BytesIOReader): - """ - Python-compatible BytesIOReader class - """ - - def read(self, n = -1): - """Read at most size bytes from the file - (less if the read hits EOF before obtaining size bytes). - - If the size argument is negative or omitted, read all data until EOF - is reached. The bytes are returned as a string object. An empty - string is returned when EOF is encountered immediately. - """ - if n is None or n < 0: - newpos = self.len - else: - newpos = min(self.pos+n, self.len) - r = self.buf[self.pos:newpos] - self.pos = newpos - return r diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx index 272435b2..b0c1adb2 100644 --- a/cassandra/datatypes.pyx +++ b/cassandra/datatypes.pyx @@ -60,3 +60,4 @@ def obj_array(list objs): arr[i] = obj return arr + diff --git a/setup.py b/setup.py index 7fe2631a..ce5e5166 100644 --- a/setup.py +++ b/setup.py @@ -274,6 +274,8 @@ if "--no-cython" not in sys.argv: exclude_failures=True)) extensions.extend(cythonize("cassandra/*.pyx", compiler_directives=directives)) + extensions.extend(cythonize("tests/unit/cython/*.pyx", + compiler_directives=directives)) except ImportError: sys.stderr.write("Cython is not installed. Not compiling core driver files as extensions (optional).") diff --git a/tests/unit/cython/__init__.py b/tests/unit/cython/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/cython/bytesio_testhelper.pyx b/tests/unit/cython/bytesio_testhelper.pyx new file mode 100644 index 00000000..7f898c4c --- /dev/null +++ b/tests/unit/cython/bytesio_testhelper.pyx @@ -0,0 +1,30 @@ +from cassandra.bytesio cimport BytesIOReader + +def test_read1(assert_equal, assert_raises): + cdef BytesIOReader reader = BytesIOReader(b'abcdef') + assert_equal(reader.read(2)[:2], b'ab') + assert_equal(reader.read(2)[:2], b'cd') + assert_equal(reader.read(0)[:0], b'') + assert_equal(reader.read(2)[:2], b'ef') + +def test_read2(assert_equal, assert_raises): + cdef BytesIOReader reader = BytesIOReader(b'abcdef') + reader.read(5) + reader.read(1) + +def test_read3(assert_equal, assert_raises): + cdef BytesIOReader reader = BytesIOReader(b'abcdef') + reader.read(6) + +def test_read_eof(assert_equal, assert_raises): + cdef BytesIOReader reader = BytesIOReader(b'abcdef') + reader.read(5) + # cannot convert reader.read to an object, do it manually + # assert_raises(EOFError, reader.read, 2) + try: + reader.read(2) + except EOFError: + pass + else: + raise Exception("Expected an EOFError") + reader.read(1) # see that we can still read this diff --git a/tests/unit/cython/dummy_module.pyx b/tests/unit/cython/dummy_module.pyx new file mode 100644 index 00000000..8bd1206b --- /dev/null +++ b/tests/unit/cython/dummy_module.pyx @@ -0,0 +1,2 @@ +# This is a dummy module used by utils.py to determine whether +# cassandra was build with Cython \ No newline at end of file diff --git a/tests/unit/cython/test_bytesio.py b/tests/unit/cython/test_bytesio.py new file mode 100644 index 00000000..65cc463a --- /dev/null +++ b/tests/unit/cython/test_bytesio.py @@ -0,0 +1,21 @@ +from tests.unit.cython.utils import cyimport, cythontest +bytesio_testhelper = cyimport('tests.unit.cython.bytesio_testhelper') + +try: + import unittest2 as unittest +except ImportError: + import unittest # noqa + + +class BytesIOTest(unittest.TestCase): + """Test Cython BytesIO proxy""" + + @cythontest + def test_reading(self): + bytesio_testhelper.test_read1(self.assertEqual, self.assertRaises) + bytesio_testhelper.test_read2(self.assertEqual, self.assertRaises) + bytesio_testhelper.test_read3(self.assertEqual, self.assertRaises) + + @cythontest + def test_reading_error(self): + bytesio_testhelper.test_read_eof(self.assertEqual, self.assertRaises) diff --git a/tests/unit/cython/utils.py b/tests/unit/cython/utils.py new file mode 100644 index 00000000..eea4698f --- /dev/null +++ b/tests/unit/cython/utils.py @@ -0,0 +1,27 @@ +try: + import tests.unit.cython.dummy_module +except ImportError: + have_cython = False +else: + have_cython = True + +try: + import unittest2 as unittest +except ImportError: + import unittest # noqa + +def cyimport(import_path): + """ + Import a Cython module if available, otherwise return None + (and skip any relevant tests). + """ + try: + return __import__(import_path, fromlist=True) + except ImportError: + if have_cython: + raise + return None + +# @cythontest +# def test_something(self): ... +cythontest = unittest.skipUnless(have_cython, 'Cython is not available') From 27e3505ececf009dd915b0c299036ed54e45aa6d Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 14:41:02 +0100 Subject: [PATCH 20/70] Fix some issues with integration tests --- tests/integration/__init__.py | 2 +- tests/integration/standard/__init__.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py index 057da5c8..f609492f 100644 --- a/tests/integration/__init__.py +++ b/tests/integration/__init__.py @@ -160,7 +160,7 @@ def remove_cluster(): CCM_CLUSTER.remove() CCM_CLUSTER = None return - except WindowsError: + except OSError: ex_type, ex, tb = sys.exc_info() log.warn("{0}: {1} Backtrace: {2}".format(ex_type.__name__, ex, traceback.extract_tb(tb))) del tb diff --git a/tests/integration/standard/__init__.py b/tests/integration/standard/__init__.py index 794d75bf..484ed237 100644 --- a/tests/integration/standard/__init__.py +++ b/tests/integration/standard/__init__.py @@ -11,6 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +try: + import unittest2 as unittest +except ImportError: + import unittest # noqa + try: from ccmlib import common except ImportError as e: From 99dea50c735d2ca3296f20ee0bbc208de5a4e881 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 16:50:29 +0100 Subject: [PATCH 21/70] Add cython protocol integration test --- .gitignore | 3 + .../standard/test_custom_protocol_handler.py | 44 +---------- .../standard/test_cython_protocol_handlers.py | 75 +++++++++++++++++++ 3 files changed, 82 insertions(+), 40 deletions(-) create mode 100644 tests/integration/standard/test_cython_protocol_handlers.py diff --git a/.gitignore b/.gitignore index ee93232c..42aa53e4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ *.so *.egg *.egg-info +*.attr .tox .python-version build @@ -19,6 +20,7 @@ setuptools*.egg cassandra/*.c !cassandra/murmur3.c +cassandra/*.html # OSX .DS_Store @@ -38,3 +40,4 @@ cassandra/*.c #iPython *.ipynb + diff --git a/tests/integration/standard/test_custom_protocol_handler.py b/tests/integration/standard/test_custom_protocol_handler.py index 61a23831..856e6979 100644 --- a/tests/integration/standard/test_custom_protocol_handler.py +++ b/tests/integration/standard/test_custom_protocol_handler.py @@ -21,7 +21,8 @@ from cassandra.protocol import ProtocolHandler, ResultMessage, UUIDType, read_in from cassandra.query import tuple_factory from cassandra.cluster import Cluster from tests.integration import use_singledc, PROTOCOL_VERSION, execute_until_pass -from tests.integration.datatype_utils import update_datatypes, PRIMITIVE_DATATYPES, get_sample +from tests.integration.datatype_utils import update_datatypes, PRIMITIVE_DATATYPES +from tests.integration.standard.utils import create_table_with_all_types, get_all_primitive_params from six import binary_type import uuid @@ -106,11 +107,11 @@ class CustomProtocolHandlerTest(unittest.TestCase): session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked session.row_factory = tuple_factory - columns_string = create_table_with_all_types("test_table", session) + columns_string = create_table_with_all_types("alltypes", session) # verify data params = get_all_primitive_params() - results = session.execute("SELECT {0} FROM alltypes WHERE zz=0".format(columns_string))[0] + results = session.execute("SELECT {0} FROM alltypes WHERE pimkey=0".format(columns_string))[0] for expected, actual in zip(params, results): self.assertEqual(actual, expected) # Ensure we have covered the various primitive types @@ -118,43 +119,6 @@ class CustomProtocolHandlerTest(unittest.TestCase): session.shutdown() -def create_table_with_all_types(table_name, session): - """ - Method that given a table_name and session construct a table that contains all possible primitive types - :param table_name: Name of table to create - :param session: session to use for table creation - :return: a string containing and columns. This can be used to query the table. - """ - # create table - alpha_type_list = ["zz int PRIMARY KEY"] - col_names = ["zz"] - start_index = ord('a') - for i, datatype in enumerate(PRIMITIVE_DATATYPES): - alpha_type_list.append("{0} {1}".format(chr(start_index + i), datatype)) - col_names.append(chr(start_index + i)) - - session.execute("CREATE TABLE alltypes ({0})".format(', '.join(alpha_type_list)), timeout=120) - - # create the input - params = get_all_primitive_params() - - # insert into table as a simple statement - columns_string = ', '.join(col_names) - placeholders = ', '.join(["%s"] * len(col_names)) - session.execute("INSERT INTO alltypes ({0}) VALUES ({1})".format(columns_string, placeholders), params, timeout=120) - return columns_string - - -def get_all_primitive_params(): - """ - Simple utility method used to give back a list of all possible primitive data sample types. - """ - params = [0] - for datatype in PRIMITIVE_DATATYPES: - params.append((get_sample(datatype))) - return params - - class CustomResultMessageRaw(ResultMessage): """ This is a custom Result Message that is used to return raw results, rather then diff --git a/tests/integration/standard/test_cython_protocol_handlers.py b/tests/integration/standard/test_cython_protocol_handlers.py new file mode 100644 index 00000000..35b131a9 --- /dev/null +++ b/tests/integration/standard/test_cython_protocol_handlers.py @@ -0,0 +1,75 @@ +"""Test the various Cython-based message deserializers""" + +# Based on test_custom_protocol_handler.py + +try: + import unittest2 as unittest +except ImportError: + import unittest + +from cassandra.cluster import Cluster +from tests.integration import use_singledc, PROTOCOL_VERSION +from tests.integration.datatype_utils import update_datatypes +from tests.integration.standard.utils import create_table_with_all_types, get_all_primitive_params +from six import next + +try: + from cassandra.cython_protocol_handler import make_protocol_handler +except ImportError as e: + raise unittest.skip("Skipping test, not compiled with Cython enabled") + +from cassandra.numpyparser import NumpyParser +from cassandra.objparser import ListParser, LazyParser + + +def setup_module(): + use_singledc() + update_datatypes() + + +class CustomProtocolHandlerTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.cluster = Cluster(protocol_version=PROTOCOL_VERSION) + cls.session = cls.cluster.connect() + cls.session.execute("CREATE KEYSPACE testspace WITH replication = " + "{ 'class' : 'SimpleStrategy', 'replication_factor': '1'}") + + @classmethod + def tearDownClass(cls): + cls.session.execute("DROP KEYSPACE testspace") + cls.cluster.shutdown() + + def test_cython_parser(self): + """ + Test Cython-based parser that returns a list of tuples + """ + self.cython_parser(ListParser()) + + def test_cython_lazy_parser(self): + """ + Test Cython-based parser that returns a list of tuples + """ + self.cython_parser(LazyParser()) + + def cython_parser(self, colparser): + session = Cluster().connect() + session.set_keyspace("smallspace") + + # use our custom protocol handler + session.client_protocol_handler = make_protocol_handler(colparser) + # session.row_factory = tuple_factory + create_table_with_all_types("test_table", session) + + # verify data + params = get_all_primitive_params() + [first_result] = session.execute("SELECT * FROM test_table WHERE primkey=0") + self.assertEqual(len(params), len(first_result), + msg="Not the right number of columns?") + print(first_result) + assert False + for expected, actual in zip(params, first_result): + self.assertEqual(actual, expected) + + session.shutdown() From 5996aa622da9dc3404bd1bdefbc0d941595e8a6e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 4 Aug 2015 17:29:23 +0100 Subject: [PATCH 22/70] Some fixes to cython integration test --- .../standard/test_custom_protocol_handler.py | 6 +++--- .../standard/test_cython_protocol_handlers.py | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/integration/standard/test_custom_protocol_handler.py b/tests/integration/standard/test_custom_protocol_handler.py index 856e6979..edd066be 100644 --- a/tests/integration/standard/test_custom_protocol_handler.py +++ b/tests/integration/standard/test_custom_protocol_handler.py @@ -63,7 +63,7 @@ class CustomProtocolHandlerTest(unittest.TestCase): """ # Ensure that we get normal uuid back first - session = Cluster().connect() + session = Cluster(protocol_version=PROTOCOL_VERSION).connect(keyspace="custserdes") session.row_factory = tuple_factory result_set = session.execute("SELECT schema_version FROM system.local") result = result_set.pop() @@ -103,7 +103,7 @@ class CustomProtocolHandlerTest(unittest.TestCase): @test_category data_types:serialization """ # Connect using a custom protocol handler that tracks the various types the result message is used with. - session = Cluster().connect(keyspace="custserdes") + session = Cluster(protocol_version=PROTOCOL_VERSION).connect(keyspace="custserdes") session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked session.row_factory = tuple_factory @@ -111,7 +111,7 @@ class CustomProtocolHandlerTest(unittest.TestCase): # verify data params = get_all_primitive_params() - results = session.execute("SELECT {0} FROM alltypes WHERE pimkey=0".format(columns_string))[0] + results = session.execute("SELECT {0} FROM alltypes WHERE primkey=0".format(columns_string))[0] for expected, actual in zip(params, results): self.assertEqual(actual, expected) # Ensure we have covered the various primitive types diff --git a/tests/integration/standard/test_cython_protocol_handlers.py b/tests/integration/standard/test_cython_protocol_handlers.py index 35b131a9..059c9317 100644 --- a/tests/integration/standard/test_cython_protocol_handlers.py +++ b/tests/integration/standard/test_cython_protocol_handlers.py @@ -16,7 +16,7 @@ from six import next try: from cassandra.cython_protocol_handler import make_protocol_handler except ImportError as e: - raise unittest.skip("Skipping test, not compiled with Cython enabled") + raise unittest.SkipTest("Skipping test, not compiled with Cython enabled") from cassandra.numpyparser import NumpyParser from cassandra.objparser import ListParser, LazyParser @@ -35,6 +35,8 @@ class CustomProtocolHandlerTest(unittest.TestCase): cls.session = cls.cluster.connect() cls.session.execute("CREATE KEYSPACE testspace WITH replication = " "{ 'class' : 'SimpleStrategy', 'replication_factor': '1'}") + cls.session.set_keyspace("testspace") + create_table_with_all_types("test_table", cls.session) @classmethod def tearDownClass(cls): @@ -54,21 +56,18 @@ class CustomProtocolHandlerTest(unittest.TestCase): self.cython_parser(LazyParser()) def cython_parser(self, colparser): - session = Cluster().connect() - session.set_keyspace("smallspace") + cluster = Cluster(protocol_version=PROTOCOL_VERSION) + session = cluster.connect(keyspace="testspace") # use our custom protocol handler session.client_protocol_handler = make_protocol_handler(colparser) # session.row_factory = tuple_factory - create_table_with_all_types("test_table", session) # verify data params = get_all_primitive_params() [first_result] = session.execute("SELECT * FROM test_table WHERE primkey=0") self.assertEqual(len(params), len(first_result), msg="Not the right number of columns?") - print(first_result) - assert False for expected, actual in zip(params, first_result): self.assertEqual(actual, expected) From 1879d9be31a0faa32e90780f26c016b328b7ea6c Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 5 Aug 2015 18:42:40 +0100 Subject: [PATCH 23/70] Start on deserializers for cqltypes --- cassandra/buffer.pxd | 8 ++ cassandra/buffer.pyx | 38 ++++++++++ cassandra/bytesio.pxd | 1 - cassandra/cython_protocol_handler.pyx | 5 +- cassandra/datatypes.pxd | 3 - cassandra/datatypes.pyx | 63 ---------------- cassandra/deserializers.pxd | 7 ++ cassandra/deserializers.pyx | 101 ++++++++++++++++++++++++++ cassandra/ioutils.pyx | 11 ++- cassandra/marshal.pyx | 5 -- cassandra/numpyparser.pyx | 16 ++-- cassandra/objparser.pyx | 18 +++-- cassandra/parsing.pxd | 4 +- 13 files changed, 185 insertions(+), 95 deletions(-) create mode 100644 cassandra/buffer.pxd create mode 100644 cassandra/buffer.pyx delete mode 100644 cassandra/datatypes.pxd delete mode 100644 cassandra/datatypes.pyx create mode 100644 cassandra/deserializers.pxd create mode 100644 cassandra/deserializers.pyx diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd new file mode 100644 index 00000000..f431d311 --- /dev/null +++ b/cassandra/buffer.pxd @@ -0,0 +1,8 @@ +cdef struct Buffer: + char *ptr + Py_ssize_t size + +cdef inline Buffer from_bytes(bytes byts) +cdef inline bytes to_bytes(Buffer *buf) +cdef inline char *buf_ptr(Buffer *buf) +cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size) \ No newline at end of file diff --git a/cassandra/buffer.pyx b/cassandra/buffer.pyx new file mode 100644 index 00000000..570a7496 --- /dev/null +++ b/cassandra/buffer.pyx @@ -0,0 +1,38 @@ +""" +Simple buffer data structure. This buffer can be included: + + include "buffer.pyx" + +or imported: + + from cassanda cimport buffer + +but this prevents inlining of the functions below. +""" + +from cpython.bytes cimport PyBytes_AS_STRING + # char* PyBytes_AS_STRING(object string) + # Macro form of PyBytes_AsString() but without error + # checking. Only string objects are supported; no Unicode objects + # should be passed. + +from cassandra.buffer cimport Buffer + +cdef struct Buffer: + char *ptr + Py_ssize_t size + +cdef inline Buffer from_bytes(bytes byts): + return from_ptr_and_size(PyBytes_AS_STRING(byts), len(byts)) + +cdef inline bytes to_bytes(Buffer *buf): + return buf.ptr[:buf.size] + +cdef inline char *buf_ptr(Buffer *buf): + return buf.ptr + +cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size): + cdef Buffer res + res.ptr = ptr + res.size = size + return res \ No newline at end of file diff --git a/cassandra/bytesio.pxd b/cassandra/bytesio.pxd index 9754dd23..64bbdcca 100644 --- a/cassandra/bytesio.pxd +++ b/cassandra/bytesio.pxd @@ -4,4 +4,3 @@ cdef class BytesIOReader: cdef Py_ssize_t pos cdef Py_ssize_t size cdef char *read(self, Py_ssize_t n = ?) except NULL - diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index 6ef3ae9f..af91c4d7 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -3,7 +3,7 @@ from cassandra.protocol import ResultMessage, ProtocolHandler from cassandra.parsing cimport ParseDesc, ColumnParser -from cassandra.datatypes import make_datatypes +from cassandra.deserializers import make_deserializers from cassandra.objparser import ListParser @@ -21,7 +21,8 @@ def make_recv_results_rows(ColumnParser colparser): colnames = [c[2] for c in column_metadata] coltypes = [c[3] for c in column_metadata] - desc = ParseDesc(colnames, coltypes, make_datatypes(coltypes), protocol_version) + desc = ParseDesc(colnames, coltypes, make_deserializers(coltypes), + protocol_version) reader = BytesIOReader(f.read()) parsed_rows = colparser.parse_rows(reader, desc) diff --git a/cassandra/datatypes.pxd b/cassandra/datatypes.pxd deleted file mode 100644 index cd58b6b3..00000000 --- a/cassandra/datatypes.pxd +++ /dev/null @@ -1,3 +0,0 @@ -cdef class DataType: - cdef object cqltype - cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version) diff --git a/cassandra/datatypes.pyx b/cassandra/datatypes.pyx deleted file mode 100644 index b0c1adb2..00000000 --- a/cassandra/datatypes.pyx +++ /dev/null @@ -1,63 +0,0 @@ -# -- cython: profile=True - -include 'marshal.pyx' - -from cython.view cimport array as cython_array -from cassandra.datatypes import Int64, GenericDataType -from cassandra.cqltypes import LongType - -# TODO: Port cqltypes to this module - -cdef class DataType: - """ - Cython-based datatype - """ - - def __init__(self, cqltype): - self.cqltype = cqltype - - cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): - raise NotImplementedError - - -cdef class Int64(DataType): - - cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): - cdef int64_t x = int64_unpack(buf) - return x - - def __str__(self): - return "int64" - - -cdef class GenericDataType(DataType): - """ - Wrap a generic datatype for deserialization - """ - - cdef object deserialize(self, char *buf, Py_ssize_t size, protocol_version): - return self.cqltype.deserialize(buf[:size], protocol_version) - - def __str__(self): - return "GenericDataType(%s)" % (self.cqltype,) - - -def make_datatypes(coltypes): - cdef DataType[::1] datatypes - return obj_array([make_datatype(ct) for ct in coltypes]) - - -def make_datatype(coltype): - return Int64(coltype) if coltype == LongType else GenericDataType(coltype) - - -def obj_array(list objs): - """Create a (Cython) array of objects given a list of objects""" - cdef object[:] arr - arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") - # arr[:] = objs # This does not work (segmentation faults) - for i, obj in enumerate(objs): - arr[i] = obj - return arr - - diff --git a/cassandra/deserializers.pxd b/cassandra/deserializers.pxd new file mode 100644 index 00000000..333479f3 --- /dev/null +++ b/cassandra/deserializers.pxd @@ -0,0 +1,7 @@ +# -- cython: profile=True + +from cassandra.buffer cimport Buffer + +cdef class Deserializer: + cdef deserialize(self, Buffer *buf, protocol_version) + # cdef deserialize(self, CString byts, protocol_version) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx new file mode 100644 index 00000000..21245364 --- /dev/null +++ b/cassandra/deserializers.pyx @@ -0,0 +1,101 @@ +# -- cython: profile=True + +include 'marshal.pyx' +include 'buffer.pyx' + +from cython.view cimport array as cython_array +from decimal import Decimal +from uuid import UUID + +import inspect + +cdef class Deserializer: + cdef deserialize(self, Buffer *buf, protocol_version): + raise NotImplementedError + + +cdef class DesLongType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return int64_unpack(buf.ptr) + + +# TODO: Use libmpdec: http://www.bytereef.org/mpdecimal/index.html +cdef class DesDecimalType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + scale = int32_unpack(buf.ptr) + unscaled = varint_unpack(buf.ptr + 4) + return Decimal('%de%d' % (unscaled, -scale)) + + +cdef class DesUUIDType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return UUID(bytes=to_bytes(buf)) + + +cdef class DesBooleanType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return bool(int8_unpack(buf.ptr)) + + +cdef class DesByteType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return int8_unpack(buf.ptr) + + +cdef class DesAsciiType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + if six.PY2: + return to_bytes(buf) + return to_bytes(buf).decode('ascii') + + +cdef class DesFloatType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return float_unpack(buf.ptr) + + +cdef class DesDoubleType(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return double_unpack(buf.ptr) + + +cdef class DesInt32Type(Deserializer): + cdef deserialize(self, Buffer *buf, protocol_version): + return int32_unpack(buf.ptr) + + +cdef class GenericDeserializer(Deserializer): + """ + Wrap a generic datatype for deserialization + """ + + def __init__(self, cqltype): + self.cqltype = cqltype + + cdef deserialize(self, Buffer *buf, protocol_version): + return self.cqltype.deserialize(to_bytes(buf), protocol_version) + +#-------------------------------------------------------------------------- + +def make_deserializers(cqltypes): + """Create an array of Deserializers for each given cqltype in cqltypes""" + cdef Deserializer[::1] deserializers + return obj_array([find_deserializer(ct) for ct in cqltypes]) + + +cpdef Deserializer find_deserializer(cqltype): + """Find a deserializer for a cqltype""" + deserializer = None + if inspect.isclass(cqltype): + deserializer = globals().get('Des' + cqltype.__name__)() + return deserializer or GenericDeserializer(cqltype) + + +def obj_array(list objs): + """Create a (Cython) array of objects given a list of objects""" + cdef object[:] arr + arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") + # arr[:] = objs # This does not work (segmentation faults) + for i, obj in enumerate(objs): + arr[i] = obj + return arr diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index db3ce633..0f8c3e3e 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -1,19 +1,22 @@ include 'marshal.pyx' +include 'buffer.pyx' + from libc.stdint cimport int32_t from cassandra.bytesio cimport BytesIOReader -cdef inline char *get_buf(BytesIOReader reader, Py_ssize_t *size_out) except NULL: +cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1: """ Get a pointer into the buffer provided by BytesIOReader for the next data item in the stream of values. """ - raw_val_size = read_int(reader) - size_out[0] = raw_val_size + cdef Py_ssize_t raw_val_size = read_int(reader) if raw_val_size < 0: raise ValueError("Expected positive item size") - return reader.read(raw_val_size) + buf_out.ptr = reader.read(raw_val_size) + buf_out.size = raw_val_size + return 0 cdef inline int32_t read_int(BytesIOReader reader) except ?0xDEAD: return int32_unpack(reader.read(4)) diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index cc80461b..9e1c8ca5 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -71,9 +71,6 @@ cpdef inline int64_t int64_unpack(const char *buf): # The 'const' makes sure the buffer is not mutated in-place! cdef int64_t x = ( buf)[0] cdef char *p = &x - # if is_little_endian: - # p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7] = ( - # p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]) swap_order( &x, 8) return x @@ -83,8 +80,6 @@ cpdef inline bytes int32_pack(int32_t x): cpdef inline int32_t int32_unpack(const char *buf): cdef int32_t x = ( buf)[0] cdef char *p = &x - # if is_little_endian: - # p[0], p[1], p[2], p[3] = p[3], p[2], p[1], p[0] swap_order( &x, 4) return x diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 936a3f99..149843a6 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -17,7 +17,7 @@ from libc.stdint cimport uint64_t from cpython.ref cimport Py_INCREF, PyObject from cassandra.bytesio cimport BytesIOReader -from cassandra.datatypes cimport DataType +from cassandra.deserializers cimport Deserializer from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser from cassandra import cqltypes from cassandra.util import is_little_endian @@ -116,22 +116,22 @@ def make_array(coltype, array_size): @cython.wraparound(False) cdef inline int unpack_row( BytesIOReader reader, ParseDesc desc, ArrDesc *arrays) except -1: - cdef char *buf - cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize + cdef Buffer buf + cdef Py_ssize_t i, rowsize = desc.rowsize cdef ArrDesc arr - cdef DataType dt + cdef Deserializer deserializer for i in range(rowsize): - buf = get_buf(reader, &bufsize) + get_buf(reader, &buf) arr = arrays[i] if arr.is_object: - dt = desc.datatypes[i] - val = dt.deserialize(buf, bufsize, desc.protocol_version) + deserializer = desc.datatypes[i] + val = deserializer.deserialize(&buf, desc.protocol_version) Py_INCREF(val) ( arr.buf_ptr)[0] = val else: - memcopy(buf, arr.buf_ptr, bufsize) + memcopy(buf.ptr, arr.buf_ptr, buf.size) # Update the pointer into the array for the next time arrays[i].buf_ptr += arr.stride diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index 6ae614b9..62723ceb 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -19,7 +19,7 @@ from cpython.ref cimport ( ) from cassandra.bytesio cimport BytesIOReader -from cassandra.datatypes cimport DataType +from cassandra.deserializers cimport Deserializer from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser @@ -63,16 +63,20 @@ cdef class TupleRowParser(RowParser): """ cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): - cdef char *buf - cdef Py_ssize_t i, bufsize, rowsize = desc.rowsize - cdef DataType dt + cdef Buffer buf + cdef Py_ssize_t i, rowsize = desc.rowsize + cdef Deserializer deserializer cdef tuple res = PyTuple_New(desc.rowsize) for i in range(rowsize): - buf = get_buf(reader, &bufsize) - dt = desc.datatypes[i] - val = dt.deserialize(buf, bufsize, desc.protocol_version) + # Read the next few bytes + get_buf(reader, &buf) + # Deserialize bytes to python object + deserializer = desc.datatypes[i] + val = deserializer.deserialize(&buf, desc.protocol_version) + + # Insert new object into tuple Py_INCREF(val) PyTuple_SET_ITEM(res, i, val) diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd index c4774385..40043f29 100644 --- a/cassandra/parsing.pxd +++ b/cassandra/parsing.pxd @@ -1,10 +1,10 @@ from cassandra.bytesio cimport BytesIOReader -from cassandra.datatypes cimport DataType +from cassandra.deserializers cimport Deserializer cdef class ParseDesc: cdef public object colnames cdef public object coltypes - cdef DataType[::1] datatypes + cdef Deserializer[::1] datatypes cdef public object protocol_version cdef Py_ssize_t rowsize From ae7c4b2e81ba35f455f80b67d5fa109b88b541b7 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 5 Aug 2015 18:49:19 +0100 Subject: [PATCH 24/70] Forgot to add test utility module --- tests/integration/standard/utils.py | 46 +++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/integration/standard/utils.py diff --git a/tests/integration/standard/utils.py b/tests/integration/standard/utils.py new file mode 100644 index 00000000..bd0c80b5 --- /dev/null +++ b/tests/integration/standard/utils.py @@ -0,0 +1,46 @@ +""" +Helper module to populate a dummy Cassandra tables with data. +""" + +from tests.integration.datatype_utils import PRIMITIVE_DATATYPES, get_sample + +def create_table_with_all_types(table_name, session): + """ + Method that given a table_name and session construct a table that contains + all possible primitive types. + + :param table_name: Name of table to create + :param session: session to use for table creation + :return: a string containing the names of all the columns. + This can be used to query the table. + """ + # create table + alpha_type_list = ["primkey int PRIMARY KEY"] + col_names = ["primkey"] + start_index = ord('a') + for i, datatype in enumerate(PRIMITIVE_DATATYPES): + alpha_type_list.append("{0} {1}".format(chr(start_index + i), datatype)) + col_names.append(chr(start_index + i)) + + session.execute("CREATE TABLE {0} ({1})".format( + table_name, ', '.join(alpha_type_list)), timeout=120) + + # create the input + params = get_all_primitive_params() + + # insert into table as a simple statement + columns_string = ', '.join(col_names) + placeholders = ', '.join(["%s"] * len(col_names)) + session.execute("INSERT INTO {0} ({1}) VALUES ({2})".format( + table_name, columns_string, placeholders), params, timeout=120) + return columns_string + + +def get_all_primitive_params(): + """ + Simple utility method used to give back a list of all possible primitive data sample types. + """ + params = [0] + for datatype in PRIMITIVE_DATATYPES: + params.append(get_sample(datatype)) + return params From 26ef8682244fcafb573f841ba7b43551725bb078 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 5 Aug 2015 20:09:06 +0100 Subject: [PATCH 25/70] Some small fixes to deserializers --- cassandra/deserializers.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 21245364..2bf91553 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -69,6 +69,8 @@ cdef class GenericDeserializer(Deserializer): Wrap a generic datatype for deserialization """ + cdef object cqltype + def __init__(self, cqltype): self.cqltype = cqltype @@ -85,10 +87,11 @@ def make_deserializers(cqltypes): cpdef Deserializer find_deserializer(cqltype): """Find a deserializer for a cqltype""" - deserializer = None - if inspect.isclass(cqltype): - deserializer = globals().get('Des' + cqltype.__name__)() - return deserializer or GenericDeserializer(cqltype) + name = inspect.isclass(cqltype) and 'Des' + cqltype.__name__ + if name in globals(): + deserializer_cls = globals()[name] + deserializer_cls() + return GenericDeserializer(cqltype) def obj_array(list objs): From ddeb7536623ebbd916c0d5af4130e99742578d99 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 5 Aug 2015 20:21:31 +0100 Subject: [PATCH 26/70] Some more small fixes --- cassandra/numpyparser.pyx | 2 +- cassandra/objparser.pyx | 8 +------- cassandra/parsing.pxd | 2 +- cassandra/parsing.pyx | 4 ++-- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 149843a6..7be86400 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -126,7 +126,7 @@ cdef inline int unpack_row( arr = arrays[i] if arr.is_object: - deserializer = desc.datatypes[i] + deserializer = desc.deserializers[i] val = deserializer.deserialize(&buf, desc.protocol_version) Py_INCREF(val) ( arr.buf_ptr)[0] = val diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index 62723ceb..e98a991e 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -54,12 +54,6 @@ cdef class TupleRowParser(RowParser): Parse a single returned row into a tuple of objects: (obj1, ..., objN) - - Attributes - =========== - datatypes: - this is a memoryview of N DataType objects that can deserialize bytes - into objects """ cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): @@ -73,7 +67,7 @@ cdef class TupleRowParser(RowParser): get_buf(reader, &buf) # Deserialize bytes to python object - deserializer = desc.datatypes[i] + deserializer = desc.deserializers[i] val = deserializer.deserialize(&buf, desc.protocol_version) # Insert new object into tuple diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd index 40043f29..13bc8411 100644 --- a/cassandra/parsing.pxd +++ b/cassandra/parsing.pxd @@ -4,7 +4,7 @@ from cassandra.deserializers cimport Deserializer cdef class ParseDesc: cdef public object colnames cdef public object coltypes - cdef Deserializer[::1] datatypes + cdef Deserializer[::1] deserializers cdef public object protocol_version cdef Py_ssize_t rowsize diff --git a/cassandra/parsing.pyx b/cassandra/parsing.pyx index 71196d14..c9afd4b5 100644 --- a/cassandra/parsing.pyx +++ b/cassandra/parsing.pyx @@ -5,10 +5,10 @@ Module containing the definitions and declarations (parsing.pxd) for parsers. cdef class ParseDesc: """Description of what structure to parse""" - def __init__(self, colnames, coltypes, datatypes, protocol_version): + def __init__(self, colnames, coltypes, deserializers, protocol_version): self.colnames = colnames self.coltypes = coltypes - self.datatypes = datatypes + self.deserializers = deserializers self.protocol_version = protocol_version self.rowsize = len(colnames) From 2d8ad6ad3a53fd22e16302e03334a4dd8f5fe6b7 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 13:45:23 +0100 Subject: [PATCH 27/70] More Cython-based object deserializers --- cassandra/buffer.pxd | 32 +++- cassandra/buffer.pyx | 38 ----- cassandra/cython_protocol_handler.pyx | 2 +- cassandra/cython_utils.pyx | 27 ++++ cassandra/deserializers.pxd | 2 +- cassandra/deserializers.pyx | 203 ++++++++++++++++++++++++-- cassandra/ioutils.pyx | 2 +- cassandra/marshal.pyx | 41 ++++-- cassandra/parsing.pxd | 2 +- 9 files changed, 275 insertions(+), 74 deletions(-) delete mode 100644 cassandra/buffer.pyx create mode 100644 cassandra/cython_utils.pyx diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd index f431d311..cfe93e01 100644 --- a/cassandra/buffer.pxd +++ b/cassandra/buffer.pxd @@ -1,8 +1,32 @@ +""" +Simple buffer data structure that provides a view on existing memory +(e.g. from a bytes object). This memory must stay alive while the +buffer is in use. +""" + +from cpython.bytes cimport PyBytes_AS_STRING + # char* PyBytes_AS_STRING(object string) + # Macro form of PyBytes_AsString() but without error + # checking. Only string objects are supported; no Unicode objects + # should be passed. + +from cassandra.buffer cimport Buffer + cdef struct Buffer: char *ptr Py_ssize_t size -cdef inline Buffer from_bytes(bytes byts) -cdef inline bytes to_bytes(Buffer *buf) -cdef inline char *buf_ptr(Buffer *buf) -cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size) \ No newline at end of file +cdef inline Buffer from_bytes(bytes byts): + return from_ptr_and_size(PyBytes_AS_STRING(byts), len(byts)) + +cdef inline bytes to_bytes(Buffer *buf): + return buf.ptr[:buf.size] + +cdef inline char *buf_ptr(Buffer *buf): + return buf.ptr + +cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size): + cdef Buffer res + res.ptr = ptr + res.size = size + return res diff --git a/cassandra/buffer.pyx b/cassandra/buffer.pyx deleted file mode 100644 index 570a7496..00000000 --- a/cassandra/buffer.pyx +++ /dev/null @@ -1,38 +0,0 @@ -""" -Simple buffer data structure. This buffer can be included: - - include "buffer.pyx" - -or imported: - - from cassanda cimport buffer - -but this prevents inlining of the functions below. -""" - -from cpython.bytes cimport PyBytes_AS_STRING - # char* PyBytes_AS_STRING(object string) - # Macro form of PyBytes_AsString() but without error - # checking. Only string objects are supported; no Unicode objects - # should be passed. - -from cassandra.buffer cimport Buffer - -cdef struct Buffer: - char *ptr - Py_ssize_t size - -cdef inline Buffer from_bytes(bytes byts): - return from_ptr_and_size(PyBytes_AS_STRING(byts), len(byts)) - -cdef inline bytes to_bytes(Buffer *buf): - return buf.ptr[:buf.size] - -cdef inline char *buf_ptr(Buffer *buf): - return buf.ptr - -cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size): - cdef Buffer res - res.ptr = ptr - res.size = size - return res \ No newline at end of file diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx index af91c4d7..629ce887 100644 --- a/cassandra/cython_protocol_handler.pyx +++ b/cassandra/cython_protocol_handler.pyx @@ -11,7 +11,7 @@ include "ioutils.pyx" def make_recv_results_rows(ColumnParser colparser): - def recv_results_rows(cls, f, protocol_version, user_type_map): + def recv_results_rows(cls, f, int protocol_version, user_type_map): """ Parse protocol data given as a BytesIO f into a set of columns (e.g. list of tuples) This is used as the recv_results_rows method of (Fast)ResultMessage diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx new file mode 100644 index 00000000..fe4fbab9 --- /dev/null +++ b/cassandra/cython_utils.pyx @@ -0,0 +1,27 @@ +""" +Duplicate module of util.py, with some accelerated functions +used for deserialization. +""" + +# from __future__ import with_statement + +from cpython.datetime cimport timedelta_new + # cdef inline object timedelta_new(int days, int seconds, int useconds) + # Create timedelta object using DateTime CAPI factory function. + # Note, there are no range checks for any of the arguments. + +import calendar +import datetime +import random +import six +import uuid +import sys + +DATETIME_EPOC = datetime.datetime(1970, 1, 1) + +assert sys.byteorder in ('little', 'big') +is_little_endian = sys.byteorder == 'little' + +cdef datetime_from_timestamp(timestamp): + return DATETIME_EPOC + timedelta_new(0, timestamp, 0) + diff --git a/cassandra/deserializers.pxd b/cassandra/deserializers.pxd index 333479f3..882d19d1 100644 --- a/cassandra/deserializers.pxd +++ b/cassandra/deserializers.pxd @@ -3,5 +3,5 @@ from cassandra.buffer cimport Buffer cdef class Deserializer: - cdef deserialize(self, Buffer *buf, protocol_version) + cdef deserialize(self, Buffer *buf, int protocol_version) # cdef deserialize(self, CString byts, protocol_version) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 2bf91553..680e14a2 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -1,69 +1,246 @@ # -- cython: profile=True +from libc.stdint cimport int32_t, uint16_t + include 'marshal.pyx' -include 'buffer.pyx' +include 'cython_utils.pyx' +from cassandra.buffer cimport Buffer, to_bytes from cython.view cimport array as cython_array + +import socket +import inspect from decimal import Decimal from uuid import UUID -import inspect +from cassandra import util + cdef class Deserializer: - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): raise NotImplementedError cdef class DesLongType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return int64_unpack(buf.ptr) # TODO: Use libmpdec: http://www.bytereef.org/mpdecimal/index.html cdef class DesDecimalType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): scale = int32_unpack(buf.ptr) unscaled = varint_unpack(buf.ptr + 4) return Decimal('%de%d' % (unscaled, -scale)) cdef class DesUUIDType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return UUID(bytes=to_bytes(buf)) cdef class DesBooleanType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return bool(int8_unpack(buf.ptr)) cdef class DesByteType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return int8_unpack(buf.ptr) cdef class DesAsciiType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): if six.PY2: return to_bytes(buf) return to_bytes(buf).decode('ascii') cdef class DesFloatType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return float_unpack(buf.ptr) cdef class DesDoubleType(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return double_unpack(buf.ptr) cdef class DesInt32Type(Deserializer): - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return int32_unpack(buf.ptr) +cdef class DesIntegerType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + return varint_unpack(to_bytes(buf)) + + +cdef class DesInetAddressType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + cdef bytes byts = to_bytes(buf) + + # TODO: optimize inet_ntop, inet_ntoa + if len(buf.size) == 16: + return util.inet_ntop(socket.AF_INET6, byts) + else: + # util.inet_pton could also handle, but this is faster + # since we've already determined the AF + return socket.inet_ntoa(byts) + + +cdef class DesCounterColumnType(DesLongType): + pass + + +cdef class DesDateType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + timestamp = int64_unpack(buf.ptr) / 1000.0 + return datetime_from_timestamp(timestamp) + + +cdef class TimestampType(DesDateType): + pass + + +cdef class TimeUUIDType(DesDateType): + cdef deserialize(self, Buffer *buf, int protocol_version): + return UUID(bytes=to_bytes(buf)) + + +# Values of the 'date'` type are encoded as 32-bit unsigned integers +# representing a number of days with epoch (January 1st, 1970) at the center of the +# range (2^31). +EPOCH_OFFSET_DAYS = 2 ** 31 + +cdef class DesSimpleDateType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + days = uint32_unpack(buf.ptr) - EPOCH_OFFSET_DAYS + return util.Date(days) + + +cdef class DesShortType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + return int16_unpack(buf.ptr) + + +cdef class DesTimeType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + return util.Time(int64_unpack(to_bytes(buf))) + + +cdef class DesUTF8Type(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + return to_bytes(buf).decode('utf8') + + +cdef class DesVarcharType(DesUTF8Type): + pass + + +cdef class _DesParameterizedType(Deserializer): + + cdef object cqltype + cdef object adapter + cdef object subtypes + cdef Deserializer[::1] deserializers + + def __init__(self, cqltype): + assert cqltype.subtypes and len(cqltype.subtypes) == 1 + self.cqltype = cqltype + self.adapter = cqltype.adapter + self.subtypes = cqltype.subtypes + self.deserializers = make_deserializers(cqltype.subtypes) + + +cdef class _DesSimpleParameterizedType(_DesParameterizedType): + cdef deserialize(self, Buffer *buf, int protocol_version): + cdef uint16_t v2_and_below = 0 + cdef int32_t v3_and_above = 0 + + if protocol_version >= 3: + result = _deserialize_parameterized[int32_t]( + v3_and_above, self.deserializers[0], buf, protocol_version) + else: + result = _deserialize_parameterized[uint16_t]( + v2_and_below, self.deserializers[0], buf, protocol_version) + return self.adapter(result) + + +ctypedef fused itemlen_t: + uint16_t # protocol <= v2 + int32_t # protocol >= v3 + + +cdef itemlen_t _unpack(itemlen_t dummy, const char *buf): + cdef itemlen_t result + if itemlen_t is uint16_t: + result = uint16_unpack(buf) + else: + result = int32_unpack(buf) + return result + +cdef list _deserialize_parameterized( + itemlen_t dummy, Deserializer deserializer, + Buffer *buf, int protocol_version): + cdef itemlen_t itemlen + cdef Buffer sub_buf + + cdef itemlen_t numelements = _unpack[itemlen_t](dummy, buf.ptr) + cdef itemlen_t p = sizeof(itemlen_t) + cdef list result = [] + + for _ in range(numelements): + itemlen = _unpack[itemlen_t](dummy, buf.ptr + p) + p += sizeof(itemlen_t) + sub_buf.ptr = buf.ptr + p + sub_buf.size = itemlen + p += itemlen + result.append(deserializer.deserialize(&sub_buf, protocol_version)) + + return result + +# cdef deserialize_v3_and_above( +# Deserializer deserializer, Buffer *buf, int protocol_version): +# cdef Py_ssize_t itemlen +# cdef Buffer sub_buf +# +# cdef Py_ssize_t numelements = int32_unpack(buf.ptr) +# cdef Py_ssize_t p = 4 +# cdef list result = [] +# +# for _ in range(numelements): +# itemlen = int32_unpack(buf.ptr + p) +# p += 4 +# sub_buf.ptr = buf.ptr + p +# sub_buf.size = itemlen +# p += itemlen +# result.append(deserializer.deserialize(&sub_buf, protocol_version)) +# +# return result +# +# +# cdef deserialize_v2_and_below( +# Deserializer deserializer, Buffer *buf, int protocol_version): +# cdef Py_ssize_t itemlen +# cdef Buffer sub_buf +# +# cdef Py_ssize_t numelements = uint16_unpack(buf.ptr) +# cdef Py_ssize_t p = 2 +# cdef list result = [] +# +# for _ in range(numelements): +# itemlen = uint16_unpack(buf.ptr + p) +# p += 2 +# sub_buf.ptr = buf.ptr + p +# sub_buf.size = itemlen +# p += itemlen +# result.append(deserializer.deserialize(&sub_buf, protocol_version)) +# +# return result + + + cdef class GenericDeserializer(Deserializer): """ Wrap a generic datatype for deserialization @@ -74,7 +251,7 @@ cdef class GenericDeserializer(Deserializer): def __init__(self, cqltype): self.cqltype = cqltype - cdef deserialize(self, Buffer *buf, protocol_version): + cdef deserialize(self, Buffer *buf, int protocol_version): return self.cqltype.deserialize(to_bytes(buf), protocol_version) #-------------------------------------------------------------------------- diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 0f8c3e3e..0d6da6e4 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -1,5 +1,5 @@ include 'marshal.pyx' -include 'buffer.pyx' +from cassandra.buffer cimport Buffer from libc.stdint cimport int32_t from cassandra.bytesio cimport BytesIOReader diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 9e1c8ca5..336ee1c7 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -25,6 +25,8 @@ from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, cdef bint is_little_endian from cassandra.util import is_little_endian +cdef bint PY3 = six.PY3 + # cdef extern from "marshal.h": # cdef str c_string_to_python(char *p, Py_ssize_t len) @@ -165,21 +167,30 @@ v3_header_pack = v3_header_struct.pack v3_header_unpack = v3_header_struct.unpack -if six.PY3: - def varint_unpack(term): - val = int(''.join("%02x" % i for i in term), 16) - if (term[0] & 128) != 0: - # There is a bug in Cython (0.20 - 0.22), where if we do - # '1 << (len(term) * 8)' Cython generates '1' directly into the - # C code, causing integer overflows. Treat it as an object for now - val -= ( 1L) << (len(term) * 8) - return val -else: - def varint_unpack(term): # noqa - val = int(term.encode('hex'), 16) - if (ord(term[0]) & 128) != 0: - val = val - (1 << (len(term) * 8)) - return val +cpdef varint_unpack(term): + """Unpack a variable-sized integer""" + if PY3: + return varint_unpack_py3(term) + else: + return varint_unpack_py2(term) + +# TODO: Optimize these two functions +def varint_unpack_py3(term): + cdef int64_t one = 1L + val = int(''.join("%02x" % i for i in term), 16) + if (term[0] & 128) != 0: + # There is a bug in Cython (0.20 - 0.22), where if we do + # '1 << (len(term) * 8)' Cython generates '1' directly into the + # C code, causing integer overflows + val -= one << (len(term) * 8) + return val + +def varint_unpack_py2(term): # noqa + cdef int64_t one = 1L + val = int(term.encode('hex'), 16) + if (ord(term[0]) & 128) != 0: + val = val - (one << (len(term) * 8)) + return val def bitlength(n): diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd index 13bc8411..9daecad9 100644 --- a/cassandra/parsing.pxd +++ b/cassandra/parsing.pxd @@ -5,7 +5,7 @@ cdef class ParseDesc: cdef public object colnames cdef public object coltypes cdef Deserializer[::1] deserializers - cdef public object protocol_version + cdef public int protocol_version cdef Py_ssize_t rowsize cdef class ColumnParser: From 758ab324db1a2fc6848a2aef232caec03a9f5c89 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 15:51:51 +0100 Subject: [PATCH 28/70] Add Cython-based MapType deserializer --- cassandra/cqltypes.py | 2 +- cassandra/deserializers.pyx | 170 +++++++++++++++++++++++------------- 2 files changed, 112 insertions(+), 60 deletions(-) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index 77fc2b91..ee6b1101 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -300,7 +300,7 @@ class _CassandraType(object): Given a set of other CassandraTypes, create a new subtype of this type using them as parameters. This is how composite types are constructed. - >>> MapType.apply_parameters(DateType, BooleanType) + >>> MapType.apply_parameters([DateType, BooleanType]) `subtypes` will be a sequence of CassandraTypes. If provided, `names` diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 680e14a2..e9b06154 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -13,6 +13,7 @@ import inspect from decimal import Decimal from uuid import UUID +from cassandra import cqltypes from cassandra import util @@ -152,46 +153,56 @@ cdef class _DesParameterizedType(Deserializer): self.subtypes = cqltype.subtypes self.deserializers = make_deserializers(cqltype.subtypes) +#-------------------------------------------------------------------------- +# List and set deserialization + +cdef class DesListType(_DesParameterizedType): + + cdef Deserializer deserializer + + def __init__(self, cqltype): + super().__init__(cqltype) + self.deserializer = self.deserializers[0] -cdef class _DesSimpleParameterizedType(_DesParameterizedType): cdef deserialize(self, Buffer *buf, int protocol_version): - cdef uint16_t v2_and_below = 0 - cdef int32_t v3_and_above = 0 + cdef uint16_t v2_and_below = 2 + cdef int32_t v3_and_above = 3 if protocol_version >= 3: - result = _deserialize_parameterized[int32_t]( - v3_and_above, self.deserializers[0], buf, protocol_version) + result = _deserialize_list_or_set[int32_t]( + v3_and_above, buf, protocol_version, self.deserializer) else: - result = _deserialize_parameterized[uint16_t]( - v2_and_below, self.deserializers[0], buf, protocol_version) + result = _deserialize_list_or_set[uint16_t]( + v2_and_below, buf, protocol_version, self.deserializer) + return self.adapter(result) +DesSetType = DesListType + + ctypedef fused itemlen_t: uint16_t # protocol <= v2 int32_t # protocol >= v3 +cdef list _deserialize_list_or_set(itemlen_t dummy_version, + Buffer *buf, int protocol_version, + Deserializer deserializer): + """ + Deserialize a list or set. -cdef itemlen_t _unpack(itemlen_t dummy, const char *buf): - cdef itemlen_t result - if itemlen_t is uint16_t: - result = uint16_unpack(buf) - else: - result = int32_unpack(buf) - return result - -cdef list _deserialize_parameterized( - itemlen_t dummy, Deserializer deserializer, - Buffer *buf, int protocol_version): + The 'dummy' parameter is needed to make fused types work, so that + we can specialize on the protocol version. + """ cdef itemlen_t itemlen cdef Buffer sub_buf - cdef itemlen_t numelements = _unpack[itemlen_t](dummy, buf.ptr) + cdef itemlen_t numelements = _unpack[itemlen_t](dummy_version, buf.ptr) cdef itemlen_t p = sizeof(itemlen_t) cdef list result = [] for _ in range(numelements): - itemlen = _unpack[itemlen_t](dummy, buf.ptr + p) + itemlen = _unpack[itemlen_t](dummy_version, buf.ptr + p) p += sizeof(itemlen_t) sub_buf.ptr = buf.ptr + p sub_buf.size = itemlen @@ -200,46 +211,80 @@ cdef list _deserialize_parameterized( return result -# cdef deserialize_v3_and_above( -# Deserializer deserializer, Buffer *buf, int protocol_version): -# cdef Py_ssize_t itemlen -# cdef Buffer sub_buf -# -# cdef Py_ssize_t numelements = int32_unpack(buf.ptr) -# cdef Py_ssize_t p = 4 -# cdef list result = [] -# -# for _ in range(numelements): -# itemlen = int32_unpack(buf.ptr + p) -# p += 4 -# sub_buf.ptr = buf.ptr + p -# sub_buf.size = itemlen -# p += itemlen -# result.append(deserializer.deserialize(&sub_buf, protocol_version)) -# -# return result -# -# -# cdef deserialize_v2_and_below( -# Deserializer deserializer, Buffer *buf, int protocol_version): -# cdef Py_ssize_t itemlen -# cdef Buffer sub_buf -# -# cdef Py_ssize_t numelements = uint16_unpack(buf.ptr) -# cdef Py_ssize_t p = 2 -# cdef list result = [] -# -# for _ in range(numelements): -# itemlen = uint16_unpack(buf.ptr + p) -# p += 2 -# sub_buf.ptr = buf.ptr + p -# sub_buf.size = itemlen -# p += itemlen -# result.append(deserializer.deserialize(&sub_buf, protocol_version)) -# -# return result +cdef itemlen_t _unpack(itemlen_t dummy_version, const char *buf): + cdef itemlen_t result + if itemlen_t is uint16_t: + result = uint16_unpack(buf) + else: + result = int32_unpack(buf) + return result + +#-------------------------------------------------------------------------- +# Map deserialization + +cdef class DesMapType(_DesParameterizedType): + + cdef Deserializer key_deserializer, val_deserializer + + def __init__(self, cqltype): + super().__init__(cqltype) + self.key_deserializer = self.deserializers[0] + self.val_deserializer = self.deserializers[1] + + cdef deserialize(self, Buffer *buf, int protocol_version): + cdef uint16_t v2_and_below = 0 + cdef int32_t v3_and_above = 0 + key_type, val_type = self.cqltype.subtypes + + if protocol_version >= 3: + result = _deserialize_map[int32_t]( + v3_and_above, buf, protocol_version, + self.key_deserializer, self.val_deserializer, + key_type, val_type) + else: + result = _deserialize_map[uint16_t]( + v2_and_below, buf, protocol_version, + self.key_deserializer, self.val_deserializer, + key_type, val_type) + + return self.adapter(result) +cdef _deserialize_map(itemlen_t dummy_version, + Buffer *buf, int protocol_version, + Deserializer key_deserializer, Deserializer val_deserializer, + key_type, val_type): + cdef itemlen_t itemlen, val_len, key_len + cdef Buffer key_buf, val_buf + + cdef itemlen_t numelements = _unpack[itemlen_t](dummy_version, buf.ptr) + cdef itemlen_t p = sizeof(itemlen_t) + cdef list result = [] + + numelements = _unpack[itemlen_t](dummy_version, buf.ptr) + p = sizeof(itemlen_t) + themap = util.OrderedMapSerializedKey(key_type, protocol_version) + for _ in range(numelements): + key_len = _unpack[itemlen_t](dummy_version, buf.ptr + p) + p += sizeof(itemlen_t) + # keybytes = byts[p:p + key_len] + key_buf.ptr = buf.ptr + p + key_buf.size = key_len + p += key_len + val_len = _unpack(dummy_version, buf.ptr + p) + p += sizeof(itemlen_t) + # valbytes = byts[p:p + val_len] + val_buf.ptr = buf.ptr + p + val_buf.size = val_len + p += val_len + key = key_deserializer.deserialize(&key_buf, protocol_version) + val = val_deserializer.deserialize(&val_buf, protocol_version) + themap._insert_unchecked(key, to_bytes(&key_buf), val) + + return themap + +#-------------------------------------------------------------------------- +# Generic deserialization cdef class GenericDeserializer(Deserializer): """ @@ -255,6 +300,7 @@ cdef class GenericDeserializer(Deserializer): return self.cqltype.deserialize(to_bytes(buf), protocol_version) #-------------------------------------------------------------------------- +# Helper utilities def make_deserializers(cqltypes): """Create an array of Deserializers for each given cqltype in cqltypes""" @@ -264,10 +310,16 @@ def make_deserializers(cqltypes): cpdef Deserializer find_deserializer(cqltype): """Find a deserializer for a cqltype""" - name = inspect.isclass(cqltype) and 'Des' + cqltype.__name__ + name = 'Des' + cqltype.__name__ if name in globals(): deserializer_cls = globals()[name] deserializer_cls() + elif issubclass(cqltype, cqltypes.ListType): + return DesListType + elif issubclass(cqltype, cqltypes.SetType): + return DesSetType + elif issubclass(cqltype, cqltypes.MapType): + return DesMapType return GenericDeserializer(cqltype) From e2820de2ba8e6760eb86ab22344b40bc9b4fc20f Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 16:19:57 +0100 Subject: [PATCH 29/70] Add Cython-based tuple deserializer --- cassandra/deserializers.pyx | 49 +++++++++++++++++++++++++++++++++++++ cassandra/objparser.pyx | 9 ++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index e9b06154..3501428d 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -5,14 +5,19 @@ from libc.stdint cimport int32_t, uint16_t include 'marshal.pyx' include 'cython_utils.pyx' from cassandra.buffer cimport Buffer, to_bytes +from cassandra.parsing cimport ParseDesc, RowParser from cython.view cimport array as cython_array +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM +from cpython.ref cimport Py_INCREF + import socket import inspect from decimal import Decimal from uuid import UUID +from cassandra.objparser import TupleRowParser from cassandra import cqltypes from cassandra import util @@ -283,6 +288,48 @@ cdef _deserialize_map(itemlen_t dummy_version, return themap +#-------------------------------------------------------------------------- +# Tuple deserialization + +cdef class DesTupleType(_DesParameterizedType): + + # TODO: Use TupleRowParser to parse these tuples + + cdef Py_ssize_t tuple_len + + def __init__(self, cqltype): + super().__init__(cqltype) + self.tuple_len = len(cqltype.subtypes) + + cdef deserialize(self, Buffer *buf, int protocol_version): + cdef Py_ssize_t i, p + cdef int32_t itemlen + cdef tuple res = PyTuple_New(self.tuple_len) + cdef Buffer item_buf + cdef Deserializer deserializer + + protocol_version = max(3, protocol_version) + + p = 0 + values = [] + for i in range(self.tuple_len): + item = None + if p != buf.size: + itemlen = int32_unpack(buf.ptr + p) + p += 4 + if itemlen >= 0: + item_buf.ptr = buf.ptr + p + item_buf.size = itemlen + deserializer = self.deserializers[i] + item = deserializer.deserialize(&item_buf, protocol_version) + p += itemlen + + # Insert new object into tuple (PyTuple_SET_ITEM steals a reference) + Py_INCREF(item) + PyTuple_SET_ITEM(res, i, item) + + return res + #-------------------------------------------------------------------------- # Generic deserialization @@ -320,6 +367,8 @@ cpdef Deserializer find_deserializer(cqltype): return DesSetType elif issubclass(cqltype, cqltypes.MapType): return DesMapType + elif issubclass(cqltype, cqltypes.TupleType): + return DesTupleType return GenericDeserializer(cqltype) diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index e98a991e..bf251942 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -66,9 +66,12 @@ cdef class TupleRowParser(RowParser): # Read the next few bytes get_buf(reader, &buf) - # Deserialize bytes to python object - deserializer = desc.deserializers[i] - val = deserializer.deserialize(&buf, desc.protocol_version) + if buf.size == 0: + val = None + else: + # Deserialize bytes to python object + deserializer = desc.deserializers[i] + val = deserializer.deserialize(&buf, desc.protocol_version) # Insert new object into tuple Py_INCREF(val) From 80c5a1931d664e8fa01784d93d48fefbce9a6114 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:02:50 +0100 Subject: [PATCH 30/70] Simply UserType deserialization in cqltypes.py --- cassandra/cqltypes.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index ee6b1101..9383c53e 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -943,27 +943,7 @@ class UserType(TupleType): @classmethod def deserialize_safe(cls, byts, protocol_version): - proto_version = max(3, protocol_version) - p = 0 - values = [] - for col_type in cls.subtypes: - if p == len(byts): - break - itemlen = int32_unpack(byts[p:p + 4]) - p += 4 - if itemlen >= 0: - item = byts[p:p + itemlen] - p += itemlen - else: - item = None - # collections inside UDTs are always encoded with at least the - # version 3 format - values.append(col_type.from_binary(item, proto_version)) - - if len(values) < len(cls.subtypes): - nones = [None] * (len(cls.subtypes) - len(values)) - values = values + nones - + values = super(UserType, cls).deserialize_safe(byts, protocol_version) if cls.mapped_class: return cls.mapped_class(**dict(zip(cls.fieldnames, values))) else: From e160ec11d7a20b1757404e66c3e20c74676c7f56 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:18:19 +0100 Subject: [PATCH 31/70] Abstract over CPython tuple API --- cassandra/tuple.pxd | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 cassandra/tuple.pxd diff --git a/cassandra/tuple.pxd b/cassandra/tuple.pxd new file mode 100644 index 00000000..185e8364 --- /dev/null +++ b/cassandra/tuple.pxd @@ -0,0 +1,27 @@ +from cpython.tuple cimport ( + PyTuple_New, + # Return value: New reference. + # Return a new tuple object of size len, or NULL on failure. + PyTuple_SET_ITEM, + # Like PyTuple_SetItem(), but does no error checking, and should + # only be used to fill in brand new tuples. Note: This function + # ``steals'' a reference to o. + ) + +from cpython.ref cimport ( + Py_INCREF + # void Py_INCREF(object o) + # Increment the reference count for object o. The object must not + # be NULL; if you aren't sure that it isn't NULL, use + # Py_XINCREF(). + ) + +cdef inline tuple tuple_new(Py_ssize_t n): + """Allocate a new tuple object""" + return PyTuple_New(n) + +cdef inline void tuple_set(tuple tup, Py_ssize_t idx, object item): + """Insert new object into tuple. No item must have been set yet.""" + # PyTuple_SET_ITEM steals a reference, so we need to INCREF + Py_INCREF(item) + PyTuple_SET_ITEM(tup, idx, item) From 9e3dbcb034282c22d9c836eab19eec5bc54eeb35 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:21:50 +0100 Subject: [PATCH 32/70] Use cleaner tuple API --- cassandra/deserializers.pyx | 10 +++------- cassandra/objparser.pyx | 24 +++--------------------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 3501428d..17f28c20 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -8,9 +8,7 @@ from cassandra.buffer cimport Buffer, to_bytes from cassandra.parsing cimport ParseDesc, RowParser from cython.view cimport array as cython_array -from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM -from cpython.ref cimport Py_INCREF - +from cassandra.tuple cimport tuple_new, tuple_set import socket import inspect @@ -304,7 +302,7 @@ cdef class DesTupleType(_DesParameterizedType): cdef deserialize(self, Buffer *buf, int protocol_version): cdef Py_ssize_t i, p cdef int32_t itemlen - cdef tuple res = PyTuple_New(self.tuple_len) + cdef tuple res = tuple_new(self.tuple_len) cdef Buffer item_buf cdef Deserializer deserializer @@ -324,9 +322,7 @@ cdef class DesTupleType(_DesParameterizedType): item = deserializer.deserialize(&item_buf, protocol_version) p += itemlen - # Insert new object into tuple (PyTuple_SET_ITEM steals a reference) - Py_INCREF(item) - PyTuple_SET_ITEM(res, i, item) + tuple_set(res, i, item) return res diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index bf251942..d4642cbd 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -1,26 +1,9 @@ include "ioutils.pyx" -from cpython.tuple cimport ( - PyTuple_New, - # Return value: New reference. - # Return a new tuple object of size len, or NULL on failure. - PyTuple_SET_ITEM, - # Like PyTuple_SetItem(), but does no error checking, and should - # only be used to fill in brand new tuples. Note: This function - # ``steals'' a reference to o. - ) - -from cpython.ref cimport ( - Py_INCREF - # void Py_INCREF(object o) - # Increment the reference count for object o. The object must not - # be NULL; if you aren't sure that it isn't NULL, use - # Py_XINCREF(). - ) - from cassandra.bytesio cimport BytesIOReader from cassandra.deserializers cimport Deserializer from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser +from cassandra.tuple cimport tuple_new, tuple_set cdef class ListParser(ColumnParser): @@ -60,7 +43,7 @@ cdef class TupleRowParser(RowParser): cdef Buffer buf cdef Py_ssize_t i, rowsize = desc.rowsize cdef Deserializer deserializer - cdef tuple res = PyTuple_New(desc.rowsize) + cdef tuple res = tuple_new(desc.rowsize) for i in range(rowsize): # Read the next few bytes @@ -74,7 +57,6 @@ cdef class TupleRowParser(RowParser): val = deserializer.deserialize(&buf, desc.protocol_version) # Insert new object into tuple - Py_INCREF(val) - PyTuple_SET_ITEM(res, i, val) + tuple_set(res, i, val) return res From a7887a17ebebcc69b713768f1d828f0143f0d8bf Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:31:44 +0100 Subject: [PATCH 33/70] Composite type deserialization --- cassandra/deserializers.pyx | 64 +++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 17f28c20..9dc37fe2 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -148,6 +148,7 @@ cdef class _DesParameterizedType(Deserializer): cdef object adapter cdef object subtypes cdef Deserializer[::1] deserializers + cdef Py_ssize_t subtypes_len def __init__(self, cqltype): assert cqltype.subtypes and len(cqltype.subtypes) == 1 @@ -287,22 +288,16 @@ cdef _deserialize_map(itemlen_t dummy_version, return themap #-------------------------------------------------------------------------- -# Tuple deserialization +# Tuple and UserType deserialization cdef class DesTupleType(_DesParameterizedType): # TODO: Use TupleRowParser to parse these tuples - cdef Py_ssize_t tuple_len - - def __init__(self, cqltype): - super().__init__(cqltype) - self.tuple_len = len(cqltype.subtypes) - cdef deserialize(self, Buffer *buf, int protocol_version): cdef Py_ssize_t i, p cdef int32_t itemlen - cdef tuple res = tuple_new(self.tuple_len) + cdef tuple res = tuple_new(self.subtypes_len) cdef Buffer item_buf cdef Deserializer deserializer @@ -310,7 +305,7 @@ cdef class DesTupleType(_DesParameterizedType): p = 0 values = [] - for i in range(self.tuple_len): + for i in range(self.subtypes_len): item = None if p != buf.size: itemlen = int32_unpack(buf.ptr + p) @@ -326,6 +321,48 @@ cdef class DesTupleType(_DesParameterizedType): return res + +cdef class DesUserType(DesTupleType): + cdef deserialize(self, Buffer *buf, int protocol_version): + typ = self.cqltype + values = DesTupleType.deserialize(self, buf, protocol_version) + if typ.mapped_class: + return typ.mapped_class(**dict(zip(typ.fieldnames, values))) + else: + return typ.tuple_type(*values) + +#-------------------------------------------------------------------------- +# CompositeType + +cdef class DesCompositeType(_DesParameterizedType): + cdef deserialize(self, Buffer *buf, int protocol_version): + cdef Py_ssize_t i + cdef Buffer elem_buf + cdef int16_t element_length + cdef Deserializer deserializer + cdef tuple res = tuple_new(self.subtypes_len) + + for i in range(self.subtypes_len): + if not buf.size: + # CompositeType can have missing elements at the end + break + + element_length = uint16_unpack(buf.ptr) + elem_buf.ptr = buf.ptr + 2 + elem_buf.size = element_length + + # skip element length, element, and the EOC (one byte) + buf.ptr = buf.ptr + 2 + element_length + 1 + buf.size = buf.size - (2 + element_length + 1) + deserializer = self.deserializers[i] + item = deserializer.deserialize(&elem_buf, protocol_version) + tuple_set(res, i, item) + + return res + + +DesDynamicCompositeType = DesCompositeType + #-------------------------------------------------------------------------- # Generic deserialization @@ -363,8 +400,17 @@ cpdef Deserializer find_deserializer(cqltype): return DesSetType elif issubclass(cqltype, cqltypes.MapType): return DesMapType + elif issubclass(cqltype, cqltypes.UserType): + # UserType is a subclass of TupleType, so should precede it + return DesUserType elif issubclass(cqltype, cqltypes.TupleType): return DesTupleType + elif issubclass(cqltype, cqltypes.DynamicCompositeType): + # DynamicCompositeType is a subclass of CompositeType, so should precede it + return DesDynamicCompositeType + elif issubclass(cqltype, cqltypes.CompositeType): + return DesCompositeType + return GenericDeserializer(cqltype) From 74fa1ad4736c44d2ff81c7651929ac26d87dad68 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:37:12 +0100 Subject: [PATCH 34/70] Deserialization for ReveredType and FrozenType --- cassandra/deserializers.pyx | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 9dc37fe2..42890580 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -157,10 +157,8 @@ cdef class _DesParameterizedType(Deserializer): self.subtypes = cqltype.subtypes self.deserializers = make_deserializers(cqltype.subtypes) -#-------------------------------------------------------------------------- -# List and set deserialization -cdef class DesListType(_DesParameterizedType): +cdef class _DesSingleParamType(_DesParameterizedType): cdef Deserializer deserializer @@ -168,6 +166,11 @@ cdef class DesListType(_DesParameterizedType): super().__init__(cqltype) self.deserializer = self.deserializers[0] + +#-------------------------------------------------------------------------- +# List and set deserialization + +cdef class DesListType(_DesSingleParamType): cdef deserialize(self, Buffer *buf, int protocol_version): cdef uint16_t v2_and_below = 2 cdef int32_t v3_and_above = 3 @@ -288,7 +291,6 @@ cdef _deserialize_map(itemlen_t dummy_version, return themap #-------------------------------------------------------------------------- -# Tuple and UserType deserialization cdef class DesTupleType(_DesParameterizedType): @@ -331,8 +333,6 @@ cdef class DesUserType(DesTupleType): else: return typ.tuple_type(*values) -#-------------------------------------------------------------------------- -# CompositeType cdef class DesCompositeType(_DesParameterizedType): cdef deserialize(self, Buffer *buf, int protocol_version): @@ -363,6 +363,16 @@ cdef class DesCompositeType(_DesParameterizedType): DesDynamicCompositeType = DesCompositeType + +cdef class DesReversedType(_DesSingleParamType): + cdef deserialize(self, Buffer *buf, int protocol_version): + return self.deserializer.deserialize(buf, protocol_version) + + +cdef class DesFrozenType(_DesSingleParamType): + cdef deserialize(self, Buffer *buf, int protocol_version): + return self.deserializer.deserialize(buf, protocol_version) + #-------------------------------------------------------------------------- # Generic deserialization @@ -410,6 +420,10 @@ cpdef Deserializer find_deserializer(cqltype): return DesDynamicCompositeType elif issubclass(cqltype, cqltypes.CompositeType): return DesCompositeType + elif issubclass(cqltype, cqltypes.ReversedType): + return DesReversedType + elif issubclass(cqltype, cqltypes.FrozenType): + return DesFrozenType return GenericDeserializer(cqltype) From ddebc448529372a0b65a0d99b0cc50844a1d250c Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 18:57:32 +0100 Subject: [PATCH 35/70] Minor code cleanup --- cassandra/cython_utils.pyx | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx index fe4fbab9..677b8009 100644 --- a/cassandra/cython_utils.pyx +++ b/cassandra/cython_utils.pyx @@ -3,18 +3,12 @@ Duplicate module of util.py, with some accelerated functions used for deserialization. """ -# from __future__ import with_statement - from cpython.datetime cimport timedelta_new # cdef inline object timedelta_new(int days, int seconds, int useconds) # Create timedelta object using DateTime CAPI factory function. # Note, there are no range checks for any of the arguments. -import calendar import datetime -import random -import six -import uuid import sys DATETIME_EPOC = datetime.datetime(1970, 1, 1) @@ -24,4 +18,3 @@ is_little_endian = sys.byteorder == 'little' cdef datetime_from_timestamp(timestamp): return DATETIME_EPOC + timedelta_new(0, timestamp, 0) - From 2f4a2d480fd9e97982a5b0dfc8ebfcc7b3472d19 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 21:45:07 +0100 Subject: [PATCH 36/70] Take more care with empty and negative sizes of binary data --- cassandra/cython_protocol_handler.pyx | 73 -------------------------- cassandra/deserializers.pxd | 21 ++++++++ cassandra/deserializers.pyx | 75 ++++++++++++++++----------- cassandra/ioutils.pyx | 2 +- cassandra/numpyparser.pyx | 6 ++- cassandra/objparser.pyx | 11 ++-- 6 files changed, 74 insertions(+), 114 deletions(-) delete mode 100644 cassandra/cython_protocol_handler.pyx diff --git a/cassandra/cython_protocol_handler.pyx b/cassandra/cython_protocol_handler.pyx deleted file mode 100644 index 629ce887..00000000 --- a/cassandra/cython_protocol_handler.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# -- cython: profile=True - -from cassandra.protocol import ResultMessage, ProtocolHandler - -from cassandra.parsing cimport ParseDesc, ColumnParser -from cassandra.deserializers import make_deserializers -from cassandra.objparser import ListParser - - -include "ioutils.pyx" - - -def make_recv_results_rows(ColumnParser colparser): - def recv_results_rows(cls, f, int protocol_version, user_type_map): - """ - Parse protocol data given as a BytesIO f into a set of columns (e.g. list of tuples) - This is used as the recv_results_rows method of (Fast)ResultMessage - """ - paging_state, column_metadata = cls.recv_results_metadata(f, user_type_map) - - colnames = [c[2] for c in column_metadata] - coltypes = [c[3] for c in column_metadata] - - desc = ParseDesc(colnames, coltypes, make_deserializers(coltypes), - protocol_version) - reader = BytesIOReader(f.read()) - parsed_rows = colparser.parse_rows(reader, desc) - - return (paging_state, (colnames, parsed_rows)) - - return recv_results_rows - - -def make_protocol_handler(colparser=ListParser()): - """ - Given a column parser to deserialize ResultMessages, return a suitable - Cython-based protocol handler. - - There are three Cython-based protocol handlers (least to most performant): - - 1. objparser.ListParser - this parser decodes result messages into a list of tuples - - 2. objparser.LazyParser - this parser decodes result messages lazily by returning an iterator - - 3. numpyparser.NumPyParser - this parser decodes result messages into NumPy arrays - - The default is to use objparser.ListParser - """ - # TODO: It may be cleaner to turn ProtocolHandler and ResultMessage into - # TODO: instances and use methods instead of class methods - - class FastResultMessage(ResultMessage): - """ - Cython version of Result Message that has a faster implementation of - recv_results_row. - """ - # type_codes = ResultMessage.type_codes.copy() - code_to_type = dict((v, k) for k, v in ResultMessage.type_codes.items()) - recv_results_rows = classmethod(make_recv_results_rows(colparser)) - - class CythonProtocolHandler(ProtocolHandler): - """ - Use FastResultMessage to decode query result message messages. - """ - - my_opcodes = ProtocolHandler.message_types_by_opcode.copy() - my_opcodes[FastResultMessage.opcode] = FastResultMessage - message_types_by_opcode = my_opcodes - - return CythonProtocolHandler diff --git a/cassandra/deserializers.pxd b/cassandra/deserializers.pxd index 882d19d1..5b820061 100644 --- a/cassandra/deserializers.pxd +++ b/cassandra/deserializers.pxd @@ -3,5 +3,26 @@ from cassandra.buffer cimport Buffer cdef class Deserializer: + # The cqltypes._CassandraType corresponding to this deserializer + cdef object cqltype + + # String may be empty, whereas other values may not be. + # Other values may be NULL, in which case the integer length + # of the binary data is negative. However, non-string types + # may also return a zero length for legacy reasons + # (see http://code.metager.de/source/xref/apache/cassandra/doc/native_protocol_v3.spec + # paragraph 6) + cdef bint empty_binary_ok + cdef deserialize(self, Buffer *buf, int protocol_version) # cdef deserialize(self, CString byts, protocol_version) + + +cdef inline object from_binary(Deserializer deserializer, + Buffer *buf, + int protocol_version): + if buf.size <= 0 and not deserializer.empty_binary_ok: + return _ret_empty(deserializer, buf.size) + return deserializer.deserialize(buf, protocol_version) + +cdef _ret_empty(Deserializer deserializer, Py_ssize_t buf_size) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 42890580..8924e13c 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -21,6 +21,12 @@ from cassandra import util cdef class Deserializer: + """Cython-based deserializer class for a cqltype""" + + def __init__(self, cqltype): + self.cqltype = cqltype + self.empty_binary_ok = False + cdef deserialize(self, Buffer *buf, int protocol_version): raise NotImplementedError @@ -144,25 +150,21 @@ cdef class DesVarcharType(DesUTF8Type): cdef class _DesParameterizedType(Deserializer): - cdef object cqltype - cdef object adapter cdef object subtypes cdef Deserializer[::1] deserializers cdef Py_ssize_t subtypes_len def __init__(self, cqltype): - assert cqltype.subtypes and len(cqltype.subtypes) == 1 - self.cqltype = cqltype - self.adapter = cqltype.adapter + super().__init__(cqltype) self.subtypes = cqltype.subtypes self.deserializers = make_deserializers(cqltype.subtypes) cdef class _DesSingleParamType(_DesParameterizedType): - cdef Deserializer deserializer def __init__(self, cqltype): + assert cqltype.subtypes and len(cqltype.subtypes) == 1, cqltype.subtypes super().__init__(cqltype) self.deserializer = self.deserializers[0] @@ -182,7 +184,7 @@ cdef class DesListType(_DesSingleParamType): result = _deserialize_list_or_set[uint16_t]( v2_and_below, buf, protocol_version, self.deserializer) - return self.adapter(result) + return self.cqltype.adapter(result) DesSetType = DesListType @@ -214,7 +216,7 @@ cdef list _deserialize_list_or_set(itemlen_t dummy_version, sub_buf.ptr = buf.ptr + p sub_buf.size = itemlen p += itemlen - result.append(deserializer.deserialize(&sub_buf, protocol_version)) + result.append(from_binary(deserializer, &sub_buf, protocol_version)) return result @@ -284,8 +286,8 @@ cdef _deserialize_map(itemlen_t dummy_version, val_buf.ptr = buf.ptr + p val_buf.size = val_len p += val_len - key = key_deserializer.deserialize(&key_buf, protocol_version) - val = val_deserializer.deserialize(&val_buf, protocol_version) + key = from_binary(key_deserializer, &key_buf, protocol_version) + val = from_binary(val_deserializer, &val_buf, protocol_version) themap._insert_unchecked(key, to_bytes(&key_buf), val) return themap @@ -316,7 +318,7 @@ cdef class DesTupleType(_DesParameterizedType): item_buf.ptr = buf.ptr + p item_buf.size = itemlen deserializer = self.deserializers[i] - item = deserializer.deserialize(&item_buf, protocol_version) + item = from_binary(deserializer, &item_buf, protocol_version) p += itemlen tuple_set(res, i, item) @@ -355,7 +357,7 @@ cdef class DesCompositeType(_DesParameterizedType): buf.ptr = buf.ptr + 2 + element_length + 1 buf.size = buf.size - (2 + element_length + 1) deserializer = self.deserializers[i] - item = deserializer.deserialize(&elem_buf, protocol_version) + item = from_binary(deserializer, &elem_buf, protocol_version) tuple_set(res, i, item) return res @@ -366,12 +368,26 @@ DesDynamicCompositeType = DesCompositeType cdef class DesReversedType(_DesSingleParamType): cdef deserialize(self, Buffer *buf, int protocol_version): - return self.deserializer.deserialize(buf, protocol_version) + return from_binary(self.deserializer, buf, protocol_version) cdef class DesFrozenType(_DesSingleParamType): cdef deserialize(self, Buffer *buf, int protocol_version): - return self.deserializer.deserialize(buf, protocol_version) + return from_binary(self.deserializer, buf, protocol_version) + +#-------------------------------------------------------------------------- + +cdef _ret_empty(Deserializer deserializer, Py_ssize_t buf_size): + """ + Decide whether to return None or EMPTY when a buffer size is + zero or negative. This is used by from_binary in deserializers.pxd. + """ + if buf_size < 0: + return None + elif deserializer.cqltype.support_empty_values: + return cqltypes.EMPTY + else: + return None #-------------------------------------------------------------------------- # Generic deserialization @@ -381,11 +397,6 @@ cdef class GenericDeserializer(Deserializer): Wrap a generic datatype for deserialization """ - cdef object cqltype - - def __init__(self, cqltype): - self.cqltype = cqltype - cdef deserialize(self, Buffer *buf, int protocol_version): return self.cqltype.deserialize(to_bytes(buf), protocol_version) @@ -401,31 +412,33 @@ def make_deserializers(cqltypes): cpdef Deserializer find_deserializer(cqltype): """Find a deserializer for a cqltype""" name = 'Des' + cqltype.__name__ + if name in globals(): - deserializer_cls = globals()[name] - deserializer_cls() + cls = globals()[name] elif issubclass(cqltype, cqltypes.ListType): - return DesListType + cls = DesListType elif issubclass(cqltype, cqltypes.SetType): - return DesSetType + cls = DesSetType elif issubclass(cqltype, cqltypes.MapType): - return DesMapType + cls = DesMapType elif issubclass(cqltype, cqltypes.UserType): # UserType is a subclass of TupleType, so should precede it - return DesUserType + cls = DesUserType elif issubclass(cqltype, cqltypes.TupleType): - return DesTupleType + cls = DesTupleType elif issubclass(cqltype, cqltypes.DynamicCompositeType): # DynamicCompositeType is a subclass of CompositeType, so should precede it - return DesDynamicCompositeType + cls = DesDynamicCompositeType elif issubclass(cqltype, cqltypes.CompositeType): - return DesCompositeType + cls = DesCompositeType elif issubclass(cqltype, cqltypes.ReversedType): - return DesReversedType + cls = DesReversedType elif issubclass(cqltype, cqltypes.FrozenType): - return DesFrozenType + cls = DesFrozenType + else: + cls = GenericDeserializer - return GenericDeserializer(cqltype) + return cls(cqltype) def obj_array(list objs): diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 0d6da6e4..d5aeff6c 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -12,7 +12,7 @@ cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1: """ cdef Py_ssize_t raw_val_size = read_int(reader) if raw_val_size < 0: - raise ValueError("Expected positive item size") + raw_val_size = 0 buf_out.ptr = reader.read(raw_val_size) buf_out.size = raw_val_size diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 7be86400..8499d938 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -17,7 +17,7 @@ from libc.stdint cimport uint64_t from cpython.ref cimport Py_INCREF, PyObject from cassandra.bytesio cimport BytesIOReader -from cassandra.deserializers cimport Deserializer +from cassandra.deserializers cimport Deserializer, from_binary from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser from cassandra import cqltypes from cassandra.util import is_little_endian @@ -125,9 +125,11 @@ cdef inline int unpack_row( get_buf(reader, &buf) arr = arrays[i] + if buf.size == 0: + raise ValueError("Cannot handle NULL value") if arr.is_object: deserializer = desc.deserializers[i] - val = deserializer.deserialize(&buf, desc.protocol_version) + val = from_binary(deserializer, &buf, desc.protocol_version) Py_INCREF(val) ( arr.buf_ptr)[0] = val else: diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index d4642cbd..8aca1427 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -1,7 +1,7 @@ include "ioutils.pyx" from cassandra.bytesio cimport BytesIOReader -from cassandra.deserializers cimport Deserializer +from cassandra.deserializers cimport Deserializer, from_binary from cassandra.parsing cimport ParseDesc, ColumnParser, RowParser from cassandra.tuple cimport tuple_new, tuple_set @@ -49,12 +49,9 @@ cdef class TupleRowParser(RowParser): # Read the next few bytes get_buf(reader, &buf) - if buf.size == 0: - val = None - else: - # Deserialize bytes to python object - deserializer = desc.deserializers[i] - val = deserializer.deserialize(&buf, desc.protocol_version) + # Deserialize bytes to python object + deserializer = desc.deserializers[i] + val = from_binary(deserializer, &buf, desc.protocol_version) # Insert new object into tuple tuple_set(res, i, val) From 7ce8e3a3c17450589dc253b3387053cd8c6efa5e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 21:45:49 +0100 Subject: [PATCH 37/70] Use Cython-based deserializers whenever available --- cassandra/cython_deps.py | 5 ++ cassandra/protocol.py | 65 ++++++++++++++++++- cassandra/rowparser.pyx | 26 ++++++++ .../standard/test_cython_protocol_handlers.py | 18 ++--- 4 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 cassandra/cython_deps.py create mode 100644 cassandra/rowparser.pyx diff --git a/cassandra/cython_deps.py b/cassandra/cython_deps.py new file mode 100644 index 00000000..41516426 --- /dev/null +++ b/cassandra/cython_deps.py @@ -0,0 +1,5 @@ +try: + from cassandra.rowparser import make_recv_results_rows + HAVE_CYTHON = True +except ImportError: + HAVE_CYTHON = False \ No newline at end of file diff --git a/cassandra/protocol.py b/cassandra/protocol.py index a6ce22ec..de8a464d 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -40,6 +40,7 @@ from cassandra.cqltypes import (AsciiType, BytesType, BooleanType, TupleType, lookup_casstype, SimpleDateType, TimeType, ByteType, ShortType) from cassandra.policies import WriteType +from cassandra.cython_deps import HAVE_CYTHON from cassandra import util log = logging.getLogger(__name__) @@ -69,10 +70,16 @@ _message_types_by_opcode = {} _UNSET_VALUE = object() +def register_class(cls): + _message_types_by_opcode[cls.opcode] = cls + +def get_registered_classes(): + return _message_types_by_opcode.copy() + class _RegisterMessageType(type): def __init__(cls, name, bases, dct): if not name.startswith('_'): - _message_types_by_opcode[cls.opcode] = cls + register_class(cls) @six.add_metaclass(_RegisterMessageType) @@ -987,6 +994,62 @@ class ProtocolHandler(object): return msg +def cython_protocol_handler(colparser): + """ + Given a column parser to deserialize ResultMessages, return a suitable + Cython-based protocol handler. + + There are three Cython-based protocol handlers (least to most performant): + + 1. objparser.ListParser + this parser decodes result messages into a list of tuples + + 2. objparser.LazyParser + this parser decodes result messages lazily by returning an iterator + + 3. numpyparser.NumPyParser + this parser decodes result messages into NumPy arrays + + The default is to use objparser.ListParser + """ + # TODO: It may be cleaner to turn ProtocolHandler and ResultMessage into + # TODO: instances and use methods instead of class methods + from cassandra.rowparser import make_recv_results_rows + + class FastResultMessage(ResultMessage): + """ + Cython version of Result Message that has a faster implementation of + recv_results_row. + """ + # type_codes = ResultMessage.type_codes.copy() + code_to_type = dict((v, k) for k, v in ResultMessage.type_codes.items()) + recv_results_rows = classmethod(make_recv_results_rows(colparser)) + + class CythonProtocolHandler(ProtocolHandler): + """ + Use FastResultMessage to decode query result message messages. + """ + + my_opcodes = ProtocolHandler.message_types_by_opcode.copy() + my_opcodes[FastResultMessage.opcode] = FastResultMessage + message_types_by_opcode = my_opcodes + + return CythonProtocolHandler + + +if HAVE_CYTHON: + from cassandra.objparser import ListParser, LazyParser + from cassandra.numpyparser import NumpyParser + + ProtocolHandler = cython_protocol_handler(ListParser()) + LazyProtocolHandler = cython_protocol_handler(LazyParser()) + NumpyProtocolHandler = cython_protocol_handler(NumpyParser()) +else: + # Use Python-based ProtocolHandler + LazyProtocolHandler = None + NumpyProtocolHandler = None + + def read_byte(f): return int8_unpack(f.read(1)) diff --git a/cassandra/rowparser.pyx b/cassandra/rowparser.pyx new file mode 100644 index 00000000..1c855769 --- /dev/null +++ b/cassandra/rowparser.pyx @@ -0,0 +1,26 @@ +# -- cython: profile=True + +from cassandra.parsing cimport ParseDesc, ColumnParser +from cassandra.deserializers import make_deserializers + +include "ioutils.pyx" + +def make_recv_results_rows(ColumnParser colparser): + def recv_results_rows(cls, f, int protocol_version, user_type_map): + """ + Parse protocol data given as a BytesIO f into a set of columns (e.g. list of tuples) + This is used as the recv_results_rows method of (Fast)ResultMessage + """ + paging_state, column_metadata = cls.recv_results_metadata(f, user_type_map) + + colnames = [c[2] for c in column_metadata] + coltypes = [c[3] for c in column_metadata] + + desc = ParseDesc(colnames, coltypes, make_deserializers(coltypes), + protocol_version) + reader = BytesIOReader(f.read()) + parsed_rows = colparser.parse_rows(reader, desc) + + return (paging_state, (colnames, parsed_rows)) + + return recv_results_rows diff --git a/tests/integration/standard/test_cython_protocol_handlers.py b/tests/integration/standard/test_cython_protocol_handlers.py index 059c9317..ba75cf72 100644 --- a/tests/integration/standard/test_cython_protocol_handlers.py +++ b/tests/integration/standard/test_cython_protocol_handlers.py @@ -8,19 +8,15 @@ except ImportError: import unittest from cassandra.cluster import Cluster +from cassandra.protocol import ProtocolHandler, LazyProtocolHandler, NumpyProtocolHandler from tests.integration import use_singledc, PROTOCOL_VERSION from tests.integration.datatype_utils import update_datatypes from tests.integration.standard.utils import create_table_with_all_types, get_all_primitive_params -from six import next -try: - from cassandra.cython_protocol_handler import make_protocol_handler -except ImportError as e: +from cassandra.cython_deps import HAVE_CYTHON +if not HAVE_CYTHON: raise unittest.SkipTest("Skipping test, not compiled with Cython enabled") -from cassandra.numpyparser import NumpyParser -from cassandra.objparser import ListParser, LazyParser - def setup_module(): use_singledc() @@ -47,20 +43,20 @@ class CustomProtocolHandlerTest(unittest.TestCase): """ Test Cython-based parser that returns a list of tuples """ - self.cython_parser(ListParser()) + self.cython_parser(ProtocolHandler) def test_cython_lazy_parser(self): """ Test Cython-based parser that returns a list of tuples """ - self.cython_parser(LazyParser()) + self.cython_parser(LazyProtocolHandler) - def cython_parser(self, colparser): + def cython_parser(self, protocol_handler): cluster = Cluster(protocol_version=PROTOCOL_VERSION) session = cluster.connect(keyspace="testspace") # use our custom protocol handler - session.client_protocol_handler = make_protocol_handler(colparser) + session.client_protocol_handler = protocol_handler # session.row_factory = tuple_factory # verify data From e4e98e7e9fb30dac5815c05c0d008ab37dec2986 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Thu, 6 Aug 2015 21:59:07 +0100 Subject: [PATCH 38/70] Some small optimizations to deserializers --- cassandra/deserializers.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 8924e13c..008e49cd 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -51,7 +51,9 @@ cdef class DesUUIDType(Deserializer): cdef class DesBooleanType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return bool(int8_unpack(buf.ptr)) + if int8_unpack(buf.ptr): + return True + return False cdef class DesByteType(Deserializer): @@ -184,10 +186,11 @@ cdef class DesListType(_DesSingleParamType): result = _deserialize_list_or_set[uint16_t]( v2_and_below, buf, protocol_version, self.deserializer) - return self.cqltype.adapter(result) + return result - -DesSetType = DesListType +cdef class DesSetType(DesListType): + cdef deserialize(self, Buffer *buf, int protocol_version): + return util.sortedset(DesListType.deserialize(self, buf, protocol_version)) ctypedef fused itemlen_t: From a3c73f6670e34ec96423b213b2a14e78fb96ec24 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 09:22:33 +0100 Subject: [PATCH 39/70] Forgot to initialize datetime C API --- cassandra/cython_utils.pyx | 15 +++++++++++---- cassandra/deserializers.pyx | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx index 677b8009..7ee385ec 100644 --- a/cassandra/cython_utils.pyx +++ b/cassandra/cython_utils.pyx @@ -3,10 +3,15 @@ Duplicate module of util.py, with some accelerated functions used for deserialization. """ -from cpython.datetime cimport timedelta_new - # cdef inline object timedelta_new(int days, int seconds, int useconds) - # Create timedelta object using DateTime CAPI factory function. - # Note, there are no range checks for any of the arguments. +from cpython.datetime cimport ( + timedelta_new, + # cdef inline object timedelta_new(int days, int seconds, int useconds) + # Create timedelta object using DateTime CAPI factory function. + # Note, there are no range checks for any of the arguments. + import_datetime, + # Datetime C API initialization function. + # You have to call it before any usage of DateTime CAPI functions. + ) import datetime import sys @@ -16,5 +21,7 @@ DATETIME_EPOC = datetime.datetime(1970, 1, 1) assert sys.byteorder in ('little', 'big') is_little_endian = sys.byteorder == 'little' +import_datetime() + cdef datetime_from_timestamp(timestamp): return DATETIME_EPOC + timedelta_new(0, timestamp, 0) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 008e49cd..aefd6bac 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -259,7 +259,7 @@ cdef class DesMapType(_DesParameterizedType): self.key_deserializer, self.val_deserializer, key_type, val_type) - return self.adapter(result) + return result cdef _deserialize_map(itemlen_t dummy_version, From 56a25df0fce5d438a06042e7072c1ee3ddf85d43 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 10:16:47 +0100 Subject: [PATCH 40/70] Don't lose out on microseconds when creating datetime --- cassandra/cython_utils.pyx | 6 ++++-- cassandra/deserializers.pyx | 7 ++----- .../cqlengine/columns/test_container_columns.py | 3 ++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx index 7ee385ec..de87c1e0 100644 --- a/cassandra/cython_utils.pyx +++ b/cassandra/cython_utils.pyx @@ -23,5 +23,7 @@ is_little_endian = sys.byteorder == 'little' import_datetime() -cdef datetime_from_timestamp(timestamp): - return DATETIME_EPOC + timedelta_new(0, timestamp, 0) +cdef datetime_from_timestamp(double timestamp): + cdef int seconds = timestamp + cdef int microseconds = ( (timestamp * 1000000)) % 1000000 + return DATETIME_EPOC + timedelta_new(0, seconds, microseconds) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index aefd6bac..35667694 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -3,19 +3,16 @@ from libc.stdint cimport int32_t, uint16_t include 'marshal.pyx' -include 'cython_utils.pyx' from cassandra.buffer cimport Buffer, to_bytes -from cassandra.parsing cimport ParseDesc, RowParser +from cassandra.cython_utils cimport datetime_from_timestamp from cython.view cimport array as cython_array from cassandra.tuple cimport tuple_new, tuple_set import socket -import inspect from decimal import Decimal from uuid import UUID -from cassandra.objparser import TupleRowParser from cassandra import cqltypes from cassandra import util @@ -107,7 +104,7 @@ cdef class DesCounterColumnType(DesLongType): cdef class DesDateType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - timestamp = int64_unpack(buf.ptr) / 1000.0 + cdef double timestamp = int64_unpack(buf.ptr) / 1000.0 return datetime_from_timestamp(timestamp) diff --git a/tests/integration/cqlengine/columns/test_container_columns.py b/tests/integration/cqlengine/columns/test_container_columns.py index 213c625c..ad67419c 100644 --- a/tests/integration/cqlengine/columns/test_container_columns.py +++ b/tests/integration/cqlengine/columns/test_container_columns.py @@ -386,7 +386,8 @@ class TestMapColumn(BaseCassEngTestCase): k2 = uuid4() now = datetime.now() then = now + timedelta(days=1) - m1 = TestMapModel.create(int_map={1: k1, 2: k2}, text_map={'now': now, 'then': then}) + m1 = TestMapModel.create(int_map={1: k1, 2: k2}, + text_map={'now': now, 'then': then}) m2 = TestMapModel.get(partition=m1.partition) self.assertTrue(isinstance(m2.int_map, dict)) From 8462e865f7c2d089884aa061b39275323605f43a Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 10:38:16 +0100 Subject: [PATCH 41/70] Squash little bug in cython decimal deserializer --- cassandra/deserializers.pyx | 7 ++++++- cassandra/marshal.pyx | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 35667694..ad1e5d1a 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -36,8 +36,13 @@ cdef class DesLongType(Deserializer): # TODO: Use libmpdec: http://www.bytereef.org/mpdecimal/index.html cdef class DesDecimalType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): + cdef Buffer varint_buf + varint_buf.ptr = buf.ptr + 4 + varint_buf.size = buf.size - 4 + scale = int32_unpack(buf.ptr) - unscaled = varint_unpack(buf.ptr + 4) + unscaled = varint_unpack(to_bytes(&varint_buf)) + return Decimal('%de%d' % (unscaled, -scale)) diff --git a/cassandra/marshal.pyx b/cassandra/marshal.pyx index 336ee1c7..0ab65c46 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/marshal.pyx @@ -177,6 +177,7 @@ cpdef varint_unpack(term): # TODO: Optimize these two functions def varint_unpack_py3(term): cdef int64_t one = 1L + val = int(''.join("%02x" % i for i in term), 16) if (term[0] & 128) != 0: # There is a bug in Cython (0.20 - 0.22), where if we do From 0cec7fef76287f710bab39b3bb2888b4621a5ad3 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 12:03:54 +0100 Subject: [PATCH 42/70] Some more small bug fixes to Cython-based deserializers --- cassandra/deserializers.pyx | 11 ++++++++++- tests/integration/cqlengine/query/test_queryset.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index ad1e5d1a..151d7747 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -95,7 +95,7 @@ cdef class DesInetAddressType(Deserializer): cdef bytes byts = to_bytes(buf) # TODO: optimize inet_ntop, inet_ntoa - if len(buf.size) == 16: + if buf.size == 16: return util.inet_ntop(socket.AF_INET6, byts) else: # util.inet_pton could also handle, but this is faster @@ -162,6 +162,7 @@ cdef class _DesParameterizedType(Deserializer): super().__init__(cqltype) self.subtypes = cqltype.subtypes self.deserializers = make_deserializers(cqltype.subtypes) + self.subtypes_len = len(self.subtypes) cdef class _DesSingleParamType(_DesParameterizedType): @@ -352,6 +353,14 @@ cdef class DesCompositeType(_DesParameterizedType): for i in range(self.subtypes_len): if not buf.size: # CompositeType can have missing elements at the end + + # Fill the tuple with None values and slice it + # + # (I'm not sure a tuple needs to be fully initialized before + # it can be destroyed, so play it safe) + for j in range(i, self.subtypes_len): + tuple_set(res, j, None) + res = res[:i] break element_length = uint16_unpack(buf.ptr) diff --git a/tests/integration/cqlengine/query/test_queryset.py b/tests/integration/cqlengine/query/test_queryset.py index 7bb101b9..45277520 100644 --- a/tests/integration/cqlengine/query/test_queryset.py +++ b/tests/integration/cqlengine/query/test_queryset.py @@ -629,7 +629,7 @@ class TestMinMaxTimeUUIDFunctions(BaseCassEngTestCase): # test kwarg filtering q = TimeUUIDQueryModel.filter(partition=pk, time__lte=functions.MaxTimeUUID(midpoint)) q = [d for d in q] - assert len(q) == 2 + self.assertEqual(len(q), 2, msg="Got: %s" % q) datas = [d.data for d in q] assert '1' in datas assert '2' in datas From e4e0e219ee8123e42dc032483bb3bd036535a064 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 14:22:58 +0100 Subject: [PATCH 43/70] Forgot pxd file for cython_utils --- cassandra/cython_utils.pxd | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 cassandra/cython_utils.pxd diff --git a/cassandra/cython_utils.pxd b/cassandra/cython_utils.pxd new file mode 100644 index 00000000..d2bf7d20 --- /dev/null +++ b/cassandra/cython_utils.pxd @@ -0,0 +1,2 @@ +from libc.stdint cimport int64_t +cdef datetime_from_timestamp(double timestamp) \ No newline at end of file From f31772e8778bc23cc3102f01c58f9fa95037284a Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 14:55:49 +0100 Subject: [PATCH 44/70] Fix bug in integration test test_types --- cassandra/util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cassandra/util.py b/cassandra/util.py index 4cf3879e..0e8a818b 100644 --- a/cassandra/util.py +++ b/cassandra/util.py @@ -493,8 +493,7 @@ except ImportError: def __init__(self, iterable=()): self._items = [] - for i in iterable: - self.add(i) + self.update(iterable) def __len__(self): return len(self._items) @@ -567,6 +566,10 @@ except ImportError: else: self._items.append(item) + def update(self, iterable): + for i in iterable: + self.add(i) + def clear(self): del self._items[:] From 302d7ab1d1a517d5a5669bf25e90c371c087634e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 15:34:06 +0100 Subject: [PATCH 45/70] Forgot to initialize flags in string types to support empty strings --- cassandra/deserializers.pyx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 151d7747..a0d55340 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -64,6 +64,11 @@ cdef class DesByteType(Deserializer): cdef class DesAsciiType(Deserializer): + + def __init__(self, cqltype): + super().__init__(cqltype) + self.empty_binary_ok = True + cdef deserialize(self, Buffer *buf, int protocol_version): if six.PY2: return to_bytes(buf) @@ -144,6 +149,10 @@ cdef class DesTimeType(Deserializer): cdef class DesUTF8Type(Deserializer): + def __init__(self, cqltype): + super().__init__(cqltype) + self.empty_binary_ok = True + cdef deserialize(self, Buffer *buf, int protocol_version): return to_bytes(buf).decode('utf8') From 0baf6965204c15a27d34d4ef9973e1662f9a624e Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 15:46:04 +0100 Subject: [PATCH 46/70] Fix use of next() in test_concurrent --- tests/integration/standard/test_concurrent.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/standard/test_concurrent.py b/tests/integration/standard/test_concurrent.py index 45b73613..bf928b8e 100644 --- a/tests/integration/standard/test_concurrent.py +++ b/tests/integration/standard/test_concurrent.py @@ -24,6 +24,8 @@ from cassandra.query import tuple_factory, SimpleStatement from tests.integration import use_singledc, PROTOCOL_VERSION +from six import next + try: import unittest2 as unittest except ImportError: @@ -151,7 +153,7 @@ class ClusterTests(unittest.TestCase): results = self.execute_concurrent_args_helper(self.session, statement, parameters, results_generator=True) for i in range(num_statements): - result = results.next() + result = next(results) self.assertEqual((True, [(i,)]), result) def test_execute_concurrent_paged_result(self): From f3e2295457fd19099d32f897114dc37d2dfb2269 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 16:12:06 +0100 Subject: [PATCH 47/70] Python 3 compatibility for stress tests --- tests/stress_tests/test_multi_inserts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/stress_tests/test_multi_inserts.py b/tests/stress_tests/test_multi_inserts.py index b23a29dd..12b5b70e 100644 --- a/tests/stress_tests/test_multi_inserts.py +++ b/tests/stress_tests/test_multi_inserts.py @@ -75,7 +75,7 @@ class StressInsertsTests(unittest.TestCase): break for conn in pool.get_connections(): if conn.in_flight > 1: - print self.session.get_pool_state() + print(self.session.get_pool_state()) leaking_connections = True break i = i + 1 From 3b6b720c4be2a84d815a768c86ac4209603ed8cb Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Fri, 7 Aug 2015 17:06:30 +0100 Subject: [PATCH 48/70] Be more careful how optional values are decoded --- cassandra/deserializers.pxd | 7 +++- cassandra/ioutils.pyx | 13 ++++-- .../cqlengine/query/test_updates.py | 42 +++++++++---------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/cassandra/deserializers.pxd b/cassandra/deserializers.pxd index 5b820061..015fda37 100644 --- a/cassandra/deserializers.pxd +++ b/cassandra/deserializers.pxd @@ -21,8 +21,11 @@ cdef class Deserializer: cdef inline object from_binary(Deserializer deserializer, Buffer *buf, int protocol_version): - if buf.size <= 0 and not deserializer.empty_binary_ok: + if buf.size < 0: + return None + elif buf.size == 0 and not deserializer.empty_binary_ok: return _ret_empty(deserializer, buf.size) - return deserializer.deserialize(buf, protocol_version) + else: + return deserializer.deserialize(buf, protocol_version) cdef _ret_empty(Deserializer deserializer, Py_ssize_t buf_size) diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index d5aeff6c..203997e9 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -9,12 +9,19 @@ cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1: """ Get a pointer into the buffer provided by BytesIOReader for the next data item in the stream of values. + + BEWARE: + If the next item has a zero negative size, the pointer will be set to NULL. + A negative size happens when the value is NULL in the database, whereas a + zero size may happen either for legacy reasons, or for data types such as + strings (which may be empty). """ cdef Py_ssize_t raw_val_size = read_int(reader) - if raw_val_size < 0: - raw_val_size = 0 + if raw_val_size <= 0: + buf_out.ptr = NULL + else: + buf_out.ptr = reader.read(raw_val_size) - buf_out.ptr = reader.read(raw_val_size) buf_out.size = raw_val_size return 0 diff --git a/tests/integration/cqlengine/query/test_updates.py b/tests/integration/cqlengine/query/test_updates.py index a3b80f15..6c539012 100644 --- a/tests/integration/cqlengine/query/test_updates.py +++ b/tests/integration/cqlengine/query/test_updates.py @@ -52,17 +52,17 @@ class QueryUpdateTests(BaseCassEngTestCase): # sanity check for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == i - assert row.text == str(i) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, i) + self.assertEqual(row.text, str(i)) # perform update TestQueryUpdateModel.objects(partition=partition, cluster=3).update(count=6) for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == (6 if i == 3 else i) - assert row.text == str(i) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, 6 if i == 3 else i) + self.assertEqual(row.text, str(i)) def test_update_values_validation(self): """ tests calling udpate on models with values passed in """ @@ -72,9 +72,9 @@ class QueryUpdateTests(BaseCassEngTestCase): # sanity check for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == i - assert row.text == str(i) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, i) + self.assertEqual(row.text, str(i)) # perform update with self.assertRaises(ValidationError): @@ -98,17 +98,17 @@ class QueryUpdateTests(BaseCassEngTestCase): # sanity check for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == i - assert row.text == str(i) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, i) + self.assertEqual(row.text, str(i)) # perform update TestQueryUpdateModel.objects(partition=partition, cluster=3).update(text=None) for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == i - assert row.text == (None if i == 3 else str(i)) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, i) + self.assertEqual(row.text, None if i == 3 else str(i)) def test_mixed_value_and_null_update(self): """ tests that updating a columns value, and removing another works properly """ @@ -118,17 +118,17 @@ class QueryUpdateTests(BaseCassEngTestCase): # sanity check for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == i - assert row.text == str(i) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, i) + self.assertEqual(row.text, str(i)) # perform update TestQueryUpdateModel.objects(partition=partition, cluster=3).update(count=6, text=None) for i, row in enumerate(TestQueryUpdateModel.objects(partition=partition)): - assert row.cluster == i - assert row.count == (6 if i == 3 else i) - assert row.text == (None if i == 3 else str(i)) + self.assertEqual(row.cluster, i) + self.assertEqual(row.count, 6 if i == 3 else i) + self.assertEqual(row.text, None if i == 3 else str(i)) def test_counter_updates(self): pass From 53b2b48f582b2079b48b5336a06aeb97089fe042 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sat, 8 Aug 2015 12:28:53 +0100 Subject: [PATCH 49/70] Be more careful when Cython is available but NumPy is not --- cassandra/cython_deps.py | 8 +++++++- cassandra/protocol.py | 11 +++++++---- tests/unit/cython/utils.py | 12 ++++-------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/cassandra/cython_deps.py b/cassandra/cython_deps.py index 41516426..fdd15464 100644 --- a/cassandra/cython_deps.py +++ b/cassandra/cython_deps.py @@ -2,4 +2,10 @@ try: from cassandra.rowparser import make_recv_results_rows HAVE_CYTHON = True except ImportError: - HAVE_CYTHON = False \ No newline at end of file + HAVE_CYTHON = False + +try: + import numpy + HAVE_NUMPY = True +except ImportError: + HAVE_NUMPY = False diff --git a/cassandra/protocol.py b/cassandra/protocol.py index de8a464d..5ebbfa5c 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -40,7 +40,7 @@ from cassandra.cqltypes import (AsciiType, BytesType, BooleanType, TupleType, lookup_casstype, SimpleDateType, TimeType, ByteType, ShortType) from cassandra.policies import WriteType -from cassandra.cython_deps import HAVE_CYTHON +from cassandra.cython_deps import HAVE_CYTHON, HAVE_NUMPY from cassandra import util log = logging.getLogger(__name__) @@ -1039,14 +1039,17 @@ def cython_protocol_handler(colparser): if HAVE_CYTHON: from cassandra.objparser import ListParser, LazyParser - from cassandra.numpyparser import NumpyParser - ProtocolHandler = cython_protocol_handler(ListParser()) LazyProtocolHandler = cython_protocol_handler(LazyParser()) - NumpyProtocolHandler = cython_protocol_handler(NumpyParser()) else: # Use Python-based ProtocolHandler LazyProtocolHandler = None + + +if HAVE_CYTHON and HAVE_NUMPY: + from cassandra.numpyparser import NumpyParser + NumpyProtocolHandler = cython_protocol_handler(NumpyParser()) +else: NumpyProtocolHandler = None diff --git a/tests/unit/cython/utils.py b/tests/unit/cython/utils.py index eea4698f..f2598c0e 100644 --- a/tests/unit/cython/utils.py +++ b/tests/unit/cython/utils.py @@ -1,9 +1,4 @@ -try: - import tests.unit.cython.dummy_module -except ImportError: - have_cython = False -else: - have_cython = True +from cassandra.cython_deps import HAVE_CYTHON, HAVE_NUMPY try: import unittest2 as unittest @@ -18,10 +13,11 @@ def cyimport(import_path): try: return __import__(import_path, fromlist=True) except ImportError: - if have_cython: + if HAVE_CYTHON: raise return None # @cythontest # def test_something(self): ... -cythontest = unittest.skipUnless(have_cython, 'Cython is not available') +cythontest = unittest.skipUnless(HAVE_CYTHON, 'Cython is not available') +numpytest = unittest.skipUnless(HAVE_CYTHON and HAVE_NUMPY, 'NumPy is not available') \ No newline at end of file From 0b81f4068bdeb9d09259b0db255660e13e12b1fd Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sat, 8 Aug 2015 12:31:53 +0100 Subject: [PATCH 50/70] Forgot BytesDeserializer, fix small empty string issue --- cassandra/deserializers.pyx | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index a0d55340..83582cfc 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -16,21 +16,20 @@ from uuid import UUID from cassandra import cqltypes from cassandra import util - cdef class Deserializer: """Cython-based deserializer class for a cqltype""" def __init__(self, cqltype): self.cqltype = cqltype - self.empty_binary_ok = False + self.empty_binary_ok = cqltype.empty_binary_ok cdef deserialize(self, Buffer *buf, int protocol_version): raise NotImplementedError -cdef class DesLongType(Deserializer): +cdef class DesBytesType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return int64_unpack(buf.ptr) + return to_bytes(buf) # TODO: Use libmpdec: http://www.bytereef.org/mpdecimal/index.html @@ -64,11 +63,6 @@ cdef class DesByteType(Deserializer): cdef class DesAsciiType(Deserializer): - - def __init__(self, cqltype): - super().__init__(cqltype) - self.empty_binary_ok = True - cdef deserialize(self, Buffer *buf, int protocol_version): if six.PY2: return to_bytes(buf) @@ -85,6 +79,11 @@ cdef class DesDoubleType(Deserializer): return double_unpack(buf.ptr) +cdef class DesLongType(Deserializer): + cdef deserialize(self, Buffer *buf, int protocol_version): + return int64_unpack(buf.ptr) + + cdef class DesInt32Type(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): return int32_unpack(buf.ptr) @@ -149,10 +148,6 @@ cdef class DesTimeType(Deserializer): cdef class DesUTF8Type(Deserializer): - def __init__(self, cqltype): - super().__init__(cqltype) - self.empty_binary_ok = True - cdef deserialize(self, Buffer *buf, int protocol_version): return to_bytes(buf).decode('utf8') @@ -320,21 +315,24 @@ cdef class DesTupleType(_DesParameterizedType): cdef Buffer item_buf cdef Deserializer deserializer + # collections inside UDTs are always encoded with at least the + # version 3 format protocol_version = max(3, protocol_version) p = 0 values = [] for i in range(self.subtypes_len): item = None - if p != buf.size: + if p < buf.size: itemlen = int32_unpack(buf.ptr + p) p += 4 if itemlen >= 0: item_buf.ptr = buf.ptr + p item_buf.size = itemlen + p += itemlen + deserializer = self.deserializers[i] item = from_binary(deserializer, &item_buf, protocol_version) - p += itemlen tuple_set(res, i, item) @@ -423,6 +421,9 @@ cdef class GenericDeserializer(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): return self.cqltype.deserialize(to_bytes(buf), protocol_version) + def __repr__(self): + return "GenericDeserializer(%s)" % (self.cqltype,) + #-------------------------------------------------------------------------- # Helper utilities From 8d45880acb5834275708b358bfa4304455e04b00 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sat, 8 Aug 2015 12:39:05 +0100 Subject: [PATCH 51/70] Python 3 compatibility for test_schema --- tests/integration/long/test_schema.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/long/test_schema.py b/tests/integration/long/test_schema.py index 7da5203f..6f165cfd 100644 --- a/tests/integration/long/test_schema.py +++ b/tests/integration/long/test_schema.py @@ -85,11 +85,11 @@ class SchemaTests(unittest.TestCase): session = self.session - for i in xrange(30): + for i in range(30): execute_until_pass(session, "CREATE KEYSPACE test_{0} WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}".format(i)) execute_until_pass(session, "CREATE TABLE test_{0}.cf (key int PRIMARY KEY, value int)".format(i)) - for j in xrange(100): + for j in range(100): execute_until_pass(session, "INSERT INTO test_{0}.cf (key, value) VALUES ({1}, {1})".format(i, j)) execute_until_pass(session, "DROP KEYSPACE test_{0}".format(i)) @@ -102,7 +102,7 @@ class SchemaTests(unittest.TestCase): cluster = Cluster(protocol_version=PROTOCOL_VERSION) session = cluster.connect() - for i in xrange(30): + for i in range(30): try: execute_until_pass(session, "CREATE KEYSPACE test WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") except AlreadyExists: @@ -111,7 +111,7 @@ class SchemaTests(unittest.TestCase): execute_until_pass(session, "CREATE TABLE test.cf (key int PRIMARY KEY, value int)") - for j in xrange(100): + for j in range(100): execute_until_pass(session, "INSERT INTO test.cf (key, value) VALUES ({0}, {0})".format(j)) execute_until_pass(session, "DROP KEYSPACE test") From edd5463e7b56a805a1b4eaa86cc6ccdbbfc8fad2 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sat, 8 Aug 2015 12:55:20 +0100 Subject: [PATCH 52/70] Add test to illustrate non-deterministic query test failure --- tests/integration/standard/test_query.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/integration/standard/test_query.py b/tests/integration/standard/test_query.py index 80a0d8e2..8474e62d 100644 --- a/tests/integration/standard/test_query.py +++ b/tests/integration/standard/test_query.py @@ -298,8 +298,8 @@ class BatchStatementTests(unittest.TestCase): keys.add(result.k) values.add(result.v) - self.assertEqual(set(range(10)), keys) - self.assertEqual(set(range(10)), values) + self.assertEqual(set(range(10)), keys, msg=results) + self.assertEqual(set(range(10)), values, msg=results) def test_string_statements(self): batch = BatchStatement(BatchType.LOGGED) @@ -367,6 +367,11 @@ class BatchStatementTests(unittest.TestCase): self.session.execute(batch) self.confirm_results() + def test_no_parameters_many_times(self): + for i in range(1000): + self.test_no_parameters() + self.session.execute("TRUNCATE test3rf.test") + class SerialConsistencyTests(unittest.TestCase): def setUp(self): From c8dfc48ff231e0b9793965790c623daa9402f482 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Sat, 8 Aug 2015 15:58:53 +0100 Subject: [PATCH 53/70] More comprehensive cython and numpy deserializer tests --- cassandra/numpyparser.pyx | 8 +- .../standard/test_custom_protocol_handler.py | 5 +- .../standard/test_cython_protocol_handlers.py | 107 ++++++++++++++---- tests/integration/standard/utils.py | 31 +++-- tests/unit/cython/utils.py | 1 + 5 files changed, 111 insertions(+), 41 deletions(-) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 8499d938..0a4e7e3e 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -23,7 +23,7 @@ from cassandra import cqltypes from cassandra.util import is_little_endian import numpy as np - +# import pandas as pd cdef extern from "numpyFlags.h": # Include 'numpyFlags.h' into the generated C code to disable the @@ -74,8 +74,10 @@ cdef class NumpyParser(ColumnParser): for i in range(rowcount): unpack_row(reader, desc, arrs) - return [make_native_byteorder(arr) for arr in arrays] - # return pd.DataFrame(dict(zip(desc.colnames, arrays))) + arrays = [make_native_byteorder(arr) for arr in arrays] + result = dict(zip(desc.colnames, arrays)) + # return pd.DataFrame(result) + return result ### Helper functions to create NumPy arrays and array descriptors diff --git a/tests/integration/standard/test_custom_protocol_handler.py b/tests/integration/standard/test_custom_protocol_handler.py index edd066be..36965a36 100644 --- a/tests/integration/standard/test_custom_protocol_handler.py +++ b/tests/integration/standard/test_custom_protocol_handler.py @@ -107,10 +107,11 @@ class CustomProtocolHandlerTest(unittest.TestCase): session.client_protocol_handler = CustomProtocolHandlerResultMessageTracked session.row_factory = tuple_factory - columns_string = create_table_with_all_types("alltypes", session) + colnames = create_table_with_all_types("alltypes", session, 1) + columns_string = ", ".join(colnames) # verify data - params = get_all_primitive_params() + params = get_all_primitive_params(0) results = session.execute("SELECT {0} FROM alltypes WHERE primkey=0".format(columns_string))[0] for expected, actual in zip(params, results): self.assertEqual(actual, expected) diff --git a/tests/integration/standard/test_cython_protocol_handlers.py b/tests/integration/standard/test_cython_protocol_handlers.py index ba75cf72..985b7953 100644 --- a/tests/integration/standard/test_cython_protocol_handlers.py +++ b/tests/integration/standard/test_cython_protocol_handlers.py @@ -7,23 +7,25 @@ try: except ImportError: import unittest +from cassandra.query import tuple_factory from cassandra.cluster import Cluster from cassandra.protocol import ProtocolHandler, LazyProtocolHandler, NumpyProtocolHandler + from tests.integration import use_singledc, PROTOCOL_VERSION from tests.integration.datatype_utils import update_datatypes -from tests.integration.standard.utils import create_table_with_all_types, get_all_primitive_params - -from cassandra.cython_deps import HAVE_CYTHON -if not HAVE_CYTHON: - raise unittest.SkipTest("Skipping test, not compiled with Cython enabled") +from tests.integration.standard.utils import ( + create_table_with_all_types, get_all_primitive_params, get_primitive_datatypes) +from tests.unit.cython.utils import cythontest, numpytest def setup_module(): use_singledc() update_datatypes() -class CustomProtocolHandlerTest(unittest.TestCase): +class CythonProtocolHandlerTest(unittest.TestCase): + + N_ITEMS = 10 @classmethod def setUpClass(cls): @@ -32,39 +34,96 @@ class CustomProtocolHandlerTest(unittest.TestCase): cls.session.execute("CREATE KEYSPACE testspace WITH replication = " "{ 'class' : 'SimpleStrategy', 'replication_factor': '1'}") cls.session.set_keyspace("testspace") - create_table_with_all_types("test_table", cls.session) + cls.colnames = create_table_with_all_types("test_table", cls.session, cls.N_ITEMS) @classmethod def tearDownClass(cls): cls.session.execute("DROP KEYSPACE testspace") cls.cluster.shutdown() + @cythontest def test_cython_parser(self): """ Test Cython-based parser that returns a list of tuples """ - self.cython_parser(ProtocolHandler) + verify_iterator_data(self.assertEqual, get_data(ProtocolHandler)) + @cythontest def test_cython_lazy_parser(self): """ - Test Cython-based parser that returns a list of tuples + Test Cython-based parser that returns an iterator of tuples """ - self.cython_parser(LazyProtocolHandler) + verify_iterator_data(self.assertEqual, get_data(LazyProtocolHandler)) - def cython_parser(self, protocol_handler): - cluster = Cluster(protocol_version=PROTOCOL_VERSION) - session = cluster.connect(keyspace="testspace") + @numpytest + def test_numpy_parser(self): + """ + Test Numpy-based parser that returns a NumPy array + """ + # arrays = { 'a': arr1, 'b': arr2, ... } + arrays = get_data(NumpyProtocolHandler) - # use our custom protocol handler - session.client_protocol_handler = protocol_handler - # session.row_factory = tuple_factory + colnames = self.colnames + datatypes = get_primitive_datatypes() + for colname, datatype in zip(colnames, datatypes): + arr = arrays[colname] + self.match_dtype(datatype, arr.dtype) - # verify data - params = get_all_primitive_params() - [first_result] = session.execute("SELECT * FROM test_table WHERE primkey=0") - self.assertEqual(len(params), len(first_result), - msg="Not the right number of columns?") - for expected, actual in zip(params, first_result): - self.assertEqual(actual, expected) + verify_iterator_data(self.assertEqual, arrays_to_list_of_tuples(arrays, colnames)) - session.shutdown() + def match_dtype(self, datatype, dtype): + """Match a string cqltype (e.g. 'int' or 'blob') with a numpy dtype""" + if datatype == 'smallint': + self.match_dtype_props(dtype, 'i', 2) + elif datatype == 'int': + self.match_dtype_props(dtype, 'i', 4) + elif datatype in ('bigint', 'counter'): + self.match_dtype_props(dtype, 'i', 8) + elif datatype == 'float': + self.match_dtype_props(dtype, 'f', 4) + elif datatype == 'double': + self.match_dtype_props(dtype, 'f', 8) + else: + self.assertEqual(dtype.kind, 'O', msg=(dtype, datatype)) + + def match_dtype_props(self, dtype, kind, size, signed=None): + self.assertEqual(dtype.kind, kind, msg=dtype) + self.assertEqual(dtype.itemsize, size, msg=dtype) + + +def arrays_to_list_of_tuples(arrays, colnames): + """Convert a dict of arrays (as given by the numpy protocol handler) to a list of tuples""" + first_array = arrays[colnames[0]] + return [tuple(arrays[colname][i] for colname in colnames) + for i in range(len(first_array))] + + +def get_data(protocol_handler): + """ + Get some data from the test table. + + :param key: if None, get all results (100.000 results), otherwise get only one result + """ + cluster = Cluster(protocol_version=PROTOCOL_VERSION) + session = cluster.connect(keyspace="testspace") + + # use our custom protocol handler + session.client_protocol_handler = protocol_handler + session.row_factory = tuple_factory + + results = session.execute("SELECT * FROM test_table") + session.shutdown() + return results + + +def verify_iterator_data(assertEqual, results): + """ + Check the result of get_data() when this is a list or + iterator of tuples + """ + for result in results: + params = get_all_primitive_params(result[0]) + assertEqual(len(params), len(result), + msg="Not the right number of columns?") + for expected, actual in zip(params, result): + assertEqual(actual, expected) diff --git a/tests/integration/standard/utils.py b/tests/integration/standard/utils.py index bd0c80b5..fe54f04d 100644 --- a/tests/integration/standard/utils.py +++ b/tests/integration/standard/utils.py @@ -4,15 +4,16 @@ Helper module to populate a dummy Cassandra tables with data. from tests.integration.datatype_utils import PRIMITIVE_DATATYPES, get_sample -def create_table_with_all_types(table_name, session): +def create_table_with_all_types(table_name, session, N): """ Method that given a table_name and session construct a table that contains all possible primitive types. :param table_name: Name of table to create :param session: session to use for table creation - :return: a string containing the names of all the columns. - This can be used to query the table. + :param N: the number of items to insert into the table + + :return: a list of column names """ # create table alpha_type_list = ["primkey int PRIMARY KEY"] @@ -26,21 +27,27 @@ def create_table_with_all_types(table_name, session): table_name, ', '.join(alpha_type_list)), timeout=120) # create the input - params = get_all_primitive_params() - # insert into table as a simple statement - columns_string = ', '.join(col_names) - placeholders = ', '.join(["%s"] * len(col_names)) - session.execute("INSERT INTO {0} ({1}) VALUES ({2})".format( - table_name, columns_string, placeholders), params, timeout=120) - return columns_string + for key in range(N): + params = get_all_primitive_params(key) + + # insert into table as a simple statement + columns_string = ', '.join(col_names) + placeholders = ', '.join(["%s"] * len(col_names)) + session.execute("INSERT INTO {0} ({1}) VALUES ({2})".format( + table_name, columns_string, placeholders), params, timeout=120) + return col_names -def get_all_primitive_params(): +def get_all_primitive_params(key): """ Simple utility method used to give back a list of all possible primitive data sample types. """ - params = [0] + params = [key] for datatype in PRIMITIVE_DATATYPES: params.append(get_sample(datatype)) return params + + +def get_primitive_datatypes(): + return ['int'] + list(PRIMITIVE_DATATYPES) \ No newline at end of file diff --git a/tests/unit/cython/utils.py b/tests/unit/cython/utils.py index f2598c0e..9f0a5a87 100644 --- a/tests/unit/cython/utils.py +++ b/tests/unit/cython/utils.py @@ -17,6 +17,7 @@ def cyimport(import_path): raise return None + # @cythontest # def test_something(self): ... cythontest = unittest.skipUnless(HAVE_CYTHON, 'Cython is not available') From 919ece20f3710289a161bd92841bbbe642e7ce28 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 10:50:30 +0100 Subject: [PATCH 54/70] Small performance optimization --- cassandra/deserializers.pyx | 5 ++++- cassandra/numpyparser.pyx | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 83582cfc..a08415e4 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -16,6 +16,9 @@ from uuid import UUID from cassandra import cqltypes from cassandra import util +cdef bint PY2 = six.PY2 + + cdef class Deserializer: """Cython-based deserializer class for a cqltype""" @@ -64,7 +67,7 @@ cdef class DesByteType(Deserializer): cdef class DesAsciiType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - if six.PY2: + if PY2: return to_bytes(buf) return to_bytes(buf).decode('ascii') diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 0a4e7e3e..89bf18da 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -63,7 +63,7 @@ cdef class NumpyParser(ColumnParser): """Decode a ResultMessage into a bunch of NumPy arrays""" cpdef parse_rows(self, BytesIOReader reader, ParseDesc desc): - cdef Py_ssize_t i, rowcount + cdef Py_ssize_t rowcount cdef ArrDesc[::1] array_descs cdef ArrDesc *arrs @@ -71,8 +71,7 @@ cdef class NumpyParser(ColumnParser): array_descs, arrays = make_arrays(desc, rowcount) arrs = &array_descs[0] - for i in range(rowcount): - unpack_row(reader, desc, arrs) + _parse_rows(reader, desc, arrs, rowcount) arrays = [make_native_byteorder(arr) for arr in arrays] result = dict(zip(desc.colnames, arrays)) @@ -80,6 +79,14 @@ cdef class NumpyParser(ColumnParser): return result +cdef _parse_rows(BytesIOReader reader, ParseDesc desc, + ArrDesc *arrs, Py_ssize_t rowcount): + cdef Py_ssize_t i + + for i in range(rowcount): + unpack_row(reader, desc, arrs) + + ### Helper functions to create NumPy arrays and array descriptors def make_arrays(ParseDesc desc, array_size): From 07327ea91586a04ac4a93cfb9b4387a59d59b194 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 13:34:56 +0100 Subject: [PATCH 55/70] Clean up setup.py --- setup.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index ce5e5166..bfb9176f 100644 --- a/setup.py +++ b/setup.py @@ -262,20 +262,16 @@ if "--no-libev" not in sys.argv and not is_windows: if "--no-cython" not in sys.argv: try: from Cython.Build import cythonize - # cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', 'pool', 'protocol', 'query', 'util'] - cython_candidates = [] + cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', + 'pool', 'protocol', 'query', 'util'] compile_args = [] if is_windows else ['-Wno-unused-function'] - directives = {'profile': PROFILING} # this seems to have no effect... extensions.extend(cythonize( [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], - extra_compile_args=compile_args, - compiler_directives=directives) + extra_compile_args=compile_args) for m in cython_candidates], exclude_failures=True)) - extensions.extend(cythonize("cassandra/*.pyx", - compiler_directives=directives)) - extensions.extend(cythonize("tests/unit/cython/*.pyx", - compiler_directives=directives)) + extensions.extend(cythonize("cassandra/*.pyx")) + extensions.extend(cythonize("tests/unit/cython/*.pyx")) except ImportError: sys.stderr.write("Cython is not installed. Not compiling core driver files as extensions (optional).") From 8d28473695e2be14e77e9db190fe73fd18a3baa2 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 13:37:10 +0100 Subject: [PATCH 56/70] Remove leftover dummy module --- tests/unit/cython/dummy_module.pyx | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 tests/unit/cython/dummy_module.pyx diff --git a/tests/unit/cython/dummy_module.pyx b/tests/unit/cython/dummy_module.pyx deleted file mode 100644 index 8bd1206b..00000000 --- a/tests/unit/cython/dummy_module.pyx +++ /dev/null @@ -1,2 +0,0 @@ -# This is a dummy module used by utils.py to determine whether -# cassandra was build with Cython \ No newline at end of file From d71b6e769c87025582adc2842cadaee6a434eb91 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 20:19:12 +0100 Subject: [PATCH 57/70] Disable non-cython extension modules --- setup.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index bfb9176f..ce09a23e 100644 --- a/setup.py +++ b/setup.py @@ -262,14 +262,14 @@ if "--no-libev" not in sys.argv and not is_windows: if "--no-cython" not in sys.argv: try: from Cython.Build import cythonize - cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', - 'pool', 'protocol', 'query', 'util'] - compile_args = [] if is_windows else ['-Wno-unused-function'] - extensions.extend(cythonize( - [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], - extra_compile_args=compile_args) - for m in cython_candidates], - exclude_failures=True)) + # cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', + # 'pool', 'protocol', 'query', 'util'] + # compile_args = [] if is_windows else ['-Wno-unused-function'] + # extensions.extend(cythonize( + # [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], + # extra_compile_args=compile_args) + # for m in cython_candidates], + # exclude_failures=True)) extensions.extend(cythonize("cassandra/*.pyx")) extensions.extend(cythonize("tests/unit/cython/*.pyx")) except ImportError: From 40422563492b8c94ef13add47d77f96f4dc92ff1 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 20:20:00 +0100 Subject: [PATCH 58/70] Do boundschecking when accessing buffer memory --- cassandra/buffer.pxd | 27 +++-- cassandra/{marshal.pyx => cython_marshal.pyx} | 64 +++++----- cassandra/deserializers.pyx | 110 +++++++++--------- cassandra/ioutils.pyx | 16 ++- cassandra/marshal.pxd | 29 ----- cassandra/objparser.pyx | 2 + 6 files changed, 121 insertions(+), 127 deletions(-) rename cassandra/{marshal.pyx => cython_marshal.pyx} (76%) delete mode 100644 cassandra/marshal.pxd diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd index cfe93e01..f94da139 100644 --- a/cassandra/buffer.pxd +++ b/cassandra/buffer.pxd @@ -16,8 +16,6 @@ cdef struct Buffer: char *ptr Py_ssize_t size -cdef inline Buffer from_bytes(bytes byts): - return from_ptr_and_size(PyBytes_AS_STRING(byts), len(byts)) cdef inline bytes to_bytes(Buffer *buf): return buf.ptr[:buf.size] @@ -25,8 +23,23 @@ cdef inline bytes to_bytes(Buffer *buf): cdef inline char *buf_ptr(Buffer *buf): return buf.ptr -cdef inline Buffer from_ptr_and_size(char *ptr, Py_ssize_t size): - cdef Buffer res - res.ptr = ptr - res.size = size - return res +cdef inline char *buf_read(Buffer *buf, Py_ssize_t size) except NULL: + if size > buf.size: + raise IndexError("Requested more than length of buffer") + return buf.ptr + +cdef inline int slice_buffer(Buffer *buf, Buffer *out, + Py_ssize_t start, Py_ssize_t size) except -1: + if size < 0: + raise ValueError("Length must be positive") + + if start + size > buf.size: + raise IndexError("Buffer slice out of bounds") + + out.ptr = buf.ptr + start + out.size = size + return 0 + +cdef inline void from_ptr_and_size(char *ptr, Py_ssize_t size, Buffer *out): + out.ptr = ptr + out.size = size diff --git a/cassandra/marshal.pyx b/cassandra/cython_marshal.pyx similarity index 76% rename from cassandra/marshal.pyx rename to cassandra/cython_marshal.pyx index 0ab65c46..00011018 100644 --- a/cassandra/marshal.pyx +++ b/cassandra/cython_marshal.pyx @@ -21,6 +21,7 @@ import math from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t) +from cassandra.buffer cimport Buffer, buf_read cdef bint is_little_endian from cassandra.util import is_little_endian @@ -66,82 +67,81 @@ cdef inline Py_ssize_t div2(Py_ssize_t x): ### Packing and unpacking of signed integers -cpdef inline bytes int64_pack(int64_t x): +cdef inline bytes int64_pack(int64_t x): return pack( &x, 8) -cpdef inline int64_t int64_unpack(const char *buf): - # The 'const' makes sure the buffer is not mutated in-place! - cdef int64_t x = ( buf)[0] +cdef inline int64_t int64_unpack(Buffer *buf): + cdef int64_t x = ( buf_read(buf, 8))[0] cdef char *p = &x swap_order( &x, 8) return x -cpdef inline bytes int32_pack(int32_t x): +cdef inline bytes int32_pack(int32_t x): return pack( &x, 4) -cpdef inline int32_t int32_unpack(const char *buf): - cdef int32_t x = ( buf)[0] +cdef inline int32_t int32_unpack(Buffer *buf): + cdef int32_t x = ( buf_read(buf, 4))[0] cdef char *p = &x swap_order( &x, 4) return x -cpdef inline bytes int16_pack(int16_t x): +cdef inline bytes int16_pack(int16_t x): return pack( &x, 2) -cpdef inline int16_t int16_unpack(const char *buf): - cdef int16_t x = ( buf)[0] +cdef inline int16_t int16_unpack(Buffer *buf): + cdef int16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x -cpdef inline bytes int8_pack(int8_t x): +cdef inline bytes int8_pack(int8_t x): return ( &x)[:1] -cpdef inline int8_t int8_unpack(const char *buf): - return ( buf)[0] +cdef inline int8_t int8_unpack(Buffer *buf): + return ( buf_read(buf, 1))[0] -cpdef inline bytes uint64_pack(uint64_t x): +cdef inline bytes uint64_pack(uint64_t x): return pack( &x, 8) -cpdef inline uint64_t uint64_unpack(const char *buf): - cdef uint64_t x = ( buf)[0] +cdef inline uint64_t uint64_unpack(Buffer *buf): + cdef uint64_t x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x -cpdef inline bytes uint32_pack(uint32_t x): +cdef inline bytes uint32_pack(uint32_t x): return pack( &x, 4) -cpdef inline uint32_t uint32_unpack(const char *buf): - cdef uint32_t x = ( buf)[0] +cdef inline uint32_t uint32_unpack(Buffer *buf): + cdef uint32_t x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x -cpdef inline bytes uint16_pack(uint16_t x): +cdef inline bytes uint16_pack(uint16_t x): return pack( &x, 2) -cpdef inline uint16_t uint16_unpack(const char *buf): - cdef uint16_t x = ( buf)[0] +cdef inline uint16_t uint16_unpack(Buffer *buf): + cdef uint16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x -cpdef inline bytes uint8_pack(uint8_t x): +cdef inline bytes uint8_pack(uint8_t x): return pack( &x, 1) -cpdef inline uint8_t uint8_unpack(const char *buf): - return ( buf)[0] +cdef inline uint8_t uint8_unpack(Buffer *buf): + return ( buf_read(buf, 1))[0] -cpdef inline bytes double_pack(double x): +cdef inline bytes double_pack(double x): return pack( &x, 8) -cpdef inline double double_unpack(const char *buf): - cdef double x = ( buf)[0] +cdef inline double double_unpack(Buffer *buf): + cdef double x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x -cpdef inline bytes float_pack(float x): +cdef inline bytes float_pack(float x): return pack( &x, 4) -cpdef inline float float_unpack(const char *buf): - cdef float x = ( buf)[0] +cdef inline float float_unpack(Buffer *buf): + cdef float x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x @@ -167,7 +167,7 @@ v3_header_pack = v3_header_struct.pack v3_header_unpack = v3_header_struct.unpack -cpdef varint_unpack(term): +cdef varint_unpack(term): """Unpack a variable-sized integer""" if PY3: return varint_unpack_py3(term) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index a08415e4..cf502691 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -2,8 +2,8 @@ from libc.stdint cimport int32_t, uint16_t -include 'marshal.pyx' -from cassandra.buffer cimport Buffer, to_bytes +include 'cython_marshal.pyx' +from cassandra.buffer cimport Buffer, to_bytes, slice_buffer from cassandra.cython_utils cimport datetime_from_timestamp from cython.view cimport array as cython_array @@ -39,10 +39,9 @@ cdef class DesBytesType(Deserializer): cdef class DesDecimalType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): cdef Buffer varint_buf - varint_buf.ptr = buf.ptr + 4 - varint_buf.size = buf.size - 4 + slice_buffer(buf, &varint_buf, 4, buf.size - 4) - scale = int32_unpack(buf.ptr) + scale = int32_unpack(buf) unscaled = varint_unpack(to_bytes(&varint_buf)) return Decimal('%de%d' % (unscaled, -scale)) @@ -55,14 +54,14 @@ cdef class DesUUIDType(Deserializer): cdef class DesBooleanType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - if int8_unpack(buf.ptr): + if int8_unpack(buf): return True return False cdef class DesByteType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return int8_unpack(buf.ptr) + return int8_unpack(buf) cdef class DesAsciiType(Deserializer): @@ -74,22 +73,22 @@ cdef class DesAsciiType(Deserializer): cdef class DesFloatType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return float_unpack(buf.ptr) + return float_unpack(buf) cdef class DesDoubleType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return double_unpack(buf.ptr) + return double_unpack(buf) cdef class DesLongType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return int64_unpack(buf.ptr) + return int64_unpack(buf) cdef class DesInt32Type(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return int32_unpack(buf.ptr) + return int32_unpack(buf) cdef class DesIntegerType(Deserializer): @@ -116,7 +115,7 @@ cdef class DesCounterColumnType(DesLongType): cdef class DesDateType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - cdef double timestamp = int64_unpack(buf.ptr) / 1000.0 + cdef double timestamp = int64_unpack(buf) / 1000.0 return datetime_from_timestamp(timestamp) @@ -136,18 +135,18 @@ EPOCH_OFFSET_DAYS = 2 ** 31 cdef class DesSimpleDateType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - days = uint32_unpack(buf.ptr) - EPOCH_OFFSET_DAYS + days = uint32_unpack(buf) - EPOCH_OFFSET_DAYS return util.Date(days) cdef class DesShortType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return int16_unpack(buf.ptr) + return int16_unpack(buf) cdef class DesTimeType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return util.Time(int64_unpack(to_bytes(buf))) + return util.Time(int64_unpack(buf)) cdef class DesUTF8Type(Deserializer): @@ -217,28 +216,40 @@ cdef list _deserialize_list_or_set(itemlen_t dummy_version, we can specialize on the protocol version. """ cdef itemlen_t itemlen - cdef Buffer sub_buf + cdef Buffer itemlen_buf + cdef Buffer elem_buf - cdef itemlen_t numelements = _unpack[itemlen_t](dummy_version, buf.ptr) - cdef itemlen_t p = sizeof(itemlen_t) + cdef itemlen_t numelements = _unpack_len[itemlen_t](0, buf) + cdef itemlen_t idx = sizeof(itemlen_t) cdef list result = [] for _ in range(numelements): - itemlen = _unpack[itemlen_t](dummy_version, buf.ptr + p) - p += sizeof(itemlen_t) - sub_buf.ptr = buf.ptr + p - sub_buf.size = itemlen - p += itemlen - result.append(from_binary(deserializer, &sub_buf, protocol_version)) + idx = subelem(buf, &elem_buf, idx) + result.append(from_binary(deserializer, &elem_buf, protocol_version)) return result -cdef itemlen_t _unpack(itemlen_t dummy_version, const char *buf): + +cdef inline itemlen_t subelem( + Buffer *buf, Buffer *elem_buf, itemlen_t idx): + cdef itemlen_t elemlen + + elemlen = _unpack_len[itemlen_t](idx, buf) + idx += sizeof(itemlen_t) + slice_buffer(buf, elem_buf, idx, elemlen) + return idx + elemlen + + +cdef itemlen_t _unpack_len(itemlen_t idx, Buffer *buf): cdef itemlen_t result + cdef Buffer itemlen_buf + slice_buffer(buf, &itemlen_buf, idx, sizeof(itemlen_t)) + if itemlen_t is uint16_t: - result = uint16_unpack(buf) + result = uint16_unpack(&itemlen_buf) else: - result = int32_unpack(buf) + result = int32_unpack(&itemlen_buf) + return result #-------------------------------------------------------------------------- @@ -278,27 +289,18 @@ cdef _deserialize_map(itemlen_t dummy_version, key_type, val_type): cdef itemlen_t itemlen, val_len, key_len cdef Buffer key_buf, val_buf + cdef Buffer itemlen_buf - cdef itemlen_t numelements = _unpack[itemlen_t](dummy_version, buf.ptr) - cdef itemlen_t p = sizeof(itemlen_t) + cdef itemlen_t numelements + cdef itemlen_t idx = sizeof(itemlen_t) cdef list result = [] - numelements = _unpack[itemlen_t](dummy_version, buf.ptr) - p = sizeof(itemlen_t) + numelements = _unpack_len[itemlen_t](0, buf) + idx = sizeof(itemlen_t) themap = util.OrderedMapSerializedKey(key_type, protocol_version) for _ in range(numelements): - key_len = _unpack[itemlen_t](dummy_version, buf.ptr + p) - p += sizeof(itemlen_t) - # keybytes = byts[p:p + key_len] - key_buf.ptr = buf.ptr + p - key_buf.size = key_len - p += key_len - val_len = _unpack(dummy_version, buf.ptr + p) - p += sizeof(itemlen_t) - # valbytes = byts[p:p + val_len] - val_buf.ptr = buf.ptr + p - val_buf.size = val_len - p += val_len + idx = subelem(buf, &key_buf, idx) + idx = subelem(buf, &val_buf, idx) key = from_binary(key_deserializer, &key_buf, protocol_version) val = from_binary(val_deserializer, &val_buf, protocol_version) themap._insert_unchecked(key, to_bytes(&key_buf), val) @@ -316,6 +318,7 @@ cdef class DesTupleType(_DesParameterizedType): cdef int32_t itemlen cdef tuple res = tuple_new(self.subtypes_len) cdef Buffer item_buf + cdef Buffer itemlen_buf cdef Deserializer deserializer # collections inside UDTs are always encoded with at least the @@ -327,11 +330,11 @@ cdef class DesTupleType(_DesParameterizedType): for i in range(self.subtypes_len): item = None if p < buf.size: - itemlen = int32_unpack(buf.ptr + p) + slice_buffer(buf, &itemlen_buf, p, 4) + itemlen = int32_unpack(&itemlen_buf) p += 4 if itemlen >= 0: - item_buf.ptr = buf.ptr + p - item_buf.size = itemlen + slice_buffer(buf, &item_buf, p, itemlen) p += itemlen deserializer = self.deserializers[i] @@ -354,12 +357,13 @@ cdef class DesUserType(DesTupleType): cdef class DesCompositeType(_DesParameterizedType): cdef deserialize(self, Buffer *buf, int protocol_version): - cdef Py_ssize_t i + cdef Py_ssize_t i, idx, start cdef Buffer elem_buf cdef int16_t element_length cdef Deserializer deserializer cdef tuple res = tuple_new(self.subtypes_len) + idx = 0 for i in range(self.subtypes_len): if not buf.size: # CompositeType can have missing elements at the end @@ -373,17 +377,17 @@ cdef class DesCompositeType(_DesParameterizedType): res = res[:i] break - element_length = uint16_unpack(buf.ptr) - elem_buf.ptr = buf.ptr + 2 - elem_buf.size = element_length + element_length = uint16_unpack(buf) + slice_buffer(buf, &elem_buf, 2, element_length) - # skip element length, element, and the EOC (one byte) - buf.ptr = buf.ptr + 2 + element_length + 1 - buf.size = buf.size - (2 + element_length + 1) deserializer = self.deserializers[i] item = from_binary(deserializer, &elem_buf, protocol_version) tuple_set(res, i, item) + # skip element length, element, and the EOC (one byte) + start = 2 + element_length + 1 + slice_buffer(buf, buf, start, buf.size - start) + return res diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 203997e9..1a11068c 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -1,5 +1,5 @@ -include 'marshal.pyx' -from cassandra.buffer cimport Buffer +include 'cython_marshal.pyx' +from cassandra.buffer cimport Buffer, from_ptr_and_size from libc.stdint cimport int32_t from cassandra.bytesio cimport BytesIOReader @@ -17,13 +17,17 @@ cdef inline int get_buf(BytesIOReader reader, Buffer *buf_out) except -1: strings (which may be empty). """ cdef Py_ssize_t raw_val_size = read_int(reader) + cdef char *ptr if raw_val_size <= 0: - buf_out.ptr = NULL + ptr = NULL else: - buf_out.ptr = reader.read(raw_val_size) + ptr = reader.read(raw_val_size) - buf_out.size = raw_val_size + from_ptr_and_size(ptr, raw_val_size, buf_out) return 0 cdef inline int32_t read_int(BytesIOReader reader) except ?0xDEAD: - return int32_unpack(reader.read(4)) + cdef Buffer buf + buf.ptr = reader.read(4) + buf.size = 4 + return int32_unpack(&buf) diff --git a/cassandra/marshal.pxd b/cassandra/marshal.pxd deleted file mode 100644 index ef7d9858..00000000 --- a/cassandra/marshal.pxd +++ /dev/null @@ -1,29 +0,0 @@ -from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, - uint8_t, uint16_t, uint32_t, uint64_t) - -cpdef bytes int64_pack(int64_t x) -cpdef bytes int32_pack(int32_t x) -cpdef bytes int16_pack(int16_t x) -cpdef bytes int8_pack(int8_t x) - -cpdef int64_t int64_unpack(const char *buf) -cpdef int32_t int32_unpack(const char *buf) -cpdef int16_t int16_unpack(const char *buf) -cpdef int8_t int8_unpack(const char *buf) - -cpdef bytes uint64_pack(uint64_t x) -cpdef bytes uint32_pack(uint32_t x) -cpdef bytes uint16_pack(uint16_t x) -cpdef bytes uint8_pack(uint8_t x) - -cpdef uint64_t uint64_unpack(const char *buf) -cpdef uint32_t uint32_unpack(const char *buf) -cpdef uint16_t uint16_unpack(const char *buf) -cpdef uint8_t uint8_unpack(const char *buf) - -cpdef bytes double_pack(double x) -cpdef bytes float_pack(float x) - -cpdef double double_unpack(const char *buf) -cpdef float float_unpack(const char *buf) - diff --git a/cassandra/objparser.pyx b/cassandra/objparser.pyx index 8aca1427..670f1b4a 100644 --- a/cassandra/objparser.pyx +++ b/cassandra/objparser.pyx @@ -40,6 +40,8 @@ cdef class TupleRowParser(RowParser): """ cpdef unpack_row(self, BytesIOReader reader, ParseDesc desc): + assert desc.rowsize >= 0 + cdef Buffer buf cdef Py_ssize_t i, rowsize = desc.rowsize cdef Deserializer deserializer From 564d2fdd0fa54194f189abb3eef2c5993084d743 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Mon, 10 Aug 2015 21:33:54 +0100 Subject: [PATCH 59/70] Make sure to propagate exceptions from unpacking functions --- cassandra/cython_marshal.pyx | 20 ++++++++-------- cassandra/deserializers.pyx | 46 +++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/cassandra/cython_marshal.pyx b/cassandra/cython_marshal.pyx index 00011018..ae964474 100644 --- a/cassandra/cython_marshal.pyx +++ b/cassandra/cython_marshal.pyx @@ -70,7 +70,7 @@ cdef inline Py_ssize_t div2(Py_ssize_t x): cdef inline bytes int64_pack(int64_t x): return pack( &x, 8) -cdef inline int64_t int64_unpack(Buffer *buf): +cdef inline int64_t int64_unpack(Buffer *buf) except ?0xDEAD: cdef int64_t x = ( buf_read(buf, 8))[0] cdef char *p = &x swap_order( &x, 8) @@ -79,7 +79,7 @@ cdef inline int64_t int64_unpack(Buffer *buf): cdef inline bytes int32_pack(int32_t x): return pack( &x, 4) -cdef inline int32_t int32_unpack(Buffer *buf): +cdef inline int32_t int32_unpack(Buffer *buf) except ?0xDEAD: cdef int32_t x = ( buf_read(buf, 4))[0] cdef char *p = &x swap_order( &x, 4) @@ -88,7 +88,7 @@ cdef inline int32_t int32_unpack(Buffer *buf): cdef inline bytes int16_pack(int16_t x): return pack( &x, 2) -cdef inline int16_t int16_unpack(Buffer *buf): +cdef inline int16_t int16_unpack(Buffer *buf) except ?0xDED: cdef int16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x @@ -96,13 +96,13 @@ cdef inline int16_t int16_unpack(Buffer *buf): cdef inline bytes int8_pack(int8_t x): return ( &x)[:1] -cdef inline int8_t int8_unpack(Buffer *buf): +cdef inline int8_t int8_unpack(Buffer *buf) except ?80: return ( buf_read(buf, 1))[0] cdef inline bytes uint64_pack(uint64_t x): return pack( &x, 8) -cdef inline uint64_t uint64_unpack(Buffer *buf): +cdef inline uint64_t uint64_unpack(Buffer *buf) except ?0xDEAD: cdef uint64_t x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x @@ -110,7 +110,7 @@ cdef inline uint64_t uint64_unpack(Buffer *buf): cdef inline bytes uint32_pack(uint32_t x): return pack( &x, 4) -cdef inline uint32_t uint32_unpack(Buffer *buf): +cdef inline uint32_t uint32_unpack(Buffer *buf) except ?0xDEAD: cdef uint32_t x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x @@ -118,7 +118,7 @@ cdef inline uint32_t uint32_unpack(Buffer *buf): cdef inline bytes uint16_pack(uint16_t x): return pack( &x, 2) -cdef inline uint16_t uint16_unpack(Buffer *buf): +cdef inline uint16_t uint16_unpack(Buffer *buf) except ?0xDEAD: cdef uint16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x @@ -126,13 +126,13 @@ cdef inline uint16_t uint16_unpack(Buffer *buf): cdef inline bytes uint8_pack(uint8_t x): return pack( &x, 1) -cdef inline uint8_t uint8_unpack(Buffer *buf): +cdef inline uint8_t uint8_unpack(Buffer *buf) except ?0xff: return ( buf_read(buf, 1))[0] cdef inline bytes double_pack(double x): return pack( &x, 8) -cdef inline double double_unpack(Buffer *buf): +cdef inline double double_unpack(Buffer *buf) except ?1.74: cdef double x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x @@ -140,7 +140,7 @@ cdef inline double double_unpack(Buffer *buf): cdef inline bytes float_pack(float x): return pack( &x, 4) -cdef inline float float_unpack(Buffer *buf): +cdef inline float float_unpack(Buffer *buf) except ?1.74: cdef float x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index cf502691..47028bf1 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -219,38 +219,52 @@ cdef list _deserialize_list_or_set(itemlen_t dummy_version, cdef Buffer itemlen_buf cdef Buffer elem_buf - cdef itemlen_t numelements = _unpack_len[itemlen_t](0, buf) - cdef itemlen_t idx = sizeof(itemlen_t) + cdef itemlen_t numelements + cdef itemlen_t idx cdef list result = [] + _unpack_len[itemlen_t](0, &numelements, buf) + idx = sizeof(itemlen_t) + for _ in range(numelements): - idx = subelem(buf, &elem_buf, idx) + subelem(buf, &elem_buf, &idx) result.append(from_binary(deserializer, &elem_buf, protocol_version)) return result -cdef inline itemlen_t subelem( - Buffer *buf, Buffer *elem_buf, itemlen_t idx): +cdef inline int subelem( + Buffer *buf, Buffer *elem_buf, itemlen_t *idx_p) except -1: + """ + Read the next element from the buffer: first read the size (in bytes) of the + element, then fill elem_buf with a newly sliced buffer of this size (and the + right offset). + + NOTE: The handling of 'idx' is somewhat atrocious, as there is a Cython + bug with the combination fused types + 'except' clause. + So instead, we pass in a pointer to 'idx', namely 'idx_p', and write + to this instead. + """ cdef itemlen_t elemlen - elemlen = _unpack_len[itemlen_t](idx, buf) - idx += sizeof(itemlen_t) - slice_buffer(buf, elem_buf, idx, elemlen) - return idx + elemlen + _unpack_len[itemlen_t](idx_p[0], &elemlen, buf) + idx_p[0] += sizeof(itemlen_t) + slice_buffer(buf, elem_buf, idx_p[0], elemlen) + idx_p[0] += elemlen + return 0 -cdef itemlen_t _unpack_len(itemlen_t idx, Buffer *buf): +cdef int _unpack_len(itemlen_t idx, itemlen_t *elemlen, Buffer *buf) except -1: cdef itemlen_t result cdef Buffer itemlen_buf slice_buffer(buf, &itemlen_buf, idx, sizeof(itemlen_t)) if itemlen_t is uint16_t: - result = uint16_unpack(&itemlen_buf) + elemlen[0] = uint16_unpack(&itemlen_buf) else: - result = int32_unpack(&itemlen_buf) + elemlen[0] = int32_unpack(&itemlen_buf) - return result + return 0 #-------------------------------------------------------------------------- # Map deserialization @@ -295,12 +309,12 @@ cdef _deserialize_map(itemlen_t dummy_version, cdef itemlen_t idx = sizeof(itemlen_t) cdef list result = [] - numelements = _unpack_len[itemlen_t](0, buf) + _unpack_len[itemlen_t](0, &numelements, buf) idx = sizeof(itemlen_t) themap = util.OrderedMapSerializedKey(key_type, protocol_version) for _ in range(numelements): - idx = subelem(buf, &key_buf, idx) - idx = subelem(buf, &val_buf, idx) + subelem(buf, &key_buf, &idx) + subelem(buf, &val_buf, &idx) key = from_binary(key_deserializer, &key_buf, protocol_version) val = from_binary(val_deserializer, &val_buf, protocol_version) themap._insert_unchecked(key, to_bytes(&key_buf), val) From 4956b12acace35e0db7718e9e5632fb1a978d896 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 11:19:48 +0100 Subject: [PATCH 60/70] Include Cython sources in sdist --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 1825f7bb..7a686a6b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,2 @@ include setup.py README.rst MANIFEST.in LICENSE ez_setup.py +include cassandra/*.pyx From d91731734c26392b0fd1655912ae3c7adf4fd931 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:04:18 +0100 Subject: [PATCH 61/70] Clean up some leftover code --- cassandra/buffer.pxd | 1 - cassandra/cython_marshal.pyx | 112 +++-------------------------------- cassandra/cython_utils.pyx | 9 +-- cassandra/deserializers.pyx | 4 +- cassandra/numpyparser.pyx | 1 - cassandra/typecodes.py | 4 +- tests/unit/cython/utils.py | 2 +- 7 files changed, 17 insertions(+), 116 deletions(-) diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd index f94da139..542cb181 100644 --- a/cassandra/buffer.pxd +++ b/cassandra/buffer.pxd @@ -10,7 +10,6 @@ from cpython.bytes cimport PyBytes_AS_STRING # checking. Only string objects are supported; no Unicode objects # should be passed. -from cassandra.buffer cimport Buffer cdef struct Buffer: char *ptr diff --git a/cassandra/cython_marshal.pyx b/cassandra/cython_marshal.pyx index ae964474..1ba11435 100644 --- a/cassandra/cython_marshal.pyx +++ b/cassandra/cython_marshal.pyx @@ -15,36 +15,16 @@ # limitations under the License. import six -import sys -import struct -import math from libc.stdint cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t) -from cassandra.buffer cimport Buffer, buf_read +from cassandra.buffer cimport Buffer, buf_read, to_bytes cdef bint is_little_endian from cassandra.util import is_little_endian cdef bint PY3 = six.PY3 -# cdef extern from "marshal.h": -# cdef str c_string_to_python(char *p, Py_ssize_t len) - -def _make_packer(format_string): - packer = struct.Struct(format_string) - pack = packer.pack - unpack = lambda s: packer.unpack(s)[0] - return pack, unpack - - -cdef inline bytes pack(char *buf, Py_ssize_t size): - """ - Pack a buffer, given as a char *, into Python bytes in byte order. - """ - swap_order(buf, size) - return buf[:size] - cdef inline void swap_order(char *buf, Py_ssize_t size): """ @@ -65,10 +45,7 @@ cdef inline void swap_order(char *buf, Py_ssize_t size): cdef inline Py_ssize_t div2(Py_ssize_t x): return x >> 1 -### Packing and unpacking of signed integers - -cdef inline bytes int64_pack(int64_t x): - return pack( &x, 8) +### Unpacking of signed integers cdef inline int64_t int64_unpack(Buffer *buf) except ?0xDEAD: cdef int64_t x = ( buf_read(buf, 8))[0] @@ -76,106 +53,58 @@ cdef inline int64_t int64_unpack(Buffer *buf) except ?0xDEAD: swap_order( &x, 8) return x -cdef inline bytes int32_pack(int32_t x): - return pack( &x, 4) - cdef inline int32_t int32_unpack(Buffer *buf) except ?0xDEAD: cdef int32_t x = ( buf_read(buf, 4))[0] cdef char *p = &x swap_order( &x, 4) return x -cdef inline bytes int16_pack(int16_t x): - return pack( &x, 2) - cdef inline int16_t int16_unpack(Buffer *buf) except ?0xDED: cdef int16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x -cdef inline bytes int8_pack(int8_t x): - return ( &x)[:1] - cdef inline int8_t int8_unpack(Buffer *buf) except ?80: return ( buf_read(buf, 1))[0] -cdef inline bytes uint64_pack(uint64_t x): - return pack( &x, 8) - cdef inline uint64_t uint64_unpack(Buffer *buf) except ?0xDEAD: cdef uint64_t x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x -cdef inline bytes uint32_pack(uint32_t x): - return pack( &x, 4) - cdef inline uint32_t uint32_unpack(Buffer *buf) except ?0xDEAD: cdef uint32_t x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x -cdef inline bytes uint16_pack(uint16_t x): - return pack( &x, 2) - cdef inline uint16_t uint16_unpack(Buffer *buf) except ?0xDEAD: cdef uint16_t x = ( buf_read(buf, 2))[0] swap_order( &x, 2) return x -cdef inline bytes uint8_pack(uint8_t x): - return pack( &x, 1) - cdef inline uint8_t uint8_unpack(Buffer *buf) except ?0xff: return ( buf_read(buf, 1))[0] -cdef inline bytes double_pack(double x): - return pack( &x, 8) - cdef inline double double_unpack(Buffer *buf) except ?1.74: cdef double x = ( buf_read(buf, 8))[0] swap_order( &x, 8) return x -cdef inline bytes float_pack(float x): - return pack( &x, 4) - cdef inline float float_unpack(Buffer *buf) except ?1.74: cdef float x = ( buf_read(buf, 4))[0] swap_order( &x, 4) return x -# int64_pack, int64_unpack = _make_packer('>q') -# int32_pack, int32_unpack = _make_packer('>i') -# int16_pack, int16_unpack = _make_packer('>h') -# int8_pack, int8_unpack = _make_packer('>b') -# uint64_pack, uint64_unpack = _make_packer('>Q') -# uint32_pack, uint32_unpack = _make_packer('>I') -# uint16_pack, uint16_unpack = _make_packer('>H') -# uint8_pack, uint8_unpack = _make_packer('>B') -# float_pack, float_unpack = _make_packer('>f') -# double_pack, double_unpack = _make_packer('>d') -# Special case for cassandra header -header_struct = struct.Struct('>BBbB') -header_pack = header_struct.pack -header_unpack = header_struct.unpack - -# in protocol version 3 and higher, the stream ID is two bytes -v3_header_struct = struct.Struct('>BBhB') -v3_header_pack = v3_header_struct.pack -v3_header_unpack = v3_header_struct.unpack - - -cdef varint_unpack(term): +cdef varint_unpack(Buffer *term): """Unpack a variable-sized integer""" if PY3: - return varint_unpack_py3(term) + return varint_unpack_py3(to_bytes(term)) else: - return varint_unpack_py2(term) + return varint_unpack_py2(to_bytes(term)) # TODO: Optimize these two functions -def varint_unpack_py3(term): +cdef varint_unpack_py3(bytes term): cdef int64_t one = 1L val = int(''.join("%02x" % i for i in term), 16) @@ -186,36 +115,9 @@ def varint_unpack_py3(term): val -= one << (len(term) * 8) return val -def varint_unpack_py2(term): # noqa +cdef varint_unpack_py2(bytes term): # noqa cdef int64_t one = 1L val = int(term.encode('hex'), 16) if (ord(term[0]) & 128) != 0: val = val - (one << (len(term) * 8)) return val - - -def bitlength(n): - # return int(math.log2(n)) + 1 - bitlen = 0 - while n > 0: - n >>= 1 - bitlen += 1 - return bitlen - - -def varint_pack(big): - pos = True - if big == 0: - return b'\x00' - if big < 0: - bytelength = bitlength(abs(big) - 1) // 8 + 1 - big = (1 << bytelength * 8) + big - pos = False - revbytes = bytearray() - while big > 0: - revbytes.append(big & 0xff) - big >>= 8 - if pos and revbytes[-1] & 0x80: - revbytes.append(0) - revbytes.reverse() - return six.binary_type(revbytes) diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx index de87c1e0..a660f3ee 100644 --- a/cassandra/cython_utils.pyx +++ b/cassandra/cython_utils.pyx @@ -16,13 +16,14 @@ from cpython.datetime cimport ( import datetime import sys -DATETIME_EPOC = datetime.datetime(1970, 1, 1) - -assert sys.byteorder in ('little', 'big') -is_little_endian = sys.byteorder == 'little' +cdef bint is_little_endian +from cassandra.util import is_little_endian import_datetime() +DATETIME_EPOC = datetime.datetime(1970, 1, 1) + + cdef datetime_from_timestamp(double timestamp): cdef int seconds = timestamp cdef int microseconds = ( (timestamp * 1000000)) % 1000000 diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 47028bf1..6c2afa22 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -42,7 +42,7 @@ cdef class DesDecimalType(Deserializer): slice_buffer(buf, &varint_buf, 4, buf.size - 4) scale = int32_unpack(buf) - unscaled = varint_unpack(to_bytes(&varint_buf)) + unscaled = varint_unpack(&varint_buf) return Decimal('%de%d' % (unscaled, -scale)) @@ -93,7 +93,7 @@ cdef class DesInt32Type(Deserializer): cdef class DesIntegerType(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return varint_unpack(to_bytes(buf)) + return varint_unpack(buf) cdef class DesInetAddressType(Deserializer): diff --git a/cassandra/numpyparser.pyx b/cassandra/numpyparser.pyx index 89bf18da..bfde839e 100644 --- a/cassandra/numpyparser.pyx +++ b/cassandra/numpyparser.pyx @@ -75,7 +75,6 @@ cdef class NumpyParser(ColumnParser): arrays = [make_native_byteorder(arr) for arr in arrays] result = dict(zip(desc.colnames, arrays)) - # return pd.DataFrame(result) return result diff --git a/cassandra/typecodes.py b/cassandra/typecodes.py index 651c58d7..2f0ce8f5 100644 --- a/cassandra/typecodes.py +++ b/cassandra/typecodes.py @@ -3,8 +3,8 @@ Module with constants for Cassandra type codes. These constants are useful for - a) mapping messages to cqltypes (cassandra/cqltypes.py) - b) optimizezd dispatching for (de)serialization (cassandra/encoding.py) + a) mapping messages to cqltypes (cassandra/cqltypes.py) + b) optimized dispatching for (de)serialization (cassandra/encoding.py) Type codes are repeated here from the Cassandra binary protocol specification: diff --git a/tests/unit/cython/utils.py b/tests/unit/cython/utils.py index 9f0a5a87..c493e17b 100644 --- a/tests/unit/cython/utils.py +++ b/tests/unit/cython/utils.py @@ -11,7 +11,7 @@ def cyimport(import_path): (and skip any relevant tests). """ try: - return __import__(import_path, fromlist=True) + return __import__(import_path, fromlist=[True]) except ImportError: if HAVE_CYTHON: raise From 2a568d1ebb967b406ebd63a8414a3e527470c6da Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:09:53 +0100 Subject: [PATCH 62/70] Use underscore to break up long module names --- .../{numpyparser.pyx => numpy_parser.pyx} | 0 cassandra/{objparser.pyx => obj_parser.pyx} | 0 cassandra/protocol.py | 18 +++++++++--------- cassandra/{rowparser.pyx => row_parser.pyx} | 0 cassandra/{typecodes.pxd => type_codes.pxd} | 0 cassandra/{typecodes.py => type_codes.py} | 0 6 files changed, 9 insertions(+), 9 deletions(-) rename cassandra/{numpyparser.pyx => numpy_parser.pyx} (100%) rename cassandra/{objparser.pyx => obj_parser.pyx} (100%) rename cassandra/{rowparser.pyx => row_parser.pyx} (100%) rename cassandra/{typecodes.pxd => type_codes.pxd} (100%) rename cassandra/{typecodes.py => type_codes.py} (100%) diff --git a/cassandra/numpyparser.pyx b/cassandra/numpy_parser.pyx similarity index 100% rename from cassandra/numpyparser.pyx rename to cassandra/numpy_parser.pyx diff --git a/cassandra/objparser.pyx b/cassandra/obj_parser.pyx similarity index 100% rename from cassandra/objparser.pyx rename to cassandra/obj_parser.pyx diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 5ebbfa5c..25311911 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -22,7 +22,7 @@ import six from six.moves import range import io -from cassandra import typecodes +from cassandra import type_codes from cassandra import (Unavailable, WriteTimeout, ReadTimeout, WriteFailure, ReadFailure, FunctionFailure, AlreadyExists, InvalidRequest, Unauthorized, @@ -548,7 +548,7 @@ class ResultMessage(_MessageType): paging_state = None # Names match type name in module scope. Most are imported from cassandra.cqltypes (except CUSTOM_TYPE) - type_codes = _cqltypes_by_code = dict((v, globals()[k]) for k, v in typecodes.__dict__.items() if not k.startswith('_')) + type_codes = _cqltypes_by_code = dict((v, globals()[k]) for k, v in type_codes.__dict__.items() if not k.startswith('_')) _FLAGS_GLOBAL_TABLES_SPEC = 0x0001 _HAS_MORE_PAGES_FLAG = 0x0002 @@ -1001,20 +1001,20 @@ def cython_protocol_handler(colparser): There are three Cython-based protocol handlers (least to most performant): - 1. objparser.ListParser + 1. obj_parser.ListParser this parser decodes result messages into a list of tuples - 2. objparser.LazyParser + 2. obj_parser.LazyParser this parser decodes result messages lazily by returning an iterator - 3. numpyparser.NumPyParser + 3. numpy_parser.NumPyParser this parser decodes result messages into NumPy arrays - The default is to use objparser.ListParser + The default is to use obj_parser.ListParser """ # TODO: It may be cleaner to turn ProtocolHandler and ResultMessage into # TODO: instances and use methods instead of class methods - from cassandra.rowparser import make_recv_results_rows + from cassandra.row_parser import make_recv_results_rows class FastResultMessage(ResultMessage): """ @@ -1038,7 +1038,7 @@ def cython_protocol_handler(colparser): if HAVE_CYTHON: - from cassandra.objparser import ListParser, LazyParser + from cassandra.obj_parser import ListParser, LazyParser ProtocolHandler = cython_protocol_handler(ListParser()) LazyProtocolHandler = cython_protocol_handler(LazyParser()) else: @@ -1047,7 +1047,7 @@ else: if HAVE_CYTHON and HAVE_NUMPY: - from cassandra.numpyparser import NumpyParser + from cassandra.numpy_parser import NumpyParser NumpyProtocolHandler = cython_protocol_handler(NumpyParser()) else: NumpyProtocolHandler = None diff --git a/cassandra/rowparser.pyx b/cassandra/row_parser.pyx similarity index 100% rename from cassandra/rowparser.pyx rename to cassandra/row_parser.pyx diff --git a/cassandra/typecodes.pxd b/cassandra/type_codes.pxd similarity index 100% rename from cassandra/typecodes.pxd rename to cassandra/type_codes.pxd diff --git a/cassandra/typecodes.py b/cassandra/type_codes.py similarity index 100% rename from cassandra/typecodes.py rename to cassandra/type_codes.py From 9240c71c02c27a6f20c73870a92ccdafab238df7 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:20:45 +0100 Subject: [PATCH 63/70] Add license to top of new files --- cassandra/buffer.pxd | 14 ++++++++++++++ cassandra/bytesio.pxd | 14 ++++++++++++++ cassandra/bytesio.pyx | 14 +++++++++++++- cassandra/cython_utils.pyx | 14 ++++++++++++++ cassandra/deserializers.pxd | 14 +++++++++++++- cassandra/deserializers.pyx | 15 ++++++++++++++- cassandra/ioutils.pyx | 14 ++++++++++++++ cassandra/numpy_parser.pyx | 14 +++++++++++++- cassandra/obj_parser.pyx | 14 ++++++++++++++ cassandra/parsing.pxd | 14 ++++++++++++++ cassandra/parsing.pyx | 14 ++++++++++++++ cassandra/protocol.py | 2 +- cassandra/row_parser.pyx | 14 +++++++++++++- cassandra/tuple.pxd | 14 ++++++++++++++ cassandra/type_codes.pxd | 14 ++++++++++++++ cassandra/util.py | 14 ++++++++++++++ tests/unit/cython/__init__.py | 14 ++++++++++++++ tests/unit/cython/bytesio_testhelper.pyx | 14 ++++++++++++++ tests/unit/cython/test_bytesio.py | 14 ++++++++++++++ tests/unit/cython/utils.py | 16 +++++++++++++++- 20 files changed, 264 insertions(+), 7 deletions(-) diff --git a/cassandra/buffer.pxd b/cassandra/buffer.pxd index 542cb181..2f40ced0 100644 --- a/cassandra/buffer.pxd +++ b/cassandra/buffer.pxd @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Simple buffer data structure that provides a view on existing memory (e.g. from a bytes object). This memory must stay alive while the diff --git a/cassandra/bytesio.pxd b/cassandra/bytesio.pxd index 64bbdcca..2bcda361 100644 --- a/cassandra/bytesio.pxd +++ b/cassandra/bytesio.pxd @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + cdef class BytesIOReader: cdef bytes buf cdef char *buf_ptr diff --git a/cassandra/bytesio.pyx b/cassandra/bytesio.pyx index eb81c2fe..68a15baf 100644 --- a/cassandra/bytesio.pyx +++ b/cassandra/bytesio.pyx @@ -1,4 +1,16 @@ -# -- cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. cdef class BytesIOReader: """ diff --git a/cassandra/cython_utils.pyx b/cassandra/cython_utils.pyx index a660f3ee..1d16d47d 100644 --- a/cassandra/cython_utils.pyx +++ b/cassandra/cython_utils.pyx @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Duplicate module of util.py, with some accelerated functions used for deserialization. diff --git a/cassandra/deserializers.pxd b/cassandra/deserializers.pxd index 015fda37..26b4429a 100644 --- a/cassandra/deserializers.pxd +++ b/cassandra/deserializers.pxd @@ -1,4 +1,16 @@ -# -- cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from cassandra.buffer cimport Buffer diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 6c2afa22..54ce1daf 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -1,4 +1,17 @@ -# -- cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from libc.stdint cimport int32_t, uint16_t diff --git a/cassandra/ioutils.pyx b/cassandra/ioutils.pyx index 1a11068c..c38b311a 100644 --- a/cassandra/ioutils.pyx +++ b/cassandra/ioutils.pyx @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + include 'cython_marshal.pyx' from cassandra.buffer cimport Buffer, from_ptr_and_size diff --git a/cassandra/numpy_parser.pyx b/cassandra/numpy_parser.pyx index bfde839e..6702cfcc 100644 --- a/cassandra/numpy_parser.pyx +++ b/cassandra/numpy_parser.pyx @@ -1,4 +1,16 @@ -# -- cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ This module provider an optional protocol parser that returns diff --git a/cassandra/obj_parser.pyx b/cassandra/obj_parser.pyx index 670f1b4a..8aa5b394 100644 --- a/cassandra/obj_parser.pyx +++ b/cassandra/obj_parser.pyx @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + include "ioutils.pyx" from cassandra.bytesio cimport BytesIOReader diff --git a/cassandra/parsing.pxd b/cassandra/parsing.pxd index 9daecad9..278c6e71 100644 --- a/cassandra/parsing.pxd +++ b/cassandra/parsing.pxd @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from cassandra.bytesio cimport BytesIOReader from cassandra.deserializers cimport Deserializer diff --git a/cassandra/parsing.pyx b/cassandra/parsing.pyx index c9afd4b5..c44d7f5a 100644 --- a/cassandra/parsing.pyx +++ b/cassandra/parsing.pyx @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Module containing the definitions and declarations (parsing.pxd) for parsers. """ diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 25311911..3cb13351 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -894,7 +894,7 @@ class ProtocolHandler(object): result decoding implementations. """ - @classmethod + @classmethod def encode_message(cls, msg, stream_id, protocol_version, compressor): """ Encodes a message using the specified frame parameters, and compressor diff --git a/cassandra/row_parser.pyx b/cassandra/row_parser.pyx index 1c855769..fc7bce15 100644 --- a/cassandra/row_parser.pyx +++ b/cassandra/row_parser.pyx @@ -1,4 +1,16 @@ -# -- cython: profile=True +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from cassandra.parsing cimport ParseDesc, ColumnParser from cassandra.deserializers import make_deserializers diff --git a/cassandra/tuple.pxd b/cassandra/tuple.pxd index 185e8364..746205e2 100644 --- a/cassandra/tuple.pxd +++ b/cassandra/tuple.pxd @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from cpython.tuple cimport ( PyTuple_New, # Return value: New reference. diff --git a/cassandra/type_codes.pxd b/cassandra/type_codes.pxd index b0405284..90f29bc9 100644 --- a/cassandra/type_codes.pxd +++ b/cassandra/type_codes.pxd @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + cdef enum: CUSTOM_TYPE AsciiType diff --git a/cassandra/util.py b/cassandra/util.py index 0e8a818b..c71822c2 100644 --- a/cassandra/util.py +++ b/cassandra/util.py @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import with_statement import calendar import datetime diff --git a/tests/unit/cython/__init__.py b/tests/unit/cython/__init__.py index e69de29b..e4b89e5f 100644 --- a/tests/unit/cython/__init__.py +++ b/tests/unit/cython/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tests/unit/cython/bytesio_testhelper.pyx b/tests/unit/cython/bytesio_testhelper.pyx index 7f898c4c..d557c037 100644 --- a/tests/unit/cython/bytesio_testhelper.pyx +++ b/tests/unit/cython/bytesio_testhelper.pyx @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from cassandra.bytesio cimport BytesIOReader def test_read1(assert_equal, assert_raises): diff --git a/tests/unit/cython/test_bytesio.py b/tests/unit/cython/test_bytesio.py index 65cc463a..2dbf1311 100644 --- a/tests/unit/cython/test_bytesio.py +++ b/tests/unit/cython/test_bytesio.py @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from tests.unit.cython.utils import cyimport, cythontest bytesio_testhelper = cyimport('tests.unit.cython.bytesio_testhelper') diff --git a/tests/unit/cython/utils.py b/tests/unit/cython/utils.py index c493e17b..788212ac 100644 --- a/tests/unit/cython/utils.py +++ b/tests/unit/cython/utils.py @@ -1,3 +1,17 @@ +# Copyright 2013-2015 DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from cassandra.cython_deps import HAVE_CYTHON, HAVE_NUMPY try: @@ -21,4 +35,4 @@ def cyimport(import_path): # @cythontest # def test_something(self): ... cythontest = unittest.skipUnless(HAVE_CYTHON, 'Cython is not available') -numpytest = unittest.skipUnless(HAVE_CYTHON and HAVE_NUMPY, 'NumPy is not available') \ No newline at end of file +numpytest = unittest.skipUnless(HAVE_CYTHON and HAVE_NUMPY, 'NumPy is not available') From 02be9f441ae4731d3247120084b9b2d3003732ee Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:30:32 +0100 Subject: [PATCH 64/70] Add some API documentation for Cython-based deserializers --- cassandra/protocol.py | 2 +- docs/api/cassandra/protocol.rst | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 3cb13351..25311911 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -894,7 +894,7 @@ class ProtocolHandler(object): result decoding implementations. """ - @classmethod + @classmethod def encode_message(cls, msg, stream_id, protocol_version, compressor): """ Encodes a message using the specified frame parameters, and compressor diff --git a/docs/api/cassandra/protocol.rst b/docs/api/cassandra/protocol.rst index cabf2b59..0d4df101 100644 --- a/docs/api/cassandra/protocol.rst +++ b/docs/api/cassandra/protocol.rst @@ -24,3 +24,17 @@ See :meth:`.Session.execute`, ::meth:`.Session.execute_async`, :attr:`.ResponseF .. automethod:: encode_message .. automethod:: decode_message + +Faster Deserialization +---------------------- +When python-driver is compiled with Cython, it uses a Cython-based deserialization path +to deserialize messages. There are two additional ProtocolHandler classes that can be +used to deserialize response messages: the first is ``LazyProtocolHandler`` and the +second is ``NumpyProtocolHandler``.They can be used as follows: + +.. code:: python + + from cassandra.protocol import NumpyProtocolHandler, LazyProtocolHandler + s.client_protocol_handler = LazyProtocolHandler # for a result iterator + s.client_protocol_handler = NumpyProtocolHandler # for a dict of NumPy arrays as result + From 0924df80d8a8bbff9c69cef3feb6840a34d9b8e3 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:34:50 +0100 Subject: [PATCH 65/70] Also include .pxd files in sdist --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 7a686a6b..4e072d1c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include setup.py README.rst MANIFEST.in LICENSE ez_setup.py include cassandra/*.pyx +include cassandra/*.pxd From c821333fb721ffb96e1456604c985c584aa02cc0 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:38:00 +0100 Subject: [PATCH 66/70] Re-enable cythonized pure-python modules --- setup.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index ce09a23e..bfb9176f 100644 --- a/setup.py +++ b/setup.py @@ -262,14 +262,14 @@ if "--no-libev" not in sys.argv and not is_windows: if "--no-cython" not in sys.argv: try: from Cython.Build import cythonize - # cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', - # 'pool', 'protocol', 'query', 'util'] - # compile_args = [] if is_windows else ['-Wno-unused-function'] - # extensions.extend(cythonize( - # [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], - # extra_compile_args=compile_args) - # for m in cython_candidates], - # exclude_failures=True)) + cython_candidates = ['cluster', 'concurrent', 'connection', 'cqltypes', 'metadata', + 'pool', 'protocol', 'query', 'util'] + compile_args = [] if is_windows else ['-Wno-unused-function'] + extensions.extend(cythonize( + [Extension('cassandra.%s' % m, ['cassandra/%s.py' % m], + extra_compile_args=compile_args) + for m in cython_candidates], + exclude_failures=True)) extensions.extend(cythonize("cassandra/*.pyx")) extensions.extend(cythonize("tests/unit/cython/*.pyx")) except ImportError: From dd76d15b5db5d24214e5e7457bff6fd4b6841599 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Tue, 11 Aug 2015 21:41:48 +0100 Subject: [PATCH 67/70] Remove leftover TODO comments --- cassandra/protocol.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cassandra/protocol.py b/cassandra/protocol.py index 25311911..4ebeab5d 100644 --- a/cassandra/protocol.py +++ b/cassandra/protocol.py @@ -1012,8 +1012,6 @@ def cython_protocol_handler(colparser): The default is to use obj_parser.ListParser """ - # TODO: It may be cleaner to turn ProtocolHandler and ResultMessage into - # TODO: instances and use methods instead of class methods from cassandra.row_parser import make_recv_results_rows class FastResultMessage(ResultMessage): From fade6487e9d859e087be067d7a375b82bfa3eeca Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 12 Aug 2015 10:40:08 +0100 Subject: [PATCH 68/70] Add cython unit tests to sdist --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 4e072d1c..e3cb20eb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include setup.py README.rst MANIFEST.in LICENSE ez_setup.py include cassandra/*.pyx include cassandra/*.pxd +include tests/unit/cython/*.pyx From 1f985cf1a1e2f821e57a44f2b2a2e83b93a03c46 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 12 Aug 2015 12:13:09 +0100 Subject: [PATCH 69/70] Fix typo in import --- cassandra/cython_deps.py | 2 +- cassandra/deserializers.pyx | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cassandra/cython_deps.py b/cassandra/cython_deps.py index fdd15464..5cc86fe7 100644 --- a/cassandra/cython_deps.py +++ b/cassandra/cython_deps.py @@ -1,5 +1,5 @@ try: - from cassandra.rowparser import make_recv_results_rows + from cassandra.row_parser import make_recv_results_rows HAVE_CYTHON = True except ImportError: HAVE_CYTHON = False diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index 54ce1daf..ca5dc6d8 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -164,7 +164,8 @@ cdef class DesTimeType(Deserializer): cdef class DesUTF8Type(Deserializer): cdef deserialize(self, Buffer *buf, int protocol_version): - return to_bytes(buf).decode('utf8') + cdef val = to_bytes(buf) + return val.decode('utf8') cdef class DesVarcharType(DesUTF8Type): @@ -502,6 +503,7 @@ cpdef Deserializer find_deserializer(cqltype): def obj_array(list objs): """Create a (Cython) array of objects given a list of objects""" cdef object[:] arr + cdef Py_ssize_t i arr = cython_array(shape=(len(objs),), itemsize=sizeof(void *), format="O") # arr[:] = objs # This does not work (segmentation faults) for i, obj in enumerate(objs): From ccc7c8b19ee8895ebf85050a7d37e644c71e6b83 Mon Sep 17 00:00:00 2001 From: Mark Florisson Date: Wed, 12 Aug 2015 14:28:25 +0100 Subject: [PATCH 70/70] Reduce some noise in valgrind --- cassandra/deserializers.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cassandra/deserializers.pyx b/cassandra/deserializers.pyx index ca5dc6d8..e2f23284 100644 --- a/cassandra/deserializers.pyx +++ b/cassandra/deserializers.pyx @@ -31,7 +31,6 @@ from cassandra import util cdef bint PY2 = six.PY2 - cdef class Deserializer: """Cython-based deserializer class for a cqltype""" @@ -468,12 +467,14 @@ def make_deserializers(cqltypes): return obj_array([find_deserializer(ct) for ct in cqltypes]) +cdef dict classes = globals() + cpdef Deserializer find_deserializer(cqltype): """Find a deserializer for a cqltype""" name = 'Des' + cqltype.__name__ if name in globals(): - cls = globals()[name] + cls = classes[name] elif issubclass(cqltype, cqltypes.ListType): cls = DesListType elif issubclass(cqltype, cqltypes.SetType):