Changes to pymysql to make it work for large result sets. As it is now it

treats the entire result as one packet: the length of the first packet
(result set header packet) is read (almost always "1") and so it only reads
in that many bytes (in actuality it reads all that come in the first TCP
packet which causes breakages when the result is large.

The result set header packet contains only number of fields in the result
set (normally can be expressed by one byte).  There are then _N_ field
descriptor packets an EOF packet, then _M_ row data packets, then a final
EOF packet.

Encapsulate the mysql packet in a class (MysqlPacket) that handles reading,
parsing the packet header, and provides an interface to read/advance through
the data buffer for client parsing.  As a result, move amost of the mysql
packet parsing out of Connection and MySQLResult.

rename Connection._query() to Connection.query() as it clearly needs to be
public (used by Cursor).

some style cleanups (whitespace, docstrings, variable names not conflicting
with built-ins, etc).  added lots of TODOs for things that may need follow-up.
This commit is contained in:
Iceman
2009-12-16 17:26:03 +00:00
parent 07ae36e644
commit 189f1be7fe
2 changed files with 305 additions and 210 deletions

View File

@@ -1,19 +1,22 @@
import sys
import struct
import socket
import sha
import re
# Python implementation of the MySQL client-server protocol
# http://forge.mysql.com/wiki/MySQL_Internals_ClientServer_Protocol
import re
import sha
import socket
import struct
import sys
from pymysql.cursor import Cursor
from pymysql.charset import MBLENGTH
from pymysql.converters import escape_item, encoders, decoders
from pymysql.cursor import Cursor
from pymysql.constants import FIELD_TYPE
from pymysql.constants import SERVER_STATUS
from pymysql.constants.CLIENT_FLAG import *
from pymysql.constants.COMMAND import *
from pymysql.exceptions import raise_mysql_exception, Warning, Error, InterfaceError, DataError, \
DatabaseError, OperationalError, IntegrityError, InternalError, \
NotSupportedError, ProgrammingError
from pymysql.converters import escape_item, encoders, decoders
from pymysql.exceptions import raise_mysql_exception, Warning, Error, \
InterfaceError, DataError, DatabaseError, OperationalError, \
IntegrityError, InternalError, NotSupportedError, ProgrammingError
DEBUG = False
@@ -21,12 +24,11 @@ NULL_COLUMN = 251
UNSIGNED_CHAR_COLUMN = 251
UNSIGNED_SHORT_COLUMN = 252
UNSIGNED_INT24_COLUMN = 253
UNSIGNED_INT32_COLUMN = 254
UNSIGNED_INT64_COLUMN = 254
UNSIGNED_CHAR_LENGTH = 1
UNSIGNED_SHORT_LENGTH = 2
UNSIGNED_INT24_LENGTH = 3
UNSIGNED_INT32_LENGTH = 4
UNSIGNED_INT32_PAD_LENGTH = 4
UNSIGNED_INT64_LENGTH = 8
DEFAULT_CHARSET = 'latin1'
BUFFER_SIZE = 256*256*256-1
@@ -38,7 +40,11 @@ def dump_packet(data):
return data
return '.'
print "packet length %d" % len(data)
print "method call: %s \npacket dump" % sys._getframe(2).f_code.co_name
print "method call[1]: %s" % sys._getframe(1).f_code.co_name
print "method call[2]: %s" % sys._getframe(2).f_code.co_name
print "method call[3]: %s" % sys._getframe(3).f_code.co_name
print "method call[4]: %s" % sys._getframe(4).f_code.co_name
print "method call[5]: %s" % sys._getframe(5).f_code.co_name
print "-" * 88
dump_data = [data[i:i+16] for i in xrange(len(data)) if i%16 == 0]
for d in dump_data:
@@ -70,6 +76,12 @@ def _my_crypt(message1, message2):
def pack_int24(n):
return struct.pack('BBB', n&0xFF, (n>>8)&0xFF, (n>>16)&0xFF)
def unpack_uint16(n):
return struct.unpack('<H', n[0:2])[0]
# TODO: stop using bit-shifting in these functions...
# TODO: rename to "uint" to make it clear they're unsigned...
def unpack_int24(n):
return struct.unpack('B',n[0])[0] + (struct.unpack('B', n[1])[0] << 8) +\
(struct.unpack('B',n[2])[0] << 16)
@@ -95,8 +107,213 @@ def defaulterrorhandler(connection, cursor, errorclass, errorvalue):
del connection
raise errorclass, errorvalue
class MysqlPacket(object):
"""Representation of a MySQL response packet. Reads in the packet
from the network socket, removes packet header and provides an interface
for reading/parsing the packet results."""
def __init__(self, socket):
self.__position = 0
self.__recv_packet(socket)
del socket
def __recv_packet(self, socket):
"""Parse the packet header and read entire packet payload into buffer."""
packet_header = socket.recv(4)
if DEBUG: dump_packet(packet_header)
packet_length_bin = packet_header[:3]
self.__packet_number = ord(packet_header[3])
# TODO: check packet_num is correct (+1 from last packet)
bin_length = packet_length_bin + '\000' # pad little-endian number
bytes_to_read = struct.unpack('<I', bin_length)[0]
payload_buff = [] # TODO: look if cStringIO is markedly better
while bytes_to_read > 0:
recv_data = socket.recv(bytes_to_read)
if DEBUG: dump_packet(recv_data)
payload_buff.append(recv_data)
bytes_to_read -= len(recv_data)
self.__data = ''.join(payload_buff)
def packet_number(self): return self.__packet_number
def read(self, size):
"""Read the first 'size' bytes in packet and advance cursor past them."""
result = self.peek(size)
self.advance(size)
return result
def read_all(self):
"""Read all remaining data in the packet.
(Subsequent read() or peek() will return errors.)
"""
result = self.__data[self.__position:]
self.__position = None # ensure no subsequent read() or peek()
return result
def advance(self, length):
"""Advance the cursor in data buffer 'length' bytes."""
new_position = self.__position + length
if new_position < 0 or new_position > len(self.__data):
raise Exception('Invalid advance amount (%s) for cursor. '
'Position=%s' % (length, new_position))
self.__position = new_position
def rewind(self, position=0):
"""Set the position of the data buffer cursor to 'position'."""
if position < 0 or position > len(self.__data):
raise Exception("Invalid position to rewind cursor to: %s." % position)
self.__position = position
def peek(self, size):
"""Look at the first 'size' bytes in packet without moving cursor."""
result = self.__data[self.__position:(self.__position+size)]
if len(result) != size:
error = ('Result length not requested length:\n'
'Expected=%s. Actual=%s. Position: %s. Data Length: %s'
% (size, len(result), self.__position, len(self.__data)))
if DEBUG:
print error
self.dump()
raise AssertionError(error)
return result
def get_bytes(self, position, length=1):
"""Get 'length' bytes starting at 'position'.
Position is start of payload (first four packet header bytes are not
included) starting at index '0'.
No error checking is done. If requesting outside end of buffer
an empty string (or string shorter than 'length') may be returned!
"""
return self.__data[position:(position+length)]
def read_coded_length(self):
"""Read a 'Length Coded' number from the data buffer.
Length coded numbers can be anywhere from 1 to 9 bytes depending
on the value of the first byte.
"""
c = ord(self.read(1))
if c == NULL_COLUMN:
return None
if c < UNSIGNED_CHAR_COLUMN:
return c
elif c == UNSIGNED_SHORT_COLUMN:
return unpack_uint16(self.read(UNSIGNED_SHORT_LENGTH))
elif c == UNSIGNED_INT24_COLUMN:
return unpack_int24(self.read(UNSIGNED_INT24_LENGTH))
elif c == UNSIGNED_INT64_COLUMN:
# TODO: what was 'longlong'? confirm it wasn't used?
return unpack_int64(self.read(UNSIGNED_INT64_LENGTH))
def read_length_coded_binary(self):
"""Read a 'Length Coded Binary' from the data buffer.
A 'Length Coded Binary' consists first of a length coded
(unsigned, positive) integer represented in 1-9 bytes followed by
that many bytes of binary data. (For example "cat" would be "3cat".)
"""
length = self.read_coded_length()
if length:
return self.read(length)
def is_ok_packet(self):
return ord(self.get_bytes(0)) == 0
def is_eof_packet(self):
return ord(self.get_bytes(0)) == 254 # 'fe'
def is_resultset_packet(self):
field_count = ord(self.get_bytes(0))
return field_count >= 1 and field_count <= 250
def is_error_packet(self):
return ord(self.get_bytes(0)) == 255
def check_error(self):
if self.is_error_packet():
self.rewind()
self.advance(1) # field_count == error (we already know that)
errno = unpack_uint16(self.read(2))
if DEBUG: print "errno = %d" % errno
raise_mysql_exception(self.__data)
def dump(self):
dump_packet(self.__data)
class FieldDescriptorPacket(MysqlPacket):
"""A MysqlPacket that represents a specific column's metadata in the result.
Parsing is automatically done and the results are exported via public
attributes on the class such as: db, table_name, name, length, type_code.
"""
def __init__(self, *args):
MysqlPacket.__init__(self, *args)
self.__parse_field_descriptor()
def __parse_field_descriptor(self):
"""Parse the 'Field Descriptor' (Metadata) packet.
This is compatible with MySQL 4.1+ (not compatible with MySQL 4.0).
"""
self.catalog = self.read_length_coded_binary()
self.db = self.read_length_coded_binary()
self.table_name = self.read_length_coded_binary()
self.org_table = self.read_length_coded_binary()
self.name = self.read_length_coded_binary()
self.org_name = self.read_length_coded_binary()
self.advance(1) # non-null filler
self.charsetnr = struct.unpack('<h', self.read(2))[0]
self.length = struct.unpack('<i', self.read(4))[0]
self.type_code = ord(self.read(1))
flags = struct.unpack('<h', self.read(2))
# TODO: what is going on here with this flag parsing???
self.flags = int(("%02X" % flags)[1:], 16)
self.scale = ord(self.read(1)) # "decimals"
self.advance(2) # filler (always 0x00)
# 'default' is a length coded binary and is still in the buffer?
# not used for normal result sets...
def description(self):
"""Provides a 7-item tuple compatible with the Python PEP249 DB Spec."""
desc = []
desc.append(self.name)
desc.append(self.type_code)
desc.append(None) # 'display size'
desc.append(self.get_column_length()) # 'internal_size'
desc.append(self.get_column_length()) # 'precision' # TODO: why!?!?
desc.append(self.scale)
# 'null_ok' -- can this be True/False rather than 1/0?
# if so just do: desc.append(bool(self.flags % 2 == 0))
if self.flags % 2 == 0:
desc.append(1)
else:
desc.append(0)
return tuple(desc)
def get_column_length(self):
if self.type_code == FIELD_TYPE.VAR_STRING:
mblen = MBLENGTH.get(self.charsetnr, 1)
return self.length / mblen
return self.length
def __str__(self):
return ('%s %s.%s.%s, type=%s'
% (self.__class__, self.db, self.table_name, self.name,
self.type_code))
class Connection(object):
"""Representation of a socket with a mysql server."""
errorhandler = defaulterrorhandler
def __init__(self, *args, **kwargs):
@@ -135,15 +352,15 @@ class Connection(object):
def autocommit(self, value):
self._execute_command(COM_QUERY, "SET AUTOCOMMIT = %s" % \
self.escape(value))
self._read_and_check_packet()
self.read_packet()
def commit(self):
self._execute_command(COM_QUERY, "COMMIT")
self._read_and_check_packet()
self.read_packet()
def rollback(self):
self._execute_command(COM_QUERY, "ROLLBACK")
self._read_and_check_packet()
self.read_packet()
def escape(self, obj):
return escape_item(obj)
@@ -159,8 +376,8 @@ class Connection(object):
self.rollback()
else:
self.commit()
def _query(self, sql):
def query(self, sql):
self._execute_command(COM_QUERY, sql)
return self._read_query_result()
@@ -171,7 +388,7 @@ class Connection(object):
sock = self.socket
if charset and self.charset != charset:
self._execute_command(COM_QUERY, "SET NAMES %s" % charset)
self._read_and_check_packet()
self.read_packet()
self.charset = charset
def _connect(self):
@@ -187,24 +404,23 @@ class Connection(object):
self._get_server_information()
self._request_authentication()
def _read_and_check_packet(self):
recv_data = self.socket.recv(BUFFER_SIZE)
if DEBUG: dump_packet(recv_data)
self._check_error(recv_data)
return recv_data
def read_packet(self, packet_type=MysqlPacket):
"""Read an entire "mysql packet" in its entirety from the network
and return a MysqlPacket type that represents the results."""
# TODO: is socket.recv(small_number) significantly slower than
# socket.recv(large_number)? if so, maybe we should buffer
# the socket.recv() (though that obviously makes memory management
# more complicated.
packet = packet_type(self.socket)
packet.check_error()
return packet
def _read_query_result(self):
recv_data = self._read_and_check_packet()
result = MySQLResult(self, recv_data)
result = MySQLResult(self)
result.read()
self._result = result
affected_rows = result.affected_rows
if not result.ok_packet:
affected_rows = len(result.rows)
self._result.affected_rows = affected_rows
return affected_rows
return result.affected_rows
def _send_command(self, command, sql):
send_data = struct.pack('<i', len(sql) + 1) + command + sql
@@ -215,25 +431,6 @@ class Connection(object):
def _execute_command(self, command, sql):
self._send_command(command, sql)
def _check_error(self, recv_data):
field_count = ord(recv_data[4:5])
if field_count == 255:
errno = ord(recv_data[5:6]) + ord(recv_data[6:7]) * 256
if DEBUG: print "errno = %d" % errno
raise_mysql_exception(recv_data)
def _is_ok_packet(self, recv_data):
field_count = ord(recv_data[4:5])
if field_count == 0:
return True
return False
def _is_resultset_packet(self, recv_data):
field_count = ord(recv_data[4:5])
if field_count > 1:
return True
return False
def _request_authentication(self):
sock = self.socket
self._send_authentication()
@@ -256,14 +453,11 @@ class Connection(object):
if DEBUG: dump_packet(data)
sock.send(data)
auth_msg = sock.recv(BUFFER_SIZE)
self._check_auth_packet(auth_msg)
def _check_auth_packet(self, recv_data):
if DEBUG: dump_packet(recv_data)
self._check_error(recv_data)
auth_packet = MysqlPacket(sock)
auth_packet.check_error()
if DEBUG: auth_packet.dump()
def _get_server_information(self):
sock = self.socket
i = 0
@@ -311,182 +505,84 @@ class Connection(object):
ProgrammingError = ProgrammingError
NotSupportedError = NotSupportedError
# TODO: move OK and EOF packet parsing/logic into a proper subclass
# of MysqlPacket like has been done with FieldDescriptorPacket.
class MySQLResult(object):
def __init__(self, connection, data):
def __init__(self, connection):
from weakref import proxy
self.connection = proxy(connection)
self.data = data
self.position = 0
self.affected_rows = None
self.insert_id = None
self.server_status = 0
self.warning_count = 0
self.message = None
self.field_count = 0
self.ok_packet = connection._is_ok_packet(data)
self.description = None
self.rows = None
self.has_next = None
if not self.ok_packet:
self._check_has_more_packet()
def read(self):
if self.ok_packet:
self.first_packet = self.connection.read_packet()
# TODO: use classes for different packet types?
if self.first_packet.is_ok_packet():
self._read_ok_packet()
else:
self._read_result_packet()
self.data = None
def _read_ok_packet(self):
self.position += 5
self.affected_rows = self._get_field_length()
self.insert_id = self._get_field_length()
self.server_status = struct.unpack('H',self.data[self.position:self.position+2])[0]
self.position += 2
self.warning_count = struct.unpack('H',self.data[self.position:self.position+2])[0]
self.position += 2
self.message = self.data[self.position:]
def _check_has_more_packet(self):
packet_len = unpack_int24(self.data[:3])
length = len(self.data) - 4
while length < packet_len:
d = self.connection.socket.recv(BUFFER_SIZE)
length += len(d)
self.data += d
self.first_packet.advance(1) # field_count (always '0')
self.affected_rows = self.first_packet.read_coded_length()
self.insert_id = self.first_packet.read_coded_length()
self.server_status = struct.unpack('H', self.first_packet.read(2))[0]
self.warning_count = struct.unpack('H', self.first_packet.read(2))[0]
self.message = self.first_packet.read_all()
def _read_result_packet(self):
self._get_field_count()
self._get_description()
self.field_count = ord(self.first_packet.read(1))
self._get_descriptions()
self._read_rowdata_packet()
# TODO: implement this as an iteratable so that it is more
# memory efficient and lower-latency to client...
def _read_rowdata_packet(self):
rows = []
not_eof = True
while(not_eof):
row = []
next = ord(self.data[self.position:self.position+1])
if next == 254:
self.position += 3
server_status = struct.unpack('h', self.data[-2:])[0]
self.has_next = server_status & SERVER_STATUS.SERVER_MORE_RESULTS_EXISTS
not_eof = False
else:
for field in self.description:
type_code = field[1]
converter = self.connection.decoders[type_code]
if DEBUG: print "DEBUG: field=" + str(field[0]) + ", type_code=" + str(type_code) + ", converter=" + str(converter)
data = self._seek_and_get_string()
converted = None
if data != None:
converted = converter(data)
row.append(converted)
rows.append(tuple(row))
self.position += 4
"""Read a rowdata packet for each data row in the result set."""
rows = []
while True:
packet = self.connection.read_packet()
if packet.is_eof_packet():
self.warning_count = packet.read(2)
server_status = struct.unpack('h', packet.read(2))[0]
self.has_next = (server_status
& SERVER_STATUS.SERVER_MORE_RESULTS_EXISTS)
break
self.rows = tuple(rows)
if DEBUG: self.rows
row = []
for field in self.fields:
converter = self.connection.decoders[field.type_code]
if DEBUG: print "DEBUG: field=%s, converter=%s" % (field, converter)
data = packet.read_length_coded_binary()
converted = None
if data != None:
converted = converter(data)
row.append(converted)
def _get_field_count(self):
self.position += 4
pos = self.position
count = ord(self.data[pos:pos+1])
self.field_count = count
self.position += 5
rows.append(tuple(row))
def _get_description(self):
data = self.data
pos = self.position
self.affected_rows = len(rows)
self.rows = tuple(rows)
if DEBUG: self.rows
def _get_descriptions(self):
"""Read a column descriptor packet for each column in the result."""
self.fields = []
description = []
for i in xrange(self.field_count):
desc = []
catalog = self._seek_and_get_string()
db = self._seek_and_get_string()
table_name = self._seek_and_get_string()
org_table = self._seek_and_get_string()
name = self._seek_and_get_string()
desc.append(name)
org_name = self._seek_and_get_string()
#filler
self.position += 1
#charsetnr
charsetnr = struct.unpack('<h',
data[self.position:self.position+2])[0]
self.position += 2
#length
length = struct.unpack('<i', data[self.position:self.position+4])
self.position += 4
type = ord(data[self.position:self.position+1])
desc.append(type)
desc.append(None)
self.position += 1
desc.append(self._get_column_length(type, charsetnr, length[0]))
desc.append(self._get_column_length(type, charsetnr, length[0]))
field = self.connection.read_packet(FieldDescriptorPacket)
self.fields.append(field)
description.append(field.description())
#flags
flags = struct.unpack('<h', data[self.position:self.position+2])
flags = int(("%02X" % flags)[1:])
self.position += 2
scale = ord(data[self.position:self.position+1])
desc.append(scale)
self.position += 1
if flags % 2 == 0:
desc.append(1)
else:
desc.append(0)
#filler
self.position += 2
self.position += 4
description.append(tuple(desc))
self.position += 9
eof_packet = self.connection.read_packet()
assert eof_packet.is_eof_packet(), 'Protocol error, expecting EOF'
self.description = tuple(description)
def _get_column_length(self, type, charsetnr, length):
if type == FIELD_TYPE.VAR_STRING:
mblen = MBLENGTH.get(charsetnr, 1)
return length / mblen
return length
def _seek_and_get_string(self):
length = self._get_field_length()
if length :
str = self.data[self.position:self.position+length]
self.position += length
return str
return None
def _get_field_length(self, longlong=False):
data = self.data
pos = self.position
c = ord(data[pos:pos + 1])
self.position += UNSIGNED_CHAR_LENGTH
if c == NULL_COLUMN:
return None
if c < UNSIGNED_CHAR_COLUMN:
return c
elif c == UNSIGNED_SHORT_COLUMN:
length = struct.unpack('<H', data[pos:pos+UNSIGNED_SHORT_LENGTH])
self.position += UNSIGNED_SHORT_LENGTH
return length
elif c == UNSIGNED_INT24_COLUMN:
length = unpack_int24(data[pos:pos+UNSIGNED_INT24_COLUMN])
self.position += UNSIGNED_INT24_LENGTH
return length
else:
length = 0
if longlong:
length = unpack_int64(data[pos:pos+UNSIGNED_INT32_LENGTH*2])
else:
length = unpack_int32(data[pos:pos+UNSIGNED_INT32_LENGTH])
self.position += UNSIGNED_INT32_LENGTH
self.position += UNSIGNED_INT32_PAD_LENGTH
return length

View File

@@ -141,7 +141,7 @@ class Cursor(object):
def _query(self, q):
conn = self._get_db()
self._last_executed = q
conn._query(q)
conn.query(q)
self._do_get_result()
return self.rowcount
@@ -171,4 +171,3 @@ class Cursor(object):
InternalError = InternalError
ProgrammingError = ProgrammingError
NotSupportedError = NotSupportedError