Merge pull request #127 from GregBowyer/master
Make it possible to read and write xerial snappy
This commit is contained in:
@@ -1,5 +1,9 @@
|
|||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import gzip
|
import gzip
|
||||||
|
import struct
|
||||||
|
|
||||||
|
_XERIAL_V1_HEADER = (-126, 'S', 'N', 'A', 'P', 'P', 'Y', 0, 1, 1)
|
||||||
|
_XERIAL_V1_FORMAT = 'bccccccBii'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import snappy
|
import snappy
|
||||||
@@ -36,13 +40,101 @@ def gzip_decode(payload):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def snappy_encode(payload):
|
def snappy_encode(payload, xerial_compatible=False, xerial_blocksize=32 * 1024):
|
||||||
|
"""Encodes the given data with snappy if xerial_compatible is set then the
|
||||||
|
stream is encoded in a fashion compatible with the xerial snappy library
|
||||||
|
|
||||||
|
The block size (xerial_blocksize) controls how frequent the blocking occurs
|
||||||
|
32k is the default in the xerial library.
|
||||||
|
|
||||||
|
The format winds up being
|
||||||
|
+-------------+------------+--------------+------------+--------------+
|
||||||
|
| Header | Block1 len | Block1 data | Blockn len | Blockn data |
|
||||||
|
|-------------+------------+--------------+------------+--------------|
|
||||||
|
| 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes |
|
||||||
|
+-------------+------------+--------------+------------+--------------+
|
||||||
|
|
||||||
|
It is important to not that the blocksize is the amount of uncompressed
|
||||||
|
data presented to snappy at each block, whereas the blocklen is the
|
||||||
|
number of bytes that will be present in the stream, that is the
|
||||||
|
length will always be <= blocksize.
|
||||||
|
"""
|
||||||
|
|
||||||
if not _has_snappy:
|
if not _has_snappy:
|
||||||
raise NotImplementedError("Snappy codec is not available")
|
raise NotImplementedError("Snappy codec is not available")
|
||||||
return snappy.compress(payload)
|
|
||||||
|
if xerial_compatible:
|
||||||
|
def _chunker():
|
||||||
|
for i in xrange(0, len(payload), xerial_blocksize):
|
||||||
|
yield payload[i:i+xerial_blocksize]
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
|
||||||
|
header = ''.join([struct.pack('!' + fmt, dat) for fmt, dat
|
||||||
|
in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER)])
|
||||||
|
|
||||||
|
out.write(header)
|
||||||
|
for chunk in _chunker():
|
||||||
|
block = snappy.compress(chunk)
|
||||||
|
block_size = len(block)
|
||||||
|
out.write(struct.pack('!i', block_size))
|
||||||
|
out.write(block)
|
||||||
|
|
||||||
|
out.seek(0)
|
||||||
|
return out.read()
|
||||||
|
|
||||||
|
else:
|
||||||
|
return snappy.compress(payload)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_xerial_stream(payload):
|
||||||
|
"""Detects if the data given might have been encoded with the blocking mode
|
||||||
|
of the xerial snappy library.
|
||||||
|
|
||||||
|
This mode writes a magic header of the format:
|
||||||
|
+--------+--------------+------------+---------+--------+
|
||||||
|
| Marker | Magic String | Null / Pad | Version | Compat |
|
||||||
|
|--------+--------------+------------+---------+--------|
|
||||||
|
| byte | c-string | byte | int32 | int32 |
|
||||||
|
|--------+--------------+------------+---------+--------|
|
||||||
|
| -126 | 'SNAPPY' | \0 | | |
|
||||||
|
+--------+--------------+------------+---------+--------+
|
||||||
|
|
||||||
|
The pad appears to be to ensure that SNAPPY is a valid cstring
|
||||||
|
The version is the version of this format as written by xerial,
|
||||||
|
in the wild this is currently 1 as such we only support v1.
|
||||||
|
|
||||||
|
Compat is there to claim the miniumum supported version that
|
||||||
|
can read a xerial block stream, presently in the wild this is
|
||||||
|
1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(payload) > 16:
|
||||||
|
header = header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16])
|
||||||
|
return header == _XERIAL_V1_HEADER
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def snappy_decode(payload):
|
def snappy_decode(payload):
|
||||||
if not _has_snappy:
|
if not _has_snappy:
|
||||||
raise NotImplementedError("Snappy codec is not available")
|
raise NotImplementedError("Snappy codec is not available")
|
||||||
return snappy.decompress(payload)
|
|
||||||
|
if _detect_xerial_stream(payload):
|
||||||
|
# TODO ? Should become a fileobj ?
|
||||||
|
out = StringIO()
|
||||||
|
byt = buffer(payload[16:])
|
||||||
|
length = len(byt)
|
||||||
|
cursor = 0
|
||||||
|
|
||||||
|
while cursor < length:
|
||||||
|
block_size = struct.unpack_from('!i', byt[cursor:])[0]
|
||||||
|
# Skip the block size
|
||||||
|
cursor += 4
|
||||||
|
end = cursor + block_size
|
||||||
|
out.write(snappy.decompress(byt[cursor:end]))
|
||||||
|
cursor = end
|
||||||
|
|
||||||
|
out.seek(0)
|
||||||
|
return out.read()
|
||||||
|
else:
|
||||||
|
return snappy.decompress(payload)
|
||||||
|
@@ -70,6 +70,49 @@ class TestCodec(unittest.TestCase):
|
|||||||
s2 = snappy_decode(snappy_encode(s1))
|
s2 = snappy_decode(snappy_encode(s1))
|
||||||
self.assertEquals(s1, s2)
|
self.assertEquals(s1, s2)
|
||||||
|
|
||||||
|
@unittest.skipUnless(has_snappy(), "Snappy not available")
|
||||||
|
def test_snappy_detect_xerial(self):
|
||||||
|
import kafka as kafka1
|
||||||
|
_detect_xerial_stream = kafka1.codec._detect_xerial_stream
|
||||||
|
|
||||||
|
header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01Some extra bytes'
|
||||||
|
false_header = b'\x01SNAPPY\x00\x00\x00\x01\x00\x00\x00\x01'
|
||||||
|
random_snappy = snappy_encode('SNAPPY' * 50)
|
||||||
|
short_data = b'\x01\x02\x03\x04'
|
||||||
|
|
||||||
|
self.assertTrue(_detect_xerial_stream(header))
|
||||||
|
self.assertFalse(_detect_xerial_stream(b''))
|
||||||
|
self.assertFalse(_detect_xerial_stream(b'\x00'))
|
||||||
|
self.assertFalse(_detect_xerial_stream(false_header))
|
||||||
|
self.assertFalse(_detect_xerial_stream(random_snappy))
|
||||||
|
self.assertFalse(_detect_xerial_stream(short_data))
|
||||||
|
|
||||||
|
@unittest.skipUnless(has_snappy(), "Snappy not available")
|
||||||
|
def test_snappy_decode_xerial(self):
|
||||||
|
header = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01'
|
||||||
|
random_snappy = snappy_encode('SNAPPY' * 50)
|
||||||
|
block_len = len(random_snappy)
|
||||||
|
random_snappy2 = snappy_encode('XERIAL' * 50)
|
||||||
|
block_len2 = len(random_snappy2)
|
||||||
|
|
||||||
|
to_test = header \
|
||||||
|
+ struct.pack('!i', block_len) + random_snappy \
|
||||||
|
+ struct.pack('!i', block_len2) + random_snappy2 \
|
||||||
|
|
||||||
|
self.assertEquals(snappy_decode(to_test), ('SNAPPY' * 50) + ('XERIAL' * 50))
|
||||||
|
|
||||||
|
@unittest.skipUnless(has_snappy(), "Snappy not available")
|
||||||
|
def test_snappy_encode_xerial(self):
|
||||||
|
to_ensure = b'\x82SNAPPY\x00\x00\x00\x00\x01\x00\x00\x00\x01' + \
|
||||||
|
'\x00\x00\x00\x18' + \
|
||||||
|
'\xac\x02\x14SNAPPY\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00' + \
|
||||||
|
'\x00\x00\x00\x18' + \
|
||||||
|
'\xac\x02\x14XERIAL\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\xfe\x06\x00\x96\x06\x00'
|
||||||
|
|
||||||
|
to_test = ('SNAPPY' * 50) + ('XERIAL' * 50)
|
||||||
|
|
||||||
|
compressed = snappy_encode(to_test, xerial_compatible=True, xerial_blocksize=300)
|
||||||
|
self.assertEquals(compressed, to_ensure)
|
||||||
|
|
||||||
class TestProtocol(unittest.TestCase):
|
class TestProtocol(unittest.TestCase):
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user