Add 'codec' parameter to Producer

Adds a codec parameter to Producer.__init__ that lets the user choose
a compression codec to use for all messages sent by it.
This commit is contained in:
Patrick Lucas
2014-05-03 11:27:57 -07:00
parent 2415609ce0
commit 671b74ab2e
3 changed files with 49 additions and 24 deletions

View File

@@ -10,7 +10,10 @@ from multiprocessing import Queue, Process
from kafka.common import ProduceRequest, TopicAndPartition from kafka.common import ProduceRequest, TopicAndPartition
from kafka.partitioner import HashedPartitioner from kafka.partitioner import HashedPartitioner
from kafka.protocol import create_message from kafka.protocol import (
CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY, ALL_CODECS,
create_message, create_gzip_message, create_snappy_message,
)
log = logging.getLogger("kafka") log = logging.getLogger("kafka")
@@ -20,7 +23,7 @@ BATCH_SEND_MSG_COUNT = 20
STOP_ASYNC_PRODUCER = -1 STOP_ASYNC_PRODUCER = -1
def _send_upstream(queue, client, batch_time, batch_size, def _send_upstream(queue, client, codec, batch_time, batch_size,
req_acks, ack_timeout): req_acks, ack_timeout):
""" """
Listen on the queue for a specified number of messages or till Listen on the queue for a specified number of messages or till
@@ -61,7 +64,14 @@ def _send_upstream(queue, client, batch_time, batch_size,
# Send collected requests upstream # Send collected requests upstream
reqs = [] reqs = []
for topic_partition, messages in msgset.items(): for topic_partition, msg in msgset.items():
if codec == CODEC_GZIP:
messages = [create_gzip_message(msg)]
elif codec == CODEC_SNAPPY:
messages = [create_snappy_message(msg)]
else:
messages = [create_message(m) for m in msg]
req = ProduceRequest(topic_partition.topic, req = ProduceRequest(topic_partition.topic,
topic_partition.partition, topic_partition.partition,
messages) messages)
@@ -101,6 +111,7 @@ class Producer(object):
def __init__(self, client, async=False, def __init__(self, client, async=False,
req_acks=ACK_AFTER_LOCAL_WRITE, req_acks=ACK_AFTER_LOCAL_WRITE,
ack_timeout=DEFAULT_ACK_TIMEOUT, ack_timeout=DEFAULT_ACK_TIMEOUT,
codec=None,
batch_send=False, batch_send=False,
batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_n=BATCH_SEND_MSG_COUNT,
batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL):
@@ -118,11 +129,17 @@ class Producer(object):
self.req_acks = req_acks self.req_acks = req_acks
self.ack_timeout = ack_timeout self.ack_timeout = ack_timeout
if codec is None:
codec = CODEC_NONE
assert codec in ALL_CODECS
self.codec = codec
if self.async: if self.async:
self.queue = Queue() # Messages are sent through this queue self.queue = Queue() # Messages are sent through this queue
self.proc = Process(target=_send_upstream, self.proc = Process(target=_send_upstream,
args=(self.queue, args=(self.queue,
self.client.copy(), self.client.copy(),
self.codec,
batch_send_every_t, batch_send_every_t,
batch_send_every_n, batch_send_every_n,
self.req_acks, self.req_acks,
@@ -138,11 +155,16 @@ class Producer(object):
""" """
if self.async: if self.async:
for m in msg: for m in msg:
self.queue.put((TopicAndPartition(topic, partition), self.queue.put((TopicAndPartition(topic, partition), m))
create_message(m)))
resp = [] resp = []
else: else:
messages = [create_message(m) for m in msg] if self.codec == CODEC_GZIP:
messages = [create_gzip_message(msg)]
elif self.codec == CODEC_SNAPPY:
messages = [create_snappy_message(msg)]
else:
messages = [create_message(m) for m in msg]
req = ProduceRequest(topic, partition, messages) req = ProduceRequest(topic, partition, messages)
try: try:
resp = self.client.send_produce_request([req], acks=self.req_acks, resp = self.client.send_produce_request([req], acks=self.req_acks,
@@ -167,7 +189,7 @@ class Producer(object):
class SimpleProducer(Producer): class SimpleProducer(Producer):
""" """
A simple, round-robbin producer. Each message goes to exactly one partition A simple, round-robin producer. Each message goes to exactly one partition
Params: Params:
client - The Kafka client instance to use client - The Kafka client instance to use
@@ -184,12 +206,13 @@ class SimpleProducer(Producer):
def __init__(self, client, async=False, def __init__(self, client, async=False,
req_acks=Producer.ACK_AFTER_LOCAL_WRITE, req_acks=Producer.ACK_AFTER_LOCAL_WRITE,
ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, ack_timeout=Producer.DEFAULT_ACK_TIMEOUT,
codec=None,
batch_send=False, batch_send=False,
batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_n=BATCH_SEND_MSG_COUNT,
batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL):
self.partition_cycles = {} self.partition_cycles = {}
super(SimpleProducer, self).__init__(client, async, req_acks, super(SimpleProducer, self).__init__(client, async, req_acks,
ack_timeout, batch_send, ack_timeout, codec, batch_send,
batch_send_every_n, batch_send_every_n,
batch_send_every_t) batch_send_every_t)
@@ -227,6 +250,7 @@ class KeyedProducer(Producer):
def __init__(self, client, partitioner=None, async=False, def __init__(self, client, partitioner=None, async=False,
req_acks=Producer.ACK_AFTER_LOCAL_WRITE, req_acks=Producer.ACK_AFTER_LOCAL_WRITE,
ack_timeout=Producer.DEFAULT_ACK_TIMEOUT, ack_timeout=Producer.DEFAULT_ACK_TIMEOUT,
codec=None,
batch_send=False, batch_send=False,
batch_send_every_n=BATCH_SEND_MSG_COUNT, batch_send_every_n=BATCH_SEND_MSG_COUNT,
batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL): batch_send_every_t=BATCH_SEND_DEFAULT_INTERVAL):
@@ -236,7 +260,7 @@ class KeyedProducer(Producer):
self.partitioners = {} self.partitioners = {}
super(KeyedProducer, self).__init__(client, async, req_acks, super(KeyedProducer, self).__init__(client, async, req_acks,
ack_timeout, batch_send, ack_timeout, codec, batch_send,
batch_send_every_n, batch_send_every_n,
batch_send_every_t) batch_send_every_t)

View File

@@ -18,6 +18,12 @@ from kafka.util import (
log = logging.getLogger("kafka") log = logging.getLogger("kafka")
ATTRIBUTE_CODEC_MASK = 0x03
CODEC_NONE = 0x00
CODEC_GZIP = 0x01
CODEC_SNAPPY = 0x02
ALL_CODECS = (CODEC_NONE, CODEC_GZIP, CODEC_SNAPPY)
class KafkaProtocol(object): class KafkaProtocol(object):
""" """
@@ -32,11 +38,6 @@ class KafkaProtocol(object):
OFFSET_COMMIT_KEY = 8 OFFSET_COMMIT_KEY = 8
OFFSET_FETCH_KEY = 9 OFFSET_FETCH_KEY = 9
ATTRIBUTE_CODEC_MASK = 0x03
CODEC_NONE = 0x00
CODEC_GZIP = 0x01
CODEC_SNAPPY = 0x02
################### ###################
# Private API # # Private API #
################### ###################
@@ -151,17 +152,17 @@ class KafkaProtocol(object):
(key, cur) = read_int_string(data, cur) (key, cur) = read_int_string(data, cur)
(value, cur) = read_int_string(data, cur) (value, cur) = read_int_string(data, cur)
codec = att & KafkaProtocol.ATTRIBUTE_CODEC_MASK codec = att & ATTRIBUTE_CODEC_MASK
if codec == KafkaProtocol.CODEC_NONE: if codec == CODEC_NONE:
yield (offset, Message(magic, att, key, value)) yield (offset, Message(magic, att, key, value))
elif codec == KafkaProtocol.CODEC_GZIP: elif codec == CODEC_GZIP:
gz = gzip_decode(value) gz = gzip_decode(value)
for (offset, msg) in KafkaProtocol._decode_message_set_iter(gz): for (offset, msg) in KafkaProtocol._decode_message_set_iter(gz):
yield (offset, msg) yield (offset, msg)
elif codec == KafkaProtocol.CODEC_SNAPPY: elif codec == CODEC_SNAPPY:
snp = snappy_decode(value) snp = snappy_decode(value)
for (offset, msg) in KafkaProtocol._decode_message_set_iter(snp): for (offset, msg) in KafkaProtocol._decode_message_set_iter(snp):
yield (offset, msg) yield (offset, msg)
@@ -544,7 +545,7 @@ def create_gzip_message(payloads, key=None):
[create_message(payload) for payload in payloads]) [create_message(payload) for payload in payloads])
gzipped = gzip_encode(message_set) gzipped = gzip_encode(message_set)
codec = KafkaProtocol.ATTRIBUTE_CODEC_MASK & KafkaProtocol.CODEC_GZIP codec = ATTRIBUTE_CODEC_MASK & CODEC_GZIP
return Message(0, 0x00 | codec, key, gzipped) return Message(0, 0x00 | codec, key, gzipped)
@@ -565,6 +566,6 @@ def create_snappy_message(payloads, key=None):
[create_message(payload) for payload in payloads]) [create_message(payload) for payload in payloads])
snapped = snappy_encode(message_set) snapped = snappy_encode(message_set)
codec = KafkaProtocol.ATTRIBUTE_CODEC_MASK & KafkaProtocol.CODEC_SNAPPY codec = ATTRIBUTE_CODEC_MASK & CODEC_SNAPPY
return Message(0, 0x00 | codec, key, snapped) return Message(0, 0x00 | codec, key, snapped)

View File

@@ -135,8 +135,8 @@ class TestProtocol(unittest.TestCase):
payloads = ["v1", "v2"] payloads = ["v1", "v2"]
msg = create_gzip_message(payloads) msg = create_gzip_message(payloads)
self.assertEqual(msg.magic, 0) self.assertEqual(msg.magic, 0)
self.assertEqual(msg.attributes, KafkaProtocol.ATTRIBUTE_CODEC_MASK & self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK &
KafkaProtocol.CODEC_GZIP) CODEC_GZIP)
self.assertEqual(msg.key, None) self.assertEqual(msg.key, None)
# Need to decode to check since gzipped payload is non-deterministic # Need to decode to check since gzipped payload is non-deterministic
decoded = gzip_decode(msg.value) decoded = gzip_decode(msg.value)
@@ -151,8 +151,8 @@ class TestProtocol(unittest.TestCase):
payloads = ["v1", "v2"] payloads = ["v1", "v2"]
msg = create_snappy_message(payloads) msg = create_snappy_message(payloads)
self.assertEqual(msg.magic, 0) self.assertEqual(msg.magic, 0)
self.assertEqual(msg.attributes, KafkaProtocol.ATTRIBUTE_CODEC_MASK & self.assertEqual(msg.attributes, ATTRIBUTE_CODEC_MASK &
KafkaProtocol.CODEC_SNAPPY) CODEC_SNAPPY)
self.assertEqual(msg.key, None) self.assertEqual(msg.key, None)
expect = ("8\x00\x00\x19\x01@\x10L\x9f[\xc2\x00\x00\xff\xff\xff\xff" expect = ("8\x00\x00\x19\x01@\x10L\x9f[\xc2\x00\x00\xff\xff\xff\xff"
"\x00\x00\x00\x02v1\x19\x1bD\x00\x10\xd5\x96\nx\x00\x00\xff" "\x00\x00\x00\x02v1\x19\x1bD\x00\x10\xd5\x96\nx\x00\x00\xff"