Fixed #42, make fetch size configurable

Was hard coded to 1024 bytes which meant that larger messages were
unconsumable since they would always get split causing the consumer to
stop.

It would probably be best to automatically retry truncated messages with
a larger request size so you don't have to know your max message size
ahead of time
This commit is contained in:
David Arthur
2013-09-08 20:20:12 -04:00
parent c3bce13b84
commit 40d8e9e550
2 changed files with 41 additions and 4 deletions

View File

@@ -206,6 +206,8 @@ class SimpleConsumer(Consumer):
auto_commit_every_t: default 5000. How much time (in milliseconds) to
wait before commit
fetch_size_bytes: number of bytes to request in a FetchRequest
Auto commit details:
If both auto_commit_every_n and auto_commit_every_t are set, they will
reset one another when one is triggered. These triggers simply call the
@@ -214,11 +216,12 @@ class SimpleConsumer(Consumer):
"""
def __init__(self, client, group, topic, auto_commit=True, partitions=None,
auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
auto_commit_every_t=AUTO_COMMIT_INTERVAL):
auto_commit_every_t=AUTO_COMMIT_INTERVAL,
fetch_size_bytes=FETCH_MIN_BYTES):
self.partition_info = False # Do not return partition info in msgs
self.fetch_max_wait_time = FETCH_MAX_WAIT_TIME
self.fetch_min_bytes = FETCH_MIN_BYTES
self.fetch_min_bytes = fetch_size_bytes
self.fetch_started = defaultdict(bool) # defaults to false
super(SimpleConsumer, self).__init__(client, group, topic,
@@ -243,6 +246,7 @@ class SimpleConsumer(Consumer):
1 is relative to the current offset
2 is relative to the latest known offset (tail)
"""
if whence == 1: # relative to current position
for partition, _offset in self.offsets.items():
self.offsets[partition] = _offset + offset
@@ -354,8 +358,7 @@ class SimpleConsumer(Consumer):
offset += 1
while True:
# TODO: configure fetch size
req = FetchRequest(self.topic, partition, offset, 1024)
req = FetchRequest(self.topic, partition, offset, self.fetch_min_bytes)
(resp,) = self.client.send_fetch_request([req],
max_wait_time=self.fetch_max_wait_time,

View File

@@ -2,6 +2,8 @@ import logging
import unittest
import time
from datetime import datetime
import string
import random
from kafka import * # noqa
from kafka.common import * # noqa
@@ -738,6 +740,38 @@ class TestConsumer(unittest.TestCase):
consumer.stop()
def test_large_messages(self):
# Produce 10 "normal" size messages
messages1 = [create_message(random_string(1024)) for i in range(10)]
produce1 = ProduceRequest("test_large_messages", 0, messages1)
for resp in self.client.send_produce_request([produce1]):
self.assertEquals(resp.error, 0)
self.assertEquals(resp.offset, 0)
# Produce 10 messages that are too large (bigger than default fetch size)
messages2=[create_message(random_string(5000)) for i in range(10)]
produce2 = ProduceRequest("test_large_messages", 0, messages2)
for resp in self.client.send_produce_request([produce2]):
self.assertEquals(resp.error, 0)
self.assertEquals(resp.offset, 10)
consumer = SimpleConsumer(self.client, "group1", "test_large_messages")
it = consumer.__iter__()
for i in range(10):
self.assertEquals(messages1[i], it.next().message)
consumer = SimpleConsumer(self.client, "group2", "test_large_messages", fetch_size_bytes=5120)
it = consumer.__iter__()
for i in range(10):
self.assertEquals(messages1[i], it.next().message)
for i in range(10):
self.assertEquals(messages2[i], it.next().message)
def random_string(l):
s = "".join(random.choice(string.printable) for i in xrange(l))
return s
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)