Auto-adjusting consumer fetch size

Related to #42

Adds new ConsumerFetchSizeTooSmall exception that is thrown when
`_decode_message_set_iter` gets a BufferUnderflowError but has not yet
yielded a message

In this event, SimpleConsumer will increase the fetch size by 1.5 and
continue the fetching loop while _not_ increasing the offset (basically
just retries the request with a larger fetch size)

Once the consumer fetch size has been increased, it will remain
increased while SimpleConsumer fetches from that partition
This commit is contained in:
David Arthur
2013-09-09 00:44:36 -04:00
parent 40d8e9e550
commit f67ad27f72
4 changed files with 42 additions and 32 deletions

View File

@@ -12,7 +12,7 @@ from kafka.common import (
) )
from kafka.util import ( from kafka.util import (
ReentrantTimer ReentrantTimer, ConsumerFetchSizeTooSmall
) )
log = logging.getLogger("kafka") log = logging.getLogger("kafka")
@@ -357,17 +357,20 @@ class SimpleConsumer(Consumer):
if self.fetch_started[partition]: if self.fetch_started[partition]:
offset += 1 offset += 1
fetch_size = self.fetch_min_bytes
while True: while True:
req = FetchRequest(self.topic, partition, offset, self.fetch_min_bytes) req = FetchRequest(self.topic, partition, offset, fetch_size)
(resp,) = self.client.send_fetch_request([req], (resp,) = self.client.send_fetch_request([req],
max_wait_time=self.fetch_max_wait_time, max_wait_time=self.fetch_max_wait_time,
min_bytes=self.fetch_min_bytes) min_bytes=fetch_size)
assert resp.topic == self.topic assert resp.topic == self.topic
assert resp.partition == partition assert resp.partition == partition
next_offset = None next_offset = None
try:
for message in resp.messages: for message in resp.messages:
next_offset = message.offset next_offset = message.offset
@@ -380,6 +383,13 @@ class SimpleConsumer(Consumer):
self.fetch_started[partition] = True self.fetch_started[partition] = True
self.offsets[partition] = message.offset self.offsets[partition] = message.offset
yield message yield message
except ConsumerFetchSizeTooSmall, e:
log.warn("Fetch size is too small, increasing by 1.5x and retrying")
fetch_size *= 1.5
continue
except ConsumerNoMoreData, e:
log.debug("Iteration was ended by %r", e)
if next_offset is None: if next_offset is None:
break break
else: else:

View File

@@ -13,7 +13,7 @@ from kafka.common import (
from kafka.util import ( from kafka.util import (
read_short_string, read_int_string, relative_unpack, read_short_string, read_int_string, relative_unpack,
write_short_string, write_int_string, group_by_topic_and_partition, write_short_string, write_int_string, group_by_topic_and_partition,
BufferUnderflowError, ChecksumError BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall
) )
log = logging.getLogger("kafka") log = logging.getLogger("kafka")
@@ -110,16 +110,20 @@ class KafkaProtocol(object):
recurse easily. recurse easily.
""" """
cur = 0 cur = 0
read_message = False
while cur < len(data): while cur < len(data):
try: try:
((offset, ), cur) = relative_unpack('>q', data, cur) ((offset, ), cur) = relative_unpack('>q', data, cur)
(msg, cur) = read_int_string(data, cur) (msg, cur) = read_int_string(data, cur)
for (offset, message) in KafkaProtocol._decode_message(msg, for (offset, message) in KafkaProtocol._decode_message(msg, offset):
offset): read_message = True
yield OffsetAndMessage(offset, message) yield OffsetAndMessage(offset, message)
except BufferUnderflowError: except BufferUnderflowError:
# If we get a partial read of a message, stop if read_message is False:
# If we get a partial read of a message, but haven't yielded anyhting
# there's a problem
raise ConsumerFetchSizeTooSmall()
else:
raise StopIteration() raise StopIteration()
@classmethod @classmethod

View File

@@ -73,6 +73,8 @@ class BufferUnderflowError(Exception):
class ChecksumError(Exception): class ChecksumError(Exception):
pass pass
class ConsumerFetchSizeTooSmall(Exception):
pass
class ReentrantTimer(object): class ReentrantTimer(object):
""" """

View File

@@ -8,7 +8,6 @@ import random
from kafka import * # noqa from kafka import * # noqa
from kafka.common import * # noqa from kafka.common import * # noqa
from kafka.codec import has_gzip, has_snappy from kafka.codec import has_gzip, has_snappy
from .fixtures import ZookeeperFixture, KafkaFixture from .fixtures import ZookeeperFixture, KafkaFixture
@@ -757,20 +756,15 @@ class TestConsumer(unittest.TestCase):
self.assertEquals(resp.error, 0) self.assertEquals(resp.error, 0)
self.assertEquals(resp.offset, 10) self.assertEquals(resp.offset, 10)
# Consumer should still get all of them
consumer = SimpleConsumer(self.client, "group1", "test_large_messages") consumer = SimpleConsumer(self.client, "group1", "test_large_messages")
it = consumer.__iter__() all_messages = messages1 + messages2
for i in range(10): for i, message in enumerate(consumer):
self.assertEquals(messages1[i], it.next().message) self.assertEquals(all_messages[i], message.message)
self.assertEquals(i, 19)
consumer = SimpleConsumer(self.client, "group2", "test_large_messages", fetch_size_bytes=5120)
it = consumer.__iter__()
for i in range(10):
self.assertEquals(messages1[i], it.next().message)
for i in range(10):
self.assertEquals(messages2[i], it.next().message)
def random_string(l): def random_string(l):
s = "".join(random.choice(string.printable) for i in xrange(l)) s = "".join(random.choice(string.letters) for i in xrange(l))
return s return s
if __name__ == "__main__": if __name__ == "__main__":