Auto-adjusting consumer fetch size

Related to #42

Adds new ConsumerFetchSizeTooSmall exception that is thrown when
`_decode_message_set_iter` gets a BufferUnderflowError but has not yet
yielded a message

In this event, SimpleConsumer will increase the fetch size by 1.5 and
continue the fetching loop while _not_ increasing the offset (basically
just retries the request with a larger fetch size)

Once the consumer fetch size has been increased, it will remain
increased while SimpleConsumer fetches from that partition
This commit is contained in:
David Arthur
2013-09-09 00:44:36 -04:00
parent 40d8e9e550
commit f67ad27f72
4 changed files with 42 additions and 32 deletions

View File

@@ -12,7 +12,7 @@ from kafka.common import (
)
from kafka.util import (
ReentrantTimer
ReentrantTimer, ConsumerFetchSizeTooSmall
)
log = logging.getLogger("kafka")
@@ -357,17 +357,20 @@ class SimpleConsumer(Consumer):
if self.fetch_started[partition]:
offset += 1
fetch_size = self.fetch_min_bytes
while True:
req = FetchRequest(self.topic, partition, offset, self.fetch_min_bytes)
req = FetchRequest(self.topic, partition, offset, fetch_size)
(resp,) = self.client.send_fetch_request([req],
max_wait_time=self.fetch_max_wait_time,
min_bytes=self.fetch_min_bytes)
min_bytes=fetch_size)
assert resp.topic == self.topic
assert resp.partition == partition
next_offset = None
try:
for message in resp.messages:
next_offset = message.offset
@@ -380,6 +383,13 @@ class SimpleConsumer(Consumer):
self.fetch_started[partition] = True
self.offsets[partition] = message.offset
yield message
except ConsumerFetchSizeTooSmall, e:
log.warn("Fetch size is too small, increasing by 1.5x and retrying")
fetch_size *= 1.5
continue
except ConsumerNoMoreData, e:
log.debug("Iteration was ended by %r", e)
if next_offset is None:
break
else:

View File

@@ -13,7 +13,7 @@ from kafka.common import (
from kafka.util import (
read_short_string, read_int_string, relative_unpack,
write_short_string, write_int_string, group_by_topic_and_partition,
BufferUnderflowError, ChecksumError
BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall
)
log = logging.getLogger("kafka")
@@ -110,16 +110,20 @@ class KafkaProtocol(object):
recurse easily.
"""
cur = 0
read_message = False
while cur < len(data):
try:
((offset, ), cur) = relative_unpack('>q', data, cur)
(msg, cur) = read_int_string(data, cur)
for (offset, message) in KafkaProtocol._decode_message(msg,
offset):
for (offset, message) in KafkaProtocol._decode_message(msg, offset):
read_message = True
yield OffsetAndMessage(offset, message)
except BufferUnderflowError:
# If we get a partial read of a message, stop
if read_message is False:
# If we get a partial read of a message, but haven't yielded anyhting
# there's a problem
raise ConsumerFetchSizeTooSmall()
else:
raise StopIteration()
@classmethod

View File

@@ -73,6 +73,8 @@ class BufferUnderflowError(Exception):
class ChecksumError(Exception):
pass
class ConsumerFetchSizeTooSmall(Exception):
pass
class ReentrantTimer(object):
"""

View File

@@ -8,7 +8,6 @@ import random
from kafka import * # noqa
from kafka.common import * # noqa
from kafka.codec import has_gzip, has_snappy
from .fixtures import ZookeeperFixture, KafkaFixture
@@ -757,20 +756,15 @@ class TestConsumer(unittest.TestCase):
self.assertEquals(resp.error, 0)
self.assertEquals(resp.offset, 10)
# Consumer should still get all of them
consumer = SimpleConsumer(self.client, "group1", "test_large_messages")
it = consumer.__iter__()
for i in range(10):
self.assertEquals(messages1[i], it.next().message)
consumer = SimpleConsumer(self.client, "group2", "test_large_messages", fetch_size_bytes=5120)
it = consumer.__iter__()
for i in range(10):
self.assertEquals(messages1[i], it.next().message)
for i in range(10):
self.assertEquals(messages2[i], it.next().message)
all_messages = messages1 + messages2
for i, message in enumerate(consumer):
self.assertEquals(all_messages[i], message.message)
self.assertEquals(i, 19)
def random_string(l):
s = "".join(random.choice(string.printable) for i in xrange(l))
s = "".join(random.choice(string.letters) for i in xrange(l))
return s
if __name__ == "__main__":