Auto-adjusting consumer fetch size

Related to #42 Adds new ConsumerFetchSizeTooSmall exception that is thrown when `_decode_message_set_iter` gets a BufferUnderflowError but has not yet yielded a message In this event, SimpleConsumer will increase the fetch size by 1.5 and continue the fetching loop while _not_ increasing the offset (basically just retries the request with a larger fetch size) Once the consumer fetch size has been increased, it will remain increased while SimpleConsumer fetches from that partition
2013-09-09 00:44:36 -04:00
parent 40d8e9e550
commit f67ad27f72
4 changed files with 42 additions and 32 deletions
--- a/kafka/consumer.py
+++ b/kafka/consumer.py
@@ -12,7 +12,7 @@ from kafka.common import (
 )
 from kafka.util import (
-    ReentrantTimer
+    ReentrantTimer, ConsumerFetchSizeTooSmall
 )
 log = logging.getLogger("kafka")
@@ -357,17 +357,20 @@ class SimpleConsumer(Consumer):
        if self.fetch_started[partition]:
            offset += 1
        fetch_size = self.fetch_min_bytes
        while True:
-            req = FetchRequest(self.topic, partition, offset, self.fetch_min_bytes)
+            req = FetchRequest(self.topic, partition, offset, fetch_size)
            (resp,) = self.client.send_fetch_request([req],
                                    max_wait_time=self.fetch_max_wait_time,
-                                    min_bytes=self.fetch_min_bytes)
+                                    min_bytes=fetch_size)
            assert resp.topic == self.topic
            assert resp.partition == partition
            next_offset = None
            try:
                for message in resp.messages:
                    next_offset = message.offset
@@ -380,6 +383,13 @@ class SimpleConsumer(Consumer):
                    self.fetch_started[partition] = True
                    self.offsets[partition] = message.offset
                    yield message
            except ConsumerFetchSizeTooSmall, e:
                log.warn("Fetch size is too small, increasing by 1.5x and retrying")
                fetch_size *= 1.5
                continue
            except ConsumerNoMoreData, e:
                log.debug("Iteration was ended by %r", e)
            if next_offset is None:
                break
            else:
--- a/kafka/protocol.py
+++ b/kafka/protocol.py
@@ -13,7 +13,7 @@ from kafka.common import (
 from kafka.util import (
    read_short_string, read_int_string, relative_unpack,
    write_short_string, write_int_string, group_by_topic_and_partition,
-    BufferUnderflowError, ChecksumError
+    BufferUnderflowError, ChecksumError, ConsumerFetchSizeTooSmall
 )
 log = logging.getLogger("kafka")
@@ -110,16 +110,20 @@ class KafkaProtocol(object):
        recurse easily.
        """
        cur = 0
        read_message = False
        while cur < len(data):
            try:
                ((offset, ), cur) = relative_unpack('>q', data, cur)
                (msg, cur) = read_int_string(data, cur)
-                for (offset, message) in KafkaProtocol._decode_message(msg,
+                for (offset, message) in KafkaProtocol._decode_message(msg, offset):
-                                                                       offset):
+                    read_message = True
                    yield OffsetAndMessage(offset, message)
            except BufferUnderflowError:
-                # If we get a partial read of a message, stop
+                if read_message is False:
                    # If we get a partial read of a message, but haven't yielded anyhting
                    # there's a problem
                    raise ConsumerFetchSizeTooSmall()
                else:
                    raise StopIteration()
    @classmethod
--- a/kafka/util.py
+++ b/kafka/util.py
@@ -73,6 +73,8 @@ class BufferUnderflowError(Exception):
 class ChecksumError(Exception):
    pass
 class ConsumerFetchSizeTooSmall(Exception):
    pass
 class ReentrantTimer(object):
    """
--- a/test/test_integration.py
+++ b/test/test_integration.py
@@ -8,7 +8,6 @@ import random
 from kafka import *  # noqa
 from kafka.common import *  # noqa
 from kafka.codec import has_gzip, has_snappy
 from .fixtures import ZookeeperFixture, KafkaFixture
@@ -757,20 +756,15 @@ class TestConsumer(unittest.TestCase):
            self.assertEquals(resp.error, 0)
            self.assertEquals(resp.offset, 10)
        # Consumer should still get all of them
        consumer = SimpleConsumer(self.client, "group1", "test_large_messages")
-        it = consumer.__iter__()
+        all_messages = messages1 + messages2
-        for i in range(10):
+        for i, message in enumerate(consumer):
-            self.assertEquals(messages1[i], it.next().message)
+            self.assertEquals(all_messages[i], message.message)
-
+        self.assertEquals(i, 19)
        consumer = SimpleConsumer(self.client, "group2", "test_large_messages", fetch_size_bytes=5120)
        it = consumer.__iter__()
        for i in range(10):
            self.assertEquals(messages1[i], it.next().message)
        for i in range(10):
            self.assertEquals(messages2[i], it.next().message)
 def random_string(l):
-    s = "".join(random.choice(string.printable) for i in xrange(l))
+    s = "".join(random.choice(string.letters) for i in xrange(l))
    return s
 if __name__ == "__main__":