diff --git a/doc/examples.rst b/doc/examples.rst index b1b8f94..6b83441 100644 --- a/doc/examples.rst +++ b/doc/examples.rst @@ -60,4 +60,12 @@ Port Forwarder ----------------------- ``examples/forwarder.py`` -.. literalinclude:: ../examples/forwarder.py \ No newline at end of file +.. _producer_consumer_example: + +Producer Consumer/Recursive Web Crawler +----------------------------------------- +``examples/producer_consumer.py`` + +This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler. + +.. literalinclude:: ../examples/producer_consumer.py diff --git a/eventlet/greenpool.py b/eventlet/greenpool.py index 7af857f..1e9e22b 100644 --- a/eventlet/greenpool.py +++ b/eventlet/greenpool.py @@ -34,8 +34,8 @@ class GreenPool(object): self.size = new_size def running(self): - """Returns the number of greenthreads that are currently executing - functions in the Parallel's pool.""" + """ Returns the number of greenthreads that are currently executing + functions in the GreenPool.""" return len(self.coroutines_running) def free(self): diff --git a/examples/producer_consumer.py b/examples/producer_consumer.py new file mode 100644 index 0000000..84e2819 --- /dev/null +++ b/examples/producer_consumer.py @@ -0,0 +1,51 @@ +"""This is a recursive web crawler. Don't go pointing this at random sites; +it doesn't respect robots.txt and it is pretty brutal about how quickly it +fetches pages. + +This is a kind of "producer/consumer" example; the producer function produces +jobs, and the GreenPool itself is the consumer, farming out work concurrently. +It's easier to write it this way rather than writing a standard consumer loop; +GreenPool handles any exceptions raised and arranges so that there's a set +number of "workers", so you don't have to write that tedious management code +yourself. +""" + +from eventlet.green import urllib2 +import eventlet +import re + +# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls +url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') + + +def fetch(url, outq): + """Fetch a url and push any urls found into a queue.""" + print "fetching", url + data = '' + with eventlet.Timeout(5, False): + data = urllib2.urlopen(url).read() + for url_match in url_regex.finditer(data): + new_url = url_match.group(0) + outq.put(new_url) + + +def producer(start_url): + """Recursively crawl starting from *start_url*. Returns a set of + urls that were found.""" + pool = eventlet.GreenPool() + seen = set() + q = eventlet.Queue() + q.put(start_url) + # keep looping if there are new urls, or workers that may produce more urls + while not q.empty() or pool.running() != 0: + url = eventlet.with_timeout(0.1, q.get, timeout_value='') + # limit requests to eventlet.net so we don't crash all over the internet + if url not in seen and 'eventlet.net' in url: + seen.add(url) + pool.spawn(fetch, url, q) + return seen + + +seen = producer("http://eventlet.net") +print "I saw these urls:" +print "\n".join(seen) \ No newline at end of file