Recursive crawler example added.

2010-03-01 23:30:35 -08:00
parent 7e6db36e0a
commit 67ddcdfead
3 changed files with 62 additions and 3 deletions
--- a/doc/examples.rst
+++ b/doc/examples.rst
@@ -60,4 +60,12 @@ Port Forwarder
 -----------------------
 ``examples/forwarder.py``
-.. literalinclude:: ../examples/forwarder.py
+.. _producer_consumer_example:
 Producer Consumer/Recursive Web Crawler
 -----------------------------------------
 ``examples/producer_consumer.py``
 This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler.
 .. literalinclude:: ../examples/producer_consumer.py
--- a/eventlet/greenpool.py
+++ b/eventlet/greenpool.py
@@ -35,7 +35,7 @@ class GreenPool(object):
    def running(self):
        """ Returns the number of greenthreads that are currently executing
-        functions in the Parallel's pool."""
+        functions in the GreenPool."""
        return len(self.coroutines_running)
    def free(self):
--- a/examples/producer_consumer.py
+++ b/examples/producer_consumer.py
@@ -0,0 +1,51 @@
 """This is a recursive web crawler.  Don't go pointing this at random sites;
 it doesn't respect robots.txt and it is pretty brutal about how quickly it 
 fetches pages.
 This is a kind of "producer/consumer" example; the producer function produces 
 jobs, and the GreenPool itself is the consumer, farming out work concurrently.  
 It's easier to write it this way rather than writing a standard consumer loop;
 GreenPool handles any exceptions raised and arranges so that there's a set
 number of "workers", so you don't have to write that tedious management code 
 yourself.
 """
 from eventlet.green import urllib2
 import eventlet
 import re
 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 def fetch(url, outq):
    """Fetch a url and push any urls found into a queue."""
    print "fetching", url
    data = ''
    with eventlet.Timeout(5, False):
        data = urllib2.urlopen(url).read()
    for url_match in url_regex.finditer(data):
        new_url = url_match.group(0)
        outq.put(new_url)
 def producer(start_url):
    """Recursively crawl starting from *start_url*.  Returns a set of 
    urls that were found."""
    pool = eventlet.GreenPool()
    seen = set()
    q = eventlet.Queue()
    q.put(start_url)
    # keep looping if there are new urls, or workers that may produce more urls
    while not q.empty() or pool.running() != 0:
        url = eventlet.with_timeout(0.1, q.get, timeout_value='')
        # limit requests to eventlet.net so we don't crash all over the internet
        if url not in seen and 'eventlet.net' in url:
            seen.add(url)
            pool.spawn(fetch, url, q)
    return seen
 seen = producer("http://eventlet.net")
 print "I saw these urls:"
 print "\n".join(seen)