Added cleaner recursive web crawler example.

2010-06-21 21:44:44 -07:00
parent 0512896657
commit 7befa471d5
2 changed files with 61 additions and 2 deletions
--- a/doc/examples.rst
+++ b/doc/examples.rst
@@ -64,13 +64,23 @@ Port Forwarder
 .. literalinclude:: ../examples/forwarder.py
 .. _recursive_crawler_example:
 Recursive Web Crawler
 -----------------------------------------
 ``examples/recursive_crawler.py``
 This is an example recursive web crawler that fetches linked pages from a seed url.
 .. literalinclude:: ../examples/recursive_crawler.py
 .. _producer_consumer_example:
-Producer Consumer/Recursive Web Crawler
+Producer Consumer Web Crawler
 -----------------------------------------
 ``examples/producer_consumer.py``
-This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler.
+This is an example implementation of the producer/consumer pattern as well as being identical in functionality to the recursive web crawler.
 .. literalinclude:: ../examples/producer_consumer.py
--- a/examples/recursive_crawler.py
+++ b/examples/recursive_crawler.py
@@ -0,0 +1,49 @@
 """This is a recursive web crawler.  Don't go pointing this at random sites;
 it doesn't respect robots.txt and it is pretty brutal about how quickly it 
 fetches pages.
 The code for this is very short; this is perhaps a good indication
 that this is making the most effective use of the primitves at hand.
 The fetch function does all the work of making http requests,
 searching for new urls, and dispatching new fetches.  The GreenPool
 acts as sort of a job coordinator (and concurrency controller of
 course).
 """
 from __future__ import with_statement
 from eventlet.green import urllib2
 import eventlet
 import re
 # http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 def fetch(url, seen, pool):
    """Fetch a url, stick any found urls into the seen set, and
    dispatch any new ones to the pool."""
    print "fetching", url
    data = ''
    with eventlet.Timeout(5, False):
        data = urllib2.urlopen(url).read()
    for url_match in url_regex.finditer(data):
        new_url = url_match.group(0)
        # only send requests to eventlet.net so as not to destroy the internet
        if new_url not in seen and 'eventlet.net' in new_url:
            seen.add(new_url)
            # while this seems stack-recursive, it's actually not:
            # spawned greenthreads start their own stacks
            pool.spawn_n(fetch, new_url, seen, pool)
 def crawl(start_url):
    """Recursively crawl starting from *start_url*.  Returns a set of 
    urls that were found."""
    pool = eventlet.GreenPool()
    seen = set()
    fetch(start_url, seen, pool)
    pool.waitall()
    return seen
 seen = crawl("http://eventlet.net")
 print "I saw these urls:"
 print "\n".join(seen)