diff --git a/doc/examples.rst b/doc/examples.rst index 2f7b817..c16bf26 100644 --- a/doc/examples.rst +++ b/doc/examples.rst @@ -64,13 +64,23 @@ Port Forwarder .. literalinclude:: ../examples/forwarder.py +.. _recursive_crawler_example: + +Recursive Web Crawler +----------------------------------------- +``examples/recursive_crawler.py`` + +This is an example recursive web crawler that fetches linked pages from a seed url. + +.. literalinclude:: ../examples/recursive_crawler.py + .. _producer_consumer_example: -Producer Consumer/Recursive Web Crawler +Producer Consumer Web Crawler ----------------------------------------- ``examples/producer_consumer.py`` -This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler. +This is an example implementation of the producer/consumer pattern as well as being identical in functionality to the recursive web crawler. .. literalinclude:: ../examples/producer_consumer.py diff --git a/examples/recursive_crawler.py b/examples/recursive_crawler.py new file mode 100644 index 0000000..2e8701e --- /dev/null +++ b/examples/recursive_crawler.py @@ -0,0 +1,49 @@ +"""This is a recursive web crawler. Don't go pointing this at random sites; +it doesn't respect robots.txt and it is pretty brutal about how quickly it +fetches pages. + +The code for this is very short; this is perhaps a good indication +that this is making the most effective use of the primitves at hand. +The fetch function does all the work of making http requests, +searching for new urls, and dispatching new fetches. The GreenPool +acts as sort of a job coordinator (and concurrency controller of +course). +""" +from __future__ import with_statement + +from eventlet.green import urllib2 +import eventlet +import re + +# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls +url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))') + + +def fetch(url, seen, pool): + """Fetch a url, stick any found urls into the seen set, and + dispatch any new ones to the pool.""" + print "fetching", url + data = '' + with eventlet.Timeout(5, False): + data = urllib2.urlopen(url).read() + for url_match in url_regex.finditer(data): + new_url = url_match.group(0) + # only send requests to eventlet.net so as not to destroy the internet + if new_url not in seen and 'eventlet.net' in new_url: + seen.add(new_url) + # while this seems stack-recursive, it's actually not: + # spawned greenthreads start their own stacks + pool.spawn_n(fetch, new_url, seen, pool) + +def crawl(start_url): + """Recursively crawl starting from *start_url*. Returns a set of + urls that were found.""" + pool = eventlet.GreenPool() + seen = set() + fetch(start_url, seen, pool) + pool.waitall() + return seen + +seen = crawl("http://eventlet.net") +print "I saw these urls:" +print "\n".join(seen)