Added cleaner recursive web crawler example.

This commit is contained in:
Ryan Williams
2010-06-21 21:44:44 -07:00
parent 0512896657
commit 7befa471d5
2 changed files with 61 additions and 2 deletions

View File

@@ -64,13 +64,23 @@ Port Forwarder
.. literalinclude:: ../examples/forwarder.py .. literalinclude:: ../examples/forwarder.py
.. _recursive_crawler_example:
Recursive Web Crawler
-----------------------------------------
``examples/recursive_crawler.py``
This is an example recursive web crawler that fetches linked pages from a seed url.
.. literalinclude:: ../examples/recursive_crawler.py
.. _producer_consumer_example: .. _producer_consumer_example:
Producer Consumer/Recursive Web Crawler Producer Consumer Web Crawler
----------------------------------------- -----------------------------------------
``examples/producer_consumer.py`` ``examples/producer_consumer.py``
This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler. This is an example implementation of the producer/consumer pattern as well as being identical in functionality to the recursive web crawler.
.. literalinclude:: ../examples/producer_consumer.py .. literalinclude:: ../examples/producer_consumer.py

View File

@@ -0,0 +1,49 @@
"""This is a recursive web crawler. Don't go pointing this at random sites;
it doesn't respect robots.txt and it is pretty brutal about how quickly it
fetches pages.
The code for this is very short; this is perhaps a good indication
that this is making the most effective use of the primitves at hand.
The fetch function does all the work of making http requests,
searching for new urls, and dispatching new fetches. The GreenPool
acts as sort of a job coordinator (and concurrency controller of
course).
"""
from __future__ import with_statement
from eventlet.green import urllib2
import eventlet
import re
# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
def fetch(url, seen, pool):
"""Fetch a url, stick any found urls into the seen set, and
dispatch any new ones to the pool."""
print "fetching", url
data = ''
with eventlet.Timeout(5, False):
data = urllib2.urlopen(url).read()
for url_match in url_regex.finditer(data):
new_url = url_match.group(0)
# only send requests to eventlet.net so as not to destroy the internet
if new_url not in seen and 'eventlet.net' in new_url:
seen.add(new_url)
# while this seems stack-recursive, it's actually not:
# spawned greenthreads start their own stacks
pool.spawn_n(fetch, new_url, seen, pool)
def crawl(start_url):
"""Recursively crawl starting from *start_url*. Returns a set of
urls that were found."""
pool = eventlet.GreenPool()
seen = set()
fetch(start_url, seen, pool)
pool.waitall()
return seen
seen = crawl("http://eventlet.net")
print "I saw these urls:"
print "\n".join(seen)