Added cleaner recursive web crawler example.
This commit is contained in:
@@ -64,13 +64,23 @@ Port Forwarder
|
|||||||
|
|
||||||
.. literalinclude:: ../examples/forwarder.py
|
.. literalinclude:: ../examples/forwarder.py
|
||||||
|
|
||||||
|
.. _recursive_crawler_example:
|
||||||
|
|
||||||
|
Recursive Web Crawler
|
||||||
|
-----------------------------------------
|
||||||
|
``examples/recursive_crawler.py``
|
||||||
|
|
||||||
|
This is an example recursive web crawler that fetches linked pages from a seed url.
|
||||||
|
|
||||||
|
.. literalinclude:: ../examples/recursive_crawler.py
|
||||||
|
|
||||||
.. _producer_consumer_example:
|
.. _producer_consumer_example:
|
||||||
|
|
||||||
Producer Consumer/Recursive Web Crawler
|
Producer Consumer Web Crawler
|
||||||
-----------------------------------------
|
-----------------------------------------
|
||||||
``examples/producer_consumer.py``
|
``examples/producer_consumer.py``
|
||||||
|
|
||||||
This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler.
|
This is an example implementation of the producer/consumer pattern as well as being identical in functionality to the recursive web crawler.
|
||||||
|
|
||||||
.. literalinclude:: ../examples/producer_consumer.py
|
.. literalinclude:: ../examples/producer_consumer.py
|
||||||
|
|
||||||
|
49
examples/recursive_crawler.py
Normal file
49
examples/recursive_crawler.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""This is a recursive web crawler. Don't go pointing this at random sites;
|
||||||
|
it doesn't respect robots.txt and it is pretty brutal about how quickly it
|
||||||
|
fetches pages.
|
||||||
|
|
||||||
|
The code for this is very short; this is perhaps a good indication
|
||||||
|
that this is making the most effective use of the primitves at hand.
|
||||||
|
The fetch function does all the work of making http requests,
|
||||||
|
searching for new urls, and dispatching new fetches. The GreenPool
|
||||||
|
acts as sort of a job coordinator (and concurrency controller of
|
||||||
|
course).
|
||||||
|
"""
|
||||||
|
from __future__ import with_statement
|
||||||
|
|
||||||
|
from eventlet.green import urllib2
|
||||||
|
import eventlet
|
||||||
|
import re
|
||||||
|
|
||||||
|
# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
|
||||||
|
url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(url, seen, pool):
|
||||||
|
"""Fetch a url, stick any found urls into the seen set, and
|
||||||
|
dispatch any new ones to the pool."""
|
||||||
|
print "fetching", url
|
||||||
|
data = ''
|
||||||
|
with eventlet.Timeout(5, False):
|
||||||
|
data = urllib2.urlopen(url).read()
|
||||||
|
for url_match in url_regex.finditer(data):
|
||||||
|
new_url = url_match.group(0)
|
||||||
|
# only send requests to eventlet.net so as not to destroy the internet
|
||||||
|
if new_url not in seen and 'eventlet.net' in new_url:
|
||||||
|
seen.add(new_url)
|
||||||
|
# while this seems stack-recursive, it's actually not:
|
||||||
|
# spawned greenthreads start their own stacks
|
||||||
|
pool.spawn_n(fetch, new_url, seen, pool)
|
||||||
|
|
||||||
|
def crawl(start_url):
|
||||||
|
"""Recursively crawl starting from *start_url*. Returns a set of
|
||||||
|
urls that were found."""
|
||||||
|
pool = eventlet.GreenPool()
|
||||||
|
seen = set()
|
||||||
|
fetch(start_url, seen, pool)
|
||||||
|
pool.waitall()
|
||||||
|
return seen
|
||||||
|
|
||||||
|
seen = crawl("http://eventlet.net")
|
||||||
|
print "I saw these urls:"
|
||||||
|
print "\n".join(seen)
|
Reference in New Issue
Block a user