Added cleaner recursive web crawler example.
This commit is contained in:
		@@ -64,13 +64,23 @@ Port Forwarder
 | 
			
		||||
 | 
			
		||||
.. literalinclude:: ../examples/forwarder.py
 | 
			
		||||
 | 
			
		||||
.. _recursive_crawler_example:
 | 
			
		||||
 | 
			
		||||
Recursive Web Crawler
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
``examples/recursive_crawler.py``
 | 
			
		||||
 | 
			
		||||
This is an example recursive web crawler that fetches linked pages from a seed url.
 | 
			
		||||
 | 
			
		||||
.. literalinclude:: ../examples/recursive_crawler.py
 | 
			
		||||
 | 
			
		||||
.. _producer_consumer_example:
 | 
			
		||||
 | 
			
		||||
Producer Consumer/Recursive Web Crawler
 | 
			
		||||
Producer Consumer Web Crawler
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
``examples/producer_consumer.py``
 | 
			
		||||
 | 
			
		||||
This is an example implementation of the producer/consumer pattern as well as a functional recursive web crawler.
 | 
			
		||||
This is an example implementation of the producer/consumer pattern as well as being identical in functionality to the recursive web crawler.
 | 
			
		||||
 | 
			
		||||
.. literalinclude:: ../examples/producer_consumer.py
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										49
									
								
								examples/recursive_crawler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								examples/recursive_crawler.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,49 @@
 | 
			
		||||
"""This is a recursive web crawler.  Don't go pointing this at random sites;
 | 
			
		||||
it doesn't respect robots.txt and it is pretty brutal about how quickly it 
 | 
			
		||||
fetches pages.
 | 
			
		||||
 | 
			
		||||
The code for this is very short; this is perhaps a good indication
 | 
			
		||||
that this is making the most effective use of the primitves at hand.
 | 
			
		||||
The fetch function does all the work of making http requests,
 | 
			
		||||
searching for new urls, and dispatching new fetches.  The GreenPool
 | 
			
		||||
acts as sort of a job coordinator (and concurrency controller of
 | 
			
		||||
course).
 | 
			
		||||
"""
 | 
			
		||||
from __future__ import with_statement
 | 
			
		||||
 | 
			
		||||
from eventlet.green import urllib2
 | 
			
		||||
import eventlet
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
# http://daringfireball.net/2009/11/liberal_regex_for_matching_urls
 | 
			
		||||
url_regex = re.compile(r'\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def fetch(url, seen, pool):
 | 
			
		||||
    """Fetch a url, stick any found urls into the seen set, and
 | 
			
		||||
    dispatch any new ones to the pool."""
 | 
			
		||||
    print "fetching", url
 | 
			
		||||
    data = ''
 | 
			
		||||
    with eventlet.Timeout(5, False):
 | 
			
		||||
        data = urllib2.urlopen(url).read()
 | 
			
		||||
    for url_match in url_regex.finditer(data):
 | 
			
		||||
        new_url = url_match.group(0)
 | 
			
		||||
        # only send requests to eventlet.net so as not to destroy the internet
 | 
			
		||||
        if new_url not in seen and 'eventlet.net' in new_url:
 | 
			
		||||
            seen.add(new_url)
 | 
			
		||||
            # while this seems stack-recursive, it's actually not:
 | 
			
		||||
            # spawned greenthreads start their own stacks
 | 
			
		||||
            pool.spawn_n(fetch, new_url, seen, pool)
 | 
			
		||||
            
 | 
			
		||||
def crawl(start_url):
 | 
			
		||||
    """Recursively crawl starting from *start_url*.  Returns a set of 
 | 
			
		||||
    urls that were found."""
 | 
			
		||||
    pool = eventlet.GreenPool()
 | 
			
		||||
    seen = set()
 | 
			
		||||
    fetch(start_url, seen, pool)
 | 
			
		||||
    pool.waitall()
 | 
			
		||||
    return seen
 | 
			
		||||
 | 
			
		||||
seen = crawl("http://eventlet.net")
 | 
			
		||||
print "I saw these urls:"
 | 
			
		||||
print "\n".join(seen)
 | 
			
		||||
		Reference in New Issue
	
	Block a user