In this example we will use
SGMLParser in order to build a simple web crawler.
import urllib
from random import choice
from sgmllib import SGMLParser
class LinkExplorer(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.links = [] # list with the urls
def start_a(self, attrs):
""" fill the links with the links in the page """
for k in attrs:
if k[0] == 'href' and k[1].startswith('http'):
self.links.append(k[1])
def explore(parser,s_url,maxvisit=10,iter=0):
""" pick a random link in the page s_url
and follow its links recursively """
if iter < maxvisit: # it will stop after maxvisit iteration
print '(',iter,') I am in',s_url
usock = urllib.urlopen(s_url) # download the page
parser.reset() # reset the list
parser.feed(usock.read()) # parse the current page
if len(parser.links) > 0:
explore(parser,choice(parser.links),maxvisit,iter+1)
else: # if the page has no links to follow
print 'the page has no links'
# test the crawler starting from the python's website
parser = LinkExplorer()
explore(parser,"http://www.python.org/")
Let's go!
( 0 ) I am in http://www.python.org/
( 1 ) I am in http://wiki.python.org/moin/NumericAndScientific
( 2 ) I am in http://numpy.scipy.org/
( 3 ) I am in http://sphinx.pocoo.org/
( 4 ) I am in http://www.bitbucket.org/birkenfeld/sphinx/issues/
( 5 ) I am in http://blog.bitbucket.org
( 6 ) I am in http://haproxy.1wt.eu/
( 7 ) I am in http://www.olivepeak.com/blog/posts/read/free-your-port-80-with-haproxy
( 8 ) I am in http://www.olivepeak.com/peaknotes/
( 9 ) I am in http://notes.olivepeak.com/account/create
really nice, thanks man ;)
ReplyDeleteSGMLlib is now deprecated and has been outright removed from Python 3. :(
ReplyDelete