Showing posts with label spider. Show all posts
Showing posts with label spider. Show all posts

Thursday, June 9, 2011

Crawling the web with SGMLParser

In this example we will use SGMLParser in order to build a simple web crawler.
import urllib
from random import choice
from sgmllib import SGMLParser

class LinkExplorer(SGMLParser): 
 def reset(self):                              
  SGMLParser.reset(self) 
  self.links = [] # list with the urls

 def start_a(self, attrs):
  """ fill the links with the links in the page """
  for k in attrs:
   if k[0] == 'href' and k[1].startswith('http'): 
    self.links.append(k[1])

def explore(parser,s_url,maxvisit=10,iter=0):
 """ pick a random link in the page s_url
     and follow its links recursively """
 if iter < maxvisit: # it will stop after maxvisit iteration
  print '(',iter,') I am in',s_url
  usock = urllib.urlopen(s_url) # download the page
  parser.reset() # reset the list
  parser.feed(usock.read()) # parse the current page
  if len(parser.links) > 0:
   explore(parser,choice(parser.links),maxvisit,iter+1)
  else: # if the page has no links to follow
   print 'the page has no links'

# test the crawler starting from the python's website
parser = LinkExplorer()
explore(parser,"http://www.python.org/")
Let's go!
( 0 ) I am in http://www.python.org/
( 1 ) I am in http://wiki.python.org/moin/NumericAndScientific
( 2 ) I am in http://numpy.scipy.org/
( 3 ) I am in http://sphinx.pocoo.org/
( 4 ) I am in http://www.bitbucket.org/birkenfeld/sphinx/issues/
( 5 ) I am in http://blog.bitbucket.org
( 6 ) I am in http://haproxy.1wt.eu/
( 7 ) I am in http://www.olivepeak.com/blog/posts/read/free-your-port-80-with-haproxy
( 8 ) I am in http://www.olivepeak.com/peaknotes/
( 9 ) I am in http://notes.olivepeak.com/account/create