A Simple Python Web Crawler

Posted by Tom on 2011-03-31 21:51

More code doodlin' in Python. A web crawler this time.

import sys
import httplib
import urlparse
from BeautifulSoup import BeautifulSoup

class Crawler:
    def __init__(self, host, root, depth, handler):
        self._host = host
        self._root = root
        self._depth = depth
        self._handler = handler
        self._visited = []
        self._connection = httplib.HTTPConnection(host)

    def run(self):
        self._run(self._root, '', 0)

    def _run(self, url, parentUrl, currentDepth):
        # is some clown is using absolute URLs for internal links?
        url = url.replace('http://' + self._host, '')
        # bail if we're running too deep
        if self._depth > 0 and currentDepth > self._depth:
            return
        # bail if it's a manky URL
        if ':' in url or url[0].startswith('#'):
            return

        # normalise relative urls
        if url[0] != '/':
            index = parentUrl.rfind('/')
            if index > -1:
                url = parentUrl[:index] + '/' + url
            else:
                url = '/' + url

        # bail if we've already visited this page
        if url in self._visited:
            return

        page = Page(self._connection, url)
        self._handler(page)
        self._visited.append(url)

        map(self._run, page.urls, [url] * len(page.urls), [currentDepth + 1] * len(page.urls))

class Page:
    def __init__(self, connection, url):
        self.url = url
        self.urls = []
        self.inputs = []

        # get a list of querystring key
        querystring = urlparse.urlparse(url).query
        self.querystring_params = [part.split('=')[0] for part in querystring.split('&')]

        connection.connect()
        connection.request('GET', url, headers = {'User-Agent': 'Colourblind Crawler 0.1'})
        response = connection.getresponse()

        self.statusCode = response.status
        if self.statusCode != 200:
            # handle redirects (location probably isn't relevant to all of them)
            if self.statusCode >= 300 and self.statusCode < 400:
                self.urls.append(response.getheader('Location'))

        # if it's HTML, parse the sucker
        if 'text/html' in response.getheader('Content-Type'):
            soup = BeautifulSoup(response.read(), fromEncoding='utf-8')
            links = soup('a')
            # grab all the hrefs and remove any blanks
            self.urls.extend(filter(lambda x: x != None, map(lambda x: x.get('href'), links)))
            self.inputs.extend(soup('input'))
            self.inputs.extend(soup('select'))
            self.inputs.extend(soup('textarea'))

        connection.close()

def print_page(page):
    print('{0} {1}'.format(page.url.ljust(75, '.'), page.statusCode))
    for input in page.inputs:
        name = input.get('name')
        print('\t{0}'.format(name))

if __name__ == '__main__':
    startPage = '/'
    depth = 3
    if len(sys.argv) > 2:
        startPage = sys.argv[2]
    if len(sys.argv) > 3:
        depth = int(sys.argv[3])

    crawler = Crawler(sys.argv[1], startPage, depth, print_page)
    crawler.run()

It's far from perfect (I still don't know how best to handle case sensitivity in the URLs), but I wrote this as part of a larger project which will, realistically, never get more than 10% completed. It'd be the shame for the code to never see the light of day, so here it is.