A Simple Python Web Crawler
Posted by Tom on 2011-03-31 21:51
More code doodlin' in Python. A web crawler this time.
import sys
import httplib
import urlparse
from BeautifulSoup import BeautifulSoup
class Crawler:
def __init__(self, host, root, depth, handler):
self._host = host
self._root = root
self._depth = depth
self._handler = handler
self._visited = []
self._connection = httplib.HTTPConnection(host)
def run(self):
self._run(self._root, '', 0)
def _run(self, url, parentUrl, currentDepth):
# is some clown is using absolute URLs for internal links?
url = url.replace('http://' + self._host, '')
# bail if we're running too deep
if self._depth > 0 and currentDepth > self._depth:
return
# bail if it's a manky URL
if ':' in url or url[0].startswith('#'):
return
# normalise relative urls
if url[0] != '/':
index = parentUrl.rfind('/')
if index > -1:
url = parentUrl[:index] + '/' + url
else:
url = '/' + url
# bail if we've already visited this page
if url in self._visited:
return
page = Page(self._connection, url)
self._handler(page)
self._visited.append(url)
map(self._run, page.urls, [url] * len(page.urls), [currentDepth + 1] * len(page.urls))
class Page:
def __init__(self, connection, url):
self.url = url
self.urls = []
self.inputs = []
# get a list of querystring key
querystring = urlparse.urlparse(url).query
self.querystring_params = [part.split('=')[0] for part in querystring.split('&')]
connection.connect()
connection.request('GET', url, headers = {'User-Agent': 'Colourblind Crawler 0.1'})
response = connection.getresponse()
self.statusCode = response.status
if self.statusCode != 200:
# handle redirects (location probably isn't relevant to all of them)
if self.statusCode >= 300 and self.statusCode < 400:
self.urls.append(response.getheader('Location'))
# if it's HTML, parse the sucker
if 'text/html' in response.getheader('Content-Type'):
soup = BeautifulSoup(response.read(), fromEncoding='utf-8')
links = soup('a')
# grab all the hrefs and remove any blanks
self.urls.extend(filter(lambda x: x != None, map(lambda x: x.get('href'), links)))
self.inputs.extend(soup('input'))
self.inputs.extend(soup('select'))
self.inputs.extend(soup('textarea'))
connection.close()
def print_page(page):
print('{0} {1}'.format(page.url.ljust(75, '.'), page.statusCode))
for input in page.inputs:
name = input.get('name')
print('\t{0}'.format(name))
if __name__ == '__main__':
startPage = '/'
depth = 3
if len(sys.argv) > 2:
startPage = sys.argv[2]
if len(sys.argv) > 3:
depth = int(sys.argv[3])
crawler = Crawler(sys.argv[1], startPage, depth, print_page)
crawler.run()
It's far from perfect (I still don't know how best to handle case sensitivity in the URLs), but I wrote this as part of a larger project which will, realistically, never get more than 10% completed. It'd be the shame for the code to never see the light of day, so here it is.