#!/usr/bin/env python
#-*-coding:UTF-8-*-
# Dec 2012 ys
import re
import urllib2
import os
class Robot():
def __init__(self, host, max_depth=3):
self.host = host
self.max_depth = max_depth
self.log_file = os.path.dirname(os.path.abspath(__file__)) \
+ os.sep + 'robot_log.txt'
def start(self):
# This pattern will ignore all the relative url.
self.reg = re.compile(r'''<a href="(?P<url>http.+?)(#.*?)?".+?/a>''')
# Avoid infinite loop.
# Such as the sub page contains a url directs to the home page.
self.url_dict = {}
self.walk('http://www.' + self.host)
def walk(self, url, depth=0):
if self.is_walked(url):
return
# The number of dash represents the depth of the url.
print '- ' * depth + url
content = ''
try:
page = urllib2.urlopen(url)
content = page.read()
self.save_content(url, content)
except urllib2.HTTPError, e:
# Such as the 404 or bad network connection.
print e, ' >> ', url
depth += 1
# Search the urls in the page.
# The finditer will generate a iterator.
for i in self.reg.finditer(content):
u = i.group('url').rstrip('/')
if not self.is_current_domain(u):
continue
# Whether the url is belong to current host.
# Whether it is reached the max recursion depth.
if depth < self.max_depth:
# Recursion.
self.walk(u, depth)
elif not self.is_walked(u):
print '- ' * depth + u
# We don't need to search other domain's url.
# Or the the url that has been handled.
def is_walked(self, url):
if url in self.url_dict:
return True
else:
self.url_dict[url] = True
return False
def is_current_domain(self, url):
if self.host not in url:
return False
else:
return True
def save_content(self, url, content):
sep = '\n\n' + '*' * 60 + '\n\n'
content = sep + url + sep + content
f = open(self.log_file, 'a')
f.write(content)
f.close()
# Ctrl + C to exit.
try:
robot = Robot('stackoverflow.com')
robot.start()
except KeyboardInterrupt:
print "\nTerminated"
exit()