Here I just want to mention one thing, sometimes it might be more aesthetic use another language to present the comments. The comments will be easier to be found through a bunch of codes.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#!/usr/bin/env python #-*-coding:UTF-8-*- # Dec 2012 ys import re import urllib2 import os class Robot(): def __init__(self, host, max_depth=3): self.host = host self.max_depth = max_depth self.log_file = os.path.dirname(os.path.abspath(__file__)) \ + os.sep + 'robot_log.txt' def start(self): # This pattern will ignore all the relative url. self.reg = re.compile(r'''<a href="(?P<url>http.+?)(#.*?)?".+?/a>''') # Avoid infinite loop. # Such as the sub page contains a url directs to the home page. self.url_dict = {} self.walk('http://www.' + self.host) def walk(self, url, depth=0): if self.is_walked(url): return # The number of dash represents the depth of the url. print '- ' * depth + url content = '' try: page = urllib2.urlopen(url) content = page.read() self.save_content(url, content) except urllib2.HTTPError, e: # Such as the 404 or bad network connection. print e, ' >> ', url depth += 1 # Search the urls in the page. # The finditer will generate a iterator. for i in self.reg.finditer(content): u = i.group('url').rstrip('/') if not self.is_current_domain(u): continue # Whether the url is belong to current host. # Whether it is reached the max recursion depth. if depth < self.max_depth: # Recursion. self.walk(u, depth) elif not self.is_walked(u): print '- ' * depth + u # We don't need to search other domain's url. # Or the the url that has been handled. def is_walked(self, url): if url in self.url_dict: return True else: self.url_dict[url] = True return False def is_current_domain(self, url): if self.host not in url: return False else: return True def save_content(self, url, content): sep = '\n\n' + '*' * 60 + '\n\n' content = sep + url + sep + content f = open(self.log_file, 'a') f.write(content) f.close() # Ctrl + C to exit. try: robot = Robot('stackoverflow.com') robot.start() except KeyboardInterrupt: print "\nTerminated" exit() |