# -*-encoding:utf-8 -*-
from DataOutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParse
from URLmanager import UrlManager
class SpiderMan(object):
def __init__(self):
super(SpiderMan, self).__init__()
self.manager = UrlManager()
self.downloader = HtmlDownloader()
self.parser = HtmlParse()
self.output = DataOutput()
def crawl(self,root_url):
#添加入口url
self.manager.add_new_url(root_url)
#判斷url管理器中是否有新的url
while(self.manager.has_new_url()and self.manager.old_url_size()<100):
try:
print("1")
#從URL管理器獲取新的url
new_url = self.manager.get_new_url()
print("2")
#從HTML下載器下載網(wǎng)頁
html = self.downloader.download(new_url)
print("3")
#HTML解析器對網(wǎng)頁進行解析
new_urls,data = self.parser.parse(new_url,html)
#將抽取的url添加到url管理器中
print("4")
self.manager.add_new_urls(new_urls)
#數(shù)據(jù)存儲器存儲文件
print("5")
#print(data)
self.output.store_data(data)
print("已經(jīng)抓取%s個鏈接"%self.manager.old_url_size())
except Exception as e:
print("crawl failed")
self.output.output_html()
if __name__ == '__main__':
Spider_Man = SpiderMan()
Spider_Man.crawl("http://baike.baidu.com/view/284853.html")
由于在解析html頁面時沒有對獲取到的超鏈接進行很好的提取所以能獲取到的數(shù)據(jù)有限