python版本:2.7.10
學習python爬蟲,首先寫了一個爬取百度貼吧圖片的程序。參考了靜覓的系列博客
好了,先上代碼:
# -*- coding : utf-8 -*-
import urllib
import urllib2
import re
class imgTest:
def __init__(self, baseUrl, seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
# self.tool = Tool()
#save a single img
def saveImg(self,imageURL,filename):
u = urllib.urlopen(imageURL)
data = u.read()
f = open(filename,'wb')
f.write(data)
f.close()
#download images
def saveImgs(self, images, name, num):
number = num
for imageURL in images:
splitPath = imageURL.split('.')
fTail = splitPath.pop()
if len(fTail)>3:
fTail = "jpg"
fileName = name+"/"+str(number)+"."+fTail
self.saveImg(imageURL,fileName)
number += 1
#get img urls
def getAllImageURLs(self,pageNum):
page = self.getPage(pageNum)
patternImg = re.compile(r'<img class="BDE_Image" pic_type="0".*?src="(.+?\.jpg)" pic_ext="jpeg"')
images = re.findall(patternImg, page)
for item in images:
print item
self.printToLog("".join(item))
# print("\n\n")
return images
#print to log.txt
def printToLog(self,mystr):
f = open('txt/log.txt', 'a')
# f = open('txt/log.txt')
f.write(mystr+"\n")
f.close()
#get the title of the bbs
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
result = re.search(pattern, page)
if result:
self.printToLog("bbs title:"+result.group(1))
return result.group(1).strip()
else:
return None
#get the total number of the tiezi
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num".*?<span .*?</span>.*?<span.*?>(.*?)</span>',re.S)
result = re.search(pattern, page)
if result:
self.printToLog("page total num:"+result.group(1))
return result.group(1).strip()
else:
return None
#get the html source code
def getPage(self, pageNum):
try:
url = self.baseUrl+self.seeLZ +'&pn='+str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read()
return content
except urllib2.URLError, e:
if hasattr(e, "reason"):
print "failed to connect baidutieba.",e.reason
return None
baseURL = 'http://tieba.baidu.com/p/3925387672'
imgtest = imgTest(baseURL,1)
totalnum = int(imgtest.getPageNum())
imageCount = 0
for i in range(1, totalnum+1):
imageURLs = imgtest.getAllImageURLs(i)
imgtest.saveImgs(imageURLs,"pic",imageCount)
imageCount += len(imageURLs)
print imageCount
由于我的sublime Text有一點編碼問題我還沒來及管,所以函數的注釋就用英文注釋了(好像只有我能看懂)。最關鍵的一步就是getAllImageURLs這個函數了,需要從網頁中抽取到圖片的url,學好正則表達式真的很重要啊。還有一點,就是我發現百度貼吧的帖子里的圖片url格式不太一樣,不同的帖子要具體的分析過才行的哦,不過呢,這個正則表達式只要稍作修改就可以滿足要求了。
OK,我要去爬本漫畫書來看咯:)