我所用到的庫BeautifulSoup,requests
# encoding:utf-8
import os, urllib, re, urllib2, requests, gzip
from StringIO import StringIO
from bs4 import BeautifulSoup
# 加上頭部信息偽裝成瀏覽器
req_header = {
'User-Agent': 'Mozilla/4.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip',
'Connection': 'close',
'Referer': None # 注意如果依然不能抓取的話,這里可以設(shè)置抓取網(wǎng)站的host
}
# 傳入URL獲取網(wǎng)頁文本信息
def get_html(url):
# page = requests.get(url)
req = urllib2.Request(url=url, headers=req_header)
res = urllib2.urlopen(req, timeout=5)
html = res.read()
print 'Content-Encoding :', res.info().get('Content-Encoding') # 如果網(wǎng)頁通過gzip壓縮,需要解壓
if res.info().get('Content-Encoding') == 'gzip':
buf = StringIO(html)
f = gzip.GzipFile(fileobj=buf)
html = f.read()
return html
# 根據(jù)html網(wǎng)頁匹配其中所有 src=.....jpg 的信息 并返回鏈接列表
def get_imglink(html_text):
img = re.compile(r'src="(.+?\.jpg)"')
imglist = re.findall(img, html_text)
return imglist
# 用bs獲取圖片鏈接
def get_imglink2(html_text):
bs = BeautifulSoup(html_text, 'html.parser', from_encoding='utf-8')
links = bs.find_all('img')
imglist = []
for link in links:
imglist.append(link.get('src'))
return imglist
# 根據(jù)鏈接列表獲取到圖片,寫入文件 用的是requests 比urllib穩(wěn)定
def get_img2(imgs, path):
dirname = './%s' % path
if not os.path.exists(dirname):
os.makedirs(dirname)
for imgurl in imgs:
filename = imgurl.split('/')[-1]
local = os.path.join(dirname, filename)
print local
try:
with open(local, 'wb') as jpg:
jpg.write(requests.get('http:'+imgurl, stream=True, headers=req_header).content)
except requests.RequestException,e:
print e
if __name__ == '__main__':
for num in range(3):
html = get_html('http://jandan.net/ooxx/page-%s#comments' % str(2000 - num))
imgs = get_imglink2(html)
get_img2(imgs, u'你要的圖')
print u'圖片共:', len(imgs)
煎蛋網(wǎng)的頁面是gzip壓縮后的,需要解壓再,從中解析element
try except 是有的imgurl前面有http前綴,有的沒有。所以循環(huán)是request會拋出異常
with 語句用來打開文件很好,可以自動關(guān)閉流。