以下程序親測成功,重點代碼都加了注釋,就不一一介紹了。爬取結(jié)果各位自行體會。PS:爬取前先在當前文件夾建立hupu_gif的文件夾。
# coding: utf-8
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import re
import os
url = 'https://my.hupu.com/search?q=%E7%A6%8F%E5%88%A9' # 虎撲搜索"福利"出來的頁面,注意q=后面的字符表示"福利"兩個字
htmls = set() # 用于去掉重復的網(wǎng)頁地址
startPage = 11 # 開始頁,可自行調(diào)整
endPage = 20 # 結(jié)束頁,可自行調(diào)整。搜索結(jié)果總共有1000頁,不能超過1000
pages = range(startPage, endPage + 1)
count = 0
for page in pages:
url_page = url + '&page=' + str(page) # 每個具體的搜索頁
html = urlopen(url_page)
bsObj = BeautifulSoup(html, 'lxml')
tds = bsObj.findAll('td', {'class': 'p_title'}) # 找到包含正確鏈接的td
for td in tds:
if td.a.attrs['href']:
td_href = td.a.attrs['href']
print('td_href=>', td_href)
try:
html_each = urlopen(td_href)
except Exception as e:
print(e)
print('出錯了,繼續(xù)抓取下一個鏈接')
continue
bsObj_each = BeautifulSoup(html_each, 'lxml')
gifs = bsObj_each.findAll('img', src=re.compile(r'.*gif')) # 找到后綴為gif的動圖
if gifs:
for gif in gifs:
gif_href = gif.attrs['src']
gif_href = re.match(r'.*gif', gif_href).group() # 去掉'.gif'后不必要的內(nèi)容
if gif_href not in htmls:
htmls.add(gif_href)
print(gif_href)
try:
local_filename, headers = urlretrieve(gif_href, filename='./hupu_gif/{}.gif'.format(count))
except Exception as e:
print('出錯了=>', e)
continue
if os.path.getsize(local_filename) >= 10000: # 大于10KB的gif文件保留
count += 1
else: # 否則刪除掉
os.remove(local_filename)
print('Done!')