# coding:utf-8
import requests
from bs4 import BeautifulSoup
import os , random
import urllib2
import sys , MySQLdb
import re
from PIL import Image
import imagehash
import uuid
from io import BytesIO
# 數據庫設置
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'ip'
MYSQL_USER = 'mark'
MYSQL_PASSWD = '1234'
MYSQL_PORT = 3306
image_hash_set = set()
# 此處修改偽造的頭字段,
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
def getProxy ( conn , cur , columns_data ):
flag = random.choice ( columns_data )
sql = "select proxy_ip,proxy_port from proxy where id =%d" % (flag)
cur.execute ( sql )
proxy = {}
for each in cur.fetchall ():
proxy[ 'http' ] = "http://%s:%s" % (each[ 0 ] , each[ 1 ])
try:
requests.get ( 'https://movie.douban.com/top250' , proxies=proxy )
except:
print "proxy error"
getProxy ( conn , cur , columns_data )
else:
print "proxy success"
return proxy
def get_request ( url , headers , conn , cur , columns_data ):
proxy = getProxy ( conn , cur , columns_data )
proxy_s = urllib2.ProxyHandler ( proxy )
opener = urllib2.build_opener ( proxy_s )
urllib2.install_opener ( opener )
req = urllib2.Request ( url , headers=headers )
r = urllib2.urlopen ( req )
return r.read ()
def getImage ( url , headers , conn , cur , columns_data ):
for each in range ( 1 , 7 ):
real_url = "%s%d" % (url , each)
html = get_request(url,headers,conn,cur,columns_data)
soup = BeautifulSoup(html,'html.parser')
for each in soup.findAll('div',attrs = {'id':re.compile('^post_content_(\d+)')}):
for i in each.findAll('img',class_='BDE_Image'):
src = i.get('src')
name = uuid.uuid1()
response = requests.get ( src ).content
image = Image.open ( BytesIO ( response) )
imagehash_tmp = imagehash.average_hash(image)
if imagehash_tmp not in image_hash_set:
print "%s\t%s.jpg" %(src,name)
os.chdir('C:\Users\NorthCity\Desktop\spider\image')
image.save ( "%s.jpg" %(name))
if __name__ == "__main__":
conn = MySQLdb.connect ( host=MYSQL_HOST , user=MYSQL_USER , passwd=MYSQL_PASSWD , db=MYSQL_DBNAME ,
port=MYSQL_PORT , charset='utf8' )
cur = conn.cursor ()
columns = "select id from proxy"
cur.execute ( columns )
columns_data = [ ]
for each in cur.fetchall ():
columns_data.append ( each[ 0 ] )
url = 'https://tieba.baidu.com/p/5033202671?pn='
getImage (url , headers , conn , cur , columns_data )
【美圖】雜圖_美圖吧_百度貼吧 爬蟲
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
- 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人,你說我怎么就攤上這事。” “怎么了?”我有些...
- 文/花漫 我一把揭開白布。 她就那樣靜靜地躺著,像睡著了一般。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發上,一...
- 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了?” 一聲冷哼從身側響起,我...