- scrapy框架爬取今日頭條數據,主要實現一下幾個主要功能:
- 數據存儲到mongodb數據庫
- 圖片下載
- 隨機切換User-Agent
- 對接IP代理池
- 實現郵件發送
1.首先按F12打開開發者工具,如圖:
- 由于今日頭條的數據是js動態加載,我們需要找到加載數據的接口,進攻分析,數據接口鏈接如上圖所示。
-
首先查看Headers信息:
headers
我們發現query string parameters 中有as和cp這兩個參數,這是今日頭條一種反扒措施,我們需要編寫算法來生成這兩個參數(網上找的)
-
我們再看response也就是返回的數據信息:
response
我們將數據拷貝到在線的json格式轉換器中,方便我們對數據進行分析:
json數據 - 上圖我們可以看到,返回的數據包含來新聞的全部信息,其中在最后還有一個max_behot_time參數,通過改變這個參數信息,我們可以實現新聞數據循環抓取
- spider代碼如下:
# -*- coding: utf-8 -*-
import scrapy
import json
import time
import hashlib
import random
import requests
from datetime import datetime
from ..emailsend import EmailSend
from toutiao_two.items import ToutiaoTwoItem
class ToutiaoSpiderSpider(scrapy.Spider):
name = 'toutiao_spider'
allowed_domains = ['www.toutiao.com']
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Host': 'www.toutiao.com',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
cookies = {'tt_webid': '6722356446824613389'}
start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
max_behot_time = '0'
D = {'hot_time': '0'}
def get_as_cp(self): # 該函數主要是為了獲取as和cp參數,程序參考今日頭條中的加密js文件:home_4abea46.js
zz = {}
now = round(time.time())
print(now) # 獲取當前計算機時間
e = hex(int(now)).upper()[2:] # hex()轉換一個整數對象為16進制的字符串表示
print('e:', e)
a = hashlib.md5() # hashlib.md5().hexdigest()創建hash對象并返回16進制結果
print('a:', a)
a.update(str(int(now)).encode('utf-8'))
i = a.hexdigest().upper()
print('i:', i)
if len(e) != 8:
zz = {'as': '479BB4B7254C150',
'cp': '7E0AC8874BB0985'}
return zz
n = i[:5]
a = i[-5:]
r = ''
s = ''
for i in range(5):
s = s + n[i] + e[i]
for j in range(5):
r = r + e[j + 3] + a[j]
zz = {
'as': 'A1' + s + e[-3:],
'cp': e[0:3] + r + 'E1'
}
print('zz:', zz)
return zz
def start_requests(self):
global start_time
start_time = datetime.now()
ascp = self.get_as_cp()
yield scrapy.FormRequest(url=self.start_url + self.max_behot_time + '&max_behot_time_tmp=' + self.max_behot_time + '&tadrequire=true&as=' + ascp[
'as'] + '&cp=' + ascp['cp'],
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse,
)
def parse(self, response):
json_result = json.loads(response.text)
# if json_result is None:
# print(self.D['hot_time'], '=====')
# time.sleep(20)
# yield scrapy.FormRequest(
# url=self.start_url + self.D['hot_time'] + '&max_behot_time_tmp=' + self.D['hot_time'] + '&tadrequire=true&as=' +
# 'A115DD5DE72AC29' + '&cp=' + '5DD7FA9C02D90E1',
# method='GET',
# headers=self.headers,
# cookies=self.cookies,
# callback=self.parse,
# )
item = ToutiaoTwoItem()
infos = json_result['data']
for info in infos:
image_url_list = []
item['abstract'] = info['abstract'] if info.get('abstract') else ''
item['chinese_tag'] = info['chinese_tag'] if info.get('chinese_tag') else ''
item['title'] = info['title'] if info.get('title') else ''
item['source'] = info['source'] if info.get('source') else ''
image_urls = info['image_list'] if info.get('image_list') else ''
for image_url in image_urls:
url = 'https:' + image_url['url']
image_url_list.append(url)
item['image_url'] = image_url_list
yield item
time.sleep(random.randint(1, 4))
print(self.D['hot_time'])
if json_result.get('next'):
next = json_result['next']
if next.get('max_behot_time'):
max_behot_time = str(json_result['next']['max_behot_time'])
self.D.update({'hot_time': max_behot_time})
ascp = self.get_as_cp()
yield scrapy.FormRequest(
url=self.start_url + max_behot_time + '&max_behot_time_tmp=' + max_behot_time + '&tadrequire=true&as=' +
str(ascp['as']) + '&cp=' + str(ascp['cp']),
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse,
)
def closed(self, reason):
# 爬蟲關閉的時候,會調用這個方法
email = EmailSend()
# 爬蟲耗時
use_time = datetime.now() - start_time
close_time = 'toutiao爬蟲開始時間{};結束時間:{};爬蟲耗時:{}'.format(start_time, datetime.now(), use_time)
content = '爬蟲關閉原因:{}'.format(reason)
email.send_text_email('發送者郵箱@qq.com', '接受者郵箱@qq.com', close_time, content)
實現IP代理池對接,在middlewares.py中實現,增加如下代碼,并在settings中配置:
# 對接IP代理池
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)
- setteings
# 拋出可用IP地址
PROXY_URL = 'http://localhost:5555/random'
設置隨機切換User-Agent,同樣在middlewares.py中實現,并在settings配置:
# 隨機切換User-Agent
class RandomUserAgent(object):
"""Randomly rotate user agents based on a list of predefined ones"""
def __init__(self, agents):
self.agents = agents
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.getlist('USER_AGENTS'))
def process_request(self, request, spider):
#print "**************************" + random.choice(self.agents)
request.headers.setdefault('User-Agent', random.choice(self.agents))
- settings
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
DOWNLOADER_MIDDLEWARES = {
'toutiao_two.middlewares.RandomUserAgent': 543,
'toutiao_two.middlewares.ProxyMiddleware': 550,
}
在pipipelines中實現圖片下載
import copy
import pymongo
from pymongo import MongoClient
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
# 下載圖片
class ToutaioImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1] + '.jpg'
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image DownloadedFailed')
return item
def get_media_requests(self, item, info):
for image_url in item['image_url']:
yield Request(image_url)
- settings中配置圖片存儲路徑:
IMAGES_STORE = './images'
- 同樣在pipipelines中實現數據存儲
# mongodb
class ToutiaoTwoMongoPipeline():
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
"""
問題:數據存入數據庫之后,出現大量重復數據
解決思路:
在process_item中執行數據插入之前,先對變量進行復制copy,再用復制copy的變量進行操作,通過互斥確保變量不被修改。因此,修正這個問題,我們只需要調整優化下process_item()方法。
解決代碼:process_item() - copy.deepcopy(item) ->導入copy包
"""
asynItem = copy.deepcopy(item)
infos = {'chinese_tag': asynItem['chinese_tag'], 'title': asynItem['title'], 'source': asynItem['source'],
'image_url': asynItem['image_url']}
self.db.toutiao.insert(infos)
return item
- settings中配置數據庫鏈接信息:
ITEM_PIPELINES = {
'toutiao_two.pipelines.ToutiaoTwoMongoPipeline': 300,
'toutiao_two.pipelines.ToutaioImagePipeline': 300,
}
MONGO_URI = 'localhost'
MONGO_DB = 'scrapy_toutiao'
實現郵件發送,在settings同級目錄下新增emailsend.py文件,寫入如下內容:
#-*- coding: utf-8 -*-
'''
---------------
Description of this file
:author: Luopeng
:date created: 2019-12-04
:python version: 3.6
---------------
'''
import smtplib
from email.mime.text import MIMEText
import logging
class EmailSend(object):
def __init__(self):
self.logging = logging.getLogger('Waring')
self.email_host = 'smtp.qq.com'
self.email_port = '465'
self.email_pass = '*********' # 自己的授權碼
def send_text_email(self, from_addr, to_addrs, subject, content):
self.logging.warning('send_text_email is willed 丟棄')
self.logging.error('send_text_email is None')
message_text = MIMEText(content, 'plain', 'utf8')
message_text['From'] = from_addr
message_text['To'] = to_addrs
message_text['Subject'] = subject
try:
# 在創建客戶端對象的同時,連接到郵箱服務器。
client = smtplib.SMTP_SSL(host=self.email_host, port=self.email_port)
login_result = client.login(from_addr, self.email_pass)
if login_result and login_result[0] == 235:
print('登錄成功')
client.sendmail(from_addr, to_addrs, message_text.as_string())
print('郵件發送成功')
else:
print('郵件發送異常:', login_result[0], login_result[1])
except Exception as e:
# print('連接郵箱服務器異常:',e)
self.logging.error('連接郵箱服務器異常:{}'.format(e))
def send_image_email(self):
pass
def send_word_email(self):
pass
def send_video_email(self):
pass
- 具體調用請參考spider里面的代碼