# -*- coding: utf-8 -*-
# @Time : 2018/6/20 8:57
# @Author :
# @File : jd_phone_spider.py
# @Description : 京東的手機暢銷榜爬取
import requests
import pymongo
import time
import json
import codecs
from lxml import etree
from datetime import datetime
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from jd_phone.config import MONGO_URI, MONGO_DB, MONGO_TABLE
class JdPhone:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
def __init__(self):
self.base_url = 'https://item.jd.com/'
self.image_base_url = 'http://img10.360buyimg.com/n1/s450x450_' # 拼接手機大圖地址
self.sale_url = 'https://top.jd.com/sale?cateId=655' # 手機熱賣榜入口url
self.search_url = 'https://top.jd.com/search?cateId=655' # 手機熱搜榜入口url
self.preferred_url = 'https://top.jd.com/preferred?cateId=655' # 手機好物榜入口url
self.options = webdriver.ChromeOptions()
self.options.add_argument("--headless")
self.browser = webdriver.Chrome(chrome_options=self.options)
self.browser.set_window_size(1500, 1000)
self.wait = WebDriverWait(self.browser, 10)
self.client = pymongo.MongoClient(MONGO_URI)
self.db = self.client[MONGO_DB]
self.file = codecs.open('jd_phones.json', 'wb+', encoding='utf-8')
def __del__(self):
self.browser.close()
self.file.close()
print('執行完成!')
def get_sale_search_html(self, url):
"""
獲取熱賣榜或者熱搜榜網頁
"""
self.browser.get(url)
try:
while True:
for i in range(5):
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
self.browser.find_element_by_link_text('加載更多').click()
time.sleep(2)
except NoSuchElementException:
print('加載完成!')
return self.browser.page_source
def get_preferred_html(self, url):
"""
獲取好物榜網頁
"""
self.browser.get(url)
for i in range(20):
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(1)
print('好物榜加載完成!')
return self.browser.page_source
def save_data(self, data):
"""
存儲數據至mongo,以及json文件
:return:
"""
data['insert_time'] = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
try:
insert_data_json = {}
insert_data_json['insert_time'] = data['insert_time'] # 入庫時間
insert_data_json['source_type'] = data['source_type'] # 數據來源; 0:熱賣榜, 1:熱搜榜, 2:好物榜
insert_data_json['model'] = data['型號'] # 機型
insert_data_json['brand'] = data['品牌'] # 品牌
insert_data_json['public_year'] = data['上市年份'] # 上市年份
insert_data_json['public_month'] = data['上市月份'] # 上市月份
insert_data_json['price'] = data['price'] # 價格
insert_data_json['phone_name'] = data['phone_name'] # 商品名稱
del data['insert_time']
del data['source_type']
del data['型號']
del data['品牌']
del data['上市年份']
del data['上市月份']
del data['price']
del data['phone_name']
insert_data_json['key_param'] = data # 主要參數
# 寫入json文件
self.file.write(json.dumps(insert_data_json, ensure_ascii=False) + "\n")
# 存入Mongo
if self.db[MONGO_TABLE].insert(insert_data_json):
print(insert_data_json['key_param']['data_sku'], '入庫成功!')
except:
pass
def parse_detail(self, data_sku, price, source_type):
"""
解析手機詳情
:param data_sku: 商品編號
:param price: 商品價格
:param source_type: 數據來源; 0:熱賣榜, 1:熱搜榜, 2:好物榜
:return:
"""
phone_detail = {}
url = self.base_url + data_sku + '.html'
html = requests.get(url, headers=self.headers).text
doc = pq(html)
items = doc('.Ptable-item').items()
for item in items:
# 刪除tips項
item.find('.Ptable-tips').remove()
dts = item('dt').items()
dds = item('dd').items()
for dt, dd in zip(dts, dds):
key = dt.text().strip()
value = dd.text().strip()
phone_detail[key] = value
image_list = []
image_urls = doc('#spec-list li').items()
for image_url in image_urls:
url = self.image_base_url + image_url('img').attr('data-url')
image_list.append(url)
phone_detail['model_pic_address'] = image_list
phone_name = doc('.parameter2.p-parameter-list li:first-child').attr('title') # 商品名稱
phone_detail['phone_name'] = phone_name
phone_detail['data_sku'] = data_sku
phone_detail['price'] = price
phone_detail['source_type'] = source_type
return phone_detail
def parse_sale_html(self):
"""
解析熱賣榜網頁
"""
html = self.get_sale_search_html(self.sale_url)
tree = etree.HTML(html)
items = tree.xpath('//li[contains(@class, "saleitem") and @data-price-item="1"]')
for item in items:
data_sku = item.xpath('.//p[@class="saleitem_info_price"]/@data-price-id')[0]
price = item.xpath('.//p[@class="saleitem_info_price"]/text()')[0] # 獲取價格,解析detail時,則無需動態渲染
phone_detail = self.parse_detail(data_sku, price, source_type=0)
self.save_data(phone_detail)
def parse_search_html(self):
"""
解析熱搜榜網頁
"""
html = self.get_sale_search_html(self.search_url)
tree = etree.HTML(html)
items = tree.xpath('//li[contains(@class, "toplanding_search_floor")]')
for item in items:
data_skus = item.xpath('.//div[@class="toplanding_search_goods"]//@data-sku')
prices = item.xpath('.//div[@class="toplanding_search_goods"]//'
'p[@class="toplanding_search_goods_price"]/text()')
for data_sku, price in zip(data_skus, prices):
phone_detail = self.parse_detail(data_sku, price, source_type=1)
self.save_data(phone_detail)
def parse_preferred_html(self):
"""
解析好物榜網頁
"""
html = self.get_preferred_html(self.preferred_url)
tree = etree.HTML(html)
hrefs = tree.xpath('//div[@class="preferred_list_item pli"]//a[@class="pli_more"]/@href')
print(hrefs)
for href in hrefs:
self.parse_preferred_more(href)
def parse_preferred_more(self, href):
"""
解析好物榜, 點擊查看全部的網頁
"""
self.browser.get(href)
html = self.browser.page_source
tree = etree.HTML(html)
data_skus = tree.xpath('//li[contains(@class, "preferred_detail_item")]/a/@href')
prices = tree.xpath('//div[@class="preferred_detail_item_price"]/text()')
for sku, price in zip(data_skus, prices):
data_sku = sku.split('/')[-1].rstrip('.html')
phone_detail = self.parse_detail(data_sku, price, source_type=2)
self.save_data(phone_detail)
def main(self):
self.parse_sale_html()
self.parse_search_html()
self.parse_preferred_html()
if __name__ == '__main__':
jd = JdPhone()
jd.main()
jd phone
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
- 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人,你說我怎么就攤上這事。” “怎么了?”我有些...
- 文/花漫 我一把揭開白布。 她就那樣靜靜地躺著,像睡著了一般。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發上,一...
- 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了?” 一聲冷哼從身側響起,我...
推薦閱讀更多精彩內容
- 登錄Apple ID 回答密保問題 進入賬戶資料 郵箱那里有一個添加資料 加上你的手機號碼就行了
- 原題 給一個不包含01的數字字符串,每個數字代表一個字母,請返回其所有可能的字母組合。 下圖的手機按鍵圖,就表示了...
- Given a digit string, return all possible letter combinat...
- 綜合實力iPhone X會更強,但是或許在針對游戲方面,Razer Phone有自己獨到的地方~ 每當有新的智能手...