搜狗搜索微信文章

微信文章的時間是無法直接xpath取,是網頁js渲染出來的。用正則取了。

from urllib.parse import urlencode
import requests, re
from requests.exceptions import ConnectionError, ReadTimeout
from lxml import etree
import pymongo
from config import *

client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]

baseurl = 'http://weixin.sogou.com/weixin?'

# 動態修改登錄后的cookie
headers = {
    'Cookie': '',
    'Host': 'weixin.sogou.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

proxy = None

def get_proxy():
    try:
        response = requests.get(PROXY_POOL_URL)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        print('代理池空')
        return None

def get_html(url, count=1):
    print('crawling', url)
    print('trying count', count)
    global proxy
    if count >= MAX_COUNT:
        print("tried too many times")
        return None
    try:
        if proxy:
            proxies = {
                'http' : 'http://' + proxy
            }
            response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies, timeout=10)
        else:
            response = requests.get(url, allow_redirects=False, headers=headers)
        if response.status_code == 200:
            return response.text
        if response.status_code == 302:
            # 加代理
            print('302錯誤')
            proxy = get_proxy()
            if proxy:
                print('using proxy', proxy)
                return get_html(url)
            else:
                print('get proxy failed')
                return None
    except (ConnectionError,ReadTimeout):
        proxy = get_proxy()
        count += 1
        return get_html(url, count)


def get_index(keyword, pagenumber):
    data = {'query': keyword,
            'type': 2,
            'page': pagenumber
            }

    url = baseurl + urlencode(data)
    html = get_html(url)
    return html

def parse_index(html):
    html = etree.HTML(html)
    urls = html.xpath("http://div[@class='news-box']/ul/li/div/h3/a/@href")
    for url in urls:
        yield url

def get_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        return None

def parse_detail(html):
    # print(html)
    htmll = etree.HTML(html)
    title = htmll.xpath("http://h2[@class='rich_media_title']/text()")
    if title:
        title = title[0].strip()
    else:
        title = ''.join(htmll.xpath("http://span[@id='video_title']/text()"))
    content = htmll.xpath("http://div[@id='js_content']")[0].xpath('string(.)').strip()
    date = re.findall(r'var publish_time = \"(.*?)\"', html)[0]
    nickname = htmll.xpath("http://span[@class='rich_media_meta rich_media_meta_nickname']/a/text()")
    if nickname:
        nickname = nickname[0].strip()
    else:
        nickname = htmll.xpath("http://strong[@class='account_nickname_inner']/text()")[0].strip()
    wechat = ''.join(htmll.xpath("http://div[@id='js_profile_qrcode']/div/p[1]/span/text()"))
    return {
        'title' : title,
        'content' : content,
        'date' : date,
        'nickname' : nickname,
        'wechat' : wechat
    }

def save_to_mongo(item):
    if db['articles'].update({'title':item['title']}, {'$set':item}, True):
        print("保存成功")
    else:
        print("保存失敗")

def main():
    for page in range(1,101):
        html = get_index(KEYWORD, page)
        if html:
            for url in parse_index(html):
                article_html = get_detail(url)
                if article_html:
                    article_data = parse_detail(article_html)
                    print(article_data)
                    save_to_mongo(article_data)


if __name__ == '__main__':
    main()

config.py

KEYWORD = '風景'

MONGO_URI = 'localhost'
MONGO_DB = 'weixin'

PROXY_POOL_URL = ''

MAX_COUNT = 5
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容

  • Spring Cloud為開發人員提供了快速構建分布式系統中一些常見模式的工具(例如配置管理,服務發現,斷路器,智...
    卡卡羅2017閱讀 134,869評論 18 139
  • 1、通過CocoaPods安裝項目名稱項目信息 AFNetworking網絡請求組件 FMDB本地數據庫組件 SD...
    陽明AGI閱讀 16,003評論 3 119
  • 我們的一生中會遇到許多的人,他也許只是你生命當中的一個匆匆過客,只留給了你一個背影,一個眼神,或者是一瞥,但這些已...
    若櫪閱讀 279評論 0 1
  • 大學畢業后,我發現在圖書館里讀書真的是一件極其享受的事情。工作了,每天都要先把自己的任務完成再去顧及一下生活,我有...
    麥田csy閱讀 523評論 0 0
  • 這一整夜,阿杰又輾成碾轉反側、難以入睡。 警察把他從傳銷窩點里面解救出來后,已記不清是第幾個不眠之夜?!懊魈炀蜐M3...
    漣漪成長記閱讀 284評論 4 2