python抓取圖片例子(ajax)

例子是仿抄:崔慶才先生 的案例
他的個人博客地址是:http://cuiqingcai.com/
#!/bin/python3.4
# -- coding:utf-8 --

import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from requests.exceptions import RequestException
import requests
from config import *
from hashlib import md5
from multiprocessing import Pool
from json.decoder import JSONDecoder
from pymongo import MongoClient
import os

client = MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]

def get_page_index(offset,keyword):
    data = {
        'offset':offset,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':'20',
        'cur_tab':1
    }
    url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print ("請求索引頁面出錯")
        return None

def parse_page_index(html):
    try:
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get('data'):
                yield item.get('article_url')
    except JSONDecoder:
        pass

def get_page_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print ("請求詳情頁面出錯",url)
        return None

def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    images_pattern = re.compile('var gallery = (.*?);',re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images: download_image(image)
            return {
                'title':title,
                'url':url,
                'images':images,
            }

def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print ("存儲到Mongodb成功",result)
        return True
    return False

def download_image(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except RequestException:
        print ("請求圖片出錯")
        return None

def save_image(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()

def main(offset):
    html = get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html,url)
            if result: save_to_mongo(result)

if __name__ == '__main__':
    groups = [x * 20 for x in range(GROUP_START,GROUP_END + 1)]
    pool = Pool()
    pool.map(main,groups)

config.py配置文件

#!/bin/python3.4
# -*- coding:utf-8 -*-
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

GROUP_START = 1
GROUP_END = 20

KEYWORD = '街拍'

images_pattern與result正則匹配到數據轉化成json格式:

"sub_images":
[
    {
    "url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22",
    "width":700,
    "url_list":
        [
         {"url":"http:\/\/p2.pstatp.com\/origin\/168300027e4c8323ee22"},
          {"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"},
          {"url":"http:\/\/pb3.pstatp.com\/origin\/168300027e4c8323ee22"}
        ],
    "uri":"origin\/168300027e4c8323ee22","height":981
    },

    {
    "url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9",
    "width":700,
    "url_list":
        [
            {"url":"http:\/\/p2.pstatp.com\/origin\/168600026fb5ecf86ba9"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/168600026fb5ecf86ba9"}
        ],
    "uri":"origin\/168600026fb5ecf86ba9","height":891
    },
    
    {
    "url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863",
    "width":700,
    "url_list":
        [
            {"url":"http:\/\/p3.pstatp.com\/origin\/16870003ef0948da7863"},
            {"url":"http:\/\/pb2.pstatp.com\/origin\/16870003ef0948da7863"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0948da7863"}
        ],
    "uri":"origin\/16870003ef0948da7863","height":1078
    },
    
    
    {
    "url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5",
    "width":700,
    "url_list":
        [
            {"url":"http:\/\/p1.pstatp.com\/origin\/16820003ee9c72717ad5"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/16820003ee9c72717ad5"}
        ],
    "uri":"origin\/16820003ee9c72717ad5","height":999
    },
    
    {
    "url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810",
    "width":960,
    "url_list":
        [
            {"url":"http:\/\/p1.pstatp.com\/origin\/16870003ef0b2bbec810"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"},
            {"url":"http:\/\/pb3.pstatp.com\/origin\/16870003ef0b2bbec810"}
        ],
    "uri":"origin\/16870003ef0b2bbec810","height":609
    }
],
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容

  • Spring Cloud為開發人員提供了快速構建分布式系統中一些常見模式的工具(例如配置管理,服務發現,斷路器,智...
    卡卡羅2017閱讀 134,923評論 18 139
  • Android 自定義View的各種姿勢1 Activity的顯示之ViewRootImpl詳解 Activity...
    passiontim閱讀 173,268評論 25 708
  • 想想看,我的閱讀觀和生活觀還挺一致的。 選書來讀,標準就是有趣,而并非有用。交朋友也是這樣,標準就是有趣談得來,并...
    赤蕪小茴閱讀 87評論 0 0
  • 為方便閱讀,該內容需具備一定的HTML+CSS基礎。 為什么學習JavaScript 一、為什么JavaScrip...
    百草紀閱讀 318評論 0 1
  • 最近看的大多都是日本作品,渡邊把一場婚外之戀的情欲描寫的如同櫻花燃燒的火焰般絢麗,很快又化為烏有。開時美艷,掉落時...
    獨立行走的魚閱讀 977評論 1 1