Python之Instagram圖片爬蟲(三)

上回我們講到了所有圖片的URL的獲取——Python之Instagram圖片爬蟲(二),這回將要將從真實的鏈接上下載圖片。


下載圖片

首先先保存了所有的URL集合到txt中,作為后期爬取的基礎數據來源。


URL集合

代碼

就直接上完整代碼了,這次的關鍵在于請求的header的設置,所以我們采用了session來請求。具體的步驟就是:

  • 請求用戶頁面,獲取session,里面保存著cookies
  • 使用session請求真實的圖片URL
  • 請求過程中sleep 1到2秒,并且請求量超過20-50就更換session
  • 判斷狀態碼,不是200就更換session
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
 @File       : spider.py
 @Time       : 2017/8/12 0012 21:22
 @Author     : Empty Chan
 @Contact    : chen19941018@gmail.com
 @Description:
"""
import re
import json
import os
from lxml import etree
import requests
import click
from urllib import parse
import time
import random
from hashlib import md5
import urllib.request
from http import cookiejar
import urllib.response


PAT = re.compile(r'queryId:"(\d*)?"', re.MULTILINE)
headers = {
    "Origin": "https://www.instagram.com/",
    "Referer": "https://www.instagram.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Host": "www.instagram.com",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "accept-encoding": "gzip, deflate, sdch, br",
    "accept-language": "zh-CN,zh;q=0.8",
    "X-Instragram-AJAX": "1",
    "X-Requested-With": "XMLHttpRequest",
    "Upgrade-Insecure-Requests": "1",
}

jso = {"id": "", "first": 12, "after": ""}

BASE_URL = "https://www.instagram.com"

# QUERY = "/morisakitomomi/"  # 森咲智美
# QUERY = "/_8_jjini/"
NEXT_URL = 'https://www.instagram.com/graphql/query/?query_hash={0}&variables={1}'

with open('./config.json', 'r') as f:
    proxy = json.load(f)
    click.echo(proxy)


def crawl(query):
    if not query:
        raise Exception('請輸入正確的Instagram用戶')
    folder = query.replace('.', '-')
    click.echo('start...')
    top_url = None
    in_top_url_flag = False
    qq = requests.session()
    try:
        if not os.path.exists('./images/%s' % folder):
            os.mkdir('./images/%s' % folder)

        all_imgs_url = []
        new_imgs_url = []
        if os.path.exists('./images/%s/%s.txt' % (folder, folder)):
            with open('./images/%s/%s.txt' % (folder, folder), mode='r', encoding='utf-8') as f:
                for line in f.readlines():
                    if line.strip():
                        all_imgs_url.append(line)
            top_url = all_imgs_url[0][:-1]
        temp_url = BASE_URL + '/' + query + '/'
        headers.update({'Referer': temp_url})
        res = qq.get(temp_url, headers=headers, proxies=proxy)
        html = etree.HTML(res.content.decode())
        all_a_tags = html.xpath('//script[@type="text/javascript"]/text()')  # 圖片數據源
        query_id_url = html.xpath('//script[@type="text/javascript"]/@src')  # query_id 作為內容加載
        click.echo(query_id_url)
        for a_tag in all_a_tags:
            if a_tag.strip().startswith('window'):
                data = a_tag.split('= {')[1][:-1]  # 獲取json數據塊
                js_data = json.loads('{' + data, encoding='utf-8')
                id = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["id"]
                edges = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]
                print(edges)
                end_cursor = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["page_info"]["end_cursor"]
                has_next = js_data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["page_info"]["has_next_page"]
                for edge in edges:
                    if top_url and top_url == edge["node"]["display_url"]:
                        in_top_url_flag = True
                        break
                    click.echo(edge["node"]["display_url"])
                    new_imgs_url.append(edge["node"]["display_url"])
                    # click.echo(qq.get(node["display_src"], proxies=proxy).status_code)

                if in_top_url_flag:
                    break
                # 請求query_id
                print(BASE_URL + query_id_url[1])
                query_content = qq.get(BASE_URL + query_id_url[1], proxies=proxy)
                query_id_list = PAT.findall(query_content.text)
                print(query_id_list)
                for u in query_id_list:
                    click.echo(u)
                # query_hash = query_id_list[1]
                # 暫時不確定3個query_hash具體用哪個,目前看網頁的情況是固定的
                query_hash = "472f257a40c653c64c666ce877d59d2b"
                retry = 0
                # 更多的圖片加載
                while has_next and retry < 3 and not in_top_url_flag:
                    jso["id"] = id
                    jso["first"] = 12
                    jso["after"] = end_cursor
                    text = json.dumps(jso)
                    # for query_hash in query_id_list:
                    url = NEXT_URL.format(query_hash, parse.quote(text))
                    print(url)
                    res = qq.get(url, proxies=proxy)
                    time.sleep(2)
                    html = json.loads(res.content.decode(), encoding='utf-8')
                    if '<' in html:  # 出現HTML tag
                        continue
                    if 'data' not in html:  # data不再json數據中,可能是網絡請求引發,進行重試請求
                        retry += 1
                        continue
                    has_next = html["data"]["user"]["edge_owner_to_timeline_media"]["page_info"]["has_next_page"]
                    end_cursor = html["data"]["user"]["edge_owner_to_timeline_media"]["page_info"]["end_cursor"]
                    edges = html["data"]["user"]["edge_owner_to_timeline_media"]["edges"]
                    for edge in edges:
                        if top_url and top_url == edge["node"]["display_url"]:
                            in_top_url_flag = True
                            break
                        click.echo(edge["node"]["display_url"])
                        new_imgs_url.append(edge["node"]["display_url"])
                click.echo('ok')
                # qq.close()
        if new_imgs_url:
            all_urls = new_imgs_url + all_imgs_url
            with open('./images/%s/%s.txt' % (folder, folder), mode='w', encoding='utf-8') as f:
                    for u in all_urls:
                        f.write(u + '\n')
        # t = threading.Thread(target=translate, args=(top_url, new_imgs_url, all_imgs_url, query))
        # t.setDaemon(True)
        # t.start()
        # t.join()
        translate(top_url, new_imgs_url, all_imgs_url, query)
    except Exception as e:
        raise e
    finally:
        qq.close()


def translate(top_url, news_imgs_url, all_imgs_url, path):
    if news_imgs_url:
        click.echo('enter news')
        download(path, news_imgs_url)
    if top_url:
        # file_md5 = md5()
        # file_md5.update(top_url.encode('utf-8'))
        # file_name = file_md5.hexdigest()
        # if os.path.exists('./images/%s/%s.jpg' % (path, file_name)):
        #     return
        # else:
        click.echo('enter all')
        download(path, all_imgs_url)


def download(path, urls):
    ss = requests.session()
    temp_url = BASE_URL + '/' + path + '/'
    folder = path.replace('.', '-')
    header = {
        "Referer": temp_url,
        "Origin": "https://www.instagram.com/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/60.0.3112.113 Safari/537.36",
        'Connection': 'keep-alive'
    }
    pp = ss.get(temp_url, headers=header, proxies=proxy)
    click.echo(pp.cookies.items())
    click.echo(pp.headers)
    try:
        count = 0
        all_count = len(urls)
        while count < all_count:
            url = urls[count]
            if '\n' in url:
                url = urls[count][:-1]  # 去掉\n結尾
            file_md5 = md5()
            file_md5.update(url.encode('utf-8'))
            file_name = file_md5.hexdigest()
            if os.path.exists('./images/%s/%s.jpg' % (folder, file_name)):
                count += 1
                continue
            time.sleep(2)
            res = ss.get(url, proxies=proxy)  # 默認沿用請求首頁的cookies
            click.echo(url + '=>' + str(res.status_code))
            click.echo(res.headers)
            if res.status_code == 200:
                with open('./images/%s/%s.jpg' % (folder, file_name), mode='wb') as f:
                    f.write(res.content)
                    click.echo('%s.jpg save!' % file_name)
                    count += 1
            else:
                ss.close()
                ss = requests.session()
                pp = ss.get(temp_url, headers=header, proxies=proxy)
                click.echo(pp.cookies.items())
                click.echo(pp.headers)
            if count % 100 == random.randrange(20, 50):  # 請求超過20-50次,就重置一下session,防止被遠程服務器關閉
                ss.close()
                ss = requests.session()
                pp = ss.get(temp_url, headers=header, proxies=proxy)
                click.echo(pp.cookies.items())
                click.echo(pp.headers)
        click.echo('complete!')
    except Exception as e:
        raise e
    finally:
        ss.close()

if __name__ == '__main__':
    input_instagram = click.prompt("請輸入Instagram用戶", None)  # 如 "/_8_jjini/"  等用戶
    crawl(input_instagram)

這周的文字不多,就是代碼比較多,大家可以嘗試書寫一下代碼,哈哈!相信大家都能理解代碼的!歡迎點贊和評論!下周再見!
源代碼Github

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容

  • Spring Cloud為開發人員提供了快速構建分布式系統中一些常見模式的工具(例如配置管理,服務發現,斷路器,智...
    卡卡羅2017閱讀 134,868評論 18 139
  • Http協議詳解 標簽(空格分隔): Linux 聲明:本片文章非原創,內容來源于博客園作者MIN飛翔的HTTP協...
    Sivin閱讀 5,252評論 3 82
  • 一、概念(載錄于:http://www.cnblogs.com/EricaMIN1987_IT/p/3837436...
    yuantao123434閱讀 8,420評論 6 152
  • 雨壓繁枝花滿地。云遮皓月暮穹天。
    月中霜里閱讀 260評論 0 1
  • 親: 首先,我要由衷的向你說聲對不起!你是個好女孩,是我沒有好好珍惜你!我不奢求你的原諒,只希望你在將來能遇上一個...
    夢里朝歌閱讀 389評論 0 0