Python第三天(spider_豆瓣)

豆瓣top250電影爬蟲

import requests
import pandas as pd
from lxml import html
movie_list = []
def spider_douban(page):
    # 獲取目標站點的源代碼
    url ='https://movie.douban.com/top250?start={}&filter='.format(page)
    # 偽裝成瀏覽器
    headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
   
    response = requests.get(url, headers =headers)
    print(response.status_code)
    html_data = response.text
    # print(html_data)

    # xpath提取
    selector = html.fromstring(html_data)
    # 獲取所有電影
    # ul_list = selector.xpath('//div[@class="article"]/ol/li')
    ul_list = selector.xpath('//ol[@class="grid_view"]/li')
    print(len(ul_list))

    # 遍歷
    for li in ul_list:
        # 電影序號
        number = li.xpath('.//div[1]/div[1]/em/text()')
        print(number)

        # 電影圖片
        picture = li.xpath('.//div[1]/div[1]/a/img/@src')[0]
        print(picture)

        # 電影名
        name = li.xpath('.//div[1]/div[2]/div[1]/a/span[1]/text()')[0]
        print(name)

        #新建img文件夾,存放電影圖片,以電影名命名
        image = requests.get(picture)
        with open('./img/'+name+'.png','wb') as f:
            f.write(image.content)

        # 電影信息
        information = li.xpath('.//div[1]/div[2]/div[2]/p/text()')
        # print(type(information))
        # information = information.strip()
        print(information)

        # 評價人數(shù)
        people = li.xpath('.//div[1]/div[2]/div[2]/div[1]/span[4]/text()')[0]
        people = people.replace('人評價', ' ')
        people = int(people)
        print(people)

        # 排序
        movie_list.append({
            'name': name,
            'number': number,
            'picture': picture,
            'information': information,
            'people':people
        })
    movie_list.sort(key=lambda x: x['people'], reverse=True)
    for movie in movie_list:
        print(movie)
    # 存儲 csv
    df = pd.DataFrame(movie_list)
    df.to_csv('douban250.csv')
#翻頁
for page in range(0,250,25):
    spider_douban(page)
# spider_douban(0)

未完待續(xù)/...

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。