豆瓣top250電影爬蟲
import requests
import pandas as pd
from lxml import html
movie_list = []
def spider_douban(page):
# 獲取目標站點的源代碼
url ='https://movie.douban.com/top250?start={}&filter='.format(page)
# 偽裝成瀏覽器
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
response = requests.get(url, headers =headers)
print(response.status_code)
html_data = response.text
# print(html_data)
# xpath提取
selector = html.fromstring(html_data)
# 獲取所有電影
# ul_list = selector.xpath('//div[@class="article"]/ol/li')
ul_list = selector.xpath('//ol[@class="grid_view"]/li')
print(len(ul_list))
# 遍歷
for li in ul_list:
# 電影序號
number = li.xpath('.//div[1]/div[1]/em/text()')
print(number)
# 電影圖片
picture = li.xpath('.//div[1]/div[1]/a/img/@src')[0]
print(picture)
# 電影名
name = li.xpath('.//div[1]/div[2]/div[1]/a/span[1]/text()')[0]
print(name)
#新建img文件夾,存放電影圖片,以電影名命名
image = requests.get(picture)
with open('./img/'+name+'.png','wb') as f:
f.write(image.content)
# 電影信息
information = li.xpath('.//div[1]/div[2]/div[2]/p/text()')
# print(type(information))
# information = information.strip()
print(information)
# 評價人數(shù)
people = li.xpath('.//div[1]/div[2]/div[2]/div[1]/span[4]/text()')[0]
people = people.replace('人評價', ' ')
people = int(people)
print(people)
# 排序
movie_list.append({
'name': name,
'number': number,
'picture': picture,
'information': information,
'people':people
})
movie_list.sort(key=lambda x: x['people'], reverse=True)
for movie in movie_list:
print(movie)
# 存儲 csv
df = pd.DataFrame(movie_list)
df.to_csv('douban250.csv')
#翻頁
for page in range(0,250,25):
spider_douban(page)
# spider_douban(0)
未完待續(xù)/...
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。