talk is cheap , show me code.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#-*- Max Young - maxc.cc 2019-09-18 -*-
import requests
from bs4 import BeautifulSoup
headers = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36")
page = 0
number = 0
while page < 251:
url = ("https://movie.douban.com/top250?start=" + str(page) + "&filter=")
page += 25
# print(url)
#下面開(kāi)始獲取url的單獨(dú)網(wǎng)頁(yè)
web_data = requests.get(url) #獲取當(dāng)前url的所有數(shù)據(jù)
soup = BeautifulSoup(web_data.text,'lxml') # 用bs4進(jìn)行網(wǎng)頁(yè)的標(biāo)簽分析
info = soup.find_all('div', class_='info') #用soup的find_all功能查找所有class為info的標(biāo)簽內(nèi)容
info.encoding = 'utf-8' #防止terminal直接print出現(xiàn)亂碼
savetxt = open('douban.txt','a') #保存到當(dāng)?shù)氐囊粋€(gè)txt里便于查看,a是指打開(kāi)文檔,在txt后面添加,w是覆蓋。
for tag in info:
number += 1
# print('5')
movie_name = tag.find('span', class_='title').get_text() #獲得所有標(biāo)簽為title的文本
movie_rate = tag.find('span', class_='rating_num').get_text() #獲得所有標(biāo)簽為rating的內(nèi)容
movie_quote = tag.find('span', class_='inq').get_text() #獲取標(biāo)簽為inq的內(nèi)容
movie_star = tag.find('div', class_='star').get_text() #獲取star打分的內(nèi)容
moive_star2 = movie_star.find('span')
# movie_star3 = movie_star2[3].content[0]
# print(movie_name)
# 把獲取的信息分類后保存到savetxt.txt里面。
savetxt.write("No" + str(number) + ": " + str(movie_name) +' '+ str(movie_rate) +' '+ str(movie_quote))
savetxt.write('\n') #這里的\n是換行符
savetxt.close() #關(guān)閉文檔
print(page)