import requests,re,time,urllib,os,random
from urllib import request
from bs4 import BeautifulSoup
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
cookies = {"cookie": "_T_WM"}
class Weibo(object):
def __init__(self,url):
self.url = url
self.dir = "C:\\Users\\Desktop\\Python\\Weibo"
#input any url
def getType(self):
url_split = self.url.split('/')
user = url_split[3]
if user == str():
flag = 1
else:
flag = 0
return flag
# input the first page url
def getPage(self):
Html = requests.get(self.url,cookies=cookies,headers=headers).text
pagereg = r'value="(\d+)"'
pages = re.compile(pagereg).findall(Html)
if len(pages) == 0:
page = 1
else:
page = pages[-1]
return page
# input any url
def getUrl(self):
Html = requests.get(self.url,cookies=cookies,headers=headers).text
Soup = BeautifulSoup(Html,'lxml')
return Html,Soup
# Input the first page url
def getBasicInfo(self):
OneHtml = requests.get(self.url,cookies=cookies,headers=headers).text
OneSoup = BeautifulSoup(OneHtml,'lxml')
ID_reg = r'<a href="/(\d+)/info"'
fans_reg = r'<a href=".+?/fans">粉絲\[(\d+)\]</a>'
name_reg = r'<title>(.+?)的微博</title>'
ID = re.compile(ID_reg).findall(OneHtml)[0]
fans = re.compile(fans_reg).findall(OneHtml)[0]
name = re.compile(name_reg).findall(OneHtml)[0]
people_dir = self.dir + '\\' + str(name)
if not os.path.isdir(people_dir):
os.mkdir(people_dir)
info_url = "https://weibo.cn/" + str(ID) + "/" + "info"
return name,fans,info_url,people_dir
# Input the info page url
def getDetailInfo(self):
InfoHtml = requests.get(self.url,cookies=cookies,headers=headers).text
# 學校
xx = r'<div class="tip">學習經歷</div><div class="c">(.+?)<br/>'
xuexiao = re.compile(xx).findall(InfoHtml)
if len(xuexiao) != 0:
info_school = "學校" + ':' + xuexiao[0] + '\n'
else:
info_school = "學校:Missing" + '\n'
# 性別
xb = r'<br/>性別:(.+?)<br/>'
xingbie = re.compile(xb).findall(InfoHtml)
if len(xingbie) != 0:
info_xb = "性別" + ':' + str(xingbie[0]) + '\n'
else:
info_xb = "性別:Missing"
# 地區
dq = r'<br/>地區:(.+?)<br/>'
diqu = re.compile(dq).findall(InfoHtml)
if len(diqu) != 0:
info_dq = "地區" + ':' + str(diqu[0]) + '\n'
else:
info_dq = "地區:Missing"
# 生日
sr = r'<br/>生日:(.+?)<br/>'
shengri = re.compile(sr).findall(InfoHtml)
if len(shengri) != 0:
info_sr = "生日" + ':' + str(shengri[0]) + '\n'
else:
info_sr = "生日:Missing" + '\n'
# 簡介
jjie = r'<br/>簡介:(.+?)<br/>'
jianjie = re.compile(jjie).findall(InfoHtml)
if len(jianjie) != 0:
info_jjie = "簡介" + ':' + str(jianjie[0]) + '\n'
else:
info_jjie = "簡介:Missing" + '\n'
return info_school,info_xb,info_dq,info_sr,info_jjie
def one(html,dir):
s = r'src="(.+?)wap180/.+?"/></a>'
e = r'src=".+?/wap180/(.+?)"/></a>'
ss = re.compile(s).findall(html)[0]
ee = re.compile(e).findall(html)[0]
url = ss + "large/" + ee
print(url)
curdir = dir + '\\'
urllib.request.urlretrieve(url, '{}{}.jpg'.format(curdir, ee))
def group(html,dir):
reg = r'<(a href=".+?">.+?)</a>'
regre = re.compile(reg)
lists = regre.findall(html)
for i in lists:
if u'組圖' in i:
ureg = r'a href="(https.+?)">'
uregre = re.compile(ureg)
gro_url = uregre.findall(i)[0]
print(gro_url)
Group = Weibo(gro_url)
html,soup = Group.getUrl()
img = r'img src="(http.+?)".+?原圖'
imgre = re.compile(img)
imgurl = imgre.findall(html)
#print("imgurl",imgurl)
for u in imgurl:
u = str(u)
s = r'^(.+?)thumb180/.+?'
e = r'.+?/thumb180/(.+?)$'
ss = re.compile(s).findall(u)[0]
ee = re.compile(e).findall(u)[0]
uu = ss + "large" + '/' + ee
print(uu)
curdir = dir + '\\'
urllib.request.urlretrieve(uu, '{}{}'.format(curdir, ee))
time.sleep(1)
time.sleep(1)
def getInfo(url):
basic = Weibo(url)
page = basic.getPage()
name,fans,info_url,people_dir = basic.getBasicInfo()
detail = Weibo(info_url)
xx,xb,dq,sr,jjie = detail.getDetailInfo()
file = people_dir + '\\' + name + ".txt"
fo = open(file,'w',encoding=('utf-8'))
fo.write("昵稱:" + name + '\n');fo.write(xb);fo.write(sr)
fo.write("粉絲:" + fans + '\n');fo.write(xx);fo.write(dq)
fo.write(jjie);fo.write("目錄:" + people_dir + '\n')
print(name + ":Info write done!")
return page,people_dir
def getLastWeiboTime(url):
time_html,time_soup = Weibo(url).getUrl()
wb_list = time_soup.find_all('div',class_="c")
leng = len(wb_list)
i = 0
time_list = []
for i in range(leng):
weibo = str(wb_list[i])
#print(wb_list[i])
if u'置頂' not in weibo and u'贊' in weibo :
reg = r'<span class="ct">(.+?)<'
real_time = re.compile(reg).findall(weibo)[0]
time_list.append(real_time)
print(time_list[0])
def getWeibo(ori):
url = ori + "?page="
pages,dir = getInfo(url)
for p in range(1,int(pages) + 1):
cur_url = url + str(p)
print("第" + str(p) + "頁")
try:
Page = Weibo(cur_url)
page_html,page_soup = Page.getUrl()
wbs = page_soup.find_all('div',class_="c")
for w in wbs:
con = str(w)
#print(con)
if u'原圖' in con and u'轉發了' not in con and u'轉發理由' not in con:
#print(con)
if u'組圖' in con:
#print(con)
print("組圖")
group(con, dir)
time.sleep(1)
else:
#print(con)
print("單圖")
one(con, dir)
time.sleep(1)
except:
time.sleep(1)
continue
print("Img downloads Done!")
oris = [""]
for ori in oris:
getWeibo(ori)
getLastWeiboTime(ori)
Python 學習記錄4
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
- 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人,你說我怎么就攤上這事?!?“怎么了?”我有些...
- 文/花漫 我一把揭開白布。 她就那樣靜靜地躺著,像睡著了一般。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發上,一...
- 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了?” 一聲冷哼從身側響起,我...