一、介紹
一直想爬微信公眾號的歷史信息,無奈限于騰訊爸爸的微信反爬機制——PC端歷史消息只能看到10條,除了通過中間代理采集APP,就沒什么招數能拿到數據了,這周在崔大的博客上看的了哎喲臥槽老師發表的利用新接口抓取微信公眾號的所有文章,原來6月6日微信團隊稱對所有公眾號開放在圖文消息中插入全平臺已群發文章鏈接,就是說我們可以在新建圖文信息那里找到一個微信公眾號的所有文章,即我們可以獲得所有文章的鏈接
立信微生活
二、流程
你需要有一個訂閱號or 公眾號or企業號
自己在微信公眾平臺申請一個就好,我申請的是訂閱號(另外兩個應該是一樣的)你需要登錄
使用selenium 驅動瀏覽器獲取cookie的方法,來達到登錄的效果
from selenium import webdriver
import time
import json
post = {}
driver = webdriver.Chrome()
driver.get('https://mp.weixin.qq.com/')
time.sleep(2)
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[1]/div/span/input').clear()
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[1]/div/span/input').send_keys('你的賬號')
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[2]/div/span/input').clear()
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[2]/div/span/input').send_keys('你的密碼')
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[3]/label/i').click()
driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[4]/a').click()
time.sleep(20)
# 掃二維碼,手速要快!
driver.get('https://mp.weixin.qq.com/')
cookie_items = driver.get_cookies()
for cookie_item in cookie_items:
post[cookie_item['name']] = cookie_item['value']
cookie_str = json.dumps(post)
with open('cookie.txt', 'w+', encoding='utf-8') as f:
f.write(cookie_str)
print(cookie_str)
-
獲取所以歷史文章的url
1.使用requests攜帶Cookie、登錄獲取URL的token
token這玩意兒是啥東西呢,每個微信公眾號對應一個token,在之后請求詳細頁面的時候需要它,如果你的抓取量并不多的話也可以直接手工輸入的,這里我們向哎喲臥槽老師學習,來獲取我們的token
1.0.png
2.使用獲取到的token和公眾號的微信號獲取到公眾號的fakeid
fakeid這玩意兒是啥東西呢,它也是公眾號的標識之一,你需要獲取你要爬的公眾號的fakeid,在之后請求詳細頁面的時候需要它,當然如果你的抓取的公眾號并不多,咱們也是可以直接手工輸入的
2.0.png
3.通過獲得的token、fakeid等構造索引頁的url,訪問索引頁獲取歷史消息的url鏈接
3.0.png
4.遍歷
import requests
import json
import re
import random
gzlist = ['gh_b59aa6364380']
header = {
"HOST": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
}
with open('cookie.txt', 'r', encoding='utf-8') as f:
cookie = f.read()
cookies = json.loads(cookie)
def get_token():
url = 'https://mp.weixin.qq.com'
response = requests.get(url=url, cookies=cookies)
#print(response.text)
#print(response.url)
token = re.findall('token=(\d+)', str(response.url))[0]
#print(token)
return token
def get_fakeid(token):
search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
for query in gzlist:
query_id = {
'action': 'search_biz',
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'query': query,
'begin': '0',
'count': '5',
}
search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
'''print(search_response.json())
data=search_response.json().get('list')[0].get('fakeid')
print(data)'''
lists = search_response.json().get('list')[0]
fakeid = lists.get('fakeid')
return fakeid
def query_id_data(token,fakeid,i):
data = {
'token': token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1',
'random': random.random(),
'action': 'list_ex',
'begin': i,
'count': '5',
'query': '',
'fakeid': fakeid,
'type': '9'
}
return data
def write_to_file(content):
with open('the_urls.text', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def get_the_link(data):
appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
appmsg_response = requests.get(appmsg_url, cookies=cookies, headers=header, fakeid=data)
#print(appmsg_response.json())
fakeid_list =appmsg_response.json().get('app_msg_list')
for item in fakeid_list:
content=item.get('link')
print(content)
write_to_file(content)
def main():
token=get_token()
fakeid=get_fakeid(token)
#num=get_num(token,fakeid)
for i in range(47):
print('正在爬取第%s頁' %(i + 1))
data=query_id_data(token, fakeid,5*i)
get_the_link(data)
if __name__ == '__main__':
main()
5.得到的url文件
4.0.png
- 訪問上面獲得的url,抓取網頁中的表白墻內容
import requests
import re
from requests.exceptions import RequestException
import json
def get_url():
with open('the_urls.text', 'r', encoding='utf-8') as f:
f = f.read()
f = f.replace("\n", "") # 去掉換行符
list = f.split('"') # 以"為分隔符把f轉化為列表
while '' in list: # 刪除列表中所有的空元素
list.remove('')
#print(list)
return list
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
#print(response.text)
return response.text
return None
except RequestException:
print('請求索引頁出錯')
return None
def get_txt(html):
pattern = re.compile('<section.*?([\u4e00-\u9fa5].*?)</section>', re.S)
txt = re.findall(pattern, html)
pattern = re.compile('[\u4e00-\u9fa5]+', re.S) # 去除原始數據中的標點符號、字母和特殊字符
a = re.findall(pattern, str(txt))
print(a)
write_to_file(a)
def write_to_file(content):
with open('lixin.text', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def main():
list=get_url()
for url in list:
#get_html(url)
html = get_html(url)
get_txt(html)
if __name__ == '__main__':
main()
5.0.png
- 清理文本數據
import re
import json
f=open("lixin.text",'r',encoding='utf-8').read()
def write_to_file(content):
with open('clean.text', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def clean(f):
pattern = re.compile('[\u4e00-\u9fa5]+',re.S) #去除原始數據中的標點符號、字母和特殊字符
a = re.findall(pattern,f)
#print(a)
#print(len(f))
#write_to_file(a)
b=['本推送由中博誠通贊助','長按關注立信微生活','立信微生活', '公眾號','藍字','點擊上方','微軟雅黑' ,'可訂閱哦','宋體','黑體']
c=a[:]
for bi in b :
for i in a:
if i == bi:
c.remove(bi)
a = c
#print(a)
f = str(a) # a是列表,我們要把他轉化成字符串
a=f.replace('[','')
a=a.replace("'", '')
a=a.replace(',', '')
a=a.replace(']', '')
a=a.replace(' ', '')
print(a)
return a
def main():
a=clean(f)
write_to_file(a)
if __name__ == '__main__':
main()
6.0.png
- 數據可視化
import jieba
from jieba.analyse import extract_tags
from wordcloud import WordCloud,ImageColorGenerator
from os import path
import matplotlib.pyplot as plt
import numpy as np
from snownlp import SnowNLP
f = open("clean.text", 'r', encoding='utf-8').read()
def GeneratePicture( max_words):
tags = extract_tags(f, topK=max_words) #根據tf-idf值找出文件中的關鍵詞
word_freq_dict = dict()
word_list = jieba.lcut(f)
for tag in tags:
freq = word_list.count(tag)
word_freq_dict[tag] = freq
print(word_freq_dict)
a = []
b = []
for i in range(1, 51):
# print(i)
a.append(i * 2.5)
print(a)
for i in range(1, 21):
b.append(i * 250)
d1 = word_freq_dict.keys()
print(d1)
print(type(d1))
d2 = word_freq_dict.values()
print(d2)
fig = plt.figure(figsize=(16, 8), dpi=100)
ax = fig.add_subplot(1, 1, 1)
plt.bar(a, d2, 0.4, color="green")
ax.set_xticks(a)
ax.set_xticklabels(d1, rotation=45, fontsize='small')
ax.set_yticks(b)
plt.show()
plt.savefig('top50.png')
def getb():
g = " ".join(jieba.cut(f))
back_coloring = plt.imread(path.join("愛心.jpg")) # 選取背景圖片
word_cloud = WordCloud(font_path='simsun.ttc', # 設置字體
mask=back_coloring, # 設置背景圖片
background_color="white", # 背景顏色
max_words=900, # 詞云顯示的最大詞數
max_font_size=70, # 字體最大值
random_state=42)
my_wordcloud = word_cloud.generate(g) # 生成詞云圖
image_colors = ImageColorGenerator(back_coloring) # 從背景圖片生成顏色值
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
word_cloud.to_file(path.join("word.png")) # 保存圖片
def sentiment(): #進行情感分析
f = open("lixin.text", 'r', encoding='utf-8').read()
f = f.replace(",", '。')
s = SnowNLP(f)
a = []
c = []
for sentence in s.sentences:
# print(sentence)
s1 = SnowNLP(sentence)
z = s1.sentiments
print(z)
a.append(z)
ci = np.random.rand(1)[0]
c.append(20 * ci)
fig = plt.figure(figsize=(16, 8), dpi=100)
ax = fig.add_subplot(1, 1, 1)
plt.scatter(a, c, 0.4, color="green")
ax.set_xlabel('積極情感概率')
#ax.set_title('情感分析圖')
plt.show()
plt.savefig('sentiment.png')
def main():
GeneratePicture(50)
getb()
sentiment()
if __name__ == '__main__':
main()
top50,這里有你的故事嗎
心里按捺不住的表白
喜歡一個人,蠻好
補充
- 文本處理可以多去一些停用詞
- 代碼比較亂,沒怎么整理,各位看官老爺湊合著看吧