from lxml import etree
import requests
import json
root_huxiu_url='https://www.huxiu.com/'
post_url = 'https://www.huxiu.com/channel/ajaxGetMore'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Cookie':'aliyungf_tc=AQAAAIokYwn+WwMAN9UmZQmaqDaXEhQv; huxiu_analyzer_wcy_id=4f1wsxk7txc42t0xk7w; Hm_lvt_324368ef52596457d064ca5db8c6618e=1516200027; Hm_lpvt_324368ef52596457d064ca5db8c6618e=1516200027; _ga=GA1.2.1212828852.1516200027; _gid=GA1.2.1755766605.1516200027; screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D; SERVERID=03a07aad3597ca2bb83bc3f5ca3decf7|1516199779|1516199738'
}
def get_channel_info(root_url):
req=requests.get(root_url,headers=headers)
html=req.text
selector=etree.HTML(html)
infos=selector.xpath('//ul[@class="header-column header-column1 header-column-zx menu-box"]/li/a')
items=[]
for info in infos:
item_dict={}
channel_name=info.xpath('text()')[0]
catId=info.xpath('@href')[0].replace('/channel/','').replace('.html','')
# print(channel_name,catId)
item_dict['channel_name']=channel_name
item_dict['catId']=catId
items.append(item_dict)
return items
def get_totalPage(catId):
post_url = 'https://www.huxiu.com/channel/ajaxGetMore'
post_data={#form data
'huxiu_hash_code':'25ac5e645e763c56a512d97ab1901874',
'page':1,
'catId':catId
}
html=requests.post(post_url,data=post_data,headers=headers).text
dict_data=json.loads(html)#json
# print(dict_data)
parse_data=dict_data['data']
total_page=parse_data['total_page']
return str(total_page)
def get_article_info(channel_name,catId,page):
post_data={#form data
'huxiu_hash_code':'25ac5e645e763c56a512d97ab1901874',
'page':page,
'catId':catId
}
html=requests.post(post_url,data=post_data,headers=headers).text
dict_data=json.loads(html)#jason
# print(dict_data)
parse_data=dict_data['data']
total_page=parse_data['total_page']
data_html=parse_data['data'].strip()
print(channel_name,catId,total_page,data_html[0:10])
selector2=etree.HTML(data_html)
articles_url=selector2.xpath('//a/@href')
for a_url in articles_url[0::2]:#相鄰的2個url是一樣的
if a_url.startswith('/article'):
article_url=root_huxiu_url+a_url[1:]
print(article_url)
req3=requests.get(article_url,headers=headers)
selector3=etree.HTML(req3.text)
title=selector3.xpath('//div[@class="article-wrap"]/h1/text()')[0].strip()
content=selector3.xpath('//p/text()')
whole_content='\n'.join(content) #自動換行
print(title)
print(whole_content)
def main():
channels_info=get_channel_info(root_huxiu_url)
for one_channel in channels_info:
print(one_channel)
pages=get_totalPage(one_channel['catId'])
print(one_channel['channel_name']+'pages:'+pages)
for ipage in pages:
get_article_info(one_channel['channel_name'],one_channel['catId'],ipage)
if __name__=='__main__':
main()
抓取虎嗅網
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。
- 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來,“玉大人,你說我怎么就攤上這事。” “怎么了?”我有些...
- 文/花漫 我一把揭開白布。 她就那樣靜靜地躺著,像睡著了一般。 火紅的嫁衣襯著肌膚如雪。 梳的紋絲不亂的頭發上,一...
- 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了?” 一聲冷哼從身側響起,我...