#coding:utf-8
import requests
from lxmlimport etree
#設(shè)置目標(biāo)地址
start_url ="https://www.liuxue86.com/zhufuyu/chunjiezhufuyu/{}/html"
target_url = [start_url.format(x)for xin range(2,5)]
target_url.append("https://www.liuxue86.com/zhufuyu/chunjiezhufuyu/")
# 設(shè)置請(qǐng)求頭
headers = {
'User-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36"
}
# 發(fā)送請(qǐng)求得到返回?cái)?shù)據(jù)
for urlin target_url:
response = requests.get(url,headers=headers)
# 轉(zhuǎn)換格式
? ? html = etree.HTML(response.text)
# 查找數(shù)據(jù)
? ? count_link = html.xpath("http://ul[@class='grid_list']/li/a/@href")
for link_urlin count_link:
response1 = requests.get(link_url,headers=headers)
response1.encoding ='utf-8'
? ? ? ? html2 = etree.HTML(response1.text)
content = html2.xpath("http://div[@id='article-content']/p[position()>1]")
for iin content:
cont = i.xpath("string(.)")
print cont
#coding:utf-8
'''
2、http://kr.tingroom.com/yuedu/ 獲得所有韓語(yǔ)閱讀資料'''
import requests
from lxmlimport etree
response = requests.get('http://kr.tingroom.com/yuedu/hysjyd/')
content = etree.HTML(response.text)
urls = content.xpath("http://ul[@class='e2']/li/a/@href")
print urls
for urlin urls :
response = requests.get(url)
response.encoding ='utf-8'
? ? content = etree.HTML(response.text)
txt = content.xpath("http://div[@id='article']")[0].xpath("string(.)")
print txt