爬蟲小程序

#coding:utf-8

import requests

from lxmlimport etree

#設置目標地址

start_url ="https://www.liuxue86.com/zhufuyu/chunjiezhufuyu/{}/html"

target_url = [start_url.format(x)for xin range(2,5)]

target_url.append("https://www.liuxue86.com/zhufuyu/chunjiezhufuyu/")

# 設置請求頭

headers = {

'User-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36"

}

# 發送請求得到返回數據

for urlin target_url:

response = requests.get(url,headers=headers)

# 轉換格式

? ? html = etree.HTML(response.text)

# 查找數據

? ? count_link = html.xpath("http://ul[@class='grid_list']/li/a/@href")

for link_urlin count_link:

response1 = requests.get(link_url,headers=headers)

response1.encoding ='utf-8'

? ? ? ? html2 = etree.HTML(response1.text)

content = html2.xpath("http://div[@id='article-content']/p[position()>1]")

for iin content:

cont = i.xpath("string(.)")

print cont



#coding:utf-8

'''

2、http://kr.tingroom.com/yuedu/ 獲得所有韓語閱讀資料'''

import requests

from lxmlimport etree

response = requests.get('http://kr.tingroom.com/yuedu/hysjyd/')

content = etree.HTML(response.text)

urls = content.xpath("http://ul[@class='e2']/li/a/@href")

print urls

for urlin urls :

response = requests.get(url)

response.encoding ='utf-8'

? ? content = etree.HTML(response.text)

txt = content.xpath("http://div[@id='article']")[0].xpath("string(.)")

print txt

?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容