這幾天在寫(xiě)js腳本,突然想寫(xiě)一個(gè)抓取小說(shuō)的腳本,于是磕磕碰碰,慢慢寫(xiě)了一個(gè)比較完善的腳本,同時(shí)對(duì)于自身所學(xué)進(jìn)一步鞏固。
1. 環(huán)境
- python版本: Python 3.7.3
- 編輯器:VScode
- Python插件: ms-python.python
- 操作系統(tǒng): MAC
setings.json配置:
{
"python.pythonPath": "/usr/local/bin/python3",
"python.formatting.provider": "black"
}
launch.json配置:
{
// 使用 IntelliSense 了解相關(guān)屬性。
// 懸停以查看現(xiàn)有屬性的描述。
// 欲了解更多信息,請(qǐng)?jiān)L問(wèn): https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: file",
"type": "python",
"request": "launch",
"program": "${file}",
"args": ["-g", "5", "-b"],
"console": "integratedTerminal"
}
]
}
2. python依賴安裝
# 初始化 beautifulSoup4
pip3 install bs4
pip3 install lxml
pip3 install json # 初始化json
pip3 install fnmatch # 查找文件
pip3 install hashlib # md5
pip3 install requests # 網(wǎng)絡(luò)請(qǐng)求
pip3 install threading # 線程
3. 自定義庫(kù) - common
目錄結(jié)構(gòu):
- common
- __init__.py
- util.py # 工具類
- thread_manager.py # 線程管理器
3.1. util.py 工具類
方法介紹:
class Util # 工具類
def write_file(self, filename, data, mode="w+") # 存儲(chǔ)文件
def append_file(self, filename, data) # 追加數(shù)據(jù)到文件
def read_file(self, filename, mode="r") # 讀取文件
def soup(self, url) # 抓取網(wǎng)頁(yè)
def list_files(self, path, fnexp) # 搜索文件
def now(self) # 當(dāng)前時(shí)間 毫秒級(jí)
def now_s(self) # 當(dāng)前時(shí)間 秒級(jí)
def recode_begin(self, group="default") # 記錄開(kāi)始時(shí)間
def recode_end(self, group="default") # 打印距開(kāi)時(shí)間的時(shí)間差
def time_format(self, timestamp, parttern="%Y-%m-%d %H:%M:%S") # 格式化時(shí)間
def md5(self, content) # 獲取字符串md5碼
3.2. thread_manager.py 多線程管理類
方法介紹:
class ThreadManager:
def __init__(self, max_size=10) # 初始化 max_size 數(shù)據(jù)量
def exit(self) # 線程退出
def lock(self) # 加鎖
def unlock(self) # 解鎖
def put_data(self, data) # 添加數(shù)據(jù)
def put_thread(self, thread) # 添加并啟動(dòng)線程
def put_cbk_thread(self, thread_name, cbk, repeat=1) # 添加方法,方法會(huì)在一個(gè)線程中執(zhí)行
def join(self) # 等待所有線程執(zhí)行完成
def wait(self) # 等待數(shù)據(jù)全部被消費(fèi)
3.3 init.py 文件
# coding=utf-8
__version__ = "1.0.0"
__all__ = ["cm_util", "ThreadManager"]
__author__ = "yszm"
from .util import *
from .thread_manager import *
4. 抓取小說(shuō)
抓取小說(shuō)總共分為3部分內(nèi)容: 標(biāo)題、目錄和具體內(nèi)容
但這3部分抓取方法大同小異,都是通過(guò)選擇器選擇對(duì)應(yīng)的元素,過(guò)濾掉不必要的元素,然后獲取相對(duì)應(yīng)的屬性和文本,然后對(duì)文件進(jìn)行縮進(jìn)。替換等處理
這里以67書(shū)吧的小說(shuō)《第一序列》為例,地址: https://www.67shu.com/111/111473
import time
import json
import sys
import os
if __name__ == "__main__":
from common import *
else:
from .common import *
URL1 = "https://www.67shu.com/111/111473/"
URL2 = "https://www.67shu.com/40/40190/"
URL = "story/result/{}.txt"
def get_cpt(url):
doc = cm_util.soup(url)
data = {"name": "unknow"}
# 獲取標(biāo)題
h1 = doc.select(".book_info .xiaoshuo h1")
if len(h1) > 0:
data["name"] = h1[0].contents[0]
# 獲取所有鏈接
links = doc.select(".novel_list .book_article_listtext dd a")
cp_arr = []
for item in links:
cp_arr.append(
{"url": (url + "{}").format(item.get("href")), "name": item.string}
)
data["cp"] = cp_arr
return data
def get_content(data):
dest_file = URL.format(data["name"])
cm_util.write_file(dest_file, "")
for item in data["cp"]:
doc = cm_util.soup(item["url"])
con = doc.select(".yuedu_page .yuedu_zhengwen")
if len(con) > 0:
c = con[0].text
txt = (
c.replace("\\n", "\n")
.replace("\\r", "")
.replace("\xa0", "")
.replace("一秒記住【67書(shū)吧 www.67shu.com】,", "")
.replace("精彩小說(shuō)無(wú)彈窗免費(fèi)閱讀!", "")
.replace(" ", " ")
.replace(" ", "")
)
print("get data: %s" % item["name"])
cm_util.write_file(dest_file, "\n\n%s\n\n" % item["name"], "a+")
cm_util.write_file(dest_file, txt, "a+")
if __name__ == "__main__":
get_content(get_cpt(URL2))
看起來(lái),是不是特別簡(jiǎn)單呢?
不過(guò),這樣一章章地抓取太慢了,尤其是一些大牛,寫(xiě)了幾千章,抓取就特別費(fèi)時(shí)了,這時(shí)候,就需要采用多線程抓取了。
5. 多線程抓取小說(shuō)
采用自定義線程管理器類:ThreadManager
需要實(shí)現(xiàn)方法: def handle_data(data, thread_id. thread_name)
這里以全本小說(shuō)網(wǎng)
的小說(shuō)《英雄聯(lián)盟我的時(shí)代》為例:
import time
import json
import sys
import os
if __name__ == "__main__":
from common import *
else:
from .common import *
URL1 = "http://www.126shu.com/99596/"
URL_CONTENT = "http://www.126shu.com/{}"
URL_RESULT = "story/result/{}.txt"
URL_DATA = "story/data/{}.txt"
def get_cpt(url):
doc = cm_util.soup(url)
data = {"name": "unknow"}
# 獲取標(biāo)題
h1 = doc.select("#info .hh")
if len(h1) > 0:
data["name"] = h1[0].string
# 獲取所有鏈接
links = doc.select("#headlink #list dl dd a")
cp_arr = []
for item in links:
cp_arr.append(
{"url": URL_CONTENT.format(item.get("href")), "name": item.string}
)
data["cp"] = cp_arr
return data
def get_text(item):
dest_file = URL_DATA.format(item["name"])
if os.path.exists(dest_file):
print("exist file, so we will use cache: %s " % dest_file)
return dest_file
doc = cm_util.soup(item["url"])
con = doc.select("#content")
if len(con) > 0:
con_l = con[0].select(".zjtj")
if len(con_l) > 0:
con_l[0].extract()
con_l = con[0].select(".zjxs")
if len(con_l) > 0:
con_l[0].extract()
c = con[0].text
txt = (
c.replace("www.126shu.com", "")
.replace("\r", "")
.replace("請(qǐng)百度搜索()", "")
.replace("\xa0", "\n")
.replace("\n\n\n\n", "\n\n")
.replace("\n\n\n\n", "\n\n")
) # replace("\r", "\n\n").replace(" ", "")
print("get data: %s" % item["name"])
cm_util.write_file(dest_file, ("\n\n%s" % item["name"]) + txt, "a+")
return dest_file
return None
# 保存路徑
text_path = {}
def get_text_thread(item, id, name):
path = get_text(item)
if path:
text_path[item["name"]] = path
else:
print("[warn]: cannot find content: %s,%s" % (item["url"], item["name"]))
def get_content(data):
# 小說(shuō)名稱
dest_file = URL_RESULT.format(data["name"])
cm_util.write_file(dest_file, "")
manager = ThreadManager(len(data["cp"]))
thread_names = [
"thread_a",
"thread_b",
"thread_c",
"thread_d"
]
manager.put_data(data["cp"])
manager.put_cbk_thread(thread_names, get_text_thread)
# 等待隊(duì)列清空
manager.wait()
# 通知線程是時(shí)候退出
manager.exit()
# 等待所有線程完成
manager.join()
# 按照順序合并
for item in data["cp"]:
path = text_path.get(item["name"], None)
if path:
txt = cm_util.read_file(path)
cm_util.append_file(dest_file, txt)
if __name__ == "__main__":
cm_util.recode_begin()
get_content(get_cpt(URL1))
cm_util.recode_end()
6. 進(jìn)一步升級(jí),實(shí)現(xiàn)工廠方法
為了更具有通用性,所以應(yīng)該抽取共同部分,通過(guò)動(dòng)態(tài)注入方法,從而增加腳本的可拓展性。
不多說(shuō),放源碼:
# coding=utf-8
import os
if __name__ == "__main__":
from ..common import *
from .parser import *
else:
from common import *
from story.parser import *
URL_RESULT = "python/story/result/{}.txt"
URL_DATA = "python/story/data/{}"
URL_TXT_CHAPTER = "python/story/data/{}/{}.txt"
CONFIG_DATA = "python/story/data/{}/config.json"
class Parser:
def __init__(self, base_url=""):
self.base_url = base_url
def get_chapter(self, url):
return None
def get_text(self, item):
return None
class StoryFactory:
def __init__(self):
self.url_matcher = {}
self.max_thread_size = 10
# 注冊(cè)
def registe(self, base_url, get_chapter, get_text):
self.url_matcher[base_url] = {
"base_url": base_url,
"get_cpt": get_chapter,
"get_text": get_text,
}
def registe_paser(self, p):
self.registe(p.base_url, p.get_chapter, p.get_text)
def match(self, url):
for base_url in self.url_matcher:
if url.startswith(base_url):
return base_url
return None
def get_text_thread(self, item, id, name):
conf_path = CONFIG_DATA.format(item["book_key"])
chap_data = cm_util.read_file(conf_path)
get_text = self.url_matcher[chap_data["base_url"]].get(
"get_text", None
) # (item)
if not get_text:
print("[warn] not match url: %s" % item["url"])
return
txt = get_text(item)
if txt:
cm_util.write_file(
URL_TXT_CHAPTER.format(item["book_key"], cm_util.md5(item["name"])), txt
)
else:
print("[warn]: cannot find content: %s,%s %s" % (item["url"], item["name"]))
def run(self, url):
key = cm_util.md5(url)
cm_util.recode_begin(key)
base_url = self.match(url)
if not base_url:
print("[warn] not match url: %s" % url)
return
print("[info] url:[%s] %s - %s" % (key, url, base_url))
if not os.path.exists(URL_DATA.format(key)):
os.makedirs(URL_DATA.format(key))
matcher = self.url_matcher[base_url].get("get_cpt", None)
if not matcher:
print("[warn] not match url: %s" % url)
return
chap_data = matcher(url)
conf_path = CONFIG_DATA.format(key)
if os.path.exists(conf_path):
chap_data = cm_util.read_file(conf_path)
else:
chap_data["base_url"] = base_url
for item in chap_data["chapter"]:
name = item.get("name", None)
if name:
item["key"] = cm_util.md5(name)
item["book_key"] = key
cm_util.write_file(conf_path, chap_data)
manager = ThreadManager(len(chap_data["chapter"]))
thread_names = []
for ch in range(self.max_thread_size):
thread_names.append("thread_%d" % ch)
manager.put_data(chap_data["chapter"])
manager.put_cbk_thread(thread_names, self.get_text_thread)
# 等待隊(duì)列清空
manager.wait()
# 通知線程是時(shí)候退出
manager.exit()
# 等待所有線程完成
manager.join()
# 小說(shuō)名稱
dest_file = URL_RESULT.format(chap_data["title"])
cm_util.write_file(dest_file, "")
# 按照順序合并
for item in chap_data["chapter"]:
ch_path = URL_TXT_CHAPTER.format(key, cm_util.md5(item["name"]))
txt = cm_util.read_file(ch_path)
if txt:
cm_util.append_file(dest_file, txt)
cm_util.recode_end(key)
story_factory = StoryFactory()
init.py文件:
# coding=utf-8
__version__ = "1.0.0"
__all__ = ["story_factory", "Parser"]
__author__ = "yszm"
from .story import *
使用:
if __name__ == "__main__":
from story import *
else:
from .story import *
class P67shu(Parser):
def __init__(self):
super().__init__("https://www.67shu.com")
def get_chapter(self, url):
doc = cm_util.soup(url)
data = {"title": "unknow"}
# 獲取標(biāo)題
h1 = doc.select(".book_info .xiaoshuo h1")
if len(h1) > 0:
data["title"] = h1[0].contents[0]
# 獲取所有鏈接
links = doc.select(".novel_list .book_article_listtext dd a")
cp_arr = []
for item in links:
cp_arr.append(
{"url": (url + "{}").format(item.get("href")), "name": item.string}
)
data["chapter"] = cp_arr
return data
story_factory.registe_paser(P67shu())
if __name__ == "__main__":
url = "https://www.67shu.com/112/112336/"
story_factory.run(url)
7. common 源碼
源碼如下:
common/util.py
# coding=utf-8
import json
import os
import requests
from bs4 import BeautifulSoup
import fnmatch
import time
import hashlib
# 請(qǐng)求頭配置
HEADER_CONFIG = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
}
class Util:
def __init__(self):
self.time_map = None
# 存儲(chǔ)數(shù)據(jù)到文件中
def write_file(self, filename, data, mode="w+"):
f = open(filename, mode)
try:
if isinstance(data, dict):
data = json.dumps(data, ensure_ascii=False)
f.write(data)
finally:
f.close()
print("write data to file: %s" % filename)
# 追加數(shù)據(jù)
def append_file(self, filename, data):
if not os.path.exists(filename):
self.write_file(filename, data)
else:
self.write_file(filename, data, "a+")
# 讀取文件中的數(shù)據(jù)
def read_file(self, filename, mode="r"):
data = None
if not os.path.exists(filename):
return data
f = open(filename, mode)
try:
data = f.read()
if filename.endswith(".json"):
data = json.loads(data)
finally:
f.close()
print("read data from file: %s" % filename)
return data
# 抓取數(shù)據(jù)
def soup(self, url):
s = requests.session()
s.keep_alive = False
txt = s.get(url, headers=HEADER_CONFIG, timeout=120).content
return BeautifulSoup(txt, "lxml")
# 搜索文件
def list_files(self, path, fnexp):
for root, dirs, files in os.walk(path):
for filename in fnmatch.filter(files, fnexp):
yield os.path.join(root, filename)
# 當(dāng)前時(shí)間 毫秒級(jí)
def now(self):
return int(round(time.time() * 1000))
# 當(dāng)前時(shí)間 秒級(jí)
def now_s(self):
return int(time.time())
# 記錄開(kāi)始時(shí)間戳
def recode_begin(self, group="default"):
if not self.time_map:
self.time_map = {}
self.time_map[group] = self.now()
return self.time_map[group]
# 記錄結(jié)束時(shí)間戳
def recode_end(self, group="default"):
t = (self.now() - self.time_map[group]) / 1000
print("[%s]: 消耗時(shí)間:%s 秒" % (group, t))
self.time_map.pop(group)
return t
# 格式化時(shí)間戳
def time_format(self, timestamp, parttern="%Y-%m-%d %H:%M:%S"):
time_local = time.localtime(timestamp)
return time.strftime(parttern, time_local)
# 獲取md5文本
def md5(self, content):
return hashlib.md5(content.encode("utf8")).hexdigest()
cm_util = Util()
common/thread_manager.py
# -*- coding: UTF-8 -*-
#
import queue
import threading
import time
#
class ThreadManager:
def __init__(self, max_size=10):
self.exit_flag = 0
self.work_queue = queue.Queue(max_size)
self.queue_lock = threading.Lock()
self.threads = []
self.cbk_group = {}
self.thread_id = 100000
#
# 通知線程是時(shí)候退出
def exit(self):
self.exit_flag = 1
#
# 加鎖
def lock(self):
# 填充隊(duì)列
self.queue_lock.acquire()
#
# 解鎖
def unlock(self):
self.queue_lock.release()
#
# 添加數(shù)據(jù)
def put_data(self, data):
self.lock()
for item in data:
self.work_queue.put(item)
self.unlock()
#
# 添加線程
def put_thread(self, thread):
thread.start()
self.threads.append(thread)
self.thread_id = self.thread_id + 1
#
# 添加回調(diào)類型的線程 cbk:def cbk(data: 數(shù)據(jù), thread_id: 線程id, thread_name:線程名稱)
def put_cbk_thread(self, thread_name, cbk, repeat=1):
if isinstance(thread_name, list):
repeat = len(thread_name)
if repeat == 1:
thread = CBThread(self.thread_id, thread_name, self.process_data)
self.cbk_group[self.thread_id] = cbk
self.put_thread(thread)
else:
for i in range(repeat):
name = thread_name
if isinstance(thread_name, list):
name = thread_name[i]
else:
name = "%s(%d)" % (thread_name, i + 1)
thread = CBThread(self.thread_id, name, self.process_data)
self.cbk_group[self.thread_id] = cbk
self.put_thread(thread)
#
# 等待所有線程完成
def join(self):
for t in self.threads:
t.join()
#
# 等待隊(duì)列清空
def wait(self):
while not self.work_queue.empty():
pass
#
# 數(shù)據(jù)數(shù)量
def data_size(self):
return self.work_queue.qsize()
#
# 線程數(shù)量
def thread_size(self):
return len(self.threads)
#
# 處理數(shù)據(jù)
def process_data(self, thread_id, thread_name):
while not self.exit_flag:
data = None
self.lock()
if not self.work_queue.empty():
try:
data = self.work_queue.get()
finally:
self.unlock()
# 如果有回調(diào),則處理回調(diào)
cbk = self.cbk_group.get(thread_id, None)
if data and cbk:
cbk(data, thread_id, thread_name)
print("%s[%d] processing" % (thread_name, thread_id))
else:
self.unlock()
time.sleep(1)
#
# 通用線程, 內(nèi)含回調(diào) 可配合線程管理器使用
class CBThread(threading.Thread):
def __init__(self, thread_id, name, cbk):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.thread_name = name
self.cbk = cbk
#
def run(self):
print("Starting %s" % self.thread_name)
self.cbk(self.thread_id, self.thread_name)
print("Exiting %s" % self.thread_name)