- 利用純Go語言編寫的開源的支持分布式的高并發(fā)、重量級爬蟲軟件Pholcus(幽靈蛛)爬取慕課網(wǎng)課程、章節(jié)等并入庫MongoDB。
寫在前面
個(gè)人非常熱愛慕課網(wǎng),在慕課網(wǎng)學(xué)習(xí)兩年多了,收獲很大,編寫此爬蟲目的純粹為了學(xué)習(xí)Pholcus, 爬蟲也并未爬取慕課網(wǎng)視頻地址(其實(shí)慕課的視頻地址不是那么容易就能爬到的,好像是分割成了1M左右的小視頻),沒有其他任何商業(yè)用途。同時(shí)也希望網(wǎng)友能愛護(hù)慕課網(wǎng),如果要爬,請輕爬!
爬取說明
- 在圖1頁面爬取課程名稱、課程簡介等,然后點(diǎn)擊進(jìn)入在圖2頁面爬取學(xué)習(xí)人數(shù)、課程章節(jié)等內(nèi)容。最后如圖3寫入數(shù)據(jù)庫。
- 爬取過程中,圖1中爬取的內(nèi)容并不直接如果,而是通過
struct request
中的temp字段以及SetTemp()
、GetTemp()
講圖1中內(nèi)容與圖2中的內(nèi)容整合后寫入數(shù)據(jù)庫。- 因?yàn)槟秸n網(wǎng)課程內(nèi)容層級結(jié)構(gòu)類似于JSON,所以數(shù)據(jù)庫考慮使用MongoDB,實(shí)現(xiàn)圖3效果。
Paste_Image.png
Paste_Image.png
Paste_Image.png
Paste_Image.png
Paste_Image.png
Spider源碼
package imooc
import (
"strconv"
"strings"
"encoding/json"
//"io/ioutil"
"log"
"net/http"
"regexp"
"fmt"
"github.com/henrylee2cn/pholcus/app/downloader/request"
. "github.com/henrylee2cn/pholcus/app/spider"
"github.com/henrylee2cn/pholcus/common/goquery"
)
func init() {
Imooc.Register()
}
func cleanHTML(s string) string {
s = strings.Replace(s, " ", "", -1)
s = strings.Replace(s, "\n", "", -1)
s = strings.Replace(s, "\t", "", -1)
return s
}
type Data struct {
id string `json:"id"`
Numbers string `json:"numbers"`
}
type AjaxCourseMembers struct {
result int `json:"result"`
Data []Data `json:"data"`
msg string `json:"msg"`
}
var chapterin = make(map[string]string)
var Score = make(map[string]string)
var chapterout = make(map[string]interface{})
var chapterall = make(map[string]interface{})
var Imooc = &Spider{
Name: "Imooc",
Description: "慕課網(wǎng)課程,[Auto Page][imooc.com]",
EnableCookie: false,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
ctx.AddQueue(&request.Request{Url: "http://www.imooc.com/course/list", Rule: "首頁請求"})
},
Trunk: map[string]*Rule{
"首頁請求": {
ParseFunc: func(ctx *Context) {
tmpMp, _ := ctx.GetDom().Find("div.page").Find("a").Eq(8).Attr("href")
maxPage, _ := strconv.Atoi(strings.TrimLeft(tmpMp, "/course/list?page="))
ctx.Aid(map[string]interface{}{"loop": [2]int{1, maxPage}, "Rule": "所有課程"}, "所有課程")
},
},
"所有課程": {
AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} {
for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ {
ctx.AddQueue(&request.Request{
Url: "http://www.imooc.com/course/list?page=" + strconv.Itoa(loop[0]),
Rule: aid["Rule"].(string),
})
}
return nil
},
ParseFunc: func(ctx *Context) {
query := ctx.GetDom().Find(".index-card-container")
query.Each(func(i int, goq *goquery.Selection) {
CourseTitle := goq.Find(".course-card-name").Text()
TechStack := goq.Find(".course-card-top span").Text()
Introduction := goq.Find("p").Text()
Attr, ok := goq.Find(".course-card").Attr("href")
CourseUrlNumber := strings.Join(regexp.MustCompile("[0-9]").FindAllString(Attr, -1), "")
url := "http://www.imooc.com/course/AjaxCourseMembers?ids=" + CourseUrlNumber
resp, err := http.Get(url)
/***
if err != nil {
log.Println("ERROR:", err)
return
}
doc1, _ := ioutil.ReadAll(resp.Body)
ajaxCourseMembers := &AjaxCourseMembers{}
if err:= json.Unmarshal([]byte(string(doc1)), &ajaxCourseMembers);err!=nil{
log.Println("ERROR:", err)
return
}
***/
//myjson, _ := ioutil.ReadAll(resp.Body)
//fmt.Println(string(myjson)) //resp的body內(nèi)容OK
if err != nil {
log.Println("ERROR:", err)
}
defer resp.Body.Close()
ajaxCourseMembers := &AjaxCourseMembers{}
if err := json.NewDecoder(resp.Body).Decode(&ajaxCourseMembers); err != nil {
log.Println("ERROR:", err)
}
//LearnerNumber:=ajaxCourseMembers.data[0].Numbers //此處,這么寫更加方便。
LearnerNumber := ajaxCourseMembers.getnumbers()
if ok == true {
ctx.AddQueue(&request.Request{
Url: "http://www.imooc.com" + Attr,
Rule: "課程詳細(xì)信息",
Temp: map[string]interface{}{
"CourseTitle": CourseTitle,
"TechStack": TechStack,
"Introduction": Introduction,
"LearnerNumber": LearnerNumber,
},
})
}
})
},
},
"課程詳細(xì)信息": {
ItemFields: []string{
"課程名稱",
"課程分類",
"課程簡介",
"學(xué)習(xí)人數(shù)",
"課程介紹",
"課程路徑",
"難度級別",
"課程時(shí)長",
"評分",
"章節(jié)",
},
ParseFunc: func(ctx *Context) {
dom := ctx.GetDom()
query := dom.Find(".mod-chapters > div")
Summary := cleanHTML(dom.Find("div.course-brief").Text())
CoursePath := cleanHTML(dom.Find(".course-infos").Find(".path").Text())
Difficulty := dom.Find(".course-infos").Find("div.static-item").Eq(1).Find(".meta-value").Text()
Duration := dom.Find(".course-infos").Find("div.static-item").Eq(2).Find(".meta-value").Text()
scoretmp := dom.Find(".course-infos").Find(".score-btn")
vScore0 := scoretmp.Find("span").Eq(0).Text()
vScore1 := scoretmp.Find("span").Eq(1).Text()
vScore2 := scoretmp.Find("span").Eq(2).Text()
vScore3 := scoretmp.Find("span").Eq(3).Text()
vScore4 := scoretmp.Find("span").Eq(4).Text()
vScore5 := scoretmp.Find("span").Eq(5).Text()
vScore6 := scoretmp.Find("span").Eq(6).Text()
vScore7 := scoretmp.Find("span").Eq(7).Text()
vScore8 := scoretmp.Find("span").Eq(8).Text()
Score[vScore0] = vScore1
Score["評價(jià)數(shù)"] = vScore2
Score[vScore4] = vScore3
Score[vScore6] = vScore5
Score[vScore8] = vScore7
query.Each(func(i int, goq *goquery.Selection) {
ChapterH1 := cleanHTML(goq.Find("strong").After("i").Text())
ctx.SetTemp("ChapterH1", ChapterH1)
Chapter2_html := goq.Find("ul.video>li")
Chapter2_html.Each(func(_ int, goq1 *goquery.Selection) {
Chapter2_url, _ := goq1.Find("a").Attr("href")
Chapter2 := cleanHTML(goq1.Find("a").After("button").Text())
chapterin[Chapter2] = "www.imooc.com" + cleanHTML(Chapter2_url)
ctx.SetTemp("JsonChapterH1", chapterin)
})
chapterout[ctx.GetTemp("ChapterH1", "").(string)] = ctx.GetTemp("JsonChapterH1", "")
chapterall[ctx.GetTemp("CourseTitle", "").(string)] = chapterout
chapterin = make(map[string]string)
})
chapterout = make(map[string]interface{})
ctx.Output(map[int]interface{}{
0: ctx.GetTemp("CourseTitle", "").(string),
1: ctx.GetTemp("TechStack", "").(string),
2: ctx.GetTemp("Introduction", "").(string),
3: ctx.GetTemp("LearnerNumber", "").(string),
4: Summary,
5: CoursePath,
6: Difficulty,
7: Duration,
8: Score,
9: chapterall[ctx.GetTemp("CourseTitle", "").(string)],
})
chapterall = make(map[string]interface{})
Score = make(map[string]string)
},
},
},
},
}
func (s *AjaxCourseMembers) getnumbers() string {
return s.Data[0].Numbers
}
源碼說明
- 如上面爬取說明->2中說明,代碼第115行的
temp
字段臨時(shí)存放頁面1爬取到的字段,并傳遞到下一個(gè)Rule(這是跨Rule字段傳遞);代172、174等行中的SetTemp()
、GetTemp()
在同一請求下存取以及獲取臨時(shí)字段(這是同Rule字段傳遞)。- 如圖4,頁面2中的人數(shù)字段是通過Ajax動(dòng)態(tài)加載的,所以需要通過相關(guān)接口單獨(dú)獲取JSON格式的人數(shù)字段。如圖5,在代碼第82、83行通過
http.Get()
返回Response。- 在代碼100~107行用
json.NewDecoder(resp.Body).Decode
解析JSON。