只針對 <a href="www.xxxxx.com">鏈接進行抓取。
package main
import (
"fmt"
"runtime"
"log"
"net/http"
"math/rand"
"time"
"io/ioutil"
"regexp"
"strings"
"encoding/xml"
)
var urlchannel = make(chan string, 200) //channel中存入string類型的href屬性,緩沖200
var atagRegExp = regexp.MustCompile(`<a[^>]+[(href)|(HREF)]\s*\t*\n*=\s*\t*\n*[(".+")|('.+')][^>]*>[^<]*</a>`) //以Must前綴的方法或函數都是必須保證一定能執行成功的,否則將引發一次panic
func main() {
//go spy("http://www.3qzone.com/")
go spy("http://www.qidian.com/")
for url := range urlchannel{
fmt.Println("routines num = ", runtime.NumGoroutine(), "chan len = ", len(urlchannel)) //通過runtime可以獲取當前運行時的一些相關參數等
go spy(url)
}
fmt.Println("a")
}
//noinspection GoTypesCompatibility
func spy(url string) {
defer func() {
if r := recover(); r != nil{
log.Println("[E]", r)
}
}()
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", GetRandomUserAgent())
client := http.DefaultClient
res, e := client.Do(req)
if e != nil{
fmt.Errorf("Get請求%s返回錯誤:%s", url, e)
return
}
if res.StatusCode == 200{
body := res.Body
defer body.Close()
bodyByte, _ := ioutil.ReadAll(body)
resStr := string(bodyByte)
atag := atagRegExp.FindAllString(resStr, -1)
for _, a := range atag{
href, _ := GetHref(a)
if strings.Contains(href, "article/details/"){
fmt.Println("☆", href)
}else{
fmt.Println("□", href)
}
urlchannel <- href
}
}
}
func GetHref(atag string) (href, content string) {
inputReader := strings.NewReader(atag)
decoder := xml.NewDecoder(inputReader)
for t, err := decoder.Token(); err == nil; t, err = decoder.Token(){
switch token := t.(type) {
//處理元素開始(標簽)
case xml.StartElement:
for _, attr := range token.Attr{
attrName := attr.Name.Local
attrValue := attr.Value
if(strings.EqualFold(attrName, "href") || strings.EqualFold(attrName, "HREF")){
href = attrValue
}
}
//處理元素開始(標簽)
case xml.EndElement:
//元素字符數據(這來是元素的文本)
case xml.CharData:
content = string([]byte(token))
default:
href = ""
content = ""
}
}
return href, content
}
var userAgent = [...]string{
"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
}
var r = rand.New(rand.NewSource(time.Now().UnixNano()))
func GetRandomUserAgent() string {
return userAgent[r.Intn(len(userAgent))]
}
這是一個還算完整的程序。用來取得一個指定的網站的 href 地址集。
下面,我們來一步一步分解它。分布實現它。
因為需要同時下載多個網頁,所以用到了 goroutine 和 channel。建立一個通道變量。
var urlchannel = make(chan string, 200) //多個頁面要同時下載,需要使用 goroutine 和 channel
在主函數 main 中,起步從域名開始訪問一個網站,并且最終要遍歷網站內所有的鏈接地址。所以需要建立一個函數,以便在函數內完成遍歷。這個函數我們取名 spy。于是在主函數中添加 spy 的 goroutine。
go spy("http://www.sohu.com")
同時建立 spy 函數。
func spy(url string) {
}
在 spy 內要實現,取得網址對應的網頁,并且分析網頁的的鏈接。把取得的頁面信息或鏈接信息給入 urlchannel 通道。
在主函數中接收通道信息,利用 runtime 獲取當前運行時的一些相關參數信息。
func main(){
...
for url := range urlchannel {
fmt.Println("routines num =", runtime.NumGoroutine(), "chan len =", len(urlchannel))
...
}
...
在 spy 函數中,使用 defer 延遲一個匿名函數做錯誤處理。
defer func() {
if r := recover(); r != nil{
log.Println("[E]", r)
}
}()
http get方式發送網絡訪問請求
req, _ := http.NewRequest("GET", url, nil)
為了模擬不同的客戶端訪問,建立一個 userAgent 數組,預存各種客戶端環境,使用隨機取用的方式發送網絡請求。
var userAgent = [...]string{
"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
"Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
"Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
"Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
"Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
}
var r = rand.New(rand.NewSource(time.Now().UnixNano()))
func GetRandomUserAgent() string {
return userAgent[r.Intn(len(userAgent))]
}
設置請求信息的 Header 利用 GetRandomUserAgent 函數隨機設置客戶端。使用 Client.Do 方法提交客戶端請求。
req.Header.Set("User-Agent", GetRandomUserAgent())
client := http.DefaultClient
res, e := client.Do(req)
根據 res 返回值,分析獲取的內容。并將有用的內容返回給通道。
if res.StatusCode == 200{
body := res.Body
defer body.Close()
bodyByte, _ := ioutil.ReadAll(body)
resStr := string(bodyByte)
atag := atagRegExp.FindAllString(resStr, -1)
for _, a := range atag{
href, _ := GetHref(a)
if strings.Contains(href, "article/details/"){
fmt.Println("☆", href)
}else{
fmt.Println("□", href)
}
urlchannel <- href
}
}
其中的 resStr 是網頁內容,而 GetHref 函數實現了對內容的分析,并返回鏈接和鏈接內容。
func GetHref(atag string) (href, content string) {
inputReader := strings.NewReader(atag)
decoder := xml.NewDecoder(inputReader)
for t, err := decoder.Token(); err == nil; t, err = decoder.Token(){
switch token := t.(type) {
//處理元素開始(標簽)
case xml.StartElement:
for _, attr := range token.Attr{
attrName := attr.Name.Local
attrValue := attr.Value
if(strings.EqualFold(attrName, "href") || strings.EqualFold(attrName, "HREF")){
href = attrValue
}
}
//處理元素開始(標簽)
case xml.EndElement:
//元素字符數據(這里是元素的文本)
case xml.CharData:
content = string([]byte(token))
default:
href = ""
content = ""
}
}
return href, content
}
如果你不想分析鏈接,也可以直接使用 resStr 作為通道的賦值。