爬取整個網(wǎng)頁
# 新建爬蟲文件并編輯
[root@bogon http]# vi crawler.js
# 文件內(nèi)容
var http = require('http')
var url = 'http://zhangdanyang.com/'
http.get(url,function(res){
var html = ''
res.on('data',function(data){
html+=data
})
res.on('end',function(){
console.log(html)
})
}).on('error',function(){
console.log('獲取數(shù)據(jù)失敗!')
})
# 運行結(jié)果是打印頁面html代碼
<!DOCTYPE html>
<head>
<meta charset="UTF-8">
<title>...</title>
</head>
<body>
...
</body>
</html>
爬取慕課網(wǎng)課程實例
http://www.imooc.com/learn/348
【代碼】
var http = require('http')
var cheerio = require('cheerio')
var url = 'http://www.imooc.com/learn/348'
function filterChapters(html){
var $ = cheerio.load(html)
var chapters = $('.chapter')
//數(shù)據(jù)結(jié)構(gòu)
//[{
// chapterTitle:'',
// videos:[
// title:'',
// id:''
// ]
//}]
var courseData = []
chapters.each(function(item){
var chapter = $(this)
//var chapterTitle = chapter.find('strong').text()
var chapterTitle = chapter.find('strong').contents().filter(function() { return this.nodeType === 3; }).text().trim();
//var videos = chapter.find('.video').children('li')
var videos=chapter.find('ul').children()
var chapterData = {
chapterTitle:chapterTitle,
videos:[]
}
videos.each(function(item){
//var video = $(this).find('.studyvideo')
var video=$(this).find('a')
//var videoTitle = video.text()
var temp=video.text().trim()
var arr = temp.split('\n') // 多層標(biāo)簽的文本都拼到一起了,要拆開,取用需要的值
var videoTitle = arr[0].trim() + ' ' +arr[1].trim()
var id = video.attr('href').split('video/')[1]
chapterData.videos.push({
title:videoTitle,
id:id
})
})
courseData.push(chapterData)
})
return courseData
}
function printCourseInfo(courseData){
//console.log("printInfo")
courseData.forEach(function(item){
var chapterTitle = item.chapterTitle
console.log(chapterTitle+'\n')
item.videos.forEach(function(video){
console.log('【'+video.id+'】'+video.title+'\n')
})
})
}
http.get(url,function(res){
var html = ''
res.on('data',function(data){
html+=data
})
res.on('end',function(){
//console.log(html)
var courseData = filterChapters(html)
printCourseInfo(courseData)
})
}).on('error',function(){
console.log('獲取數(shù)據(jù)失敗!')
})
【結(jié)果】
[root@bogon http]# node crawler
第1章 前言
【6687】1-1 Node.js基礎(chǔ)-前言 (01:20)
……(省略掉中間內(nèi)容)
【8837】5-12 Node.js:request方法 (17:56)