目的是要爬取書的書名,以及書的數目和一些簡介
(1)配置item文件
class DuyuanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field()
book_list_title = scrapy.Field()
book_number = scrapy.Field()
book_list_author = scrapy.Field()
book_list_date = scrapy.Field()
book_list_summary = scrapy.Field()
book_url = scrapy.Field()
book_name = scrapy.Field()
book_author = scrapy.Field()
book_summary = scrapy.Field() #根據你自己想要抓取哪些數據來填寫
(2)配置setting文件
ROBOTSTXT_OBEY = False #這是基礎里面就說了要配置的
ITEM_PIPELINES = { 'duyuan.pipelines.DuyuanPipeline': 300, } #pipeline文件的入口
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'duyuan'
MONGODB_DOCNAME = 'bookitem' #MongoDB的一些參數
(3)配置pipelines文件
import pymongo
from scrapy.conf import settingsclass
class DuyuanPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
db_name = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
db = client[db_name]
self.post = db[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
book_info = dict(item)
self.post.insert(book_info)
return item
#都是按這個套路配。。模仿著來就可以了。
(4)配置爬蟲文件
import scrapy
from duyuan.items import DuyuanItemclass
ReadcolorSpider(scrapy.Spider):
name = "readcolor"
allowed_domains = ["readcolor.com"]
start_urls = ['http://readcolor.com/lists']
url = 'http://readcolor.com'
def parse(self, response):
book_list_group = response.xpath('//article[@style="margin:10px 0 20px;"]')
for each in book_list_group:
item = DuyuanItem() #實例化一個對象
item['book_list_title'] = each.xpath('header/h3/a/text()').extract()[0] #爬取標題,我發現我這xpath還真有點不熟悉,然后這都是一些數據處理
item['book_number'] = each.xpath('p/a/text()').extract()[0]
book_list_url = each.xpath('header/h3/a/@href').extract()[0]
yield scrapy.Request(self.url+book_list_url,callback=self.parse_book_list_detail,dont_filter=True,meta={'item':item}) #這個yield好像和return挺像的,具體我還得看下python的書,那個url是相對的網址,要自己拼湊出來,callback一個回傳
def parse_book_list_detail(self,response): #相當于點進去一個網站,處理那個點進去的網站的信息
item = response.meta['item']
summary = response.xpath('//div[@id="list-description"]/p/text()').extract()
item['book_list_summary'] = '\n'.join(summary)
yield item
最后編輯于 :
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。