爬取趕集網(wǎng)二手交易市場(chǎng)所有類(lèi)目,并將信息儲(chǔ)存在數(shù)據(jù)中。
效果是這樣的:
所有類(lèi)目訪問(wèn)鏈接
產(chǎn)品詳情信息
我的代碼:
#建立channel_list.py文件獲取所有類(lèi)目的訪問(wèn)鏈接
import requests
from bs4 import BeautifulSoup
start_url='http://bj.ganji.com/wu/'
def get_channel_list(url):
web_data=requests.get(url)
soup=BeautifulSoup(web_data.text,'lxml')
channels=soup.select('dl.fenlei dt a')
# print(channels)#返回的是列表
for channel in channels:
base_url = 'http://bj.ganji.com'
residue_url=channel.get('href')
full_url=base_url+residue_url
print(full_url)
get_channel_list(start_url)
channel_list='''
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
'''
#建立link_list_detail_info.py文件獲取每個(gè)類(lèi)目的所有鏈接存放入數(shù)據(jù)庫(kù)'linklists'及將每個(gè)類(lèi)目的具體產(chǎn)品信息存放在'detailinfo'
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import random
client=MongoClient('localhost',27017)
ganjiDB=client['ganjiDB']
linklists=ganjiDB['linklists']
detailinfo=ganjiDB['detailinfo']
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
proxy_list=[
'http://125.88.74.122:83',
'http://113.18.193.5:8080',
'http://113.18.193.7:8080',
'http://120.92.3.127:90'
]
proxy_ip=random.choice(proxy_list)
proxies={'http':proxy_ip}#啟用代理,規(guī)避趕集網(wǎng)針對(duì)單個(gè)IP的訪問(wèn)限制
def page_link(channel):
for cate in range(1,3):
for page in range(1,101):
link_url = ['{}a{}o{}'.format(channel, cate, page)][0]
#print(link_url)
link_list(link_url)
def link_list(url):
time.sleep(2)
web_data=requests.get(url,headers=headers)
# print(web_data.status_code)#返回結(jié)果code 200
soup=BeautifulSoup(web_data.text,'lxml')
# mark=soup.find('a','next')#返回結(jié)果為字符串<a class="next"href="/jiaju/a1o31/"><span>下一頁(yè)</span></a>
# print(mark)
if soup.find('a','next')and url.split('/')[-1][1]=='1':#滿足兩個(gè)條件1、當(dāng)前頁(yè)不是最后一頁(yè)2、當(dāng)前頁(yè)屬于個(gè)人類(lèi)目
lists=soup.select('td.t a.t')#與商家類(lèi)目過(guò)濾條件不同
# print(lists)
for list in lists:
list_href=list.get('href').split('?')[0]
linklists.insert_one({'list_href':list_href})
print(list_href)
elif soup.find('a', 'next') and url.split('/')[-1][1] == '2':#滿足兩個(gè)條件1、當(dāng)前頁(yè)不是最后一頁(yè)2、當(dāng)前頁(yè)屬于商家類(lèi)目
lists = soup.select('a.ft-tit')#與個(gè)人列木過(guò)濾條件不同
# print(lists)
for list in lists:
list_href = list.get('href')
linklists.insert_one({'list_href': list_href})
print(list_href)
else:
print('列表地址錯(cuò)誤')
#獲取每個(gè)頁(yè)面的具體信息
def get_detail_info(url):
web_data=requests.get(url,headers=headers)
soup=BeautifulSoup(web_data.text,'lxml')
if url[-5]=='x':
info={
'title':soup.select('h1.title-name')[0].text,
'date':soup.select('i.pr-5')[0].text.strip(),
'types':soup.select('ul > li > span > a')[5].text,
'price':soup.select('i.f22.fc-orange.f-type')[0].text,
'area':list(map(lambda x:x.text,soup.select('div > div > div > div > ul > li > a')[-3:-1])),
'url':url
}
detailinfo.insert_one(info)
print(info)
elif url[-7]=='z':
info={
'title':soup.select('h1.info_titile')[0].text,
'price':soup.select('span.price_now i')[0].text,
'area':soup.select('div.palce_li span i')[0].text,
'url':url
}
detailinfo.insert_one(info)
print(info)
else:
print('地址錯(cuò)誤')
#建立main.py文件調(diào)用channel_list.py、link_list_detail_info.py中的屬性和方法及數(shù)據(jù)庫(kù)信息
from channel_list import channel_list
from link_list_detail_info import linklists,page_link,link_list
from link_list_detail_info import detailinfo,get_detail_info
from multiprocessing import Pool
import time
def get_all_links(channel):
page_link(channel)
db_urls=set([item['list_href'] for item in linklists.find()])
index_urls=set([item['url'] for item in detailinfo.find()])
rest_of_url=db_urls-index_urls#斷點(diǎn)續(xù)傳
if __name__=='__main__':
pool=Pool()
pool.map(get_all_links,channel_list.split())#調(diào)用channel_list
time.sleep(10)
pool.map(get_detail_info,rest_of_url)#調(diào)用rest_of_url中每個(gè)類(lèi)目下具體頁(yè)面鏈接獲取頁(yè)面詳情并進(jìn)行斷點(diǎn)續(xù)傳優(yōu)化
#建立count.py文件實(shí)時(shí)監(jiān)控存入linklists中鏈接數(shù)量
from link_list_detail_info import linklists
import time
while True:
print(linklists.find().count())
time.sleep(10)
監(jiān)控截圖:
監(jiān)控效果圖
總結(jié):
- Mongodb數(shù)據(jù)的基礎(chǔ)功能使用;
- 多進(jìn)程訪問(wèn)方式的引用;
- 數(shù)據(jù)庫(kù)查找的靈活調(diào)用實(shí)現(xiàn)斷點(diǎn)續(xù)傳;
- map、lambda函數(shù)的使用;
- proxy及headers防爬機(jī)制的使用。
Paste_Image.png