python實(shí)戰(zhàn)計(jì)劃:爬取租房信息

Date:2016-9-21
update:2016-9-30
By:Black Crow

前言:

終于進(jìn)入到網(wǎng)絡(luò)頁面的抓取了。前面一節(jié)課靜態(tài)頁面的作業(yè)做了之后總是有報(bào)錯(cuò),所以一直沒有單獨(dú)寫總結(jié)。聽課的時(shí)候就感覺到內(nèi)容十分的吸引人,爬取的過程也是特別有意思,后面一節(jié)課關(guān)于select的條件上是有做優(yōu)化的,比前一節(jié)課更高效。PPT里的地址已失效,所以隨便設(shè)定條件搜的短租房信息。

作業(yè)效果:

看著信息滾動(dòng)的感覺其實(shí)挺爽的

房租信息.gif

20160921爬取的excel表格:鏈接: http://pan.baidu.com/s/1nvEVDvN 密碼: j4vt
20160922update表格:鏈接: http://pan.baidu.com/s/1c198fN6 密碼: kq4a
20160922update圖片:
各區(qū)女房東占多數(shù).png

東城均價(jià)最高,通州均價(jià)最低.png

女房東的房子均價(jià)要高.png

我的代碼:

20160921代碼

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico'or 'member_ico1':
return 'girl'
elif gender_lorder =='member_boy_ico' or 'member_ico':
return 'boy'
else:
return 'unknown gender!'
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(days_fee)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
data ={
'title':title.get_text('em'),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
print(data)
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')


#####20160922update代碼:修正了性別判斷
>```
from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
    if gender_lorder == 'member_girl_ico':
        return 'girl'
    elif gender_lorder == 'member_boy_ico':
        return 'boy'
    else:
        return 'unknown gender!'
def info(url):
    info_data = requests.get(url)
    info_soup=BeautifulSoup(info_data.text,'lxml')
    titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
    addresses =info_soup.select('div.pho_info > p')
    images_house = info_soup.select('img[id="curBigImage"]')
    days_fee =info_soup.select('div.day_l > span')
    urls_lorder =info_soup.select('div.member_pic > a > img')
    names_lorder =info_soup.select('div.w_240 > h6 > a')
    genders_lorder = info_soup.select('div.w_240 > h6 > span')
    #print(genders_lorder)
    for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,genders_lorder):
        data ={
            'title':title.get_text('em'),
            'address':address.get('title'),
            'image_house':image_house.get('src'),
            'url_lorder':url_lorder.get('src'),
            'name_lorder':name_lorder.get_text(),
            'gender_lorder':gender_change(gender_lorder.get('class')[0]),
            'day_fee': day_fee.get_text(),
        }
        print(data)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
    wb_data = requests.get(house_url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    detail_urls = soup.select('a[class="resule_img_a"]')
    for detail_url in detail_urls:
        house_data=detail_url.get('href')
        info(house_data)
        #print(house_data)
print('Done')
20160924update:性別表述修改為male和female;去除標(biāo)題中的換行符,避免影響數(shù)據(jù)處理;增加寫入本地文件;增加計(jì)數(shù)項(xiàng),避免爬取過程中無聊。

from bs4 import BeautifulSoup
import requests
import time
def gender_change(gender_lorder):
if gender_lorder =='member_girl_ico':
return 'female'
elif gender_lorder =='member_boy_ico':
return 'male'
else:
return 'unknown gender'
def counter(last=[0]):
#last[0]將列表里面的第一個(gè)元素取出,然后加1,賦值給next
next = last[0] + 1
#修改列表里面第一個(gè)元素的值
last[0] = next
#返回此時(shí)運(yùn)行的次數(shù)
return next
def info(url):
info_data = requests.get(url)
info_soup=BeautifulSoup(info_data.text,'lxml')
titles = info_soup.select('h4 em')
#titles = info_soup.select('div.con_l > div.pho_info > h4 > em')
addresses =info_soup.select('div.pho_info > p')
images_house = info_soup.select('img[id="curBigImage"]')
days_fee =info_soup.select('div.day_l > span')
urls_lorder =info_soup.select('div.member_pic > a > img')
names_lorder =info_soup.select('div.w_240 > h6 > a')
genders_lorder = info_soup.select('div.w_240 > h6 > span')
#print(titles)
for title,address,image_house,day_fee,url_lorder,name_lorder,gender_lorder
in zip(titles,addresses,images_house,days_fee,urls_lorder,names_lorder,
genders_lorder):
data ={
#'title':title.get_text('em'),
'title': title.get_text(),
'address':address.get('title'),
'image_house':image_house.get('src'),
'url_lorder':url_lorder.get('src'),
'name_lorder':name_lorder.get_text(),
'gender_lorder':gender_change(str(gender_lorder.get('class')[0])),
'day_fee': day_fee.get_text(),
}
#print(data)
with open('F://python/2/xiaozhu_data.txt','a',encoding='utf-8') as file:#路徑根據(jù)實(shí)際填寫
#以w形式寫入,前面會(huì)被清空;a為追加寫入
#標(biāo)題里有換行,用replace去除
file_content =data['title'].replace("\n",'')+';'+data['day_fee']+';'
+data['address']+';'+data['image_house']+';'
+data['name_lorder']+';'+data['gender_lorder']+';'
+data['url_lorder']+'\n'
file.write(file_content)
print(counter()) # 調(diào)用計(jì)數(shù)器,避免無聊
time.sleep(0.01)
house_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/?startDate=2016-09-21'
'&endDate=2016-10-01'.format(str(i)) for i in range(0,15,1)]
for house_url in house_urls:
wb_data = requests.get(house_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
detail_urls = soup.select('a[class="resule_img_a"]')
for detail_url in detail_urls:
house_data=detail_url.get('href')
info(house_data)
#print(house_data)
print('Done')

####總結(jié):
>1. 該網(wǎng)站也設(shè)置了反爬措施,房屋圖片及房東圖片都采用了障眼法(假src,點(diǎn)擊鏈接后圖片下載了但是打不開),但是目前技術(shù)有限,繞不過去,只能是暫時(shí)擱置了。(update20160930圖片的地址是真實(shí)的,但是因?yàn)槲夷J(rèn)瀏覽器為chrome,打開鏈接就直接下載了圖片,圖片無法打開,在該鏈接復(fù)制進(jìn)IE瀏覽器后,發(fā)現(xiàn)原來可以顯示。霧~~~)
![1.png](http://upload-images.jianshu.io/upload_images/1059649-5555e1182aab31d6.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
2. 本來打算將爬取的數(shù)據(jù)寫入文檔,但實(shí)驗(yàn)了多次后發(fā)現(xiàn)dict的轉(zhuǎn)換寫入方法還沒掌握,這個(gè)后續(xù)打算問問老師怎么處理比較妥當(dāng);此次作業(yè)的表格是采用傻瓜式處理的,復(fù)制粘貼到excel,然后用excel分列處理的。大致看了下短租房日租金以128-499區(qū)間的房屋最多,地址沒細(xì)作研究,但是覺得可以再excel里用地圖展現(xiàn)一下。(dict里的內(nèi)容打印存儲(chǔ)到本地的坑已經(jīng)填上,20160924update)
3. 代碼寫的時(shí)候是先寫的單個(gè)頁面的解析,后來寫的是房屋鏈接的采集,兩段代碼合并時(shí)稍微做了調(diào)整。
4. 速度有些慢,不知道是代碼原因還是本身數(shù)據(jù)爬取過程就比較慢的原因。sleep的時(shí)間還是設(shè)定了,比較短,以防萬一。
5. 性別一項(xiàng)抓取的數(shù)據(jù)都是girl,估計(jì)還是有問題,還沒有一個(gè)個(gè)細(xì)看是不是真的如此,但直覺是女性確實(shí)比較多。(此項(xiàng)已經(jīng)修正,20160924update)
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

推薦閱讀更多精彩內(nèi)容