1.代碼可以直接運行,請下載anaconda并安裝,用spyder方便查看變量
或者可以查看生成的excel文件
2.依賴庫,命令行運行(WIN10打開命令行快捷鍵:windows+x組合鍵,然后按a鍵):
pip install BeautifulSoup4
pip install requests
3.爬取的網站是安居客(廈門)網站,可以進入https://xm.fang.anjuke.com/loupan/all/進行觀察
4.關于如何判斷代碼是python2還是python3,print('')為python3,print ''為python2
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 14 19:07:39 2018
@author: Steven Lei
"""
def getHousesDetails(url):
import requests
from bs4 import BeautifulSoup
request = requests.get(url)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'lxml')
houses = soup.select('.item-mod')[3:]
housesDetails = []
for house in houses:
#獲取樓盤名字
houseName = house.select('.items-name')[0].text
#獲取樓盤價格
priceBefore = house.select('.price')
if(len(priceBefore) == 0):
priceBefore = house.select('.price-txt')
price = priceBefore[0].text
#獲取樓盤地址
address = house.select('.list-map')[0].text
if(address[-1] == '.'):
href = house.select('.pic')[0]['href']
request = requests.get(href)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'lxml')
address = soup.select('.lpAddr-text')[0].text
#獲取房屋面積
houseSizeBefore = house.select('.huxing span')
if(len(houseSizeBefore) >0):
houseSize = houseSizeBefore[-1].text
else:
houseSize = ''
#獲取銷售狀態
saleStatus = house.select('.tag-panel i')[0].text
#獲取戶型
if(len(house.select('.tag-panel i')) == 2):
houseType = house.select('.tag-panel i')[1].text
else:
houseType = house.select('.tag-panel span')[0].text
#將獲取的信息做成房屋信息字典
houseDetail = {}
houseDetail['houseName'] = houseName
houseDetail['price'] = price
houseDetail['address'] = address
houseDetail['houseSize'] = houseSize
houseDetail['saleStatus'] = saleStatus
houseDetail['houseType'] = houseType
print(houseDetail)
housesDetails.append(houseDetail)
return housesDetails
def getAllHouseDetails():
import pandas
urlBefore = 'https://xm.fang.anjuke.com/loupan/all/p{}/'
allHouseDetails = []
for i in range(1,8):
url = urlBefore.format(i)
allHouseDetails.extend(getHousesDetails(url))
dataframe = pandas.DataFrame(allHouseDetails)
return dataframe
if __name__ == '__main__':
#houseDetails = getHousesDetails('https://xm.fang.anjuke.com/loupan/all/p1/')
allHouseDetails = getAllHouseDetails()
allHouseDetails.to_excel('anjukeHousesDetails.xlsx')
print(allHouseDetails.head(10))