1.源數據文件下載
用于進行回歸預測的源數據文件下載鏈接: https://pan.baidu.com/s/16-JGI-JnksC9I7I_ghvrug 密碼: ey46
2.編寫代碼并運行
1.第1次代碼修改并運行
下面一段代碼與之前預測評分只有0.83的文章相比,數據源多了房屋高度的分類和房屋建筑年代的分類。根據運行結果來看,這是一個有效的分類,提高了預測準確率。
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math
from sklearn.model_selection import KFold
def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位數為{} 上四分位數{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("刪除第{}列中數值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印處理異常值的相關信息
print("原矩陣共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("刪除第{}列中的異常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data
df = pd.read_excel("廈門房價數據處理結果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]
for i in range(len(y)):
y[i] = math.log(y[i])
kf = KFold(n_splits=5,shuffle=True)
for train_index,test_index in kf.split(x):
train_x = x[train_index]
test_x = x[test_index]
train_y = y[train_index]
test_y = y[test_index]
model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多層感知器-回歸模型得分",mlp_score)
model_gbr = GradientBoostingRegressor(learning_rate=0.1)
model_gbr.fit(train_x,train_y.ravel())
gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回歸模型得分",gbr_score)
上面一段代碼的運行結果為:
sklearn多層感知器-回歸模型得分 0.8372352880455567
sklearn集成-回歸模型得分 0.9103113202098574
sklearn多層感知器-回歸模型得分 0.8776823262293032
sklearn集成-回歸模型得分 0.915195456505515
sklearn多層感知器-回歸模型得分 0.8767527980991213
sklearn集成-回歸模型得分 0.9155513360324288
sklearn多層感知器-回歸模型得分 0.865894829352436
sklearn集成-回歸模型得分 0.907927739780212
sklearn多層感知器-回歸模型得分 0.8576044250407024
sklearn集成-回歸模型得分 0.9156697685167987
從上面的結果看出,此次模型訓練集成-回歸模型優于多層感知器-回歸模型
2.第2次代碼修改并運行
第2次代碼與第1次代碼主要的不同之處是對x即輸入變量做了標準化。根據運行結果來看,標準化提高了回歸模型的預測準確率。
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math
from sklearn.model_selection import KFold
def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位數為{} 上四分位數{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("刪除第{}列中數值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印處理異常值的相關信息
print("原矩陣共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("刪除第{}列中的異常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data
df = pd.read_excel("廈門房價數據處理結果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]
for i in range(len(y)):
y[i] = math.log(y[i])
kf = KFold(n_splits=5,shuffle=True)
for train_index,test_index in kf.split(x):
train_x = x[train_index]
test_x = x[test_index]
train_y = y[train_index]
test_y = y[test_index]
ss_x = preprocessing.StandardScaler()
train_x = ss_x.fit_transform(train_x)
test_x = ss_x.transform(test_x)
model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多層感知器-回歸模型得分",mlp_score)
model_gbr = GradientBoostingRegressor(learning_rate=0.1)
model_gbr.fit(train_x,train_y.ravel())
gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回歸模型得分",gbr_score)
上面一段代碼的運行結果為:
sklearn多層感知器-回歸模型得分 0.9420052610363624
sklearn集成-回歸模型得分 0.9119298712798816
sklearn多層感知器-回歸模型得分 0.9408811404890329
sklearn集成-回歸模型得分 0.9119071943013952
sklearn多層感知器-回歸模型得分 0.9453408533881785
sklearn集成-回歸模型得分 0.9183101348039411
sklearn多層感知器-回歸模型得分 0.9420925829808715
sklearn集成-回歸模型得分 0.909328953608675
sklearn多層感知器-回歸模型得分 0.9427244328757453
sklearn集成-回歸模型得分 0.9106290975464613
從上面的結果看出,輸入變量x進行標準化之后提高了多層感知器-回歸模型的得分,這次訓練結果多層感知器-回歸模型優于集成-回歸模型
3.第3次代碼修改并運行
第3次代碼與第2次代碼主要的不同之處調整了回歸模型的參數。從運行結果看出,調參使集成-回歸模型得分從0.90提升到0.95,0.05的提高,是一次成功的調參。
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math
from sklearn.model_selection import KFold
def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位數為{} 上四分位數{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("刪除第{}列中數值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印處理異常值的相關信息
print("原矩陣共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("刪除第{}列中的異常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data
df = pd.read_excel("廈門房價數據處理結果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]
for i in range(len(y)):
y[i] = math.log(y[i])
kf = KFold(n_splits=5,shuffle=True)
for train_index,test_index in kf.split(x):
train_x = x[train_index]
test_x = x[test_index]
train_y = y[train_index]
test_y = y[test_index]
ss_x = preprocessing.StandardScaler()
train_x = ss_x.fit_transform(train_x)
test_x = ss_x.transform(test_x)
model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(30,30,30),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多層感知器-回歸模型得分",mlp_score)
model_gbr = GradientBoostingRegressor(n_estimators = 1000,learning_rate=0.3)
model_gbr.fit(train_x,train_y.ravel())
gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回歸模型得分",gbr_score)
上面一段代碼的運行結果為:
sklearn多層感知器-回歸模型得分 0.9409698368795202
sklearn集成-回歸模型得分 0.9572998845800237
sklearn多層感知器-回歸模型得分 0.9415776157615223
sklearn集成-回歸模型得分 0.9550918318968675
sklearn多層感知器-回歸模型得分 0.9477305164540656
sklearn集成-回歸模型得分 0.9601481639665502
sklearn多層感知器-回歸模型得分 0.9476815842932507
sklearn集成-回歸模型得分 0.9588259351144909
sklearn多層感知器-回歸模型得分 0.9399513836020602
sklearn集成-回歸模型得分 0.9578897231281281
兩個模型的評分到0.95左右,可以算是比較準確的模型,模型訓練就到此告一段落。