Bagging與Boosting

加載數(shù)據(jù)

import pandas as pd
df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 
                   'Malic acid', 'Ash', 
                   'Alcalinity of ash', 'Magnesium', 
                   'Total phenols', 'Flavanoids', 
                   'Nonflavanoid phenols', 'Proanthocyanins', 
                   'Color intensity', 'Hue', 
                   'OD280/OD315 of diluted wines', 'Proline']
y = df_wine['Class label'].values

特征選擇

為了方便后面可視化,我們只選取2個(gè)特征,通過(guò)自變量與因變量y相關(guān)系數(shù)來(lái)選擇

# pearsonr可以計(jì)算相關(guān)系數(shù)與p值
# 當(dāng)p<0.01表示兩個(gè)變量強(qiáng)相關(guān)
from scipy.stats import pearsonr

lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
    lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values

還可以通過(guò)PCA降維來(lái)選擇,本例降維后分類(lèi)效果并不好

# pearsonr可以計(jì)算相關(guān)系數(shù)與p值
# 當(dāng)p<0.01表示兩個(gè)變量強(qiáng)相關(guān)
from scipy.stats import pearsonr

lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
    lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values

因?yàn)檫@里有標(biāo)簽,還可以通過(guò)LDA來(lái)降維選擇,效果比較好,數(shù)據(jù)分類(lèi)達(dá)到100%正確

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = df_wine.iloc[:,range(1,len(df_wine.columns),1)].values
lda = LinearDiscriminantAnalysis(n_components=2)
X = lda.fit(X, y).transform(X)

調(diào)參,這里只調(diào)一個(gè)決策樹(shù)深度參數(shù)

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# # 拆分訓(xùn)練集的30%作為測(cè)試集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=1)
param_test1 = {'max_depth':range(1,20,1)}
gsearch1 = GridSearchCV(estimator = DecisionTreeClassifier(criterion="entropy",
                                random_state=10), 
                       param_grid = param_test1,cv=10)
gsearch1.fit(X_train,y_train)
#print gsearch1.grid_scores_, 
print gsearch1.best_params_ 
print gsearch1.best_score_

輸出

{'max_depth': 8}
0.822580645161
度量單個(gè)決策樹(shù)的準(zhǔn)確性
# 度量單個(gè)決策樹(shù)的準(zhǔn)確性
from sklearn.metrics import accuracy_score
tree = DecisionTreeClassifier(criterion="entropy", max_depth=gsearch1.best_params_['max_depth'])
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test))
Decision tree train/test accuracies 0.984/0.815
# 生成50個(gè)決策樹(shù),詳細(xì)的參數(shù)建議參考官方文檔
bag = BaggingClassifier(base_estimator=tree, n_estimators=50, 
                        max_samples=1.0, max_features=1.0, 
                        bootstrap=True, bootstrap_features=False, 
                        n_jobs=1, random_state=1)

# 度量bagging分類(lèi)器的準(zhǔn)確性
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test))

Bagging分類(lèi)器的效果的確要比單個(gè)決策樹(shù)的效果好,提高了一點(diǎn)

Bagging train/test accuracies 1.000/0.852

Boosting分類(lèi)器, Bagging是投票平均模式,Boosting

ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000, learning_rate=0.1, random_state=0)
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

推薦閱讀更多精彩內(nèi)容