Python機器學習初步——第二部分

# coding = UTF-8

# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# machine_five_ldmwp.py
# @簡介:用python進行機器學習和數據挖掘
# @作者:Glen
# @日期:2016.8.16
# @資料來源:Python數據挖掘入門與實踐
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array
from sklearn.pipeline import Pipeline

# ---------------------------------------
# 用轉換器抽取特征
# ----------------------------------------

# 模型就是用來簡化世界,特征抽取也是一樣。
# 降低復雜性有好處,但也有不足,簡化會忽略很多細節。

# 這里的例子用adult數據集,預測一個人是否年收入多于五萬美元

# 1. 載入數據
adult_filename = r'E:\data\bigdata\adult\adult.data'
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
                                                        "Education-Num", "Marital-Status", "Occupation",
                                                        "Relationship", "Race", "Sex", "Capital-gain",
                                                        "Capital-loss", "Hours-per-week", "Native-Country",
                                                        "Earnings-Raw"])

# 2. 數據清理
# 刪除缺失數據
adult.dropna(how='all', inplace=True)

# 3. 探索性數據分析
# 描述統計
print(adult["Hours-per-week"].describe())

print(adult["Work-Class"].unique())

# 3'. 演示scikit-learn特征選擇的方式
X = np.arange(30).reshape((10, 3))
X[:,1] = 1
# 注意:這時X矩陣中第二列全為1

# 利用VarianceThreshold()來刪除方差低于閾值的變量
vt = VarianceThreshold()
Xt = vt.fit_transform(X)
# 這個時候,第二列就被刪除了,因為它的方差為零
print(vt.variances_)

# 回到adult的例子,選擇最佳特征
X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (adult["Earnings-Raw"] == ' >50K').values

# 構建選擇器
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)

# 結論:相關性最好的分別是第一、三、四列
print(transformer.scores_)

# 還可以利用皮爾遜(Pearson)相關系數進行選擇
# 這里利用了SciPy庫的pearsonr()函數

# 定義函數
def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:,column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

# 利用CART分類器,查看特征選擇的準確率
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')

print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))

# 創建自己的轉換器
# 轉換器的API很簡單。它接受一種特定格式的數據,輸出一種格式的數據。

# 轉換器有兩個關鍵函數
# - fit(): 接受訓練數據,設置內部參數
# - transform(): 轉換過程。接受訓練數據集或相同格式的新數據集。

# 轉換器范例
class MeanDiscrete(TransformerMixin):
    def fit(self, X, y=None):
        X = as_float_array(X)
        self.mean = np.mean(X, axis=0)
        return self

    def transform(self, X):
        X = as_float_array(X)
        assert X.shape[1] == self.mean.shape[0]
        return X > self.mean

pipeline = Pipeline([('mean_discrete', MeanDiscrete()),
                     ('classifier', DecisionTreeClassifier(random_state=14))])
scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')
print("Mean Discrete performance: {0:.3f}".format(scores_mean_discrete.mean()))
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容