數據導入
setwd("D:/R")
或者直接getwd()放入學校路徑
A=read.csv('',header=TRUE)? ? 有表頭true? 無 false
數據清理
#1.空值
data=data[complete.cases(data),]#去空值
data=data[!complete.cases(wine),]#顯示空值
#2.去重復值
data=unique(data)
#3.查看缺失值
c=is.na(data)
#4.標記缺失值
data() ?? 列出已載入的包中的所有數據集。
data(package = .packages(all.available = TRUE))? ? ? ?列出已安裝的包中的所有數據集。
y = rep(c(1, 2, 3), c(20, 20, 20))
生成20個1 20個2 20個3
y=c(rep(-1,10),rep(1,10))
rep? 重復函數? ? -1 重復出現十次
rnorm()函數產生一系列的隨機數,隨機數個數,均值和標準差都可以設定
sample 有無放回生成隨機數https://blog.csdn.net/Heidlyn/article/details/56013509
cor() 函數計算兩兩變量之間的相關系數的矩陣
數據中心化:? scale(data,center=T,scale=F)
數據標準化:? scale(data,center=T,scale=T) 或默認參數scale(data)
進行pca之前一般先變量標準化
決策樹 分類樹 剪枝條
決策樹(https://blog.csdn.net/u010089444/article/details/53241218)
ID3算法? ? ? ? 選擇信息增益最大的方向進行分支標準
https://blog.csdn.net/xiaohukun/article/details/78055132
信息增益:? ? 信息熵-條件熵
在決策樹算法的學習過程中,信息增益是特征選擇的一個重要指標,它定義為一個特征能夠為分類系統帶來多少信息,帶來的信息越多,說明該特征越重要,相應的信息增益也就越大。
https://www.zhihu.com/question/22104055
信息熵越大說明事件的無序程度越高
信息熵越小說明事件的有序程度越高
https://blog.csdn.net/wxn704414736/article/details/80512705
CART
gini越小 越純
最小的切分點最為最優切分點? ? 使用該切分點將數據切分為兩個子集https://blog.csdn.net/wsp_1138886114/article/details/80955528
生成樹枝+剪枝
https://www.cnblogs.com/karlpearson/p/6224148.html
監督學習【分類 回歸? 支持向量機】
非監督學習【聚類 主成分】
https://blog.csdn.net/chenKFKevin/article/details/70547549
無監督學習:僅有x值 來
兩種主要類型無監督學習:聚類分析,主成分分析
定性的響應變量,定性變量也稱為分類變量。
線性回歸的因變量(Y)是連續變量,自變量(X)可以是連續變量,也可以是分類變量
logistic 回歸與線性回歸恰好相反,因變量一定要是分類變量,不可能是連續變量。分類變量既可以是二分類,也可以是多分類,多分類中既可以是有序,也可以是無序。
最小二乘法(https://www.zhihu.com/question/37031188)? ??
豎直投影下來 計算(y-ybar)^2最小
pca? 降維工具
協方差矩陣——PCA實現的關鍵??
cov()? 計算協方差in R
https://www.zhihu.com/question/41120789
pinkyjie.com/2011/02/24/covariance-pca/
prcomp(data,scale=TRUE)? ? scale對數據進行標準化處理
prcomp? ? pca主成分分析函數
混淆矩陣
https://www.zhihu.com/question/36883196
馬氏距離。ROC曲線
蒙特卡洛仿真
支持向量機? ? (文本分類問題)
https://www.zhihu.com/question/21094489
knn?
kmeans?
https://zhuanlan.zhihu.com/p/31580379
Version:1.0StartHTML:0000000107EndHTML:0000000974StartFragment:0000000127EndFragment:0000000956
X=rbind(matrix(rnorm(20*50,mean = 0),nrow = 20),matrix(rnorm(20*50,mean = 0.7),nrow = 20),matrix(rnorm(20*50,mean = 1.4),nrow = 20))
X.pca=prcomp(X)
plot(X.pca[,1:2],col=c(rep(1,20),rep(2,20),rep(3,20)))
res=kmeans(X,centers=3)
true_class=c(rep(1,20),rep(2,20),rep(3,30))
table(res$cluster,true_class)
wine.case
https://www.kaggle.com/xvivancos/tutorial-clustering-wines-with-k-means
https://www.kaggle.com/maitree/wine-quality-selection
cov_sdc=cov(wine)
eigen(cov_sdc)
res.pca <- PCA(wine[,-12], graph = TRUE)
eig.val <- get_eigenvalue(res.pca)
eig.val
#數據導入
wine=read.csv()
wine= read.csv('winequality-white.csv',header=TRUE)
wine=winequality_white
#data cleaning
wine = wine[complete.cases(wine),]
#PCA
library(stringr)
library(FactoMineR)
#繪圖
res.pca <- PCA(wine[,-12], graph = TRUE)#delete Y=quality, plot the PCA graph
sdc=scale(wine)
pca.d=prcomp(sdc)
summary(pca.d)
#PCA降維
wine=wine[,-9:-11]
#查看定性變量分布,確定定性變量
hist(wine$quality)
#分類
wine0 = wine[wine$quality==3,]
wine1 = wine[wine$quality==4,]
wine2 = wine[wine$quality==5,]
wine3 = wine[wine$quality==6,]
wine4 = wine[wine$quality==7,]
wine5 = wine[wine$quality==8,]
#抽樣
label0= sample(c(1:10),dim(wine0[1]),replace= TRUE)
label1= sample(c(1:10),dim(wine1[1]),replace= TRUE)
label2= sample(c(1:10),dim(wine2[1]),replace= TRUE)
label3= sample(c(1:10),dim(wine3[1]),replace= TRUE)
label4= sample(c(1:10),dim(wine4[1]),replace= TRUE)
label5= sample(c(1:10),dim(wine5[1]),replace= TRUE)
wine0_train = wine0[label0<=5,]
wine0_test = wine0[label0>5,]
wine1_train = wine1[label1<=5,]
wine1_test = wine1[label1>5,]
wine2_train = wine2[label2<=5,]
wine2_test = wine2[label2>5,]
wine3_train = wine3[label3<=5,]
wine3_test = wine3[label3>5,]
wine4_train = wine4[label4<=5,]
wine4_test = wine4[label4>5,]
wine5_train = wine5[label5<=5,]
wine5_test = wine4[label5>5,]
wine_train = rbind(wine0_train,wine1_train,wine2_train,wine3_train,wine4_train,wine5_train)
wine_test = rbind(wine0_test,wine1_test,wine2_test,wine3_test,wine4_test,wine5_test)
跑
library(nnet)
re_log = multinomial(quality~.,data= wine_train)?
將數據變為定性變量
wine_train$quality = as.factor(wine_train$quality)
######################################
library(rpart)
library(rattle)
library(rpart.plot)
#########################################
ID3? 方法生成樹枝(信息增益)
re_id3 <-rpart(quality~.,data=wine_train,method="class", parms=list(split="information"))
plot(re_id3)
########################################
CART 方法生成樹枝(基尼系數)
re_CART = rpart(quality~.,data= wine_train,method = "class",parms = list(split="gini"),control=rpart.control(cp=0.000001))
plot(re_CART,main = "CART")
找到復雜度最小的值
min = which.min(re_CART$cptable[,4])
剪枝
re_CART_f = prune(re_CART,cp=re_CART$cptable[min,1])
pred_id3 = predict(re_id3,newdata = wine_test)
pred_CART = predict(re_CART,newdata = wine_test,type="class")
table(wine_test$quality,pred_CART)
wine_train$quality= as.factor(wine_train$quality)
隨機森林
?library("randomForest")
?data.index = sample(c(1,2), nrow(heart), replace = T, prob = c(0.7, 0.3))
?train_data =heart[which(data.index == 1),]
?test_data =heart[which(data.index == 2),]
?n<-length(names(train_data))
?rate=c()
網格法
for (i in 1:(n-1))
{
? mtry=i
? for(j in (1:100))
? {
? set.seed(1234)
? rf_train=randomForest(as.factor(train_data$target)~.,data=train_data,mtry=i,ntree=j)
? rate[(i-1)*100+j]=mean(rf_train$err.rate)?
? }
}
z=which.min(rate)
print(z)
展示重要性
importance<-importance(heart_rf)
barplot(heart_rf$importance[,1],main="Input variable importance measure indicator bar chart")
box()
importance(heart_rf,type=2)
varImpPlot(x=heart_rf,sort=TRUE,n.var=nrow(heart_rf$importance),main="scatterplot") #可視化
hist(treesize(heart_rf))
check model
pred<-predict(heart_rf,newdata=data.test)
pred_out_1<-predict(object=heart_rf,newdata=data.test,type="prob")
table<-table(pred,data.test$target)
sum(diag(table))/sum(table)
plot(margin(iris_rf,data.test$target))
----------------------------------------------------------------別管
wine$quality
linear regression
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(corrgram)
library(lattice) #required for nearest neighbors
library(FNN) # nearest neighbors techniques
library(pROC) # to make ROC curve
install.packages('corrgram')
library(corrgram)
---------------------------------------------------------------------------------
linear_quality = lm(quality ~ fixed acidity+volatile acidity+citric acid+residual sugar+chlorides+free sulfur dioxide+total sulfur dioxide+density, data=wine)
corrgram(wine, lower.panel=panel.shade, upper.panel=panel.ellipse)
wine$poor <- wine$quality <= 4
wine$okay <- wine$quality == 5 | wine$quality == 6
wine$good <- wine$quality >= 7
head(wine)
summary(wine)
KNN
class_knn10 = knn(train=wine[,1:8], test=wine[,1:8], cl=wine$good, k =10)
class_knn20 = knn(train=wine[,1:8],test=wine[,1:8], cl = wine$good, k=20)
table(wine$good,class_knn10)
table(wine$good,class_knn20)
wine123=winequality_white
wine123$poor <- wine$quality <= 4
wine123$okay <- wine$quality == 5 | wine$quality == 6
wine123$good <- wine$quality >= 7
library(rpart) #for trees
tree1 = rpart(good~? alcohol + sulphates+ pH , data = wine123, method="class")
rpart.plot(tree1)
summary(tree1)
pred1 = predict(tree1,newdata=wine123,type="class")
summary(pred1)
summary(wine123$good)
比較模型的準確度
tree2 = rpart(good~? alcohol + volatile acidity +citric acid+ pH , data = wine123, method="class")
tree2 = rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123, method="class")
rpart.plot(tree2)
tree2= rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123 ,method='class')
pred2 = predict(tree2,newdata=wine123,type="class")
summary(pred2)
summary(wine123$good)
信息熵計算
LDA
決策樹
p187 ? ?? chp4 ? power function
p212 ? ? chp5 ? ? boostrap
p215 ? ? chp5 ? ? loocv
p431 ? ? chp10 ?? kmeans
一、變量的基本定義和基礎操作
1. 數值型變量的賦值
??a = 5
2. 向量賦值
??x = c(1:6) , c()為生成向量對應的函數
3. 向量中元素的訪問
??x = c(1:6)
?x[3] ,中括號中的數字代表所訪問的數值在向量x中的位置。
??x[-3],負數的標度表示取補集,即返回向量x中除第3位以外的其他元素。
4. 矩陣的定義
??B =matrix(c(1:10),nrow=2,ncol=5,byrow=TRUE)
matrix()未定義矩陣的函數,括號中第一個位置為寫入矩陣中的元素,nrow參數位行數,ncol參數位列數,byrow=TRUE,表示數據按行的順序書寫。byrow=FALSE? 按照列的順序書寫
不打byrow 按照列來輸入
5. 矩陣元素的訪問
??B[1,]?訪問矩陣中的第一行
??B[,2]? 訪問矩陣中的第二列
??B[2,1]訪問矩陣第二行第一列的元素
??B[,2:5]訪問矩陣2到5列的元素
??B[,-4]? 訪問矩陣中除第4列的元素
6. 常用統計函數
??sum()求括號中對象的各個元素和
??mean()求括號中對象元素的均值
??max()? 求括號中對象元素中的最大值
??min()?? 求括號中對象元素中的最小值
7. 其他矩陣信息的提取
??dim(B)??返回矩陣的維度,第一個值為行數,第二個值為列數
??dim(B)[1]可訪問矩陣的行
? dim(B)[2] 可訪問矩陣的列數?? 1 代表行 2代表列
??length(B)返回對象的長度,(請自行測試返回值是行還是列)