背景:
業務部門獲取了公司最近一個月電信客戶信息(通訊信息、個人信息),想通過數據部門建模預測用戶未來是否流失
數據源:teleco.csv
樣本量:1000
觀察指標
建模方法: BP 神經網絡/RBF 神經網絡
指標評估:ROC 曲線 --用來描述模型分辨能力,對角線以上的圖形越高越好
Paste_Image.png
建模結論
模型對比
A. 通過 RBF 神經網絡構建的模型為 model <- rbf(x, y, size=220, maxit=410,linOut=F,initFunc = "RBF_Weights",initFuncParams=c(-4, 4, 6, 0.3, 0),learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8)),其中訓練集的 ROC:0.873,驗證集合的ROC:0.77,數據有一定的過度擬合,但是相差不大,ROC效果均比BP神經網絡和邏輯回歸的效果好。
B. 通過 BP 神經網絡構建模型為:model_nnet<-nnet(y~., linout = F,size = 19, decay = 0.01, maxit = 1000,data = train),其中訓練集 ROC 為 0.995,驗證集 ROC 為 0.691,訓練集和驗證集存在過度擬合比較嚴重,訓練集模型效果好,驗證集合模型效果一般。
建模過程
---------------------------------BP 神經網絡建模-------------------------------
> #1.數據清洗
> #2.size 從 1~23 循環找到最佳 size 為 19
> #3.得到較為合理的模型 model_nnet<-nnet(y~., linout = F,size
= 19, decay = 0.01, maxit = 1000,data = train)
> #4.訓練集 ROC 為 0.995,驗證集 ROC 為 0.691,訓練集和驗證集存在過
度擬合,訓練集模型效果好,驗證集合模型效果一般
>
> setwd('E:\\R 數據挖掘實戰\\第四周\\data 數據')
> library(sqldf)
> #導入數據和數據清洗
> data<-read.csv("teleco.csv")
> names(data)
[1] "region" "tenure" "age""marital" "address"
"income" "ed" "employ" "retire" "gender"
[11] "reside" "tollfree" "equip" "callcard" "wireless" "longmon" "tollmon" "equipmon" "cardmon" "wiremon" [21] "longten" "tollten" "equipten" "cardten" "wireten"
"multline" "voice" "pager" "internet" "callwait" [31] "forward" "confer" "ebill" "lninc" "custcat" "churn"
> interval_var = c('income','longten','tollten','equipten ','cardten','wireten')
> for (i in interval_var){
+ data[,i] = gsub(',','',data[,i])
+ data[,i] = as.numeric(data[,i])
+ }
> #對 Y--是否流失(分類變量)替換
> data <- sqldf("select tenure,age,address,income,employ,r
eside,longmon,tollmon,equipmon,cardmon,wiremon,longten,to
llten,equipten,
+ cardten,wireten,lninc,
+ (case when region = 'Zone 1' then 1 whenregion = 'Zone 2' then 2 else 3 end) as region,
+ (case when custcat = 'Basic service' then 1 when ed = 'E-service' then 2 when ed = 'Plus service' then 3 else 4 end) as custcat,
+ (case when ed = 'College degree Did no complete high school' then 1 when ed = 'High school degree'
then 2 when ed = 'Post-undergraduate degree' then 3 else 4 end) as ed,
+ (case when marital = 'Married' then 1 else 2 end) as marital,
(case when retire = 'Yes' then 1 else 2 end) as retire,
+ (case when gender = 'Male' then 1 else 2 end) as gender,
(case when tollfree = 'Yes' then 1 else 2 en d) as tollfree,
+ (case when equip = 'Yes' then 1 else 2 end) as equip,
(case when callcard = 'Yes' then 1 else 2 end) as callcard,
+ (case when wireless = 'Yes' then 1 else 2 end) as wireless,
(case when multline = 'Yes' then 1 else 2 end) as multline,
+ (case when voice = 'Yes' then 1 else 2 end) as voice,
(case when pager = 'Yes' then 1 else 2 end) as pager,
+ (case when internet = 'Yes' then 1 else 2 end) as internet,
(case when callwait = 'Yes' then 1 else 2 end) as callwait,
+ (case when forward = 'Yes' then 1 else 2 end) as forward,
(case when confer = 'Yes' then 1 else 2 en
d) as confer,
+ (case when ebill = 'Yes' then 1 else 2 end) as ebill,
(case when churn = 'Yes' then 0 else 1 end) as y
+ from data")
> #驗證數據類型是否都為數值型
> library(dfexplore)
> dfexplore::dfplot(data)
Paste_Image.png
> write.csv(data,"datanowone.csv")
> #size 從 1~22 循環,找到最佳 size 為 19
> Network<-function(maxNum,formula,sizeNum,DataSet,sample
rate){
+ library(nnet)
+ library(ROCR)
+ set.seed(100)
+ select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+ train=data[select,]
+ test=data[-select,]
+ st_range <- function(x) {
+ return((x - min(x)) / (max(x) - min(x)))
+ }
+ train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+ test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge))
+ ROC<-data.frame()
+ for (i in seq(from =1,to =sizeNum+1,by =2)){
+ model_nnet<-nnet(formula, linout = F,size = i, decay = 0.01, maxit = maxNum,trace = F,data = train)
+ train$lg_nnet_p<-predict(model_nnet, train)
+ test$lg_nnet_p<-predict(model_nnet, test)
+ pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+ perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
+ pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+ perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
+ lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+ lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+ out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+ ROC<-rbind(ROC,out)
+ }
+ return(ROC)
+ }
> data <-read.csv("datanowone.csv")
> data <- data[,c(-1)]
> Roc<-Network(maxNum=100,formula=y~.,sizeNum=25,DataSet= data,samplerate=0.7)
> names(Roc)<-c("size","Index_Train","Index_Test")
> plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")
Paste_Image.png
plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
Paste_Image.png
> Proc <- data.frame(Roc$size,Roc$Index_Train,Roc$Index_T est)
> Proc
Roc.size Roc.Index_Train Roc.Index_Test
1 1 0.836 0.764
2 3 0.860 0.703
3 5 0.958 0.673
4 7 0.993 0.602
5 9 1.000 0.619
6 11 1.000 0.626
7 13 1.000 0.682
8 15 1.000 0.702
9 17 1.000 0.710
10 19 1.000 0.713
11 21 1.000 0.712
12 23 1.000 0.714
13 25 1.000 0.717
> #用循環得到的最優 size=19,建模
> data <-read.csv("datanowone.csv")
> data <- data[,c(-1)]
> set.seed(10)
> select<-sample(1:nrow(data),700)
> train=data[select,]
> test=data[-select,]
> #極差標準化函數
> st_range <- function(x) {
+ return((x - min(x)) / (max(x) - min(x)))
+ }
> train[,1:35]<- as.data.frame(lapply(train[,1:35], st_ra nge))
> test[,1:35]<- as.data.frame(lapply(test[,1:35], st_rang e))
>
> library(nnet)
> model_nnet<-nnet(y~., linout = F,size = 19, decay = 0.0 1, maxit = 1000,data = train)
# weights: 704
initial value 351.037721 iter 10 value 193.936803 iter 20 value 106.403864 iter 30 value 92.620658 iter 950 value 20.273290 final value 20.273286 converged
> pre.forest=predict(model_nnet, test)
> out=pre.forest
> out[out<0.5]=0
> out[out>=0.5]=1
> rate2<-sum(out==test$y)/length(test$y)
> rate2
[1] 0.6966667
> #ROC 繪圖
> train$lg_nnet_p<-predict(model_nnet, train)
> test$lg_nnet_p<-predict(model_nnet, test)
> library(ROCR)
> pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
> perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
> pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
> perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
> plot(perf_nnet_Tr,col='green',main="ROC of Models")
> plot(perf_nnet_Te, col='black',lty=2,add=TRUE);
> abline(0,1,lty=2,col='red')
> lr_m_auc<-round(as.numeric(performance(pred_nnet_Tr,'au c')@y.values),3)
> lr_m_str<-paste("Tran-AUC:",lr_m_auc,sep="")
> legend(0.3,0.45,c(lr_m_str),2:8)
> lr_m_auc<-round(as.numeric(performance(pred_nnet_Te,'au c')@y.values),3)
> lr_m_ste<-paste("Test-AUC:",lr_m_auc,sep="")
> legend(0.3,0.25,c(lr_m_ste),2:8)
Paste_Image.png
---------------------------使用徑向基神經網絡建模----------------------------------------------------------
> #1.循環 1,size 從 50~450 循環(間隔 20),確定訓練集對應的 ROC 最大值——對應的最佳 size 值:220
> #2.循環 2,在確定最佳 size 的基礎上,P 值從 0.1~2 循環(間隔 0.1),找到訓練集的 ROC 最大值——對應的 P 值:0.3
> #3.循環 3,前兩次最優循環值,模型仍有過度擬合現象,懲罰項從 0 到 66 循環 66 次,找到驗證集的 ROC 明顯提升,訓練集 ROC 影響不大的懲罰值:6
> #4.通過前 3 次的循環找到最佳模型,訓練集的 ROC:0.873,驗證集合的 R OC:0.77,從 ROC 的值表現來看模型效果一般
> #model <- rbf(x, y, size=220, maxit=410,linOut=F,initFun
c = "RBF_Weights",initFuncParams=c(-4, 4, 6, 0.3, 0),learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
> #-----size 從 50~450 循環(間隔 20),尋找最佳 size 為 220-----
> Network<-function(maxNum,sizeNum,DataSet,samplerate){
+ library(nnet)
+ library(ROCR)
+ set.seed(100)
+ select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+ train=data[select,]
+ test=data[-select,]
+ #進行極差標準化
+ st_range <- function(x) {
+ return((x - min(x)) / (max(x) - min(x)))
+ }
+
+ train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+ test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge))
+ x<-train[,1:35]
+ y<-train[,36]
+ ROC<-data.frame()
+ for (i in seq(from =50,to =sizeNum+1,by =20)){
+ model <- rbf(x, y, size=i, maxit=maxNum,linOut=F,init Func = "RBF_Weights",initFuncParams=c(-4, 4, 0, 0.01, 0) , learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
+ train$lg_nnet_p<-predict(model,train[,1:35])
+ test$lg_nnet_p<-predict(model, test[,1:35])
+ pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+ perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr
")
+ pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+ perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr
")
+ lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+ lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+ out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+ ROC<-rbind(ROC,out)
+ }
+ return(ROC)
+ }
> data <-read.csv("datanowone.csv")
> data <- data[,c(-1)]
> Roc<-Network(maxNum=410,sizeNum=450,DataSet=data,sample rate=0.7)
> names(Roc)<-c("size","Index_Train","Index_Test")#命名
> plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")
> plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
Paste_Image.png
Paste_Image.png
> #-P 值從 0.1~2 循環(間隔 0.1),找到訓練集的 ROC 最大對應的 P 值為
0.3
> Network<-function(maxNum,sizeNum,DataSet,samplerate){
+ library(nnet)
+ library(ROCR)
+ set.seed(100)
+ select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+ train=data[select,]
+ test=data[-select,]
+ st_range <- function(x) {
+ return((x - min(x)) / (max(x) - min(x)))
+ }
+
+ train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+ test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge))
+ x<-train[,1:35]
+ y<-train[,36]
+ ROC<-data.frame()
+ for (i in seq(from =0.1,to =sizeNum+1,by =0.1)){
+ model <- rbf(x, y, size=220, maxit=maxNum,linOut=F,in itFunc = "RBF_Weights",initFuncParams=c(-4, 4, 0, i, 0) ,l earnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
+ train$lg_nnet_p<-predict(model,train[,1:35])
+ test$lg_nnet_p<-predict(model, test[,1:35])
+ pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+ perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr
")
+ pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+ perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr
")
+ lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+ lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+ out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+ ROC<-rbind(ROC,out)
+ }
+ return(ROC)
+ }
> data <-read.csv("datanowone.csv")
> data <- data[,c(-1)]
> Roc<-Network(maxNum=410,sizeNum=1,DataSet=data,samplera te=0.7)
> plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")
> plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
Paste_Image.png
Paste_Image.png
> Proc <-data.frame(Roc$size,Roc$Index_Train,Roc$Index_Test)
> Proc #懲罰值=2
Roc.size Roc.Index_Train Roc.Index_Test
1 0 0.929 0.704
2 1 0.891 0.760
3 2 0.873 0.770
4 3 0.861 0.773
5 4 0.853 0.775
6 5 0.846 0.776
7 6 0.841 0.777
8 7 0.837 0.777
9 8 0.833 0.776
10 9 0.830 0.775
11 10 0.827 0.774
12 11 0.825 0.773
29 28 0.800 0.767
30 29 0.799 0.766
31 30 0.798 0.765
32 31 0.797 0.765
33 32 0.797 0.765
34 33 0.796 0.765
35 34 0.795 0.765
> #------將三次循環的結果得到的最佳 size,P 值,懲罰項,得出較為合理的徑向基神經網絡模型---------
> setwd('E:\\R 數據挖掘實戰\\第四周\\data 數據')
> data <-read.csv("datanowone.csv")
> data <- data[,c(-1)]
> dfexplore::dfplot(data)
> #隨機抽樣,建立訓練集與測試集
> set.seed(100)
> select<-sample(1:nrow(data),700)
> train=data[select,]
> test=data[-select,]
> library("RSNNS")
> st_range <- function(x) {
+ return((x - min(x)) / (max(x) - min(x)))
+ }
> train[,1:35]<- as.data.frame(lapply(train[,1:35], st_ra nge))
> test[,1:35]<- as.data.frame(lapply(test[,1:35], st_rang e))
> x<-train[,1:35]
> y<-train[,36]
> model <- rbf(x, y, size=220, maxit=1000,linOut=F,
+ initFunc = "RBF_Weights",
+ initFuncParams=c(-4, 4, 2, 0.3, 0),
+ learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
> plotIterativeError(model)
> train$lg_nnet_p<-predict(model, train[,1:35])
> test$lg_nnet_p<-predict(model, test[,1:35])
> library(ROCR)
> pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
> perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
> pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
> perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
> plot(perf_nnet_Tr,col='green',main="ROC of Models")
> plot(perf_nnet_Te, col='black',lty=2,add=TRUE);
> abline(0,1,lty=2,col='red')
> lr_m_auc<-round(as.numeric(performance(pred_nnet_Tr,'au c')@y.values),3)
> lr_m_str<-paste("Tran-AUC:",lr_m_auc,sep="")
> legend(0.3,0.45,c(lr_m_str),2:8)
> lr_m_auc<-round(as.numeric(performance(pred_nnet_Te,'au c')@y.values),3)
> lr_m_ste<-paste("Test-AUC:",lr_m_auc,sep="")
> legend(0.3,0.25,c(lr_m_ste),2:8)
Paste_Image.png
參考資料:CDA《信用風險建模》微專業