【挖掘模型】:R語言-BP和RBF 神經網絡構建電信客戶流失預測模型

背景:

業務部門獲取了公司最近一個月電信客戶信息(通訊信息、個人信息),想通過數據部門建模預測用戶未來是否流失
數據源:teleco.csv
樣本量:1000

觀察指標

建模方法: BP 神經網絡/RBF 神經網絡
指標評估:ROC 曲線 --用來描述模型分辨能力,對角線以上的圖形越高越好

Paste_Image.png

建模結論

模型對比

A. 通過 RBF 神經網絡構建的模型為 model <- rbf(x, y, size=220, maxit=410,linOut=F,initFunc = "RBF_Weights",initFuncParams=c(-4, 4, 6, 0.3, 0),learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8)),其中訓練集的 ROC:0.873,驗證集合的ROC:0.77,數據有一定的過度擬合,但是相差不大,ROC效果均比BP神經網絡和邏輯回歸的效果好。

B. 通過 BP 神經網絡構建模型為:model_nnet<-nnet(y~., linout = F,size = 19, decay = 0.01, maxit = 1000,data = train),其中訓練集 ROC 為 0.995,驗證集 ROC 為 0.691,訓練集和驗證集存在過度擬合比較嚴重,訓練集模型效果好,驗證集合模型效果一般。

建模過程

---------------------------------BP 神經網絡建模-------------------------------

>   #1.數據清洗
>   #2.size 從 1~23 循環找到最佳 size 為 19
>   #3.得到較為合理的模型 model_nnet<-nnet(y~., linout = F,size
=   19, decay = 0.01, maxit = 1000,data = train)
>   #4.訓練集 ROC 為 0.995,驗證集 ROC 為 0.691,訓練集和驗證集存在過
度擬合,訓練集模型效果好,驗證集合模型效果一般
>
>   setwd('E:\\R 數據挖掘實戰\\第四周\\data 數據')
>   library(sqldf)
>   #導入數據和數據清洗
>   data<-read.csv("teleco.csv")
>   names(data)
[1] "region"   "tenure"   "age""marital"  "address"
"income"    "ed"    "employ"    "retire"    "gender"
[11] "reside" "tollfree" "equip" "callcard" "wireless" "longmon" "tollmon" "equipmon" "cardmon" "wiremon" [21] "longten" "tollten" "equipten" "cardten" "wireten"
"multline" "voice" "pager" "internet" "callwait" [31] "forward" "confer" "ebill" "lninc" "custcat" "churn"
>   interval_var = c('income','longten','tollten','equipten ','cardten','wireten')
>   for (i in interval_var){
+   data[,i] = gsub(',','',data[,i])
+   data[,i] = as.numeric(data[,i])
+   }
>   #對 Y--是否流失(分類變量)替換
>   data <- sqldf("select tenure,age,address,income,employ,r
eside,longmon,tollmon,equipmon,cardmon,wiremon,longten,to
llten,equipten, 
+   cardten,wireten,lninc,
+   (case when region = 'Zone 1' then 1 whenregion = 'Zone 2' then 2  else 3 end) as region,
+   (case when custcat = 'Basic service' then 1 when ed = 'E-service' then 2 when ed = 'Plus service' then 3 else 4 end) as custcat,    
+   (case when ed = 'College degree Did no complete high school' then 1 when ed = 'High school degree'
then 2  when ed = 'Post-undergraduate degree' then 3 else 4 end) as ed,
+   (case when marital = 'Married' then 1 else 2 end) as marital,
        (case when retire = 'Yes' then 1 else 2 end) as retire,
+   (case when gender = 'Male' then 1 else 2 end) as gender,
        (case when tollfree = 'Yes' then 1 else 2 en d) as tollfree,
+   (case when equip = 'Yes' then 1 else 2 end) as equip,
       (case when callcard = 'Yes' then 1 else 2 end) as callcard,
+   (case when wireless = 'Yes' then 1 else 2 end) as wireless,
       (case when multline = 'Yes' then 1 else 2 end) as multline,  
+   (case when voice = 'Yes' then 1 else 2 end) as voice,
       (case when pager = 'Yes' then 1 else 2 end) as pager,
+   (case when internet = 'Yes' then 1 else 2 end) as internet,
       (case when callwait = 'Yes' then 1 else 2 end) as callwait,
+   (case when forward = 'Yes' then 1 else 2 end) as forward,
      (case when confer = 'Yes' then 1 else 2 en
d) as confer,
+   (case when ebill = 'Yes' then 1 else 2 end) as ebill,
       (case when churn = 'Yes' then 0 else 1 end) as y 
+   from data")

> #驗證數據類型是否都為數值型
> library(dfexplore)
> dfexplore::dfplot(data)

Paste_Image.png
>   write.csv(data,"datanowone.csv")
>   #size 從 1~22 循環,找到最佳 size 為 19
>   Network<-function(maxNum,formula,sizeNum,DataSet,sample
rate){
+   library(nnet)
+   library(ROCR)
+   set.seed(100)
+   select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+   train=data[select,]
+   test=data[-select,]
+   st_range <- function(x) {
+   return((x - min(x)) / (max(x) - min(x)))
+   }

+   train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+   test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge)) 
+   ROC<-data.frame()
+   for (i in seq(from =1,to =sizeNum+1,by =2)){
+   model_nnet<-nnet(formula, linout = F,size = i, decay = 0.01, maxit = maxNum,trace = F,data = train)
+   train$lg_nnet_p<-predict(model_nnet, train)
+   test$lg_nnet_p<-predict(model_nnet, test)
+   pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+   perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
+   pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+   perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
+   lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+   lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+   out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+   ROC<-rbind(ROC,out)
+   }
+   return(ROC)
+   }
>   data <-read.csv("datanowone.csv")
>   data <- data[,c(-1)]
>   Roc<-Network(maxNum=100,formula=y~.,sizeNum=25,DataSet= data,samplerate=0.7)
>   names(Roc)<-c("size","Index_Train","Index_Test")
>   plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")

Paste_Image.png
plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
Paste_Image.png
>   Proc <- data.frame(Roc$size,Roc$Index_Train,Roc$Index_T est)
>   Proc
    Roc.size Roc.Index_Train Roc.Index_Test
1   1   0.836   0.764
2   3   0.860   0.703
3   5   0.958   0.673
4   7   0.993   0.602
5   9   1.000   0.619
6   11  1.000   0.626
7   13  1.000   0.682
8   15  1.000   0.702           
9   17  1.000   0.710
10  19  1.000   0.713
11  21  1.000   0.712
12  23  1.000   0.714
13  25  1.000   0.717
            
>   #用循環得到的最優 size=19,建模
>   data <-read.csv("datanowone.csv")
>   data <- data[,c(-1)]
>   set.seed(10)
>   select<-sample(1:nrow(data),700)
>   train=data[select,]
>   test=data[-select,]
>   #極差標準化函數
>   st_range <- function(x) {
+   return((x - min(x)) / (max(x) - min(x)))
+   }
>   train[,1:35]<- as.data.frame(lapply(train[,1:35], st_ra nge))
>   test[,1:35]<- as.data.frame(lapply(test[,1:35], st_rang e))
>   
>   library(nnet)
>   model_nnet<-nnet(y~., linout = F,size = 19, decay = 0.0 1, maxit = 1000,data = train)

# weights:  704

initial value 351.037721 iter 10 value 193.936803 iter 20 value 106.403864 iter 30 value 92.620658 iter 950 value 20.273290 final value 20.273286 converged

>   pre.forest=predict(model_nnet, test)
>   out=pre.forest
>   out[out<0.5]=0
>   out[out>=0.5]=1
>   rate2<-sum(out==test$y)/length(test$y)
>   rate2

[1] 0.6966667

>   #ROC 繪圖
>   train$lg_nnet_p<-predict(model_nnet, train)
>   test$lg_nnet_p<-predict(model_nnet, test)
>   library(ROCR)
>   pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
>   perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
>   pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
>   perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
>   plot(perf_nnet_Tr,col='green',main="ROC of Models")
>   plot(perf_nnet_Te, col='black',lty=2,add=TRUE);
>   abline(0,1,lty=2,col='red')
>   lr_m_auc<-round(as.numeric(performance(pred_nnet_Tr,'au c')@y.values),3)
>   lr_m_str<-paste("Tran-AUC:",lr_m_auc,sep="")
>   legend(0.3,0.45,c(lr_m_str),2:8)
>   lr_m_auc<-round(as.numeric(performance(pred_nnet_Te,'au c')@y.values),3)
>   lr_m_ste<-paste("Test-AUC:",lr_m_auc,sep="")
>   legend(0.3,0.25,c(lr_m_ste),2:8)


Paste_Image.png
---------------------------使用徑向基神經網絡建模----------------------------------------------------------

>   #1.循環 1,size 從 50~450 循環(間隔 20),確定訓練集對應的 ROC 最大值——對應的最佳 size 值:220
>   #2.循環 2,在確定最佳 size 的基礎上,P 值從 0.1~2 循環(間隔 0.1),找到訓練集的 ROC 最大值——對應的 P 值:0.3
>   #3.循環 3,前兩次最優循環值,模型仍有過度擬合現象,懲罰項從 0 到 66 循環 66 次,找到驗證集的 ROC 明顯提升,訓練集 ROC 影響不大的懲罰值:6
>   #4.通過前 3 次的循環找到最佳模型,訓練集的 ROC:0.873,驗證集合的 R OC:0.77,從 ROC 的值表現來看模型效果一般
>   #model <- rbf(x, y, size=220, maxit=410,linOut=F,initFun
c = "RBF_Weights",initFuncParams=c(-4, 4, 6, 0.3, 0),learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
>   #-----size 從 50~450 循環(間隔 20),尋找最佳 size 為 220-----
>   Network<-function(maxNum,sizeNum,DataSet,samplerate){
+   library(nnet)
+   library(ROCR)
+   set.seed(100)
+   select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+   train=data[select,]
+   test=data[-select,]
+   #進行極差標準化
+   st_range <- function(x) {
+   return((x - min(x)) / (max(x) - min(x)))
+   }
+
+   train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+   test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge))
+   x<-train[,1:35]
+   y<-train[,36]
+   ROC<-data.frame()
+   for (i in seq(from =50,to =sizeNum+1,by =20)){
+   model <- rbf(x, y, size=i, maxit=maxNum,linOut=F,init Func = "RBF_Weights",initFuncParams=c(-4, 4, 0, 0.01, 0) , learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
+   train$lg_nnet_p<-predict(model,train[,1:35])
+   test$lg_nnet_p<-predict(model, test[,1:35])
+   pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+   perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr
")
+   pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+   perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr
")
+   lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+   lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+   out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+   ROC<-rbind(ROC,out)
+   }
+   return(ROC)
+   }
>   data <-read.csv("datanowone.csv")
>   data <- data[,c(-1)]
>   Roc<-Network(maxNum=410,sizeNum=450,DataSet=data,sample rate=0.7)
>   names(Roc)<-c("size","Index_Train","Index_Test")#命名
>   plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")
>   plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
 
Paste_Image.png
Paste_Image.png
>   #-P 值從 0.1~2 循環(間隔 0.1),找到訓練集的 ROC 最大對應的 P 值為

0.3
>   Network<-function(maxNum,sizeNum,DataSet,samplerate){
+   library(nnet)
+   library(ROCR)
+   set.seed(100)
+   select<-sample(1:nrow(data),ceiling(nrow(data)*sample rate))
+   train=data[select,]
+   test=data[-select,]
+   st_range <- function(x) {
+   return((x - min(x)) / (max(x) - min(x)))
+   }
+
+   train[,1:35]<- as.data.frame(lapply(train[,1:35], st_r ange))
+   test[,1:35]<- as.data.frame(lapply(test[,1:35], st_ran ge))
+   x<-train[,1:35]
+   y<-train[,36]
+   ROC<-data.frame()
+   for (i in seq(from =0.1,to =sizeNum+1,by =0.1)){
+   model <- rbf(x, y, size=220, maxit=maxNum,linOut=F,in itFunc = "RBF_Weights",initFuncParams=c(-4, 4, 0, i, 0) ,l earnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
+   train$lg_nnet_p<-predict(model,train[,1:35])
+   test$lg_nnet_p<-predict(model, test[,1:35])
+   pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
+   perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr
")
+   pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
+   perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr
")
+   lr_m_auc_Tr<-round(as.numeric(performance(pred_nnet_ Tr,'auc')@y.values),3)
+   lr_m_auc_Te<-round(as.numeric(performance(pred_nnet_ Te,'auc')@y.values),3)
+   out<-data.frame(i,lr_m_auc_Tr,lr_m_auc_Te)
+   ROC<-rbind(ROC,out)
+   }
+   return(ROC)
+   }
>   data <-read.csv("datanowone.csv")
>   data <- data[,c(-1)]
>   Roc<-Network(maxNum=410,sizeNum=1,DataSet=data,samplera te=0.7)
> plot(Roc$size,Roc$Index_Train,type="l",main="訓練集的 ROC INDEX")
> plot(Roc$size,Roc$Index_Test,type="l",main="驗證集的 ROC INDEX")
Paste_Image.png
Paste_Image.png
> Proc <-data.frame(Roc$size,Roc$Index_Train,Roc$Index_Test)
> Proc #懲罰值=2
    
    Roc.size Roc.Index_Train Roc.Index_Test
1   0   0.929   0.704
2   1   0.891   0.760
3   2   0.873   0.770
4   3   0.861   0.773
5   4   0.853   0.775
6   5   0.846   0.776
7   6   0.841   0.777           
8   7   0.837   0.777
9   8   0.833   0.776
10  9   0.830   0.775
11  10  0.827   0.774
12  11  0.825   0.773
29  28  0.800   0.767
30  29  0.799   0.766
31  30  0.798   0.765
32  31  0.797   0.765
33  32  0.797   0.765
34  33  0.796   0.765
35  34  0.795   0.765
            

>   #------將三次循環的結果得到的最佳 size,P 值,懲罰項,得出較為合理的徑向基神經網絡模型---------
>   setwd('E:\\R 數據挖掘實戰\\第四周\\data 數據')
>   data <-read.csv("datanowone.csv")
>   data <- data[,c(-1)]
>   dfexplore::dfplot(data)
>   #隨機抽樣,建立訓練集與測試集
>   set.seed(100)
>   select<-sample(1:nrow(data),700)
>   train=data[select,]
>   test=data[-select,]
>   library("RSNNS")
>   st_range <- function(x) {
+   return((x - min(x)) / (max(x) - min(x)))
+   }
>   train[,1:35]<- as.data.frame(lapply(train[,1:35], st_ra nge))
>   test[,1:35]<- as.data.frame(lapply(test[,1:35], st_rang e))
>   x<-train[,1:35]
>   y<-train[,36]

> model <- rbf(x, y, size=220, maxit=1000,linOut=F,
+   initFunc = "RBF_Weights",
+   initFuncParams=c(-4, 4, 2, 0.3, 0), 
+   learnFuncParams=c(1e-8, 0, 1e-8, 0.1, 0.8))
>   plotIterativeError(model)   
>   train$lg_nnet_p<-predict(model, train[,1:35])
>   test$lg_nnet_p<-predict(model, test[,1:35]) 
>   library(ROCR)
>   pred_nnet_Tr <- prediction(train$lg_nnet_p, train$y)
>   perf_nnet_Tr <- performance(pred_nnet_Tr,"tpr","fpr")
>   pred_nnet_Te <- prediction(test$lg_nnet_p, test$y)
>   perf_nnet_Te <- performance(pred_nnet_Te,"tpr","fpr")
>   plot(perf_nnet_Tr,col='green',main="ROC of Models")
>   plot(perf_nnet_Te, col='black',lty=2,add=TRUE);
>   abline(0,1,lty=2,col='red')
>   lr_m_auc<-round(as.numeric(performance(pred_nnet_Tr,'au c')@y.values),3)
>   lr_m_str<-paste("Tran-AUC:",lr_m_auc,sep="")
>   legend(0.3,0.45,c(lr_m_str),2:8)
>   lr_m_auc<-round(as.numeric(performance(pred_nnet_Te,'au c')@y.values),3)
>   lr_m_ste<-paste("Test-AUC:",lr_m_auc,sep="")
>   legend(0.3,0.25,c(lr_m_ste),2:8)
Paste_Image.png

參考資料:CDA《信用風險建模》微專業

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容