美文网首页
【挖掘模型】:R语言-逻辑回归预测汽车信用金融违约模型

【挖掘模型】:R语言-逻辑回归预测汽车信用金融违约模型

作者: dataheart | 来源:发表于2017-05-22 11:15 被阅读487次

    背景:

    目前有一批汽车信用贷款用户违约数据(客户属性 + 账号属性 + 消费行为 +还款行为),市场部门想根据这些数据建立模型从而预测下一批相似用户将来是否会违约。

    数据源:

    data.csv(一份汽车贷款违约数据)
    样本量:7193

    24个观察指标

    建模方法: 逻辑回归
    指标评估:准确度 和 ROC 曲线 --用来描述模型分辨能力,对角线以上的图形越高越好

    ROC曲线

    模型结论

    1. 验证集准确率为67.76%,测试集的准确率为67.37%,精度效果一般
    1. 验证集的ROC和测试集的ROC为0.791和 0.782,模型效果一般
    2. 逻辑回归不是很适合该类数据,建议使用决策树,神经网络,贝叶斯分类器,KNN分类算法等相关分类模型预测优化
    ROC曲线

    代码

    1.变量的粗筛》2.缺失值处理》3.变量的细筛》4.建模》5.检查共线性和模型的评估

    > # 1.读取数据
    > data<-read.csv("data.csv")
    > data <- data[,c(-1)]
    
    > # 2.变量类型
    >  # 2.1 分类变量转化为因子
    > factor_var = c('bad_ind','bankruptcy_ind','vehicle_make','used_ind')
    > data$bad_ind<-as.factor(data$bad_ind) 
    > data$bankruptcy_ind<-as.factor(data$bankruptcy_ind) 
    > data$vehicle_make<-as.factor(data$vehicle_make) 
    > data$used_ind<-as.factor(data$used_ind) 
    >  # 2.2 连续变量转化为数值,没有特殊格式的数值
    > # 3. 变量类型即缺失情况,发现有很多缺失值
    > dfexplore::dfplot(data)
    
    Paste_Image.png
    # 4. 变量粗筛
    > 
    > library(party)
    > set.seed(123)
    > crf <- cforest(bad_ind~.,controls = cforest_unbiased(mtry = 2,ntree =100),data = data)
    > varIMP = varimp(crf);varIMP = varIMP[order(varIMP,decreasing = T)]
    > barplot(varIMP)
    > abline(h = 0.001,col = 'red')
    
    Paste_Image.png
    #选取前20个较为重要的变量
    > var = names(varIMP[1:20]) 
    > data1 = data[,var]
    > data1$bad_ind = data$bad_ind 
    > 
    > 
    > #5.缺失值,异常值和错误值的处理
    > 
    > outlier = function(data,var)
    + {
    +   vmean<-mean(data[,var],na.rm=TRUE)
    +   data[is.na(data[,var]),i]<-vmean
    +   data[data[,var] < quantile(data[,var],0.01),var] = quantile(data[,var],0.01)
    +   # data[筛选条件,对应组] = 
    +   # quantile(x,probs) #求分位数。
    +   # 其中x为待求分位数的数值型向量,pobs为一个由[0,1]之间的概率值组成的数值向量
    +   data[data[,var] > quantile(data[,var],0.99),var] = quantile(data[,var],0.99)
    +   return(data[,var])
    + }
    > 
    > var = sapply(data1,class)
    > interval_var = names(var[var != 'factor'])
    > for (i in interval_var){data1[,i] = outlier(data1,i)}
    > 
    > # 检验还有没有缺失值
    > dfexplore::dfplot(data1)
    
    Paste_Image.png
    #变量的细筛,信息价值,剔除weak以下的信息价值低的变量
    > library(woe)
    > IV <- iv.mult(data1,"bad_ind",TRUE)
    > iv.plot.summary(IV)
    
    Paste_Image.png
    > data2 = data1[,setdiff(names(data1),IV[IV$Strength == 'Wery weak',1])]
    > 
    > #训练集测试集
    > 
    > index = sample(nrow(data2),nrow(data2)*.6)
    > train = data2[index,]
    > test = data2[-index,]
    > 
    > # logit建模
    > 
    > lg<-glm(bad_ind~.,family=binomial(link='logit'),data = train)
    > summary(lg)
    > lg_ms<-step(lg,direction = "both")
    Step:  AIC=3655.05
    bad_ind ~ fico_score + age_oldest_tr + ltv + tot_rev_line + rev_util + 
        bankruptcy_ind
    
                      Df Deviance    AIC
    <none>                 3639.0 3655.0
    + tot_rev_tr       1   3637.3 3655.3
    + msrp             1   3637.9 3655.9
    + loan_term        1   3638.0 3656.0
    + tot_rev_debt     1   3638.5 3656.5
    + down_pyt         1   3638.6 3656.6
    + tot_derog        1   3638.6 3656.6
    + veh_mileage      1   3638.9 3656.9
    + tot_tr           1   3639.0 3657.0
    - rev_util         1   3644.4 3658.4
    - tot_rev_line     1   3649.5 3663.5
    - age_oldest_tr    1   3670.1 3684.1
    - bankruptcy_ind   2   3693.7 3705.7
    - ltv              1   3742.3 3756.3
    + vehicle_make   133   3498.0 3780.0
    - fico_score       1   4043.0 4057.0
    
    > #共线性,lg_ms模型有没有共线性,发现该模型没有数据共线性的问题
    > library(car)
    > vif(lg_ms) 
                       GVIF Df GVIF^(1/(2*Df))
    fico_score     1.191609  1        1.091609
    age_oldest_tr  1.201768  1        1.096252
    ltv            1.060458  1        1.029785
    tot_rev_line   1.270180  1        1.127023
    rev_util       1.178712  1        1.085685
    bankruptcy_ind 1.143360  2        1.034060
    > #系数情况
    > summary(lg_ms)
    
    Call:
    glm(formula = bad_ind ~ fico_score + age_oldest_tr + ltv + tot_rev_line + 
        rev_util + bankruptcy_ind, family = binomial(link = "logit"), 
        data = train)
    
    Deviance Residuals: 
        Min       1Q   Median       3Q      Max  
    -1.9473  -0.7030  -0.4036  -0.1497   3.0196  
    
    Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
    (Intercept)      9.511e+00  6.754e-01  14.082  < 2e-16 ***
    fico_score      -1.781e-02  9.609e-04 -18.538  < 2e-16 ***
    age_oldest_tr   -3.034e-03  5.595e-04  -5.423 5.85e-08 ***
    ltv              2.787e-02  2.853e-03   9.769  < 2e-16 ***
    tot_rev_line    -1.066e-05  3.395e-06  -3.140  0.00169 ** 
    rev_util         2.398e-03  1.029e-03   2.330  0.01983 *  
    bankruptcy_indN -1.183e+00  1.736e-01  -6.817 9.27e-12 ***
    bankruptcy_indY -1.661e+00  2.265e-01  -7.336 2.20e-13 ***
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    (Dispersion parameter for binomial family taken to be 1)
    
        Null deviance: 4466.7  on 4314  degrees of freedom
    Residual deviance: 3639.0  on 4307  degrees of freedom
    AIC: 3655
    
    Number of Fisher Scoring iterations: 5
    
    > train$lg_p<-predict(lg_ms, train) 
    > train$p<-(1/(1+exp(-1*train$lg_p)))
    > summary(train$p)
        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    0.001521 0.062900 0.172000 0.212700 0.323600 0.866600 
    > train$out<-1
    > train[train$p<0.2,]$out<-0
    > table(train$bad_ind,train$out)
       
           0    1
      0 2202 1195
      1  196  722
    > 
    > test$lg_p<-predict(lg_ms, test) 
    > test$p<-(1/(1+exp(-1*test$lg_p)))
    > summary(test$p)
        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    0.002186 0.067500 0.177400 0.214300 0.325300 0.875600 
    > test$out<-1
    > test[test$p<0.2,]$out<-0
    > table(test$bad_ind,test$out)
       
           0    1
      0 1442  809
      1  130  497
     # 计算验证集准确率为67.76%,测试集的准确率为67.37%
    > rate<-sum(train$out==train$bad_ind)/length(train$bad_ind)
    > print(rate)
    [1] 0.6776362
    > 
    > rate2<-sum(test$out==test$bad_ind)/length(test$bad_ind)
    > print(rate2)
    [1] 0.6737318
    > 
    > #检验,ROC曲线
    > 
    > library(ROCR)
    > pred_Te <- prediction(test$p, test$bad_ind)
    > perf_Te <- performance(pred_Te,"tpr","fpr")
    > pred_Tr <- prediction(train$p, train$bad_ind)
    > perf_Tr <- performance(pred_Tr,"tpr","fpr")
    > plot(perf_Te, col='blue',lty=1);
    > plot(perf_Tr, col='black',lty=2,add=TRUE);
    > abline(0,1,lty=2,col='red')
    > 
    > lr_m_auc<-round(as.numeric(performance(pred_Tr,'auc')@y.values),3)
    > lr_m_str<-paste("Mode_Train-AUC:",lr_m_auc,sep="")
    > legend(0.3,0.4,c(lr_m_str),2:8)
    > 
    > lr_m_auc<-round(as.numeric(performance(pred_Te,'auc')@y.values),3)
    > lr_m_ste<-paste("Mode_Test-AUC:",lr_m_auc,sep="")
    > legend(0.3,0.2,c(lr_m_ste),2:8)
    
    
    Paste_Image.png

    参考资料:CDA《信用风险建模》微专业

    相关文章

      网友评论

          本文标题:【挖掘模型】:R语言-逻辑回归预测汽车信用金融违约模型

          本文链接:https://www.haomeiwen.com/subject/agsbxxtx.html