美文网首页
第7章 模型评估

第7章 模型评估

作者: zd200572 | 来源:发表于2021-11-27 21:05 被阅读0次

    7.2 k折交叉验证模型性能

    这个方法可以解决过度适应的问题,

    library(modeldata)
    library(e1071)
    data(mlc_churn)
    churnTrain <- mlc_churn
    ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
    accuracies <- c()
    for (i in 1:10) {
      fit <- svm(churn~.,churnTrain[ind!=i,])
      predictions <- predict(fit, churnTrain[ind==i,!names(churnTrain) %in% c("churn")])
      correct_count <- sum(as.data.frame(predictions) == churnTrain[ind==i,c("churn")])
      accuracies <- append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
    }
    accuracies
    [1] 0.920 0.874 0.906 0.902 0.874 0.890 0.884 0.904 0.922 0.902
    

    7.3 e1071包进行交叉验证

    # e1071 交叉验证
    library(e1071)
    churnTrain <- churn[,!names(churn) %in% c('state',
                                              'area_code', "account_length")]
    set.seed(2)
    ind <- sample(2,nrow(churnTrain),replace = TRUE,
                  prob = c(0.7,0.3))
    trainset <- churnTrain[ind==1,]
    testset <- churnTrain[ind==2,]
    tuned <- tune.svm(churn~., data = trainset, gamma = 10^-2,
                      cost = 10^2, tunedcontrol = tune.control(cross = 10))
    summary(tuned)
    # Error estimation of ‘svm’ using 10-fold cross validation: 0.07473914
    tuned$performances
      gamma cost      error dispersion
    1  0.01  100 0.07473914 0.01605983
    svmfit <- tuned$best.model
    table(trainset[,c("churn")], predict(svmfit))
           yes   no
      yes  400   93
      no    14 2972
    

    tune函数采用风格式搜索方法来完成参数优化,查看其他优化函数?tune

    7.4 caret包进行交叉验证

    # caret
    library(caret)
    # 重复3次10折交叉
    control <- trainControl(method = "repeatedcv",number = 10,
                            repeats = 3)
    model <- train(churn~.,data = trainset, method = "rpart",
                   preProcess="scale", trControl=control)
    model
    CART 
    
    3479 samples
      19 predictor
       2 classes: 'yes', 'no' 
    
    Pre-processing: scaled (69) 
    Resampling: Cross-Validated (10 fold, repeated 3 times) 
    Summary of sample sizes: 3131, 3132, 3130, 3130, 3131, 3132, ... 
    Resampling results across tuning parameters:
    
      cp          Accuracy   Kappa    
      0.04056795  0.9301563  0.6720876
      0.05578093  0.9147378  0.5774005
      0.07505071  0.8731365  0.2416613
    
    Accuracy was used to select the optimal model using the largest value.
    The final value used for the model was cp = 0.04056795.
    

    trainControl中可以设置重采样的参数,指定boot\boot632\cv\repeatdcv\LOOCV\LGOCV\non\oob\adaptive_cv\adaptive_boot\adaptive_LGOCV等。

    7.5 caret包对变量重要程度排序

    得到监督学习模型后,可以改变输入值,比较给定模型输出效果的变化敏感程度来评估不同特征对模型的重要性。

    # 重要性排序
    importance <- varImp(model, scale = FALSE)
    importance
    rpart variable importance
    
      only 20 most important variables shown (out of 69)
    
                                  Overall
    international_planyes         196.919
    total_day_minutes             172.870
    total_day_charge              161.582
    number_customer_service_calls 155.274
    total_intl_minutes            133.221
    total_intl_charge             124.817
    total_intl_calls               48.700
    voice_mail_planyes             40.912
    total_eve_charge               31.116
    total_eve_minutes              31.116
    ...
    plot(importance)
    
    终于出个图
    扩展
    rpart等一些分类算法包中从训练模型中产生的对象包含了变量重要性,可以查看和输出。
    library(rpart)
    model.rp <- rpart(churn~.,data = trainset)
    model.rp$variable.importance
                 total_day_charge             total_day_minutes                         state 
                       161.951011                    161.951011                     85.145010 
               total_intl_minutes number_customer_service_calls             total_intl_charge 
                        80.504234                     76.323349                     75.141041 
    ...
    

    7.6 rimier包进行变量重要程度排序

    这个费了好大劲,好像只有数值变量才行。

    # rminier
    install.packages("rminer")
    library(rminer)
    model <- fit(Species~., iris,model = "svm")
    variable.importance <- Importance(model,iris,method = "sensv")
    L=list(runs=1,sen=t(variable.importance$imp),sresponses=variable.importance$sresponses)
    mgraph(L,graph = "IMP",leg = names(iris),col = "gray", Grid = 4)
    

    7.7 caret包找到高度关联的特征

    去掉非数值型属性,相关性计算获得一个关联度矩阵,将阈值设置为0.75,挑选高度关联的属性。

    # findCorrelation
    new_train <- trainset[,!names(trainset) %in% c('churn',
                                         'international_plan', "voice_mail_plan")]
    cor_mat <- cor(new_train)
    highlyCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
    names(new_train)[highlyCorrelated]
    [1] "total_night_minutes" "total_intl_charge"   "total_day_charge"    "total_eve_charge"  
    

    还可以使用subselect包中的leaps, genetic和anneal函数达到同样效果。

    7.8 利用caret包选择特征

    特征选择可以挑选出预测误差最低的属性子集,有助于我们判断究竟应该使用哪些特征才能建立一个精确的模型,递归特征排除函数rfe,自动选出符合要求的特征。

    # caret选择特征
    library(modeldata)
    library(caret)
    data(mlc_churn)
    churnTrain <- mlc_churn
    ind <- sample(2,nrow(churnTrain),replace = TRUE,
                  prob = c(0.7,0.3))
    trainset <- churnTrain[ind==1,]
    testset <- churnTrain[ind==2,]
    # 特征转换,应该就是yes no变1,0,因子变二元属性
    intl_plan <- model.matrix(~ trainset.international_plan - 1,
                              data = data.frame(trainset$international_plan))
    colnames(intl_plan) <- c("trainset.international_planno"="intl_no",
                             "trainset.international_planyes"="intl_yes")
    
    voice_plan <- model.matrix(~ trainset.voice_mail_plan - 1,
                               data = data.frame(trainset$voice_mail_plan))
    colnames(voice_plan) <- c("trainset.voice_mail_planno"="voice_no",
                              "trainset.voice_mail_planyes"="voice_yes")
    trainset$international_plan <- NULL
    trainset$voice_mail_plan <- NULL
    trainset <- cbind(intl_plan, voice_plan,trainset)
    # testset同样操作
    intl_plan <- model.matrix(~ testset.international_plan - 1,
                              data = data.frame(testset$international_plan))
    colnames(intl_plan) <- c("testset.international_planno"="intl_no",
                             "testset.international_planyes"="intl_yes")
    
    voice_plan <- model.matrix(~ testset.voice_mail_plan - 1,
                               data = data.frame(testset$voice_mail_plan))
    colnames(voice_plan) <- c("testset.voice_mail_planno"="voice_no",
                              "testset.voice_mail_planyes"="voice_yes")
    testset$international_plan <- NULL
    testset$voice_mail_plan <- NULL
    testset <- cbind(intl_plan, voice_plan,testset)
    # 线性判别分析创建一个特征筛选算法,交叉验证方法cv
    ldaControl <- rfeControl(functions = ldaFuncs, method = "cv")
    # 反向特征筛选
    ldaProfile <- rfe(trainset[,names(trainset)!='churn'][,-c(5,6,7)],
                      trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl)
    ldaProfile
    
    Recursive feature selection
    
    Outer resampling method: Cross-Validated (10 fold) 
    
    Resampling performance over subset size:
    
     Variables Accuracy   Kappa AccuracySD KappaSD Selected
             1   0.8554 0.02245   0.009841 0.06008         
             2   0.8554 0.02245   0.009841 0.06008         
             3   0.8494 0.23064   0.013363 0.10679         
    ...
    plot(ldaProfile, type = c("o","g"))
    # 最佳变量子集
    ldaProfile$optVariables
    # 合适模型
    ldaProfile$fit
    Call:
    lda(x, y)
    
    Prior probabilities of groups:
          yes        no 
    0.1417074 0.8582926 
    
    Group means:
    postResample(predict(ldaProfile, testset[,names(testset)!="churn"]), testset[,c("churn")])
     Accuracy     Kappa 
    0.8520710 0.2523709 
    
    又到了画图时间
    扩展

    7.9 评测回归模型性能

    均方根误差法RMSE,相对平方差RSE,可决系数R-Square。

    # 回归模型性能评估
    library(car)
    data("Quartet")
    plot(Quartet$x,Quartet$y3)
    lmfit<- lm(Quartet$y3~Quartet$x)
    abline(lmfit, col="red")
    predicted <- predict(lmfit,newdata = Quartet[c("x")])
    actual <- Quartet$y3
    # 这里用的caret包的这个函数,这个包是个宝呀,啥都有
    rmse <- RMSE(predicted, actual)
    mu <- mean(actual)
    rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
    Rsquare <- 1-rse
    c(rmse,rse,Rsquare)
    [1] 1.118286 0.333676 0.666324
    # MASS包rlm重新计算
    library(MASS)
    plot(Quartet$x,Quartet$y3)
    rlmfit <- rlm(Quartet$y3~Quartet$x)
    abline(rlmfit,col='red')
    predicted <- predict(rlmfit, newdata = Quartet['x'])
    rmse <- (mean((predicted-actual)^2))^0.5
    rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
    rsqure <- 1-rse
    c(rmse,rse,Rsquare)
    [1] 1.2790448 0.4365067 0.6663240
    
    还是这张老图
    优点无视极值
    扩展
    library(e1071)
    tune(lm,y3~x,data = Quartet)
    Error estimation of ‘lm’ using 10-fold cross validation: 2.351897
    

    caret的train函数交叉验证,DAAG包的cv.lm可以达到同样效果

    7.10 利用混淆矩阵评测模型的预测能力

    模型的精确度、召回率、特异性以及准确率等性能指标

    # 混淆矩阵
    svm.model <- train(churn~., data=trainset,method = "svmRadial")
    svm.pred <- predict(svm.model, testset[,names(testset)!="churn"])
    table(svm.pred, testset[,"churn",drop=TRUE])
    svm.pred  yes   no
         yes   12    1
         no   198 1290
    confusionMatrix(svm.pred,testset[,"churn",drop=TRUE])
    Confusion Matrix and Statistics
    
              Reference
    Prediction  yes   no
           yes   12    1
           no   198 1290
                                              
                   Accuracy : 0.8674          
                     95% CI : (0.8492, 0.8842)
        No Information Rate : 0.8601          
        P-Value [Acc > NIR] : 0.2183          
                                              
                      Kappa : 0.0928   
    

    7.11 ROCR评测模型的预测能力

    受试者工作曲线ROC是一种常见的二元分类系统性能展示图形,曲线上分别标注了不同切点的真阳和假阳率。通常会基于曲线下面积AUC来衡量模型的分类性能。

    install.packages("ROCR")
    library(ROCR)
    svmfit <- svm(churn~.,data = trainset, prob = TRUE)
    pred <- predict(svmfit, testset[,names(testset)!="churn"],probability = TRUE)
    # 标记yes的概率
    pred.prob <- attr(pred,"probabilities")
    pred.to.roc <- pred.prob[, 2]
    # 预测
    pred.rocr <- prediction(pred.to.roc, testset$churn)
    # 性能评估
    perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
    perf.tpr.rocr <- performance(pred.rocr, "tpr", "fpr")
    plot(perf.tpr.rocr, colorize = T, main = paste("AUC:", (perf.rocr@y.values)))
    

    7.12 利用caret包比较ROC曲线

    install.packages("pROC")
    library(pROC)
    # 重复3次10折交叉
    control <- trainControl(method = "repeatedcv", number = 10,
                            repeats = 3,classProbs = TRUE,
                            summaryFunction = twoClassSummary)
    # glm分类
    glm.model <- train(churn ~., data = trainset,method = "glm",
                       metric = "ROC", trControl = control)
    # svm分类
    svm.model <- train(churn ~., data = trainset,method = "svmRadial",
                       metric = "ROC", trControl = control)
    rpart.model <- train(churn ~., data = trainset,method = "rpart",
                         metric = "ROC", trControl = control)
    # 分别预测
    glm.probs <- predict(glm.model, testset[,names(testset) != "churn"], type = "prob")
    svm.probs <- predict(svm.model, testset[,names(testset) != "churn"], type = "prob")
    rpart.probs <- predict(rpart.model, testset[,names(testset) != "churn"], type = "prob")
    # 生成ROC曲线,在一个图中
    glm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
                   predictor = glm.probs$yes,
                   levels = levels(testset[,"churn",drop=TRUE]))
    plot(glm.ROC, type = "S", col = 'red')
    
    svm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
                   predictor = svm.probs$yes,
                   levels = levels(testset[,"churn",drop=TRUE]))
    plot(svm.ROC, add = TRUE, col = 'green')
    rpart.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
                   predictor = rpart.probs$yes,
                   levels = levels(testset[,"churn",drop=TRUE]))
    plot(rpart.ROC, add = TRUE, col = 'blue')
    

    在这里困扰了好久,一直报错,Error in colnames(data) : argument "data" is missing, with no default,最后发现由于tibble默认不降维引起的,加drop=TRUE解决。

    7.13 caret包比较模型性能差异

    # 模型重采样
    cv.values <- resamples(list(glm=glm.model, svm=svm.model, rpart = rpart.model))
    summary(cv.values)
    Call:
    summary.resamples(object = cv.values)
    
    Models: glm, svm, rpart 
    Number of resamples: 30 
    
    ROC 
               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
    glm   0.7483007 0.7977494 0.8187351 0.8200275 0.8357939 0.8953608    0
    svm   0.8356721 0.8594691 0.8722842 0.8778421 0.9018812 0.9184929    0
    rpart 0.6446078 0.7146965 0.7666979 0.7773052 0.8459407 0.9173395    0
    
    Sens 
               Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
    glm   0.1600000 0.2088235 0.2672549 0.2667320 0.3184314 0.40    0
    svm   0.2600000 0.3879412 0.4454902 0.4460654 0.5049020 0.66    0
    rpart 0.2352941 0.3800000 0.4411765 0.4622876 0.5637255 0.76    0
    
    Spec 
               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
    glm   0.9509804 0.9639639 0.9673203 0.9666242 0.9705882 0.9771242    0
    svm   0.9346405 0.9542484 0.9672131 0.9641094 0.9729749 0.9836601    0
    rpart 0.9705882 0.9836601 0.9901639 0.9885492 0.9934641 1.0000000    0
    dotplot(cv.values, metric = "ROC")
    bwplot(cv.values, layout=c(3,1))
    


    扩展
    还可以使用densityplot.splom和xyplot函数可视化

    相关文章

      网友评论

          本文标题:第7章 模型评估

          本文链接:https://www.haomeiwen.com/subject/lchgtrtx.html