美文网首页
特征递归消除(Recursive Feature Elimina

特征递归消除(Recursive Feature Elimina

作者: 北欧森林 | 来源:发表于2021-02-17 23:56 被阅读0次

以下为本人学习RFE所做的记录,原帖地址在:http://xccds1977.blogspot.com/2011/09/caret.html

RFE_demo

NordicForrest
2/17/2021
library(caret)

data(mdrr)
head(mdrrDescr[1:4,1:4])
##                    MW  AMW    Sv    Se
## METHOPROMAZINE 314.49 7.15 27.56 43.50
## ACEPROMAZINE   326.50 7.26 28.56 44.50
## TRIMEPRAZINE   298.49 6.94 27.05 42.17
## PROMETHAZINE   284.46 7.11 25.46 39.28
head(mdrrClass)
## [1] Active   Active   Inactive Active   Active   Active  
## Levels: Active Inactive
1.数据预处理
zerovar=nearZeroVar(mdrrDescr)
newdata1=mdrrDescr[,-zerovar]
descrCorr = cor(newdata1)
highCorr = findCorrelation(descrCorr, 0.90)
newdata2 = newdata1[,-highCorr]
# comboInfo = findLinearCombos(newdata2)
# newdata2=newdata2[, -comboInfo$remove] #不适用于本例,运行的话会报错
Process = preProcess(newdata2)
newdata3 = predict(Process, newdata2) #标准化训练集
inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)

trainx = newdata3[inTrain,]
testx = newdata3[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]
featurePlot(trainx[,1:2],trainy,plot='box') #查看前两个
image.png
2.特征选择
subsets = c(20,30,40,50,60,70,80)

ctrl= rfeControl(functions = rfFuncs, method = "cv",verbose = FALSE, returnResamp = "final")

定义控制参数,functions是确定用什么样的模型进行自变量排序,本例选择的模型是随机森林即rfFuncs,可以选择的还有lmFuncs(线性回归),nbFuncs(朴素贝叶斯),treebagFuncs(装袋决策树),caretFuncs(自定义的训练模型)。method是确定用什么样的抽样方法,本例使用cv即交叉检验, 还有提升boot以及留一交叉检验LOOCV

Profile = rfe(newdata3, mdrrClass, sizes = subsets, rfeControl = ctrl) #耗时!

print(Profile)
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##         20   0.8258 0.6418    0.05387 0.11260         
##         30   0.8333 0.6570    0.05095 0.10598         
##         40   0.8295 0.6488    0.05046 0.10573         
##         50   0.8352 0.6607    0.05781 0.12053         
##         60   0.8295 0.6481    0.06109 0.12861         
##         70   0.8333 0.6568    0.03554 0.07423         
##         80   0.8353 0.6612    0.03183 0.06732         
##         94   0.8372 0.6642    0.04454 0.09277        *
## 
## The top 5 variables (out of 94):
##    nC, CIC0, X5v, IVDM, Wap
plot(Profile)
image.png
Profile$optVariables #返回最终保留的自变量
##  [1] "nC"       "CIC0"     "X5v"      "IVDM"     "Wap"      "IDDE"    
##  [7] "D.Dr06"   "CIC1"     "BIC0"     "VEA2"     "nAB"      "QZZp"    
## [13] "QXXm"     "Xt"       "VRA1"     "IC3"      "PCD"      "G1"      
## [19] "MPC09"    "Yindex"   "nBnz"     "VEA1"     "G.N..O."  "H3D"     
## [25] "DISPm"    "BIC1"     "Jhetm"    "Jhetp"    "SPAM"     "Ms"      
## [31] "DISPe"    "DISPv"    "TIE"      "BLI"      "TI2"      "PCR"     
## [37] "PW3"      "X0Av"     "T.O..O."  "nR07"     "Rww"      "piPC10"  
## [43] "Ram"      "nCIR"     "X3A"      "IC2"      "IC0"      "X5Av"    
## [49] "Mp"       "X2A"      "MAXDN"    "Lop"      "PW4"      "G.N..N." 
## [55] "nR10"     "MSD"      "PW5"      "nO"       "SEigp"    "Me"      
## [61] "SPI"      "FDI"      "J3D"      "ASP"      "X1A"      "PJI3"    
## [67] "SPH"      "nCIC"     "L.Bw"     "BIC2"     "BIC3"     "MAXDP"   
## [73] "RBN"      "HNar"     "BAC"      "BIC5"     "AMW"      "G.N..F." 
## [79] "IVDE"     "nR06"     "SEigm"    "nX"       "RBF"      "PW2"     
## [85] "nDB"      "nF"       "nS"       "PJI2"     "nR09"     "G.N..Cl."
## [91] "nCL"      "nR05"     "G.N..S."  "G.O..Cl."
3.建模与参数优化
newdata4=newdata3[,Profile$optVariables]

inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)
trainx = newdata4[inTrain,]
testx = newdata4[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3,returnResamp = "all")
gbmGrid = expand.grid(interaction.depth = c(1, 3),
                      n.trees = c(50, 100, 150, 200, 250, 300),
                      shrinkage = 0.1,
                      n.minobsinnode =5) #原作没有提供最后一个参数,所以图片无法完全重现

tips: 这里似乎应该设置一个随机种子

gbmFit1 = train(trainx,trainy,method = "gbm",trControl = fitControl,tuneGrid = gbmGrid,verbose = FALSE)

plot(gbmFit1)
image.png
4.模型预测与检验
predict(gbmFit1, newdata = testx)[1:5]
## [1] Active Active Active Active Active
## Levels: Active Inactive
gbmFit2= train(trainx, trainy,method = "treebag",trControl = fitControl)
models = list(gbmFit1, gbmFit2)
predValues = extractPrediction(models,testX = testx, testY = testy)
head(predValues)
##        obs     pred model dataType  object
## 1   Active   Active   gbm Training Object1
## 2 Inactive Inactive   gbm Training Object1
## 3   Active   Active   gbm Training Object1
## 4   Active   Active   gbm Training Object1
## 5   Active   Active   gbm Training Object1
## 6   Active   Active   gbm Training Object1
testValues = subset(predValues, dataType == "Test")
probValues = extractProb(models,testX = testx, testY = testy)
testProbs = subset(probValues, dataType == "Test")
Pred1 = subset(testValues, model == "gbm")
Pred2 = subset(testValues, model == "treebag")
confusionMatrix(Pred1$pred, Pred1$obs)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Active Inactive
##   Active       66       13
##   Inactive      8       44
##                                           
##                Accuracy : 0.8397          
##                  95% CI : (0.7655, 0.8979)
##     No Information Rate : 0.5649          
##     P-Value [Acc > NIR] : 1.841e-11       
##                                           
##                   Kappa : 0.6706          
##                                           
##  Mcnemar's Test P-Value : 0.3827          
##                                           
##             Sensitivity : 0.8919          
##             Specificity : 0.7719          
##          Pos Pred Value : 0.8354          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.5649          
##          Detection Rate : 0.5038          
##    Detection Prevalence : 0.6031          
##       Balanced Accuracy : 0.8319          
##                                           
##        'Positive' Class : Active          
confusionMatrix(Pred2$pred, Pred2$obs)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Active Inactive
##   Active       67       20
##   Inactive      7       37
##                                           
##                Accuracy : 0.7939          
##                  95% CI : (0.7145, 0.8596)
##     No Information Rate : 0.5649          
##     P-Value [Acc > NIR] : 3.134e-08       
##                                           
##                   Kappa : 0.5694          
##                                           
##  Mcnemar's Test P-Value : 0.02092         
##                                           
##             Sensitivity : 0.9054          
##             Specificity : 0.6491          
##          Pos Pred Value : 0.7701          
##          Neg Pred Value : 0.8409          
##              Prevalence : 0.5649          
##          Detection Rate : 0.5115          
##    Detection Prevalence : 0.6641          
##       Balanced Accuracy : 0.7773          
##                                           
##        'Positive' Class : Active          
prob1 = subset(testProbs, model == "gbm")
prob2 = subset(testProbs, model == "treebag")
library(ROCR)
prob1$lable=ifelse(prob1$obs=='Active',yes=1,0)
pred1 = prediction(prob1$Active,prob1$lable)
perf1 = performance(pred1, measure="tpr", x.measure="fpr" )
plot(perf1)
image.png

相关文章

网友评论

      本文标题:特征递归消除(Recursive Feature Elimina

      本文链接:https://www.haomeiwen.com/subject/mhotsktx.html