RFE_demo

NordicForrest

2/17/2021

library(caret)

data(mdrr)
head(mdrrDescr[1:4,1:4])

##                    MW  AMW    Sv    Se
## METHOPROMAZINE 314.49 7.15 27.56 43.50
## ACEPROMAZINE   326.50 7.26 28.56 44.50
## TRIMEPRAZINE   298.49 6.94 27.05 42.17
## PROMETHAZINE   284.46 7.11 25.46 39.28

head(mdrrClass)

## [1] Active   Active   Inactive Active   Active   Active  
## Levels: Active Inactive

1.数据预处理

zerovar=nearZeroVar(mdrrDescr)
newdata1=mdrrDescr[,-zerovar]

descrCorr = cor(newdata1)
highCorr = findCorrelation(descrCorr, 0.90)
newdata2 = newdata1[,-highCorr]
# comboInfo = findLinearCombos(newdata2)
# newdata2=newdata2[, -comboInfo$remove] #不适用于本例，运行的话会报错

Process = preProcess(newdata2)
newdata3 = predict(Process, newdata2) #标准化训练集

inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)

trainx = newdata3[inTrain,]
testx = newdata3[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]

featurePlot(trainx[,1:2],trainy,plot='box') #查看前两个

image.png

2.特征选择

subsets = c(20,30,40,50,60,70,80)

ctrl= rfeControl(functions = rfFuncs, method = "cv",verbose = FALSE, returnResamp = "final")

定义控制参数，functions是确定用什么样的模型进行自变量排序，本例选择的模型是随机森林即rfFuncs，可以选择的还有lmFuncs（线性回归），nbFuncs（朴素贝叶斯），treebagFuncs（装袋决策树），caretFuncs（自定义的训练模型）。method是确定用什么样的抽样方法，本例使用cv即交叉检验, 还有提升boot以及留一交叉检验LOOCV

Profile = rfe(newdata3, mdrrClass, sizes = subsets, rfeControl = ctrl) #耗时！

print(Profile)

## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##         20   0.8258 0.6418    0.05387 0.11260         
##         30   0.8333 0.6570    0.05095 0.10598         
##         40   0.8295 0.6488    0.05046 0.10573         
##         50   0.8352 0.6607    0.05781 0.12053         
##         60   0.8295 0.6481    0.06109 0.12861         
##         70   0.8333 0.6568    0.03554 0.07423         
##         80   0.8353 0.6612    0.03183 0.06732         
##         94   0.8372 0.6642    0.04454 0.09277        *
## 
## The top 5 variables (out of 94):
##    nC, CIC0, X5v, IVDM, Wap

plot(Profile)

image.png

Profile$optVariables #返回最终保留的自变量

##  [1] "nC"       "CIC0"     "X5v"      "IVDM"     "Wap"      "IDDE"    
##  [7] "D.Dr06"   "CIC1"     "BIC0"     "VEA2"     "nAB"      "QZZp"    
## [13] "QXXm"     "Xt"       "VRA1"     "IC3"      "PCD"      "G1"      
## [19] "MPC09"    "Yindex"   "nBnz"     "VEA1"     "G.N..O."  "H3D"     
## [25] "DISPm"    "BIC1"     "Jhetm"    "Jhetp"    "SPAM"     "Ms"      
## [31] "DISPe"    "DISPv"    "TIE"      "BLI"      "TI2"      "PCR"     
## [37] "PW3"      "X0Av"     "T.O..O."  "nR07"     "Rww"      "piPC10"  
## [43] "Ram"      "nCIR"     "X3A"      "IC2"      "IC0"      "X5Av"    
## [49] "Mp"       "X2A"      "MAXDN"    "Lop"      "PW4"      "G.N..N." 
## [55] "nR10"     "MSD"      "PW5"      "nO"       "SEigp"    "Me"      
## [61] "SPI"      "FDI"      "J3D"      "ASP"      "X1A"      "PJI3"    
## [67] "SPH"      "nCIC"     "L.Bw"     "BIC2"     "BIC3"     "MAXDP"   
## [73] "RBN"      "HNar"     "BAC"      "BIC5"     "AMW"      "G.N..F." 
## [79] "IVDE"     "nR06"     "SEigm"    "nX"       "RBF"      "PW2"     
## [85] "nDB"      "nF"       "nS"       "PJI2"     "nR09"     "G.N..Cl."
## [91] "nCL"      "nR05"     "G.N..S."  "G.O..Cl."

3.建模与参数优化

newdata4=newdata3[,Profile$optVariables]

inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)
trainx = newdata4[inTrain,]
testx = newdata4[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]

fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3,returnResamp = "all")

gbmGrid = expand.grid(interaction.depth = c(1, 3),
                      n.trees = c(50, 100, 150, 200, 250, 300),
                      shrinkage = 0.1,
                      n.minobsinnode =5) #原作没有提供最后一个参数，所以图片无法完全重现

tips: 这里似乎应该设置一个随机种子

gbmFit1 = train(trainx,trainy,method = "gbm",trControl = fitControl,tuneGrid = gbmGrid,verbose = FALSE)

plot(gbmFit1)

image.png

4.模型预测与检验

predict(gbmFit1, newdata = testx)[1:5]

## [1] Active Active Active Active Active
## Levels: Active Inactive

gbmFit2= train(trainx, trainy,method = "treebag",trControl = fitControl)
models = list(gbmFit1, gbmFit2)

predValues = extractPrediction(models,testX = testx, testY = testy)
head(predValues)

##        obs     pred model dataType  object
## 1   Active   Active   gbm Training Object1
## 2 Inactive Inactive   gbm Training Object1
## 3   Active   Active   gbm Training Object1
## 4   Active   Active   gbm Training Object1
## 5   Active   Active   gbm Training Object1
## 6   Active   Active   gbm Training Object1

testValues = subset(predValues, dataType == "Test")

probValues = extractProb(models,testX = testx, testY = testy)
testProbs = subset(probValues, dataType == "Test")

Pred1 = subset(testValues, model == "gbm")
Pred2 = subset(testValues, model == "treebag")
confusionMatrix(Pred1$pred, Pred1$obs)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Active Inactive
##   Active       66       13
##   Inactive      8       44
##                                           
##                Accuracy : 0.8397          
##                  95% CI : (0.7655, 0.8979)
##     No Information Rate : 0.5649          
##     P-Value [Acc > NIR] : 1.841e-11       
##                                           
##                   Kappa : 0.6706          
##                                           
##  Mcnemar's Test P-Value : 0.3827          
##                                           
##             Sensitivity : 0.8919          
##             Specificity : 0.7719          
##          Pos Pred Value : 0.8354          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.5649          
##          Detection Rate : 0.5038          
##    Detection Prevalence : 0.6031          
##       Balanced Accuracy : 0.8319          
##                                           
##        'Positive' Class : Active

confusionMatrix(Pred2$pred, Pred2$obs)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Active Inactive
##   Active       67       20
##   Inactive      7       37
##                                           
##                Accuracy : 0.7939          
##                  95% CI : (0.7145, 0.8596)
##     No Information Rate : 0.5649          
##     P-Value [Acc > NIR] : 3.134e-08       
##                                           
##                   Kappa : 0.5694          
##                                           
##  Mcnemar's Test P-Value : 0.02092         
##                                           
##             Sensitivity : 0.9054          
##             Specificity : 0.6491          
##          Pos Pred Value : 0.7701          
##          Neg Pred Value : 0.8409          
##              Prevalence : 0.5649          
##          Detection Rate : 0.5115          
##    Detection Prevalence : 0.6641          
##       Balanced Accuracy : 0.7773          
##                                           
##        'Positive' Class : Active

prob1 = subset(testProbs, model == "gbm")
prob2 = subset(testProbs, model == "treebag")
library(ROCR)
prob1$lable=ifelse(prob1$obs=='Active',yes=1,0)
pred1 = prediction(prob1$Active,prob1$lable)
perf1 = performance(pred1, measure="tpr", x.measure="fpr" )
plot(perf1)

image.png