以下为本人学习RFE所做的记录,原帖地址在:http://xccds1977.blogspot.com/2011/09/caret.html
RFE_demo
NordicForrest
2/17/2021
library(caret)
data(mdrr)
head(mdrrDescr[1:4,1:4])
## MW AMW Sv Se
## METHOPROMAZINE 314.49 7.15 27.56 43.50
## ACEPROMAZINE 326.50 7.26 28.56 44.50
## TRIMEPRAZINE 298.49 6.94 27.05 42.17
## PROMETHAZINE 284.46 7.11 25.46 39.28
head(mdrrClass)
## [1] Active Active Inactive Active Active Active
## Levels: Active Inactive
1.数据预处理
zerovar=nearZeroVar(mdrrDescr)
newdata1=mdrrDescr[,-zerovar]
descrCorr = cor(newdata1)
highCorr = findCorrelation(descrCorr, 0.90)
newdata2 = newdata1[,-highCorr]
# comboInfo = findLinearCombos(newdata2)
# newdata2=newdata2[, -comboInfo$remove] #不适用于本例,运行的话会报错
Process = preProcess(newdata2)
newdata3 = predict(Process, newdata2) #标准化训练集
inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)
trainx = newdata3[inTrain,]
testx = newdata3[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]
featurePlot(trainx[,1:2],trainy,plot='box') #查看前两个
![](https://img.haomeiwen.com/i24502658/d96f7b22ed456f99.png)
2.特征选择
subsets = c(20,30,40,50,60,70,80)
ctrl= rfeControl(functions = rfFuncs, method = "cv",verbose = FALSE, returnResamp = "final")
定义控制参数,functions是确定用什么样的模型进行自变量排序,本例选择的模型是随机森林即rfFuncs,可以选择的还有lmFuncs(线性回归),nbFuncs(朴素贝叶斯),treebagFuncs(装袋决策树),caretFuncs(自定义的训练模型)。method是确定用什么样的抽样方法,本例使用cv即交叉检验, 还有提升boot以及留一交叉检验LOOCV
Profile = rfe(newdata3, mdrrClass, sizes = subsets, rfeControl = ctrl) #耗时!
print(Profile)
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 20 0.8258 0.6418 0.05387 0.11260
## 30 0.8333 0.6570 0.05095 0.10598
## 40 0.8295 0.6488 0.05046 0.10573
## 50 0.8352 0.6607 0.05781 0.12053
## 60 0.8295 0.6481 0.06109 0.12861
## 70 0.8333 0.6568 0.03554 0.07423
## 80 0.8353 0.6612 0.03183 0.06732
## 94 0.8372 0.6642 0.04454 0.09277 *
##
## The top 5 variables (out of 94):
## nC, CIC0, X5v, IVDM, Wap
plot(Profile)
![](https://img.haomeiwen.com/i24502658/b80b8d1b3a0ae444.png)
Profile$optVariables #返回最终保留的自变量
## [1] "nC" "CIC0" "X5v" "IVDM" "Wap" "IDDE"
## [7] "D.Dr06" "CIC1" "BIC0" "VEA2" "nAB" "QZZp"
## [13] "QXXm" "Xt" "VRA1" "IC3" "PCD" "G1"
## [19] "MPC09" "Yindex" "nBnz" "VEA1" "G.N..O." "H3D"
## [25] "DISPm" "BIC1" "Jhetm" "Jhetp" "SPAM" "Ms"
## [31] "DISPe" "DISPv" "TIE" "BLI" "TI2" "PCR"
## [37] "PW3" "X0Av" "T.O..O." "nR07" "Rww" "piPC10"
## [43] "Ram" "nCIR" "X3A" "IC2" "IC0" "X5Av"
## [49] "Mp" "X2A" "MAXDN" "Lop" "PW4" "G.N..N."
## [55] "nR10" "MSD" "PW5" "nO" "SEigp" "Me"
## [61] "SPI" "FDI" "J3D" "ASP" "X1A" "PJI3"
## [67] "SPH" "nCIC" "L.Bw" "BIC2" "BIC3" "MAXDP"
## [73] "RBN" "HNar" "BAC" "BIC5" "AMW" "G.N..F."
## [79] "IVDE" "nR06" "SEigm" "nX" "RBF" "PW2"
## [85] "nDB" "nF" "nS" "PJI2" "nR09" "G.N..Cl."
## [91] "nCL" "nR05" "G.N..S." "G.O..Cl."
3.建模与参数优化
newdata4=newdata3[,Profile$optVariables]
inTrain = createDataPartition(mdrrClass, p = 3/4, list = FALSE)
trainx = newdata4[inTrain,]
testx = newdata4[-inTrain,]
trainy = mdrrClass[inTrain]
testy = mdrrClass[-inTrain]
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3,returnResamp = "all")
gbmGrid = expand.grid(interaction.depth = c(1, 3),
n.trees = c(50, 100, 150, 200, 250, 300),
shrinkage = 0.1,
n.minobsinnode =5) #原作没有提供最后一个参数,所以图片无法完全重现
tips: 这里似乎应该设置一个随机种子
gbmFit1 = train(trainx,trainy,method = "gbm",trControl = fitControl,tuneGrid = gbmGrid,verbose = FALSE)
plot(gbmFit1)
![](https://img.haomeiwen.com/i24502658/8b3bd3ea75c1499a.png)
4.模型预测与检验
predict(gbmFit1, newdata = testx)[1:5]
## [1] Active Active Active Active Active
## Levels: Active Inactive
gbmFit2= train(trainx, trainy,method = "treebag",trControl = fitControl)
models = list(gbmFit1, gbmFit2)
predValues = extractPrediction(models,testX = testx, testY = testy)
head(predValues)
## obs pred model dataType object
## 1 Active Active gbm Training Object1
## 2 Inactive Inactive gbm Training Object1
## 3 Active Active gbm Training Object1
## 4 Active Active gbm Training Object1
## 5 Active Active gbm Training Object1
## 6 Active Active gbm Training Object1
testValues = subset(predValues, dataType == "Test")
probValues = extractProb(models,testX = testx, testY = testy)
testProbs = subset(probValues, dataType == "Test")
Pred1 = subset(testValues, model == "gbm")
Pred2 = subset(testValues, model == "treebag")
confusionMatrix(Pred1$pred, Pred1$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Active Inactive
## Active 66 13
## Inactive 8 44
##
## Accuracy : 0.8397
## 95% CI : (0.7655, 0.8979)
## No Information Rate : 0.5649
## P-Value [Acc > NIR] : 1.841e-11
##
## Kappa : 0.6706
##
## Mcnemar's Test P-Value : 0.3827
##
## Sensitivity : 0.8919
## Specificity : 0.7719
## Pos Pred Value : 0.8354
## Neg Pred Value : 0.8462
## Prevalence : 0.5649
## Detection Rate : 0.5038
## Detection Prevalence : 0.6031
## Balanced Accuracy : 0.8319
##
## 'Positive' Class : Active
confusionMatrix(Pred2$pred, Pred2$obs)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Active Inactive
## Active 67 20
## Inactive 7 37
##
## Accuracy : 0.7939
## 95% CI : (0.7145, 0.8596)
## No Information Rate : 0.5649
## P-Value [Acc > NIR] : 3.134e-08
##
## Kappa : 0.5694
##
## Mcnemar's Test P-Value : 0.02092
##
## Sensitivity : 0.9054
## Specificity : 0.6491
## Pos Pred Value : 0.7701
## Neg Pred Value : 0.8409
## Prevalence : 0.5649
## Detection Rate : 0.5115
## Detection Prevalence : 0.6641
## Balanced Accuracy : 0.7773
##
## 'Positive' Class : Active
prob1 = subset(testProbs, model == "gbm")
prob2 = subset(testProbs, model == "treebag")
library(ROCR)
prob1$lable=ifelse(prob1$obs=='Active',yes=1,0)
pred1 = prediction(prob1$Active,prob1$lable)
perf1 = performance(pred1, measure="tpr", x.measure="fpr" )
plot(perf1)
![](https://img.haomeiwen.com/i24502658/1dcb9fd3f199c181.png)
网友评论