一个模型对于对象的评估可能存在偏差,我们往往可以通过综合多个模型的预测结果从而最终评估对象。多个模型集成成为的模型叫做集成评估器,组成集成评估器的每个模型都叫做基评估器,通常来说,有三类集成算法:袋装法(bagging),提升法(boosting)和stacking。
bagging :随机森林模式,模型独立,相互不影响 ,决策树越多,分类效果越好。
boosting:Adaboost, 模型相互影响,先形成一个弱分类器,然后一步步将分类器得到提升。
R random Forest
library(randomForest)
library(pROC)
library(caret)
library(rpart)
load("CUP2_methyl.rda")
load("CUP2_anno.rda")
CUP2_methyl_1 = as.data.frame(t(CUP2_methyl[-c(1:3)]))
CUP2_methyl_1$Sample = gsub("methy_","",row.names(CUP2_methyl_1))
CUP2_methyl_anno=merge(CUP2_methyl_1,CUP2_anno,by="Sample")
row.names(CUP2_methyl_anno)=CUP2_methyl_anno$Sample;CUP2_methyl_anno$Sample=NULL
head(CUP2_methyl_anno[6140:6148]);head(CUP2_anno)
####remove the Live Cancer
CUP2_methyl_anno = CUP2_methyl_anno[which(CUP2_methyl_anno$Group != "LIHC"),]
CUP2_methyl_anno$Group = factor(CUP2_methyl_anno$Group,levels=c('CRC','LC','Normal','STAD'))
dim(CUP2_methyl_anno)
### creat 3/4 dataset as training dataset
set.seed(101)
sets = createDataPartition(CUP2_methyl_anno$Group,p=0.75,list=F)
training = CUP2_methyl_anno[sets,]
training = training[-c(6145:6147)]
dim(training)
testing = CUP2_methyl_anno[-sets,]
testing = testing[-c(6145:6147)]
dim(testing)
cup2_train <- randomForest(as.factor(training$Group)~ .,data = training,
importance = TRUE,na.action =na.pass)
cup2_train
plot(cup2_train,main="random Forest origin")
cup2_test <-predict(cup2_train,newdata = testing,type = "class" )
head(cup2_test);head(testing$Group)
cup2.cf <- caret::confusionMatrix(as.factor(cup2_test),as.factor(testing$Group))
cup2.cf
cup2_test2<-predict(cup2_train,newdata=testing,type ="prob")
cup2_test2
roc.rf <-multiclass.roc(testing$Group,cup2_test2)
fit1<-rpart(Group~ ., data = training)
pre1<-predict(fit1,newdata=testing,type="prob")
roc <-multiclass.roc(testing$Group,pre1[,1])
plot(roc$rocs[[1]],col='red')
plot(roc$rocs[[2]],add=T,col="blue")
plot(roc$rocs[[3]],add=T,col="green")
plot(roc$rocs[[4]],add=T,col="orange")
download.png
网友评论