随机森林分析第一波

作者: Dayueban | 来源:发表于2019-06-18 20:56 被阅读47次

随机森林分析第一波
生存分析至尊套餐
何为决策树和随机森林？
集成学习之Bagging和RF
（十四、）极限森林
待学清单
机器学习集成学习与树家族汇总
随机森林
随机森林
随机森林

随机森林分析在宏基因组数据分析中还是蛮常见的，所以这里整理一个常规的分析案例

1 | 数据准备

1.1 菌群数据

图一菌群数据-OTU

1.2 样品信息数据

图二样品信息数据

2 | 直接贴代码

#For windows system#
##ramdomforest.crossvalidation.r##
rm(list=ls())
# loading packages if necessary
library(randomForest)
library(ggplot2)
library(caret)
library(tidyverse)
# setting your working directory
setwd("E:\\F6_castrate_analysis\\total\\Duroc 16S\\fsm\\OTU\\稀疏后的OTU表")
set.seed(123)
## group male before and after castration

# loading Your datasets
dat1 <- read.table("fsm_otu_profile.csv",header=T,sep=',',row.names = 1)
conf <- read.table("fsm_metadata.csv",header=T,sep=',',row.names = 1)
#calculate the relative abundance of dat1
#dat1 <- sweep(dat1,2,colSums(dat1),'/')*100
dat1 <- log10(dat1 + 1)/log10(max(dat1))
cN.prof <- colnames(dat1)
#cN.prof <- sub("X","",cN.prof)
rN.conf <- rownames(conf)
gid <- intersect(cN.prof ,rN.conf) #求A和B的交集，根据样品名
dat1 <- dat1[,pmatch(gid,cN.prof)] # pmatch部分匹配
conf <- conf[pmatch(gid,rN.conf),]
dat2<-dat1[,conf$sex=="Gilts" | conf$sex=="Entire boars"]
conf2<-conf[conf$sex=="Gilts" | conf$sex=="Entire boars",]
conf2$sex=as.factor(as.character(conf2$sex))
outcome=conf2$sex
outcome<-sub("Gilts","0",outcome)
outcome<-sub("Entire boars","1",outcome)
outcome<-as.factor(outcome)
dat<-dat2
X <- as.data.frame(t(dat))
X$outcome = outcome

# 随机1-2有放回抽样293次，概率分别为0.67和0.33，用于获取训练集
ind = sample(2,nrow(X),replace=TRUE, prob=c(0.67,0.33))  
# 2/3作预测建模
ind.train <- X[ind == 1,]
ind.test <- X[ind == 2,]
# dev.rf = randomForest(outcome ~ ., data=X[ind==1,], ntree=500, 
#                       importance = TRUE, proximity=TRUE)  
# print(dev.rf)  
# 1/3验证预测模型
# dev.pred = predict(dev.rf, X[ind==2,] )
# print(dev.pred)
# 输出预测与观测对应表
# table(observed=X[ind==2,"outcome"], predicted=dev.pred) 
set.seed(123)

rf.train <- randomForest(outcome ~ ., data = ind.train,
                   importance = TRUE,proximity=TRUE,ntree = 1000)
print(rf.train)
#summary(rf)
#' 交叉验证选择features
# set.seed(123)
#' rfcv是随机森林交叉验证函数：Random Forest Cross Validation
# result <- rfcv(X[,-ncol(X)],X$outcome,cv.fold = 10)
# result$error.cv #' 查看错误率表，21时错误率最低，为最佳模型
#' 绘制验证结果
# with(result,plot(n.var,error.cv,log="x",type = "o",lwd=2))
# 使用replicate进行多次交叉验证，可选
result <- replicate(5, rfcv(ind.train[,-ncol(ind.train)],ind.train$outcome,cv.fold = 20), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
error.cv <- cbind(rowMeans(error.cv),error.cv)
n.var = rownames(error.cv) %>% as.numeric()
error.cv = error.cv[,2:6]
colnames(error.cv) = paste('err',1:5,sep='.')
err.mean = apply(error.cv,1,mean)
allerr = data.frame(num=n.var,err.mean=err.mean,error.cv)
# number of features selected
optimal = 82


# 表格输出
write.table(allerr, file = "family_rfcv.txt", sep = "\t", quote = F, row.names = T, col.names = T)
# the pre-setted parameters used for ploting afterwards
main_theme = theme(panel.background=element_blank(),
                   panel.grid=element_blank(),
                   axis.line.x=element_line(size=.5, colour="black"),
                   axis.line.y=element_line(size=.5, colour="black"),
                   axis.ticks=element_line(color="black"),
                   axis.text=element_text(color="black", size=7),
                   legend.position="right",
                   legend.background=element_blank(),
                   legend.key=element_blank(),
                   legend.text= element_text(size=7),
                   text=element_text(family="sans", size=7))

p = ggplot() + 
  geom_line(aes(x = allerr$num, y = allerr$err.1), colour = 'grey') + 
  geom_line(aes(x = allerr$num, y = allerr$err.2), colour = 'grey') + 
  geom_line(aes(x = allerr$num, y = allerr$err.3), colour = 'grey') + 
  geom_line(aes(x = allerr$num, y = allerr$err.4), colour = 'grey') + 
  geom_line(aes(x = allerr$num, y = allerr$err.5), colour = 'grey') + 
  geom_line(aes(x = allerr$num, y = allerr$err.mean), colour = 'black') + 
  geom_vline(xintercept = optimal, colour='black', lwd=0.36, linetype="dashed") + 
  #  geom_hline(yintercept = min(allerr$err.mean), colour='black', lwd=0.36, linetype="dashed") + 
  coord_trans(x = "log2") +
  scale_x_continuous(breaks = c(1, 3, 5, 10, 20, 40, 60, 100, 200, 300)) + # , max(allerr$num)
  labs(title=paste('Training set (n = ', dim(dat)[2],')', sep = ''), 
       x='Number of OTUs ', y='Cross-validation error rate') + 
  annotate("text", x = optimal, y = max(allerr$err.mean), label=paste("optimal = ", optimal, sep="")) + 
  main_theme
ggsave(p, file = "duroc_otu_rfcv.pdf", width = 89, height = 50, unit = 'mm')
p


###--------------------------------------######
###----       将最佳的那82个OTUs         ######

imp <- as.data.frame(rf.train$importance)
imp <- imp[order(imp[,1],decreasing = T),]
#head(imp)
#' 输出重要性排序的表格
#write.table(imp,file = "importance_class_20180719.txt",quote = F,sep = '\t',row.names = T,col.names = T)
#' 简单可视化
varImpPlot(rf,main = "Top 82-feature OTU importance",n.var = 82,bg=par("bg"),
           color = par("fg"),gcolor=par("fg"),lcolor="gray")

#' 美化feature贡献度柱状图
#' 分析选择top5效果最好
imp = head(imp, n=82)
write.table(imp,file = "importance_class_selected.txt",quote = F,sep = '\t',row.names = T,col.names = T)
#' 反向排序X轴，让柱状图从上往下画
imp = imp[order(imp[,3],decreasing = F),]
imp$name <- rownames(imp)
imp$name <- factor(imp$name,levels = imp$name)
OTU_attribution <- read.table("OTU_phylum.csv",header = T,sep = ",",check.names = F)
rownames(OTU_attribution) <- OTU_attribution$OTUID
OTU_attribution <- OTU_attribution[rownames(imp),]
imp$phylum <- OTU_attribution$phylum
imp2 <- imp
rownames(imp2) <- paste(rownames(imp2),OTU_attribution$class,sep = " ")
imp2$name <- rownames(imp2)
imp2$name <- factor(imp2$name,levels = imp2$name)
#' load ggplot2 package
library(ggplot2)
p <- ggplot(imp2,aes(x=name,y=MeanDecreaseAccuracy,fill=phylum)) +
  geom_bar(stat = "identity") + coord_flip() 
p <- p+theme_bw()
p <- p+theme(legend.position = c(0.75,0.15))
p <- p + theme(plot.background=element_blank(),panel.background=element_blank())
p <- p + theme(axis.text.x=element_text(face="bold",color="black",size=14),
               axis.text.y=element_text(face = "bold",color="black",size = 14),
               axis.title.y=element_text(size = 16,face = "bold"),axis.title.x=element_text(size = 16))
p <- p + scale_fill_manual(values=c("red2", "blue4","yellow2"))
p + theme(legend.background = element_blank())
p <- p + theme(
  legend.key.height=unit(0.8,"cm"),
  legend.key.width=unit(0.8,"cm"),
  legend.text=element_text(lineheight=0.8,face="bold",size=16),
  legend.title=element_text(size=18,face = "bold"))
p <- p+labs(x="OTU rank")
ggsave(paste("rf_imp_feature" , ".tiff", sep=""), p, width = 10, height = 6)

###---------------------------------###
###---     热图展示其丰度      -----###
# ' 筛选82个feature 展示
sub_abu = X[,rownames(imp)]
#' transposition the data
sub_abu <- as.data.frame(t(sub_abu))
#' table(X$outcome)
#'  0   1 
#' 108 185
#sub_abu$class <- X$outcome
#sub_abu$class <- sub("0","Gilts",sub_abu$class)
#sub_abu$class <- sub("1","Entire_boars",sub_abu$class)
sub_abu_matrix <- as.matrix(sub_abu)
library(pheatmap)
annotation_col <- data.frame(Group=factor(rep(c("Gilts","Entire_boars"),c(108,185))))
rownames(annotation_col) <- colnames(sub_abu_matrix)
ann_colors <- list(Group=c(Gilts="#4DAF4A",Entire_boars="#984EA3"))
#plot.new()
tiff("OTU_rf_pheatmap.tiff",width = 6000,height = 1200,res = 300)
pheatmap(sub_abu_matrix,treeheight_col = 20,annotation_colors = ann_colors,cluster_cols = FALSE,
         cluster_rows = FALSE,show_colnames = FALSE,annotation_col=annotation_col,
         gaps_col = c(108,108),border_color = "NA",cellwidth = 4.3,cellheight = 20)
dev.off()

#' use the selected OTUs to fit the model

ind.test <- cbind(ind.test[,rownames(imp)], ind.test$outcome)
names(ind.test)[ncol(ind.test)] <- "class"
#sub_abu_tr <- as.data.frame(t(sub_abu))
#sub_abu_tr$outcome <- X$outcome
rf.test <- randomForest(class ~ ., data = ind.test,
                          importance = TRUE,proximity=TRUE,ntree = 1000)
rf.test.pre <- predict(rf.test,type="prob")
p.train<-rf.test.pre[,2]
########ROC in train######
library(pROC)
roc(ind.test$class,p.train)
###################
roc1 <- roc(ind.test$class,
            p.train,
            percent=TRUE,
            partial.auc.correct=TRUE,
            ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,
            plot=F, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE)
#####################
tiff("fsm_variable_pROC.tiff",width=2200,height=1800,res=300)
roc1 <- roc(ind.test$class, p.train,
            ci=TRUE, boot.n=100, ci.alpha=0.9, stratified=FALSE,font=2, font.lab=2,
            plot=TRUE, percent=roc1$percent,col="#F781BF",cex.lab=1.2, cex.axis=1.2,cex.main=1.5)
sens.ci <- ci.se(roc1, specificities=seq(0, 100, 5))
plot(sens.ci, type="shape", col="#FF69B4")
plot(sens.ci, type="bars")
plot(roc1,col="black",add=T)
legend("bottomright",cex=1.4,c(paste("AUC=",round(roc1$ci[2],2),"%"),
                               paste("95% CI:",round(roc1$ci[1],2),"%-",round(roc1$ci[3],2),"%")))
dev.off()