美文网首页生信文章software转录组数据分析
差异分析+火山图+COX模型构建

差异分析+火山图+COX模型构建

作者: 医学小白学生信 | 来源:发表于2021-03-30 11:22 被阅读0次

    简书:没有生物学重复的转录组数据怎么进行差异分析?

    edgeR需要的数据是reads数,可以设置BCV值,做单样本的差异分析。
    edgeR包可以做无重复的差异分析,不过需要认为指定一个dispersion值(设置BCV值),这样得到的结果比较主观,不同的人就可以有不同的结果。通常如果是实验控制的好的人类数据,那么选择BCV=0.4,比较好的模式生物选择BCV=0.1。

    6 LUAD验证

    6.2 差异基因分析和功能分析

    差异基因分析

    library(limma)
    rt=read.table("OV.txt",header=T,sep="\t",check.names=F)
    #如果一个基因占了多行,取均值
    rt=as.matrix(rt)
    rownames(rt)=rt[,1]
    exp=rt[,2:ncol(rt)]
    dimnames=list(rownames(exp),colnames(exp))
    data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
    data=avereps(data)
    data<-as.data.frame(data)
    nrow(data)
    data=data[rowMeans(data)>0.5,]
    
    group_list=c(rep("high",177),rep("low",176))  
    table(group_list)
    
    
    
    library(edgeR)
    
    dge <- DGEList(counts=data,group=group_list)
    dge$samples$lib.size <- colSums(dge$counts)
    dge <- calcNormFactors(dge) 
    
    design <- model.matrix(~0+group_list)
    rownames(design)<-colnames(dge)
    colnames(design)<-levels(group_list)
    
    dge <- estimateGLMCommonDisp(dge,design)
    dge <- estimateGLMTrendedDisp(dge, design)
    dge <- estimateGLMTagwiseDisp(dge, design)
    
    fit <- glmFit(dge, design)
    fit2 <- glmLRT(fit, contrast=c(-1,1)) 
    
    DEG=topTags(fit2, n=nrow(data))
    DEG=as.data.frame(DEG)
    #logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
    logFC_cutoff <- 2
    DEG$change = as.factor(
      ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
             ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
    head(DEG)
    table(DEG$change)
    edgeR_DEG <- DEG
    
    #提取差异基因表达矩阵
    DEG<-DEG[DEG$change!="NOT",]
    library(data.table)
    #fintersect求交集
    a<-data.table(rownames(DEG))
    diff<-data[match(a$V1,row.names(data)),]
    write.csv(diff,file = "diff.csv",quote=F,row.names = T)
    
    source("3-plotfunction.R")
    logFC_cutoff <- 2
    dat = log(data+1)
    pca.plot = draw_pca(dat,group_list)
    
    cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
    h2 = draw_heatmap(data[cg2,],group_list)
    v2 = draw_volcano(test = edgeR_DEG[,c(1,4,6)],pkg = 2)
    v2
    library(patchwork)
    (h2) / (v2) +plot_layout(guides = 'collect')
    ggsave("heat_volcano.png",width = 7,height = 5)
    
    火山图

    功能分析

    library("org.Hs.eg.db")
    rt=read.csv("diff.csv",sep=",",check.names=F,header=T)
    rt<-rt[,1]
    rt<-as.data.frame(rt)
    
    genes=as.vector(rt[,1])
    entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
    entrezIDs <- as.character(entrezIDs)
    out=cbind(rt,entrezID=entrezIDs)
    write.table(out,file="id.txt",sep="\t",quote=F,row.names=F)
    
    
    ###############GO分析
    library("clusterProfiler")
    library("org.Hs.eg.db")
    library("enrichplot")
    library("ggplot2")
    
    rt=read.table("id.txt",sep="\t",header=T,check.names=F)
    rt=rt[is.na(rt[,"entrezID"])==F,]
    
    cor=rt$cor
    gene=rt$entrezID
    names(cor)=gene
    
    #GO富集分析
    kk <- enrichGO(gene = gene,
                   OrgDb = org.Hs.eg.db, 
                   pvalueCutoff =0.05, 
                   qvalueCutoff = 0.05,
                   ont="all",
                   readable =T)
    write.table(kk,file="GO.txt",sep="\t",quote=F,row.names = F)
    
    #柱状图
    tiff(file="GO.tiff",width = 26,height = 20,units ="cm",compression="lzw",bg="white",res=600)
    barplot(kk, drop = TRUE, showCategory =10,split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale='free')
    dev.off()
    
    
    #####################KEGG分析
    library("clusterProfiler")
    library("org.Hs.eg.db")
    library("enrichplot")
    library("ggplot2")
    
    setwd("C:\\Users\\lexb4\\Desktop\\CCLE\\09.KEGG")
    rt=read.table("id.txt",sep="\t",header=T,check.names=F)
    rt=rt[is.na(rt[,"entrezID"])==F,]
    
    cor=rt$cor
    gene=rt$entrezID
    names(cor)=gene
    
    #kegg富集分析
    kk <- enrichKEGG(gene = gene, organism = "hsa", pvalueCutoff =0.05, qvalueCutoff =0.05)
    write.table(kk,file="KEGG.txt",sep="\t",quote=F,row.names = F)
    
    #柱状图
    tiff(file="barplot.tiff",width = 20,height = 12,units ="cm",compression="lzw",bg="white",res=600)
    barplot(kk, drop = TRUE, showCategory = 20)
    dev.off()
    

    6.3 批量KM曲线

    输入数据KM.TXT,数据是FPKM.


    KM.txt
    library(survival)
    rt=read.table("KM.txt",header=T,sep="\t",check.names=F)
    outTab=data.frame()
    
    for(gene in colnames(rt[,4:ncol(rt)])){
      a=rt[,gene]<=median(rt[,gene])
      diff=survdiff(Surv(futime, fustat) ~a,data = rt)
      pValue=1-pchisq(diff$chisq,df=1)
      outTab=rbind(outTab,cbind(gene=gene,pvalue=pValue))
      
      fit <- survfit(Surv(futime, fustat) ~ a, data = rt)
      summary(fit)
      #只将p<0.05的基因画出来。
      if(pValue<0.05){
        if(pValue<0.001){
          pValue="<0.001"
        }else{
          pValue=round(pValue,3)
          pValue=paste0("=",pValue)
        }
        pdf(file=paste(gene,".survival.pdf",sep=""),
            width=6,
            height=6)
        plot(fit, 
             lwd=2,
             col=c("red","blue"),
             xlab="Time (year)",
             mark.time=T, #显示删失数据
             ylab="Survival rate",
             main=paste(gene,"(p", pValue ,")",sep=""))
        legend("topright", 
               c("High","Low"), 
               lwd=2, 
               col=c("red","blue"))
        dev.off()
      }
    }
    write.table(outTab,file="survival.xls",sep="\t",row.names=F,quote=F)
    

    6.4 差异基因进行COX模型构建

    用FPKM数据做COX模型的构建,用OS相关的基因做分析。
    输入数据exptime.txt

    exptime.txt
    单因素cox
    pFilter=0.01                            #定义单因素显著性
    library(survival)                                                  #引用包
    rt=read.table("expTime.txt",header=T,sep="\t",check.names=F,row.names=1)       #读取输入文件
    
    sigGenes=c("futime","fustat")
    outTab=data.frame()
    for(i in colnames(rt[,3:ncol(rt)])){
     cox <- coxph(Surv(futime, fustat) ~ rt[,i], data = rt)
     coxSummary = summary(cox)
     coxP=coxSummary$coefficients[,"Pr(>|z|)"]
     outTab=rbind(outTab,
                  cbind(id=i,
                  HR=coxSummary$conf.int[,"exp(coef)"],
                  HR.95L=coxSummary$conf.int[,"lower .95"],
                  HR.95H=coxSummary$conf.int[,"upper .95"],
                  pvalue=coxSummary$coefficients[,"Pr(>|z|)"])
                  )
      if(coxP<pFilter){
          sigGenes=c(sigGenes,i)
      }
    }
    write.table(outTab,file="uniCox.xls",sep="\t",row.names=F,quote=F)
    uniSigExp=rt[,sigGenes]
    uniSigExp=cbind(id=row.names(uniSigExp),uniSigExp)
    write.table(uniSigExp,file="uniSigExp.txt",sep="\t",row.names=F,quote=F)
    

    LASSO回归

    library("glmnet")
    library("survival")
    rt=read.table("uniSigExp.txt",header=T,sep="\t",row.names=1,check.names=F)     #读取文件
    rt$futime[rt$futime<=0]=1
    
    x=as.matrix(rt[,c(3:ncol(rt))])
    y=data.matrix(Surv(rt$futime,rt$fustat))
    
    fit <- glmnet(x, y, family = "cox", maxit = 1000)
    pdf("lambda.pdf")
    plot(fit, xvar = "lambda", label = TRUE)
    dev.off()
    
    cvfit <- cv.glmnet(x, y, family="cox", maxit = 1000)
    pdf("cvfit.pdf")
    plot(cvfit)
    abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)),lty="dashed")
    dev.off()
    
    coef <- coef(fit, s = cvfit$lambda.min)
    index <- which(coef != 0)
    actCoef <- coef[index]
    lassoGene=row.names(coef)[index]
    lassoGene=c("futime","fustat",lassoGene)
    lassoSigExp=rt[,lassoGene]
    lassoSigExp=cbind(id=row.names(lassoSigExp),lassoSigExp)
    write.table(lassoSigExp,file="lassoSigExp.txt",sep="\t",row.names=F,quote=F)
    

    多因素COX模型的构建

    library(survival)
    library(survminer)
    
    rt=read.table("lassoSigExp.txt",header=T,sep="\t",check.names=F,row.names=1)  #读取train输入文件
    rt[,"futime"]=rt[,"futime"]/365
    
    #使用train组构建COX模型
    multiCox=coxph(Surv(futime, fustat) ~ ., data = rt)
    multiCox=step(multiCox,direction = "both")
    multiCoxSum=summary(multiCox)
    
    #输出模型相关信息
    outTab=data.frame()
    outTab=cbind(
                 coef=multiCoxSum$coefficients[,"coef"],
                 HR=multiCoxSum$conf.int[,"exp(coef)"],
                 HR.95L=multiCoxSum$conf.int[,"lower .95"],
                 HR.95H=multiCoxSum$conf.int[,"upper .95"],
                 pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])
    outTab=cbind(id=row.names(outTab),outTab)
    write.table(outTab,file="multiCox.xls",sep="\t",row.names=F,quote=F)
    
    #绘制森林图
    pdf(file="forest.pdf",
           width = 8,             #图片的宽度
           height = 5,            #图片的高度
           )
    ggforest(multiCox,
             main = "Hazard ratio",
             cpositions = c(0.02,0.22, 0.4), 
             fontsize = 0.7, 
             refLabel = "reference", 
             noDigits = 2)
    dev.off()
    
    #输出train组风险文件
    riskScore=predict(multiCox,type="risk",newdata=rt)           #利用train得到模型预测train样品风险
    coxGene=rownames(multiCoxSum$coefficients)
    coxGene=gsub("`","",coxGene)
    outCol=c("futime","fustat",coxGene)
    medianTrainRisk=median(riskScore)
    risk=as.vector(ifelse(riskScore>medianTrainRisk,"high","low"))
    write.table(cbind(id=rownames(cbind(rt[,outCol],riskScore,risk)),cbind(rt[,outCol],riskScore,risk)),
        file="riskTrain.txt",
        sep="\t",
        quote=F,
        row.names=F)
    ####################下面是验证集的风险文件输出#################
    #输出test组风险文件
    rtTest=read.table("test.txt",header=T,sep="\t",check.names=F,row.names=1)          #读取test输入文件
    rtTest[,"futime"]=rtTest[,"futime"]/365
    riskScoreTest=predict(multiCox,type="risk",newdata=rtTest)      #利用train得到模型预测test样品风险
    riskTest=as.vector(ifelse(riskScoreTest>medianTrainRisk,"high","low"))
    write.table(cbind(id=rownames(cbind(rtTest[,outCol],riskScoreTest,riskTest)),cbind(rtTest[,outCol],riskScore=riskScoreTest,risk=riskTest)),
        file="riskTest.txt",
        sep="\t",
        quote=F,
        row.names=F)
    

    6.5 风险图形绘制

    风险生存曲线

    library(survival)
    
    #绘制train组生存曲线
    rt=read.table("riskTrain.txt",header=T,sep="\t")
    diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
    pValue=1-pchisq(diff$chisq,df=1)
    pValue=signif(pValue,4)
    pValue=format(pValue, scientific = TRUE)
    fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
    summary(fit)    #查看五年生存率
    pdf(file="survivalTrain.pdf",width=5.5,height=5)
    plot(fit, 
         lwd=2,
         col=c("red","blue"),
         xlab="Time (year)",
         ylab="Survival rate",
         main=paste("Survival curve (p=", pValue ,")",sep=""),
         mark.time=T)
    legend("topright", 
           c("high risk", "low risk"),
           lwd=2,
           col=c("red","blue"))
    dev.off()
    
    #绘制test组生存曲线
    rt=read.table("riskTest.txt",header=T,sep="\t")
    diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
    pValue=1-pchisq(diff$chisq,df=1)
    pValue=signif(pValue,4)
    pValue=format(pValue, scientific = TRUE)
    fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
    summary(fit)    #查看五年生存率
    pdf(file="survivalTest.pdf",width=5.5,height=5)
    plot(fit, 
         lwd=2,
         col=c("red","blue"),
         xlab="Time (year)",
         ylab="Survival rate",
         main=paste("Survival curve (p=", pValue ,")",sep=""),
         mark.time=T)
    legend("topright", 
           c("high risk", "low risk"),
           lwd=2,
           col=c("red","blue"))
    dev.off()
    
    KM曲线

    风险ROC曲线

    library(survivalROC)
    
    rt=read.table("riskTrain.txt",header=T,sep="\t",check.names=F,row.names=1)
    pdf(file="rocTrain.pdf",width=6,height=6)
    par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
    roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, 
          predict.time =1, method="KM")
    plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red', 
      xlab="False positive rate", ylab="True positive rate",
      main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
      lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
    abline(0,1)
    dev.off()
    
    rt=read.table("riskTest.txt",header=T,sep="\t",check.names=F,row.names=1)
    pdf(file="rocTest.pdf",width=6,height=6)
    par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
    roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore, 
          predict.time =1, method="KM")
    plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red', 
      xlab="False positive rate", ylab="True positive rate",
      main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
      lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
    abline(0,1)
    dev.off()
    
    ROC曲线

    相关文章

      网友评论

        本文标题:差异分析+火山图+COX模型构建

        本文链接:https://www.haomeiwen.com/subject/ddolrktx.html