GEO差异分析

作者: 萍智医信 | 来源:发表于2021-08-07 11:08 被阅读0次
    rt.png
    logFoldChange=1
    adjustP=0.05
    
    library(limma)
    setwd("C:\\Users\\lexb4\\Desktop\\geoBatch\\07.diff")
    rt=read.table("lncRNA.txt",sep="\t",header=T,check.names=F)
    #重复基因取均值,去重复
    rt=as.matrix(rt)
    rownames(rt)=rt[,1]
    exp=rt[,2:ncol(rt)]
    dimnames=list(rownames(exp),colnames(exp))
    rt=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
    rt=avereps(rt)
    rt=rt[rowMeans(rt)>0,]
    #数据集矫正,标准化
    rt=normalizeBetweenArrays(as.matrix(rt))
    boxplot(rt)
    #rt=log2(rt+1)
    #差异分析
    modType=c(rep("con",182),rep("treat",32))
    design <- model.matrix(~0+factor(modType))
    colnames(design) <- c("con","treat")
    exprSet=rt #exprSet 为表达矩阵
    rownames(design)=colnames(exprSet) #更改分组信息design行名为分组名,也就是样本名字
    #构造了design矩阵,得出每一个样本属于哪一个组,属于为1,不属于为0
    fit <- lmFit(rt,design)
    #自定义比较元素
    contrast.matrix<-makeContrasts(treat-con,levels=design)
    #自定义函数
    deg = function(exprSet,design,contrast.matrix){#定义一个叫deg的函数
      ##step1#logFC后是前的多少倍,进行差异化比较
      fit <- lmFit(exprSet,design)
      ##step2
      fit2 <- contrasts.fit(fit, contrast.matrix) 
      ##这一步很重要,大家可以自行看看效果
      
      fit2 <- eBayes(fit2)  ## default no trend !!!
      ##eBayes() with trend=TRUE
      ##step3
      tempOutput = topTable(fit2, coef=1, n=Inf)
      nrDEG = na.omit(tempOutput) 
      #write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
      head(nrDEG)
      return(nrDEG)
    }
    deg = deg(exprSet,design,contrast.matrix)
    head(deg)#查看前几行文件
    save(deg,file = 'deg.Rdata')#存储文件
    write.table(deg,file="limmaTab.xls",sep="\t",quote=F)
    
    #write table
    diffSig <- deg[with(deg, (abs(logFC)>logFoldChange & adj.P.Val < adjustP )), ]
    write.table(diffSig,file="diff.xls",sep="\t",quote=F,row.names=T)
    diffUp <- deg[with(deg, (logFC>logFoldChange & adj.P.Val < adjustP )), ]
    write.table(diffUp,file="up.xls",sep="\t",quote=F,row.names=T)
    diffDown <- deg[with(deg, (logFC<(-logFoldChange) & adj.P.Val < adjustP )), ]
    write.table(diffDown,file="down.xls",sep="\t",quote=F,row.names=T)
    
    #write expression level of diff gene
    hmExp=rt[as.vector(diffSig[,1]),]
    diffExp=rbind(id=colnames(hmExp),hmExp)
    write.table(diffExp,file="diffExp.txt",sep="\t",quote=F,col.names=F)
    
    #差异热图
    if(T){ 
      # 每次都要检测数据,rt为表达数据,取过log的
      rt[1:4,1:4]
      #分组
      group_list<-c(rep("noTcells",182),rep("Tcells",32))
      #查看分组
      table(group_list)
      x=deg$logFC #deg取logFC这列并将其重新赋值给x
      names(x)=rownames(deg) #deg取probe_id这列,并将其作为名字给x
      cg=c(names(head(sort(x),100)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
           names(tail(sort(x),100)))
      library(pheatmap)
      pheatmap(rt[cg,],show_colnames =F,show_rownames = F) #对dat按照cg取行,所得到的矩阵来画热图
      n=t(scale(t(rt[cg,])))#通过“scale”对log-ratio数值进行归一化,现在的dat是行名为探针,列名为样本名,由于scale这个函数应用在不同组数据间存在差异时,需要行名为样本,因此需要用t(dat[cg,])来转换,最后再转换回来
      
      n[n>2]=2
      n[n< -2]= -2
      n[1:4,1:4]
      pheatmap(n,show_colnames =F,show_rownames = F)
      ac=data.frame(g=group_list)
      rownames(ac)=colnames(n) #将ac的行名也就分组信息(是‘no TNBC’还是‘TNBC’)给到n的列名,即热图中位于上方的分组信息
      pheatmap(n,show_colnames =F,
               show_rownames = F,
               cluster_cols = F, 
               annotation_col=ac,filename = 'heatmap_top.pdf') #列名注释信息为ac即分组信息
      
      
    }
    #deg.csv是所有基因的差异表达情况
    write.csv(deg,file = 'deg.csv')
    dev.new()
    
    #火山图
    if(T){
      nrDEG=deg
      head(nrDEG)
      attach(nrDEG)
      plot(logFC,-log10(P.Value))
      library(ggpubr)
      df=nrDEG
      df$v= -log10(P.Value) #df新增加一列'v',值为-log10(P.Value)
      ggscatter(df, x = "logFC", y = "v",size=0.5)
      
      df$g=ifelse(df$P.Value>0.05,'stable
                  ', #if 判断:如果这一基因的P.Value>0.05,则为stable基因
                  ifelse( df$logFC >1,'up', #接上句else 否则:接下来开始判断那些P.Value<0.01的基因,再if 判断:如果logFC >1,则为up(上调)基因
                          ifelse( df$logFC < -1,'down','stable') )#接上句else 否则:接下来开始判断那些logFC <1 的基因,再if 判断:如果logFC <1,则为down(下调)基因,否则为stable基因
      )
      table(df$g)
      df$name=rownames(df)
      head(df)
      ggscatter(df, x = "logFC", y = "v",size=0.5,color = 'g')
      ggscatter(df, x = "logFC", y = "v", color = "g",size = 0.5,
                label = "name", repel = T,
                #label.select = rownames(df)[df$g != 'stable'] ,
                #label.select = c('TTC9', 'AQP3', 'CXCL11','PTGS2'), #挑选一些基因在图中显示出来
                palette = c("#00AFBB", "#E7B800", "#FC4E07") )
      ggsave('volcano.png')
      
      ggscatter(df, x = "AveExpr", y = "logFC",size = 0.2)
      df$p_c = ifelse(df$P.Value<0.001,'p<0.001',
                      ifelse(df$P.Value<0.01,'0.001<p<0.01','p>0.01'))
      table(df$p_c )
      ggscatter(df,x = "AveExpr", y = "logFC", color = "p_c",size=0.2, 
                palette = c("green", "red", "black") )
      ggsave('MA.pdf')
    }
    

    相关文章

      网友评论

        本文标题:GEO差异分析

        本文链接:https://www.haomeiwen.com/subject/dvyivltx.html