美文网首页生信学习笔记集生信绘图
#多个数据集差异基因整合#

#多个数据集差异基因整合#

作者: 辛晓红 | 来源:发表于2021-03-23 23:15 被阅读0次
    image.png

    加载包,设置差异倍数logFC和P值

    #=======================================================
    
    #=======================================================
    setwd('D:\\SCIwork\\F34TNBC\\s5deg')
    library(dplyr)
    library(tidyr)
    library(tibble)
    #清除环境变量
    rm(list=ls()) 
    padj=0.05
    logFC=0.5
    

    读取各个数据集的差异基因

    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
    a <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
    b <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
    c <-read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
    d <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
    e <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
    f <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
    g  <- read.csv("allDiff.csv", header = T, row.names = 1)
    
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
    h  <- read.csv("allDiff.csv", header = T, row.names = 1)
    setwd('D:\\SCIwork\\F34TNBC\\s5deg')
    

    读取各个数据集的差异基因

    image.png

    读取这些差异基因的数据框,其必须含有的列是logfc和p值两类

    image.png

    保存符合我们设置的差异基因标准基因

    # --------------------------------------------------------
    # 
    # --------------------------------------------------------
    
    a <- a %>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(a, file = "allDiff1.csv", row.names = F)
    
    b <- b %>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(b, file = "allDiff2.csv", row.names = F)
    
    c <- c %>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(c, file = "allDiff3.csv", row.names = F)
    
    
    d <- d%>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(d, file = "allDiff4.csv", row.names = F)
    
    
    e <- e%>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(e, file = "allDiff5.csv", row.names = F)
    
    
    
    f <- f%>%
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(f, file = "allDiff6.csv", row.names = F)
    
    
    g <- g%>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(g, file = "allDiff7.csv", row.names = F)
    
    
    h <- h%>% 
      rownames_to_column(var='gene')%>% 
      dplyr::filter((logFC > 0.5 & P.Value < 0.05) | (logFC < -0.5 & P.Value < 0.05))
      write.csv(h, file = "allDiff8.csv", row.names = F)
    

    得到合并后的差异基因

    # --------------------------------------------------------
    # 
    # --------------------------------------------------------
    #读取差异基因分析结果,其必须包含两列:基因名和差异倍数#
    files=c("allDiff1.csv","allDiff2.csv",
            "allDiff3.csv","allDiff4.csv",
            "allDiff5.csv","allDiff6.csv",
            "allDiff7.csv", "allDiff8.csv")
    upList=list()
    downList=list()
    allFCList=list()
    
    for(i in 1:length(files)){
      inputFile=files[i]
      rt=read.csv(inputFile,header=T,sep = ',', row.names = 1) # 注意文件读取
      rt$Gene <- rownames(rt)
      rt <- rt %>%
        distinct(Gene, .keep_all = T)%>%
        arrange(logFC)%>%
        dplyr::select(Gene, logFC)
      header=unlist(strsplit(inputFile,"_"))
      downList[[header[1]]]=as.vector(rt[,1])
      upList[[header[1]]]=rev(as.vector(rt[,1]))
      fcCol=rt[,1:2]
      colnames(fcCol)=c("Gene",header[[1]])
      allFCList[[header[1]]]=fcCol}
    
    #合并文件
    mergeLe=function(x,y){
      merge(x,y,by="Gene",all=T)}
    newTab=Reduce(mergeLe,allFCList)
    rownames(newTab)=newTab[,1]
    newTab=newTab[,2:ncol(newTab)]
    newTab[is.na(newTab)]=0
    
    #计算共同上调基因
    library(RobustRankAggreg)
    upMatrix = rankMatrix(upList)
    upAR = aggregateRanks(rmat=upMatrix)
    colnames(upAR)=c("Name","Pvalue")
    upAdj=p.adjust(upAR$Pvalue,method="bonferroni")
    upXls=cbind(upAR,adjPvalue=upAdj)
    upFC=newTab[as.vector(upXls[,1]),]
    upXls=cbind(upXls,logFC=rowMeans(upFC))
    write.table(upXls,file="up.xls",sep="\t",quote=F,row.names=F)
    upSig=upXls[(upXls$Pvalue<padj & upXls$logFC>logFC),]
    write.table(upSig,file="upSig.xls",sep="\t",quote=F,row.names=F)
    
    #计算共同下调基因
    downMatrix = rankMatrix(downList)
    downAR = aggregateRanks(rmat=downMatrix)
    colnames(downAR)=c("Name","Pvalue")
    downAdj=p.adjust(downAR$Pvalue,method="bonferroni")
    downXls=cbind(downAR,adjPvalue=downAdj)
    downFC=newTab[as.vector(downXls[,1]),]
    downXls=cbind(downXls,logFC=rowMeans(downFC))
    write.table(downXls,file="down.xls",sep="\t",quote=F,row.names=F)
    downSig=downXls[(downXls$Pvalue<padj & downXls$logFC< -logFC),]
    write.table(downSig,file="downSig.xls",sep="\t",quote=F,row.names=F)
    
    #合并上调与下调基因
    allSig = rbind(upSig,downSig)
    colnames(allSig)
    write.csv(allSig,file = 'allSign.csv')
    

    绘图准备:仅仅保留那些在所有数据存在的基因,以及删除TCGA和GEO趋势不一致的基因

    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE18864')
    a <- read.csv("allDiff.csv", header = T, row.names = 1) %>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE58812')
    b <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76124')
    c <-read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE76250')
    d <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE83937')
    e <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE95700')
    f <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\GSE106977')
    g  <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg\\TCGA')
    h  <- read.csv("allDiff.csv", header = T, row.names = 1)%>% rownames_to_column(var='gene')
    setwd('D:\\SCIwork\\F34TNBC\\s5deg')
    target <- allSig$Name
    length(target)
    a <- subset(a, select =c("gene",   "logFC" ))
    b <- subset(b, select =c("gene",   "logFC" ))
    c <- subset(c, select =c("gene",   "logFC" ))
    d <- subset(d, select =c("gene",   "logFC" ))
    e <- subset(e, select =c("gene",   "logFC" ))
    f <- subset(f, select =c("gene",   "logFC" ))
    g <- subset(g, select =c("gene",   "logFC" ))
    h <- subset(h, select =c("gene",   "logFC" ))
    gene <- intersect(a$gene, b$gene)
    gene <- intersect(c$gene, gene)
    gene <- intersect(d$gene, gene)
    gene <- intersect(e$gene, gene)
    gene <- intersect(f$gene, gene)
    gene <- intersect(g$gene, gene)
    gene <- intersect(h$gene, gene)
    gene <- intersect(allSig$Name, gene)
    dat <-  data.frame()
    for (i in 1:length(gene)) {   
      dat[i,1] <- a[which(a$gene %in% gene[i]),'logFC']
      dat[i,2] <- b[which(b$gene %in% gene[i]),'logFC']
      dat[i,3] <- c[which(c$gene %in% gene[i]),'logFC']
      dat[i,4] <- d[which(d$gene %in% gene[i]),'logFC']
      dat[i,5] <- e[which(e$gene %in% gene[i]),'logFC']
      dat[i,6] <- f[which(f$gene %in% gene[i]),'logFC']
      dat[i,7] <- g[which(g$gene %in% gene[i]),'logFC']
      dat[i,8] <- h[which(g$gene %in% gene[i]),'logFC']
    }
    colnames(dat) <- c("GSE18864",  "GSE58812",
                       "GSE76124",  "GSE76250",
                       "GSE83937",  "GSE95700",
                       "GSE106977",'TCGA'
    )
    rownames(dat) <- gene
    dat <- dat[which(rownames(dat) %in% rownames(allSig)),]
    hminput <- dat
    for (i in 1:dim(hminput)[1]) {  
      hminput$re[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]*hminput[i,8]
      hminput$re1[i] <- hminput[i,1]*hminput[i,2]*hminput[i,3]*hminput[i,4]*hminput[i,5]*hminput[i,6]*hminput[i,7]  
    }
    table(hminput$re )
    hminput <- subset(hminput, hminput$re != 0)
    hminput$re <- NULL
    hminput$re1 <- hminput$re1*hminput$TCGA
    hminput <- subset(hminput, hminput$re1 >0)
    hminput$re1 <- NULL
    for (x in 1:nrow(hminput )) {  
      for ( y in 1:ncol(hminput )) {    
        if(hminput[x,y] > 2){
          hminput[x,y] <- 2}
        if(hminput[x,y] < -2){
          hminput[x,y] <- -2}   
        
      }
      
    }
    bk = unique(c(seq(-2,2, length=100)))
    setwd('D:\\SCIwork\\F34TNBC\\s5deg')
    library(pheatmap)
    pdf(file="logFC.pdf",width = 4,height = 4)
    pheatmap(hminput,display_numbers = F,
             cluster_cols = T,
             cluster_row = T,
             border_color  = NA,
             breaks = bk,
             show_colnames     = T,
             show_rownames     = T,
             color = colorRampPalette(c("#20B6E2",
                                        "#020303",
                                        "#F5EB17"))(100), 
             fontsize_row=1,
             fontsize_col=5 )
    dev.off()
    
    

    代码数据私信我

    相关文章

      网友评论

        本文标题:#多个数据集差异基因整合#

        本文链接:https://www.haomeiwen.com/subject/pfgqhltx.html