美文网首页ggplot集锦
刚上研一,复现blood(二)

刚上研一,复现blood(二)

作者: 生命数据科学 | 来源:发表于2022-12-07 23:27 被阅读0次

    Article name: A comprehensive transcriptome signature of murine hematopoietic stem cell aging

    Journal: blood

    Doi: 10.1182/blood.2020009729

    IF: 23.629

    Position: suppleFigure 1D/E

    图1 图2

    可以先猜一下哪张是R语言做出来的图,哪张是blood里的原图~


    腮边打网的话

    当想要学习作图的时候,到底是先买书还是在网上边学边做,个人看法:

    1. 可以考虑先买书,一本书也就几百页,经过系统、深入的学习,再通过大量的R语言程序检验之后,学练结合,理论与实践的完美交互,少则1小时,多则1年,ggplot大法必然修炼的炉火纯青,再然后......

      图片 图片
    2. 当然也可以考虑边学边练,看一点点资料,一天搞定

    图3

    还有上一篇复现中,高中同学问我,现在都有软件了origin或者graphpad,为啥还要用代码,嫌头发太多吗。。。

    图4

    这确实也是我一开始的问题,但是在做了今天这次复现后,这个疑虑被打消了,为什么呢?(浅谈这个热图数据分析思路)

    1. 我们目前有一个5000行*12列,其中有随机分布的NA数值

    2. 需要遍历12列中每两列之间交集情况,并且以12*12矩阵的格式保存

    3. 将这12*12矩阵去掉右上角一半部分

    4. 然后作图

    5. 软件其实只能由12*12的矩阵来绘制图形,而对于最难的前两步处理只能说爱莫能助

    综上,作图软件只能作图,而靠代码的话,一是可以自己分析,自由空间大,二是效率超指数型上升,第一次作图需要1天,第二次作图,只需要1分钟,ggplot2,香~


    具体代码

    代码其实延续了之前的代码,数据的话参考[刚上研一,复现blood],后台回复相应关键词即可获取。顺带着还做了一张图(图4,只做了re-analysis的),拟合曲线的效果比blood还好

    图4 图5
    
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(reshape)
    library(corrplot)
    library(ggpubr)
    
    # rm(list = ls())
    # setwd("./file")
    # Bersenev <- read.table("Bersenev_GSE39553.csv",sep = "\t",
    #                                               header = T,skip = 1,quote = "")%>%
    #   .[,c("Gene.symbol","logFC")]
    # colnames(Bersenev)<-c("genes","Bersenev")
    # 
    # Chambers <- read.table("Chambers_GSE6503.csv",sep = "\t",
    #                                               header = T,skip = 1,quote = "")%>%.[,c("Gene.symbol","logFC")]
    # colnames(Chambers)<-c("genes","Chambers")
    # 
    # Flach <- read.table("Flach_GSE48893.csv",sep = "\t",
    #                                         header = T,skip = 1,quote = "")%>%.[,c("Gene.symbol","logFC")]
    # colnames(Flach)<-c("genes","Flach")
    # 
    # Grover <- read.table("Grover_GSE70657.csv",sep = "\t",
    #                                           header = T,skip = 1,quote = "")%>%.[,c("Gene","avg_logFC")]
    # colnames(Grover)<-c("genes","Grover")
    # 
    # Kirschner <- read.table("Kirschner_GSE87631.csv",sep = "\t",
    #                                                 header = T,skip = 1,quote = "")%>%.[,c("Gene","avg_logFC")]
    # colnames(Kirschner)<-c("genes","Kirschner")
    # 
    # Kowalczyk <- read.table("Kowalczyk_GSE59114.csv",sep = "\t",
    #                                                 header = T,skip = 1,quote = "")%>%.[,c("Gene","avg_logFC")]
    # colnames(Kowalczyk)<-c("genes","Kowalczyk")
    # 
    # Lazare <- read.table("Lazare_GSE128050.csv",sep = "\t",
    #                                           header = T,skip = 1,quote = "")%>%.[,c("external_gene_name","logFC")]
    # colnames(Lazare)<-c("genes","Lazare")
    # 
    # Mann <- read.table("Mann_GSE1004426.csv",sep = "\t",
    #                                       header = T,skip = 1,quote = "")%>%.[,c("Gene","avg_logFC")]
    # colnames(Mann)<-c("genes","Mann")
    # 
    # Maryanovich <- read.table("Maryanovich_GSE109546.csv",sep = "\t",
    #                                                     header = T,skip = 1,quote = "")%>%.[,c("external_gene_name","logFC")]
    # colnames(Maryanovich)<-c("genes","Maryanovich")
    # 
    # Norddahl <- read.table("Norddahl_GSE27686.csv",sep = "\t",
    #                                               header = T,skip = 1,quote = "")%>%.[,c("Gene.symbol","logFC")]
    # colnames(Norddahl)<-c("genes","Norddahl")
    # 
    # Sun <- read.table("Sun_GSE47817.csv",sep = "\t",
    #                                     header = T,skip = 1,quote = "")%>%.[,c("external_gene_name","logFC")]
    # colnames(Sun)<-c("genes","Sun")
    # 
    # Wahlestedt <- read.table("Wahlestedt_GSE44923.csv",sep = "\t",
    #                                                   header = T,skip = 1,quote = "")%>%.[,c("Gene.symbol","logFC")]
    # colnames(Wahlestedt)<-c("genes","Wahlestedt")
    # 
    # all_data <- list(Bersenev,Chambers,Flach,Grover,Kirschner,
    #                                   Kowalczyk,Lazare, Mann,Maryanovich,Norddahl,
    #                                   Sun,Wahlestedt)
    # 
    # all_pub <- purrr::reduce(.x = all_data,.f = full_join,by="genes")
    # blood_output<-separate(all_pub,genes,sep = "///",remove = T,into = "genes")
    # blood_output <- blood_output %>% group_by(genes) %>% filter (!duplicated(genes))
    # blood_output<-blood_output[(blood_output[,1]!=""&!is.na(blood_output[,1])),]
    # # blood_output<-separate_rows(geneMatrix,genes,sep = "///")
    # write.table(blood_output,"output_remove.txt",sep = "\t",quote = F,row.names = F,col.names = T)
    blood_output <- read.table("output.txt",sep = "\t",header = T)
    data <- blood_output[,2:13]
    
    plot_matrix <- matrix(nrow = ncol(data),ncol = 3,dimnames = list(NULL,c("name","Upregulated","Downregulated")))
    for (x in 1:ncol(data)) {
      data_name <- colnames(data)[x]
      non_na_data <- na.omit(data[,x])
      Upregulated <- length(non_na_data[non_na_data>0])
      Downregulated <- length(non_na_data[non_na_data<0])
      plot_matrix[x,] <- c(data_name,Upregulated,Downregulated)
    }
    plot_matrix <- as.data.frame(plot_matrix)
    plot_matrix <- melt(plot_matrix,id.vars = c("name"))
    
    plot_matrix$value <- as.numeric(plot_matrix$value) 
    
    pl <- ggplot(data=plot_matrix, aes(x=reorder(name,-value), y=value)) +
      geom_bar(stat = "identity",aes(fill=variable))+
      scale_fill_manual(values=c("#005187","#e5082c"))+
      scale_y_continuous(breaks=c(1000,2000,3000),
                                            labels=c("1000", "2000", "3000"))+
      theme_bw()+
      theme(panel.grid=element_blank())+
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))+
      labs(x="",y="# of reported DE genes",title = "Reanalysis")+
      theme(text = element_text(family = "Arial",face = "bold"))
    pl
    ggsave(pl, filename = "blood_figure_1c.pdf", device = cairo_pdf, 
                  width = 8, height = 7, units = "in")
    
    # supple Fig1D
    multidata <- read.table("output.txt",sep = "\t",header = T,check.names = F)
    length_compute <- function(x){
      length(x[!is.na(x)])-1
    }
    
    genematrix_count <- apply(multidata,1,length_compute)
    multidata <- cbind(multidata,count=genematrix_count)
    
    point_plot_data <- as.data.frame(table(multidata$count))
    point_plot_data[,1] <- as.numeric(point_plot_data[,1])
    supple_Fig1D <- ggplot(data = point_plot_data, aes(x=Var1,y=Freq)) +
      geom_smooth(mapping = aes(x=Var1,y=Freq),color="#4baf49",
                              method = "loess",se = F,span=1,formula = y~I(1/x),size=1.5) +
      geom_point(shape=1,size=3,color="#4baf49",stroke=2)+
      labs(x="number of overlaps across studies",y="DE genes")+
     ylim(0,5000)+
      scale_x_continuous(breaks = as.numeric(point_plot_data$Var1))+
      theme_bw()+
      theme(panel.grid=element_blank())+
      theme(axis.title = element_text(size = 15,face = "bold",colour = "black"),
                  axis.text = element_text(size=15,face = "bold",colour = "black"),
                  panel.border = element_rect(fill=NA,color="black", size=3, linetype="solid"),
                  axis.ticks = element_line(size = 2))
    ggsave(supple_Fig1D, filename = "supple Fig1D.PDF", device = cairo_pdf, 
                  width = 5, height = 4, units = "in")
    #supple Fig1F
    rm(list = ls())
    Fig1F_data <- read.table("output_remove.txt",sep = "\t",header = T,check.names = F,row.names = 1)
    Fig1F_data <- Fig1F_data[,c(5,3,4,6,2,9,11,1,10,12,7,8)]
    heatmap_data<-matrix(data = NA,nrow = ncol(Fig1F_data),
                                              ncol = ncol(Fig1F_data),
                                              dimnames = list(colnames(Fig1F_data),colnames(Fig1F_data)))
    
    two_inter <- list()
    for (i in colnames(Fig1F_data)) {
      for (x in colnames(Fig1F_data)) {
        two_inter[[i]][[x]] <- rownames(na.omit(Fig1F_data[,c(i,x)]))
        heatmap_data[i,x] <- nrow(na.omit(Fig1F_data[,c(i,x)]))
      }
    }
    heatmap_data[upper.tri(heatmap_data)]<-NA
    
    fraction <- c()
    for (n in names(two_inter)) {
      inter_names <- names(two_inter)[names(two_inter)!=n]
      rname <- c()
      for (n2 in inter_names) {
        rname <- unique(c(rname,two_inter[[n]][[n2]]))
      }
      fraction <- c(fraction,length(rname)/length(Fig1F_data[,n][!is.na(Fig1F_data[,n])]))
    }
    heatmap_data_gg<-reshape2::melt(heatmap_data)
    heatmap_plot<-
      ggplot(heatmap_data_gg,aes(x=Var1,y=Var2,fill=log2(value)))+
      geom_tile(color = "white",na.rm = T,size=1.5)+
      scale_fill_gradient2(low = "#800080", high = "#faed5d", mid = "#65aeb0", 
                                                midpoint = 6,  space = "Lab",na.value = "white",guide = "colourbar") +
      labs(x="",y="",title = "Re-analysis")+
      theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1),
                  panel.border = element_rect(fill = NA,color = "black",size = 2),
                  axis.ticks = element_line(size = 1,linetype = 1),
                  legend.text = element_blank(),
                  plot.title = element_text(size=18,face = "bold",colour = "black",vjust = 2,hjust = 0.5),
                  axis.text = element_text(size=15,face = "bold",colour = "black"),
                  legend.title = element_blank(),legend.key.width = unit(0.7,"cm") 
      )+
      coord_fixed(1)+
      geom_text(aes(label=value),colour= "white")
      
    
    fraction_data <- as.data.frame(cbind(name=rep("col",length(fraction)),fraction,name_true = colnames(Fig1F_data)))
    fraction_data$fraction <- factor(fraction_data$fraction,levels=fraction)
    
    inter_ratio <- ggplot(data = fraction_data,aes(x=name,y=fraction,fill=as.numeric(as.character(fraction))))+
      geom_tile(color = "white",size = 1.5)+
      theme_bw()+
      scale_fill_gradient2(low = "#800080", high = "#faed5d", mid = "#65aeb0", 
                                                midpoint = 0.5,  space = "Lab",na.value = "white")+
     coord_fixed(ratio=1)+
      theme(axis.text = element_blank(),
                  axis.title = element_blank(),
                  legend.position="none",
                  panel.border = element_rect(fill = NA,color = "black",size = 2),
                  axis.ticks.x=element_blank(),
                  axis.ticks.y = element_line(size = 2),
                  )+
      geom_text(aes(label=signif(as.numeric(as.character(fraction)), 2),color="white"))
      # scale_y_discrete(breaks=fraction,labels=fraction_data$name_true,position = "right")
    
    all_plot <- ggarrange(heatmap_plot, inter_ratio,
                        ncol = 2, nrow = 1, 
                        widths = c(12, 1), heights = 9,
                        common.legend = TRUE,hjust = 0)  
    ggsave(all_plot, filename = "supple Fig1F2.PDF", device = cairo_pdf, 
                  width = 8, height = 7, units = "in")
    

    相关文章

      网友评论

        本文标题:刚上研一,复现blood(二)

        本文链接:https://www.haomeiwen.com/subject/tpuofdtx.html