美文网首页四象限bioinformaticsR语言做图
R可视化:组间差异结果的可视化

R可视化:组间差异结果的可视化

作者: 生信学习者2 | 来源:发表于2021-06-29 09:48 被阅读0次

    前言

    对质谱数据做单变量的组间差异检验后,我们获得了检验的结果。常用展示结果的方法有图和表的方式,我们在文献中常用图来展示,而表可作为附表补充图。更多知识分享请到 https://zouhua.top/

    本文讲使用火山图,韦恩图和扩展火山图展示组间差异结果,最后再结合热图展示所有的组间差异蛋白的丰度分布情况。

    Importing data

    library(dplyr)
    library(tibble)
    library(convert)
    library(ggplot2)
    library(ggrepel)
    library(data.table)
    
    phen <- read.csv("phenotype_20210625.csv")
    NC_PC_DEP <- fread("NC_PC_limma_Mass_LOG2Impute.csv")
    NC_PT_DEP <- fread("NC_PT_limma_Mass_LOG2Impute.csv")
    ExprSet <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
    
    subgrp <- c("NC", "PC", "PT")
    grp.col <- c("#568875", "#73FAFC", "#EE853D")
    

    Volcano Function

    # function
    volcanofun <- function(datset=NC_PC_DEP,
                           genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
                           group_name=subgrp[1:2],
                           group_col=grp.col[1:2],
                           pval=0.05, 
                           fc=1){
      
      # datset=NC_PC_DEP
      # genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
      # group_name=subgrp[1:2]
      # group_col=grp.col[1:2]
      # pval=0.05
      # fc=1
      
      dat <- datset %>% 
        mutate(color=factor(Enrichment, 
                            levels = c(group_name, "Nonsignif")))  
      # print(table(dat$color))
      dat_status <- table(dat$color)
      dat_status_number <- as.numeric(dat_status)
      dat_status_name <- names(dat_status)
      legend_label <- c(paste0(dat_status_name[1], " (", dat_status_number[1], ")"),
                        paste0(dat_status_name[2], " (", dat_status_number[2], ")"),
                        paste0("Nonsignif", " (", dat_status_number[3], ")"))
      
      dat.signif <- subset(dat, adj.P.Val < pval & abs(logFC) > fc) %>%
        filter(GeneID%in%genelist)
      print(table(dat.signif$color))
      
      group_col_new <- c(group_col, "#979797")
      group_name_new <- levels(dat$color)
      
      xlabel <- paste0("log2(", paste(group_name, collapse="/"), ")")
      
      # Make a basic ggplot2 object with x-y values
      pl <- ggplot(dat, aes(x = -logFC, y = -log10(adj.P.Val), color = color))+ 
              geom_point(size = 2, alpha = 1, stroke = 1)+
              scale_color_manual(name = NULL,
                                 values = group_col_new,
                                 labels = c(legend_label, "Nonsignif"))+
              xlab(xlabel) + 
              ylab(expression(-log[10]("adjusted p-value")))+ 
              geom_hline(yintercept=-log10(pval), alpha=.8, linetype=2, size=.7)+
              geom_vline(xintercept=fc, alpha=.8, linetype=2, size=.7)+
              geom_vline(xintercept=-fc, alpha=.8, linetype=2, size=.7)+ 
              geom_text_repel(data = dat.signif,
                              aes(label = GeneID),
                              size = 4,
                              max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
                              segment.linetype = 1,
                              segment.curvature = -1e-20,
                              box.padding = unit(0.35, "lines"),
                              point.padding = unit(0.3, "lines"),
                              arrow = arrow(length = unit(0.005, "npc")),
                              color = "black",     # text color
                              bg.color = "white", # shadow color
                              bg.r = 0.15)+
              annotate("text", x=min(dat$logFC), y=-log10(pval), label=pval, size=6, color="red")+
              annotate("text", x=fc, y=0, label=fc, size=6, color="red")+
              annotate("text", x=-fc, y=0, label=-fc, size=6, color="red")+
              scale_y_continuous(trans = "log1p")+
              guides(color=guide_legend(override.aes = list(size = 3)))+
              theme_bw()+ 
              theme(axis.title = element_text(color = "black", size = 12),
                    axis.text = element_text(color = "black", size = 10),
                    text = element_text(size = 8, color = "black", family="serif"),
                    panel.grid = element_blank(),
                    #legend.position = "right",
                    legend.position = c(.15, .1),
                    legend.key.height = unit(0.6,"cm"),
                    legend.text = element_text(face = "bold", color = "black", size = 8),
                    strip.text = element_text(face = "bold", size = 14))
      return(pl)
    }
    
    Gene_boxplot <- function(datset=ExprSet,
            genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
            group_name=subgrp[1:2],
            group_col=grp.col[1:2]){
    
      # datset=ExprSet
      # genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
      # group_name=subgrp[1:2]
      # group_col=grp.col[1:2]
      
      pheno <- pData(datset) %>%
        filter(SubGroup%in%group_name)
      pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
    
      
      edata <- data.frame(exprs(datset)) %>%
        dplyr::select(rownames(pheno)) %>%
        rownames_to_column("GeneID") %>%
        filter(GeneID%in%genelist) %>%
        column_to_rownames("GeneID")
      
      mdat <- pheno %>% dplyr::select(SubGroup) %>%
        rownames_to_column("SampleID") %>%
        inner_join(t(edata) %>% data.frame() %>% rownames_to_column("SampleID"), by = "SampleID") %>%
        column_to_rownames("SampleID")
      
      plotdata <- mdat %>% tidyr::gather(key="geneID", value="value", -SubGroup) %>%
        mutate(SubGroup=factor(SubGroup, levels = group_name))
      
      plotdata$geneID <- factor(plotdata$geneID, levels = genelist)  
      
      pl <- ggplot(plotdata, aes(x = SubGroup, y = value, fill= SubGroup))+
        stat_boxplot(geom = "errorbar", width = 0.15,
                     position = position_dodge(0.4)) +    
        geom_boxplot(width = 0.4, 
                     outlier.colour = "black", 
                     outlier.shape = 21, 
                     outlier.size = .5)+
        scale_fill_manual(values = group_col)+
        #scale_y_continuous(labels = scales::scientific)+
        facet_wrap(facets = "geneID", scales = "free_y", nrow = 2)+    
        labs(x="", y="Relative Abundance (Mass Spectrometry)")+
        guides(fill=F)+
        theme_classic()+
        theme(axis.title = element_text(color = "black", size = 12),
              axis.text.x = element_text(color = "black", size = 10, hjust = .5, vjust = .5),
              text = element_text(size = 8, color = "black", family="serif"),
              panel.grid = element_blank(),
              strip.text = element_text(face = "bold", size = 12))    
        
      return(pl)
    }
    
    NC_PC_volcano <- volcanofun(datset=NC_PC_DEP,
               genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
               group_name=subgrp[1:2],
               group_col=grp.col[1:2])
    
    NC_PC_volcano
    ggsave("NC_PC_Mass_volcano.pdf", NC_PC_volcano, width = 8, height = 6)
    NC_PC_boxplot <- Gene_boxplot(datset=ExprSet,
            genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
            group_name=subgrp[1:2],
            group_col=grp.col[1:2])
    NC_PC_boxplot
    ggsave("NC_PC_Mass_SpecificDEP_boxplot.pdf", NC_PC_boxplot, width = 8, height = 6)
    

    Venn plot for Overlap DEGs

    get_DEG_List <- function(datset1=NC_PC_DEP,
                             datset2=NC_PT_DEP){
      # datset1=NC_PC_DEP
      # datset2=NC_PT_DEP
      
      diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif")
      diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif")
      res <- list(diff1=diff_gene1$GeneID,
                             diff2=diff_gene2$GeneID)
      return(res)
    }
    
    diff_gene_list <- get_DEG_List(datset1=NC_PC_DEP, datset2=NC_PT_DEP)
    
    # eulerr
    require(eulerr)
    diff_venn <- euler(diff_gene_list, shape = "ellipse")
    
    # pdf(file = "Mass_DEPs_Venn.pdf", width = 5, height = 4)
    plot(diff_venn, 
         fills = alpha(grp.col[2:3], 0.5),
         labels = c("NC vs PC", "NC vs PT"),
         quantities = TRUE,
         col = "black")
    # dev.off()
    

    quadrant plot

    A quadrant plot shows the common or different Proteins between two comparisons.

    • 1st quadrant: the Proteins up-regulate in both PC and PT

    • 2nd quadrant: the Proteins up-regulate in PC but down-regulate in PT

    • 3rd quadrant: the Proteins down-regulate in both PC and PT

    • 4th quadrant: the Proteins down-regulate in PC but up-regulate in PT

    quadrant_plot <- function(datset1=NC_PC_DEP, 
                              datset2=NC_PT_DEP,
                              thresholdFC=3,
                              group_name=subgrp,
                              axis_len=12){
      
      # datset1=NC_PC_DEP
      # datset2=NC_PT_DEP
      # thresholdFC=1
      # group_name=subgrp
      # axis_len=5
      
      overlap_gene <- union(datset1$GeneID, datset2$GeneID)
      dat_x <- datset1 %>% filter(GeneID%in%overlap_gene)
      dat_y <- datset2 %>% filter(GeneID%in%overlap_gene)
      
      # the Differential Genes 
      dat_x_signif <- subset(dat_x, Enrichment != "Nonsignif")
      dat_y_signif <- subset(dat_y, Enrichment != "Nonsignif")
      
      # enriched in NC and Disease in both  
      common_signif <- intersect(dat_x_signif$GeneID, dat_y_signif$GeneID)
      
      # enriched in NC and Disease in each
      dat_x_sig_only <- setdiff(dat_x_signif$GeneID, common_signif)
      dat_y_sig_only <- setdiff(dat_y_signif$GeneID, common_signif)
      non_signif <- setdiff(overlap_gene, c(common_signif, dat_x_sig_only, dat_y_sig_only))
      
      
      gene_type_name <- c(paste(group_name[2:3], collapse = "/"),
                          paste(group_name[2], "only"),
                          paste(group_name[3], "only"),
                          "Nonsignif")
      
      gene_type_df <- rbind(data.frame(GeneID=common_signif, group=gene_type_name[1]),
                            data.frame(GeneID=dat_x_sig_only, group=gene_type_name[2]),
                            data.frame(GeneID=dat_y_sig_only, group=gene_type_name[3]),
                            data.frame(GeneID=non_signif, group=gene_type_name[4]))
      mdat <- inner_join(dat_x %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(xvalue=logFC),
                         dat_y %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(yvalue=logFC),
                         by = "GeneID") %>%
        inner_join(gene_type_df, by="GeneID") %>%
        mutate(group=factor(group, levels = rev(gene_type_name)))
      
      print(table(mdat$group))
      
      common_signif_gene <- mdat %>% filter(GeneID%in%common_signif)
      
      common_signif_gene <- mdat %>% filter(GeneID%in%common_signif) %>%
        mutate(GeneID_v2=ifelse(abs(xvalue) > thresholdFC | abs(yvalue) > thresholdFC, GeneID, NA))
      
      common_signif_gene_v2 <- na.omit(common_signif_gene)
      print(table(common_signif_gene_v2$group))
    
      require(magrittr)
      # constants
      axis_begin  <- -axis_len
      axis_end    <- axis_len
      total_ticks <- 8
      
      # point to plot
      my_point <- data.frame(x=1, y=1)
      # chart junk data
      tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
        subset(ticks != 0)
      tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
        subset(ticks != 0)
      lab_frame <- data.frame(lab = seq(axis_begin, axis_end, 2), zero = 0) %>%
        subset(lab != 0)
      tick_sz <- (tail(lab_frame$lab, 1) -  lab_frame$lab[1]) / 128
      
      x_title <- paste(group_name[1], "vs", group_name[2])
      y_title <- paste(group_name[1], "vs", group_name[3])
      
      pl <- ggplot(mdat)+
        geom_segment(x = 0, xend = 0, 
                     y = lab_frame$lab[1], yend = tail(lab_frame$lab, 1),
                     size = 0.5) +
        geom_segment(y = 0, yend = 0, 
                     x = lab_frame$lab[1], xend = tail(lab_frame$lab, 1),
                     size = 0.5) +
        geom_segment(data = tick_frame, 
                     aes(x = ticks, xend = ticks, 
                         y = zero, yend = zero + tick_sz)) +
        geom_segment(data = tick_frame, 
                     aes(x = zero, xend = zero + tick_sz, 
                         y = ticks, yend = ticks)) + 
        geom_text(data=lab_frame, aes(x=lab, y=zero, label=lab),
                  family = 'Times', vjust=1.5) +
        geom_text(data=lab_frame, aes(x=zero, y=lab, label=lab),
                  family = 'Times', hjust=1.5) +
        annotate("text", x = axis_len-1, y = -.7, color = "black", size=3, 
                 label = paste0("log2(", x_title, ")"))+
        annotate("text", x = -.7, y = axis_len-1, color = "black", size=3, angle=90, 
                 label = paste0("log2(", y_title, ")"))+
        geom_point(aes(x = xvalue, y = yvalue, color=group), size = 1.5)+
        geom_text_repel(data = common_signif_gene,  
                        aes(x=xvalue, y=yvalue, label = GeneID_v2),
                        size = 5,
                        max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
                        segment.linetype = 1,
                        segment.curvature = -1e-20,
                        box.padding = unit(0.35, "lines"),
                        point.padding = unit(0.3, "lines"),
                        arrow = arrow(length = unit(0.005, "npc")),
                        # color = "white",     # text color
                        # bg.color = "grey30", # shadow color
                        bg.r = 0.15)+
        scale_color_manual(values = c("#A6A6A6", "#7BBDE0", "#B67FD0", "#FDC361"))+
        guides(color=guide_legend(title = NULL, keywidth=.9, keyheight=.9, linetype=2))+
        theme_void()+
        theme(panel.grid = element_blank(),
              text = element_text(size = 8, color = "black", family="serif"),      
              legend.text=element_text(size=11, color = "black"),
              legend.position = c(.7, .2),
              legend.justification = c(0, 0),
              legend.background = element_rect(linetype=2, color = "black", fill="white"))
      
      return(pl)
    }
    
    quadrantpl <- quadrant_plot(datset1=NC_PC_DEP, 
                                datset2=NC_PT_DEP,
                                thresholdFC=3,
                                group_name=subgrp,
                                axis_len=8)
    quadrantpl
    ggsave("Mass_DEPs_quadrant.pdf", quadrantpl, width = 6, height = 6)
    

    Heatmap Of DEGs

    heatFun <- function(datset1=NC_PC_DEP, 
                        datset2=NC_PT_DEP,
                        thresholdFC=1,
                        ExprSet=ExprSet,
                        group_name=subgrp){
      
      # datset1=NC_PC_DEP
      # datset2=NC_PT_DEP
      # thresholdFC=1
      # ExprSet=ExprSet
      # group_name=subgrp
      
      
      diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif") %>%
        filter(abs(logFC) > thresholdFC)
      diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif") %>%
        filter(abs(logFC) > thresholdFC)
      
      union_gene <- Reduce(union, list(diff_gene1$GeneID, diff_gene2$GeneID))
      
      pheno <- pData(ExprSet) %>% data.frame() %>%
        rownames_to_column("SampleID") %>%
        filter(SubGroup%in%group_name) %>%
        mutate(SubGroup=factor(SubGroup, levels = group_name)) %>%
        arrange(SubGroup) %>%
        column_to_rownames("SampleID")
      
      edata <- exprs(ExprSet) %>% data.frame() %>%
        rownames_to_column("geneid") %>%
        filter(geneid%in%union_gene) %>%
        dplyr::select(c("geneid", rownames(pheno))) %>%
        column_to_rownames("geneid")
      
      # scale data: z-score
      scale_rows <- function (x) {
          m = apply(x, 1, mean, na.rm = T)
          s = apply(x, 1, sd, na.rm = T)
          return((x - m)/s)
      }  
      edata_scaled <- t(scale_rows(edata))
      require(circlize)
      col_fun <- colorRamp2(c(round(range(edata_scaled)[1]), 0, 
                              round(range(edata_scaled)[2])),
                            c("blue", "white", "red")) 
      # row split 
      dat_status <- table(pheno$SubGroup)
      dat_status_number <- as.numeric(dat_status)
      dat_status_name <- names(dat_status)
      row_split <- c()
      for (i in 1:length(dat_status_number)) {
        row_split <- c(row_split, rep(i, dat_status_number[i]))
      }
      require(ComplexHeatmap)
      pl <- Heatmap(
              edata_scaled, 
              #col = col_fun,
              cluster_rows = FALSE,
              row_order = rownames(pheno),
              show_column_names = FALSE,
              show_row_names = FALSE,
              row_names_gp = gpar(fontsize = 12),
              row_names_side = "right",
              row_dend_side = "left",
              column_title = NULL, 
              heatmap_legend_param = list(
                title = "Relative Abundance\nZscore",
                title_position = "topcenter",
                border = "black",
                legend_height = unit(10, "cm"),
                direction = "horizontal"),
             row_split = row_split,
            left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:4),
                labels = group_name, 
                labels_gp = gpar(col = "black", fontsize = 12))),         
             column_km = 3
        )
      return(pl)
    }
    
    heatFun(datset1=NC_PC_DEP, 
            datset2=NC_PT_DEP,
            thresholdFC=1,
            ExprSet=ExprSet,
            group_name=subgrp)
    

    结果效果图类似如下截图

    systemic information

    sessionInfo()
    
    R version 4.0.2 (2020-06-22)
    Platform: x86_64-conda_cos6-linux-gnu (64-bit)
    Running under: CentOS Linux 8 (Core)
    
    Matrix products: default
    BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
    
    locale:
     [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
     [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
     [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
    
    attached base packages:
    [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
    
    other attached packages:
     [1] ggrepel_0.9.1.9999          SummarizedExperiment_1.20.0 GenomicRanges_1.42.0        GenomeInfoDb_1.26.4        
     [5] IRanges_2.24.1              S4Vectors_0.28.1            MatrixGenerics_1.2.1        matrixStats_0.58.0         
     [9] data.table_1.14.0           umap_0.2.7.0                vegan_2.5-6                 lattice_0.20-41            
    [13] permute_0.9-5               Rtsne_0.15                  convert_1.64.0              marray_1.68.0              
    [17] limma_3.46.0                Biobase_2.50.0              BiocGenerics_0.36.0         ggplot2_3.3.3              
    [21] tibble_3.1.0                dplyr_1.0.5                
    
    loaded via a namespace (and not attached):
     [1] nlme_3.1-150           bitops_1.0-6           tools_4.0.2            backports_1.2.1        utf8_1.2.1            
     [6] R6_2.5.0               DBI_1.1.1              mgcv_1.8-34            colorspace_2.0-0       withr_2.4.1           
    [11] tidyselect_1.1.0       curl_4.3               compiler_4.0.2         DelayedArray_0.16.3    scales_1.1.1          
    [16] askpass_1.1            digest_0.6.27          foreign_0.8-81         rmarkdown_2.7          rio_0.5.16            
    [21] XVector_0.30.0         pkgconfig_2.0.3        htmltools_0.5.1.1      rlang_0.4.10           readxl_1.3.1          
    [26] generics_0.1.0         jsonlite_1.7.2         zip_2.1.1              car_3.0-10             RCurl_1.98-1.3        
    [31] magrittr_2.0.1         GenomeInfoDbData_1.2.4 Matrix_1.3-2           Rcpp_1.0.6             munsell_0.5.0         
    [36] fansi_0.4.2            abind_1.4-5            reticulate_1.18        lifecycle_1.0.0        stringi_1.4.6         
    [41] yaml_2.2.1             edgeR_3.32.1           carData_3.0-4          MASS_7.3-53.1          zlibbioc_1.36.0       
    [46] grid_4.0.2             forcats_0.5.0          crayon_1.4.1           haven_2.3.1            splines_4.0.2         
    [51] hms_1.0.0              locfit_1.5-9.4         knitr_1.31             pillar_1.6.0           ggpubr_0.4.0          
    [56] ggsignif_0.6.0         glue_1.4.2             evaluate_0.14          vctrs_0.3.7            cellranger_1.1.0      
    [61] gtable_0.3.0           openssl_1.4.3          purrr_0.3.4            tidyr_1.1.3            assertthat_0.2.1      
    [66] xfun_0.20              openxlsx_4.2.3         broom_0.7.6            RSpectra_0.16-0        rstatix_0.7.0  
    

    Reference

    1. Core functional nodes and sex-specific pathways in human ischaemic and dilated cardiomyopathy

    相关文章

      网友评论

        本文标题:R可视化:组间差异结果的可视化

        本文链接:https://www.haomeiwen.com/subject/xpyhultx.html