R可视化:时序连线点图

作者: 生信学习者2 | 来源:发表于2020-12-07 09:29 被阅读0次

    最近处理一批小鼠衰老数据,如何展示不同时间点的基因表达数据比较好玩,因此记录一下。更多知识分享请到 https://zouhua.top/

    加载R包

    library(dplyr)
    library(tibble)
    library(ggplot2)
    library(data.table)
    library(convert)
    library(ggpubr)
    
    age <- c("1", "3", "6", "9", "12", "15", "18", "21", "24", "27")
    age.col <- c("#283891", "#EED3AC", "#C1272D", "#9DCEDC", 
                 "#ED1C24", "#9DCEDC", "#F89C31", "#B6B6BA", "#6DC06A", "#2D6BB4")
    

    导入数据

    phen <- read.csv("phenotype.csv")
    prof <- fread("TPM.tsv") %>%
      column_to_rownames("V1")
    mouse2human <- read.table("gene_mouse2human.tsv", 
                              sep = "\t", header = T)
    genelist <- read.table("gene_list.tsv", header = T)
    #mouse2human %>% filter(HGNC_symbol%in%genelist$gene2)
    

    处理数据

    get_expr_Set <- function(x, y, gene_list=gene_list,
                             ncount=10, occurrence=0.2){
      # x <- phen
      # y <- prof
      # ncount=10
      # occurrence=0.2
      
      prf <- y[rowSums(y) > ncount, ]
      prf <- prf[, colSums(prf) > 0]
      sid <- intersect(x$SampleID, colnames(y))
      phe <- x %>% filter(SampleID%in%sid) %>% 
        column_to_rownames("SampleID_v2") 
      prf.cln <- prf %>% dplyr::select(as.character(phe$SampleID)) %>% 
        rownames_to_column("tmp") %>% 
        filter(apply(dplyr::select(., -one_of("tmp")), 1, function(x) {
                sum(x != 0)/length(x)}) > occurrence) %>%
        column_to_rownames("tmp")
      
      # determine the right order between profile and phenotype 
      for(i in 1:ncol(prf.cln)){ 
        if (!(colnames(prf.cln)[i] == phe$SampleID[i])) {
          stop(paste0(i, " Wrong"))
        }
      }
      
      # change ensemble id into HGNC symbol
      mmu_hsa_gene <- inner_join(
        prf.cln %>% rownames_to_column("geneid"), 
        mouse2human %>% dplyr::select(ensembl_id_mouse, HGNC_symbol), 
        by = c("geneid" = "ensembl_id_mouse")) %>%
        dplyr::select(-geneid)
      
      mmu_hsa_gene$median <- apply(mmu_hsa_gene[,-ncol(mmu_hsa_gene)], 1, median)
      mmu_hsa_gene <- with(mmu_hsa_gene, 
                           mmu_hsa_gene[order(HGNC_symbol, median, decreasing = T), ])
      mmu_hsa_gene.new <- mmu_hsa_gene[!duplicated(mmu_hsa_gene$HGNC_symbol), ] %>%
        dplyr::select(-median)
      rownames(mmu_hsa_gene.new) <- NULL
      mmu_hsa_gene.new <- mmu_hsa_gene.new  %>% column_to_rownames("HGNC_symbol")
      
      mmu_hsa_gene.new.cln <- mmu_hsa_gene.new %>% rownames_to_column("gene") %>%
        filter(gene%in%gene_list$gene2) %>%
        column_to_rownames("gene")
      exprs <- as.matrix(mmu_hsa_gene.new.cln)
      metadata <-  new("AnnotatedDataFrame", data=phe)
      experimentData <- new("MIAME",
            name="Mouse aging", lab="Lab",
            contact="hua zou@outlook.com",
            title="Mouse Aging Experiment",
            abstract="The gene ExpressionSet",
            url="www.zouhua.top",
            other=list(notes="Created from text files"))
      expressionSet <- new("ExpressionSet", exprs=exprs,
                           phenoData=metadata, 
                           experimentData=experimentData)
      
      return(expressionSet)
    }
    gene_expr_set <- get_expr_Set(phen, prof, gene_list=genelist)
    

    获取画图数据

    dat_expr <- exprs(gene_expr_set) %>% data.frame()
    dat_expr.cln <- dat_expr[, colSums(dat_expr) > 0]
    dat_phen <- data.frame(SampleID=gene_expr_set$SampleID, Age=gene_expr_set$Age)
    mdat <- inner_join(dat_phen, 
                       dat_expr.cln %>% t() %>% data.frame() %>% rownames_to_column("SampleID"),
                       by = "SampleID") %>% 
            tidyr::gather(key="Gene", value="TMP_Value", -c("SampleID", "Age")) %>%
            mutate(Age=factor(Age, levels = age))
    

    画图

    pl <- ggplot(mdat, aes(x=Age, y=TMP_Value))+
      geom_dotplot(aes(fill=Age), binaxis='y', stackdir='center', dotsize=0.9,
                   bins = 30)+ 
      # stat_summary(fun.data="mean_sdl", fun.args = list(mult=1), 
      #                geom="crossbar", width=0.5)+
      stat_summary(fun.data=mean_sdl, fun.args = list(mult=1), 
            geom="errorbar", color="red", width=0.2)+
      stat_summary(fun.y=mean, geom="point", color="black", fill="white", size=2)+
      stat_summary(fun.y=mean, geom="line", group=1, linetype=1, size=0.7)+
      stat_compare_means(comparisons = list(c("1", "3")),
                         method = "wilcox.test")+
      scale_fill_manual(values = age.col)+
      scale_x_discrete(breaks=age,
            labels=paste0(age, "\nMonth"))+
      labs(x="", y="Gene expression value (TPM)")+
      facet_wrap(facets = "Gene", scales = "free")+
      theme_bw()+ 
      theme(axis.ticks.length = unit(0.2, "cm"),
              axis.title = element_text(face = "bold", size = 12),
              axis.text.x = element_text(face = "bold", size = 10),
              axis.text.y = element_text(size = 10, face = "bold"),
              strip.text = element_text(color = 'red', face = 'bold', size = rel(1.5)),
              strip.background = element_rect(colour = 'black', size = rel(2)))
    pl
    

    保存图形

    ggsave("test.pdf", width = 14, height = 8, dpi = 300)
    #ggsave("test.png", width = 14, height = 8, dpi = 300)
    

    参考

    参考文章如引起任何侵权问题,可以与我联系,谢谢。

    相关文章

      网友评论

        本文标题:R可视化:时序连线点图

        本文链接:https://www.haomeiwen.com/subject/ukszwktx.html