转录组DEGs聚类热图和功能富集分析

作者: kkkkkkang | 来源:发表于2022-04-18 16:29 被阅读0次

    写在前面:经常做转录组分析,就是把差异基因搞个火山图和Venn图看各组差异基因的共有和特有情况。看见有个比较好的选择,能直观比较各种处理带来的影响,如下:

    image.png
    来自Nature plants的一篇文章
    Ref:
    https://github.com/YulongNiu/MPIPZ_microbe-host_homeostasis
    https://www.nature.com/articles/s41477-021-00920-2
    这个图很牛逼啊,表示的信息量很全,值得学习。去扒作者的代码,复现出了大部分

    所需文件:
    总的基因丰度表,即各个基因在每个样品中的丰度


    image.png

    各个样品的基因差异表达情况
    举一个例子,这是Deseq2算出来的:


    image.png

    1. 导入数据并做一些处理

    setwd("C:/Users/yjk/Desktop/cluster_DEGs")
    library("dplyr")
    library("ComplexHeatmap")
    library("tibble")
    library("RColorBrewer")
    library("ggplot2")
    my_theme()
    all_genes <- read.table("all_genes.txt", header = TRUE) # fpkm
    
    # DESeq2获得的差异表达基因(DEGs), |log2foldchange| > 1, padj < 0.05
    HF12N_Rs <- read.table("HF12N_Rs.txt", header = TRUE, sep = "\t")
    HF12N_S <- read.table("HF12N_S.txt", header = TRUE, sep = "\t")
    HF12Rs_N_S <- read.table("HF12Rs_N_S.txt", header = TRUE, sep = "\t")
    HF12S_Rs <- read.table("HF12S_Rs.txt", header = TRUE, sep = "\t")
    HG64N_Rs <- read.table("HG64N_Rs.txt", header = TRUE, sep = "\t")
    HG64N_S <- read.table("HG64N_S.txt", header = TRUE, sep = "\t")
    HG64Rs_N_S <- read.table("HG64Rs_N_S.txt", header = TRUE, sep = "\t")
    HG64S_Rs <- read.table("HG64S_Rs.txt", header = TRUE, sep = "\t")
    # 合并差异表达和基因丰度
    all <- HF12N_Rs %>% 
        full_join(HF12N_S) %>% 
        full_join(HF12Rs_N_S) %>% 
        full_join(HF12S_Rs) %>% 
        full_join(HG64N_Rs) %>% 
        full_join(HG64N_S) %>% 
        full_join(HG64Rs_N_S) %>% 
        full_join(HG64S_Rs) %>% 
        left_join(all_genes)
    all[is.na(all)] <- "No"
    ## mean func
    meanGp <- function(v) {
        require('magrittr')
        res <- v %>%
            split(rep(1 : 8, each = 3)) %>%
            sapply(mean, na.rm = TRUE)
        return(res)
    }
    
    all_for_cluster <- select(all, -contains("vs")) # 只选择原始fpkm数据
    rownames(all_for_cluster) <- all_for_cluster$id
    all_for_cluster <- all_for_cluster[-1]
    
    ## sample name
    sampleN <- c("HG64NRs","HG64SRs","HG64N","HG64S",
                 "HF12NRs","HF12SRs","HF12N","HF12S")
    
    meanCount <- all_for_cluster %>%
        apply(1, meanGp) %>%
        t
    
    colnames(meanCount) <- sampleN
    ## scale
    scaleCount <- meanCount %>%
        t %>%
        scale %>%
        t
    scaleCount %<>% .[complete.cases(.), ]
    

    2. 然后要先计算多少个cluster合适:

    # set.seed(123) 聚类结果一致
    set.seed(123)
    ## choose cluster num
    ## 1. sum of squared error
    wss <- (nrow(scaleCount) - 1) * sum(apply(scaleCount, 2, var))
    
    for (i in 2:20) {
        wss[i] <- sum(kmeans(scaleCount,
                             centers=i,
                             algorithm = 'MacQueen')$withinss)
    }
    
    ggplot(tibble(k = 1:20, wss = wss), aes(k, wss)) +
        geom_point(colour = '#D55E00', size = 3) +
        geom_line(linetype = 'dashed') +
        xlab('Number of clusters') +
        ylab('Sum of squared error')
    
    # ggsave("Sum_of_squared_error.pdf", height = 3, width = 4)
    ## 2. Akaike information criterion
    kmeansAIC = function(fit){
        m = ncol(fit$centers)
        n = length(fit$cluster)
        k = nrow(fit$centers)
        D = fit$tot.withinss
        return(D + 2*m*k)
    }
    
    aic <- numeric(20)
    for (i in 1:20) {
        fit <- kmeans(x = scaleCount, centers = i, algorithm = 'MacQueen')
        aic[i] <- kmeansAIC(fit)
    }
    
    ggplot(tibble(k = 1:20, aic = aic), aes(k, wss)) +
        geom_point(colour = '#009E73', size = 3) +
        geom_line(linetype = 'dashed') +
        xlab('Number of clusters') +
        ylab('Akaike information criterion')
    
    # ggsave("Akaike_information_criterion.pdf", height = 3, width = 4)
    # choose cluster 10
    

    withinss: Vector of within-cluster sum of squares, one component per cluster.

    我们上面计算的是withinss,即cluster内的误差平方和,同一个cluster趋势越一致则越小。所以cluster越多则这个值越小,这和我们认知一致。但是,cluster太多了信息很杂,太少了就成了生拉硬扯成cluster了

    image.png
    可以看出选则10比较合适
    利用另外一个参数:赤池信息量准则(Akaike information criterion,简称AIC)来衡量

    AIC介绍,https://www.biaodianfu.com/aic-bic.html
    理论上来讲,增加自由参数的数目可以提高拟合的优良性,但是过多,模型过于复杂容易造成过拟合现象。因此,AIC不仅要提高模型拟合度(极大似然),而且引入了惩罚项,使模型参数尽可能少,有助于降低过拟合的可能性。

    image.png
    可以看出,两种方法,同样结果。选择10

    3. 然后聚类:

    kclust10 <- kmeans(scaleCount, centers = 10, algorithm = "MacQueen", nstart = 1000, iter.max = 20)
    cl <- as.data.frame(kclust10$cluster) %>% 
        rownames_to_column("id") %>% 
        set_colnames(c("id","cl"))
    
    heat_all <- all %>% left_join(cl)
    
    
    # 把所有DEGs的cluster信息保存,为后面富集分析所用
    degs_cl <- heat_all %>%
        select(c("id","cl"))
    write.table(degs_cl, "./enrichment/degs_cl.txt", sep = "\t", quote = FALSE)
    
    
    scaleC <- heat_all %>% 
        select(-contains("vs")) %>% 
        select(-c("id","cl")) %>% 
        t %>% 
        scale %>%
        t %>%
        as_tibble %>%
        bind_cols(heat_all %>% select(id, cl))
    # 排序
    scaleC <- scaleC[,c("HG64S1", "HG64S2", "HG64S3",
                     "HG64N1", "HG64N2", "HG64N3",
                     "HG64SRs1", "HG64SRs2", "HG64SRs3",
                     "HG64NRs1", "HG64NRs2", "HG64NRs3",
                     "HF12S1", "HF12S2", "HF12S3",
                     "HF12N1", "HF12N2", "HF12N3",
                     "HF12SRs1", "HF12SRs2", "HF12SRs3",
                     "HF12NRs1", "HF12NRs2", "HF12NRs3",
                     "id", "cl")]
    
    
    ## col annotation
    NatSoil <- HeatmapAnnotation(NatSoil = c(rep(rep(c("No", "Yes", "No", "Yes"), each = 3),2)),
                                 col = list(NatSoil = c("Yes" = "grey", "No" = "white")),
                                 gp = gpar(col = "black"))
    Rs <- HeatmapAnnotation(Rs = c(rep(rep(c("No", "Yes"), each = 6),2)),
                                   col = list(Rs = c("Yes" = "grey", "No" = "white")),
                            gp = gpar(col = "black"))
    
    ## DEG annotation
    deg <- heat_all %>% select(matches("vs"))
    # order
    deg <- deg[,c("HG64N_vs_HG64S", "HF12N_vs_HF12S", "HG64NRs_vs_HG64SRs", "HF12RsN_vs_HF12RsS", 
                  "HG64NRs_vs_HG64N", "HF12NRs_vs_HF12N", "HG64SRs_vs_HG64S", "HF12SRs_vs_HF12S")]
    Heatmap(matrix = scaleC[1:24],
            name = 'Scaled Counts',
            row_split = scaleC$cl,
            row_gap = unit(2, "mm"),
            column_order = 1:24,
            # column_split = rep(c("HG64S","HG64N","HG64SRs","HG64NRs",
                                 # "HF12S","HF12N","HF12SRs","HF12NRs"), each = 3),
            column_split = factor(rep(c("HG64","HF12"), each = 12), levels = c("HG64","HF12")),
            show_column_names = FALSE,
            col = colorRampPalette(rev(brewer.pal(n = 10, name = 'Spectral'))[c(-3,-8)])(100),
            top_annotation = c(NatSoil, Rs),
            row_title_gp = gpar(fontsize = 10),
            use_raster = FALSE) +
            Heatmap(deg,
                    col = c('Down' = '#00bbf9', 'No' = 'white', 'Up' = '#f94144'),
                    column_names_gp = gpar(fontsize = 6),
                    heatmap_legend_param = list(title = 'DEGs'),
                    cluster_columns = FALSE,
                    column_names_rot = 45,
                    use_raster = FALSE)
    
    
    image.png

    4. 然后每个cluster有什么功能呢?富集分析

    #####################
    setwd("C:/Users/yjk/Desktop/cluster_DEGs/enrichment")
    library("clusterProfiler")
    library("magrittr")
    library("tidyverse")
    library("RColorBrewer")
    library("AnnotationHub")
    my_theme()
    
    # > packageVersion("AnnotationHub")
    # [1] ‘3.0.2’
    # packageVersion("clusterProfiler")
    # [1] ‘4.0.5’
    
    hub <- AnnotationHub()
    # snapshotDate(): 2021-05-18
    query(hub, "solanum")
    # AnnotationHub with 9 records
    # # snapshotDate(): 2021-05-18
    # # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/, Inparanoid8, WikiPathways
    # # $species: Solanum lycopersicum, Solanum tuberosum, Solanum pennellii, Solanum pennelli, Sola...
    # # $rdataclass: OrgDb, Inparanoid8Db, Tibble
    # # additional mcols(): taxonomyid, genome, description, coordinate_1_based, maintainer,
    # #   rdatadateadded, preparerclass, tags, rdatapath, sourceurl, sourcetype 
    # # retrieve records with, e.g., 'object[["AH10593"]]' 
    # 
    # title                                             
    # AH10593 | hom.Solanum_lycopersicum.inp8.sqlite              
    # AH10606 | hom.Solanum_tuberosum.inp8.sqlite                 
    # AH91815 | wikipathways_Solanum_lycopersicum_metabolites.rda 
    # AH94101 | org.Solanum_pennellii.eg.sqlite                   
    # AH94102 | org.Solanum_pennelli.eg.sqlite                    
    # AH94160 | org.Solanum_tuberosum.eg.sqlite                   
    # AH94246 | org.Solanum_esculentum.eg.sqlite                  
    # AH94247 | org.Solanum_lycopersicum.eg.sqlite                
    # AH94248 | org.Solanum_lycopersicum_var._humboldtii.eg.sqlite
    
    sly.db <- hub[["AH94247"]]
    

    这里如果遇到提示

    hub <- AnnotationHub()
    snapshotDate(): 2021-05-18
    Warning message:DEPRECATION: As of AnnotationHub (>2.23.2), default caching location has changed.
    Problematic cache: C:\Users\yjk\AppData\Local/AnnotationHub/AnnotationHub/Cache
    See https://bioconductor.org/packages/devel/bioc/vignettes/AnnotationHub/inst/doc/TroubleshootingTheCache.html#default-caching-location-update

    运行下面的命令即可

     moveFiles<-function(package){
          olddir <- path.expand(rappdirs::user_cache_dir(appname=package))
          newdir <- tools::R_user_dir(package, which="cache")
          dir.create(path=newdir, recursive=TRUE)
          files <- list.files(olddir, full.names =TRUE)
          moveres <- vapply(files,
                            FUN=function(fl){
                                filename = basename(fl)
                                newname = file.path(newdir, filename)
                                file.rename(fl, newname)
                            },
                            FUN.VALUE = logical(1))
          if(all(moveres)) unlink(olddir, recursive=TRUE)
      }
    

    下面就可以进行GO和KEGG富集分析了

    kmeansRes <- read.table("degs_cl.txt")
    
    prefix <- 'kmeans10'
    savepath <- "C:/Users/yjk/Desktop/cluster_DEGs/enrichment"
    
    for (i in kmeansRes$cl %>% unique) {
        ## BP
        goBP <- enrichGO(gene = kmeansRes %>% filter(cl == i) %>% .$id,
                         OrgDb = sly.db,
                         keyType= 'SYMBOL',
                         ont = 'BP',
                         pAdjustMethod = 'BH',
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.1)
        
        goBPSim <- clusterProfiler::simplify(goBP,
                                             cutoff = 0.5,
                                             by = 'p.adjust',
                                             select_fun = min)
        ## check and plot
        write.table(as.data.frame(goBPSim),
                  paste0(prefix, '_cl', i, '_cp_BP.txt') %>% file.path(savepath, .),
                  quote = FALSE,
                  sep = "\t")
        
        ## KEGG
        kk2 <- enrichKEGG(gene = kmeansRes %>% filter(cl == i) %>% .$id %>%
                          bitr(.,"SYMBOL", "ENTREZID", sly.db) %>% dplyr::select("ENTREZID") %>%
                              unlist(),
                          organism = 'sly',
                          pvalueCutoff = 0.05)
        
        write.table(as.data.frame(kk2),
                  paste0(prefix, '_cl', i, '_cp_KEGG.txt') %>% file.path(savepath, .),
                  quote = FALSE,
                  sep = "\t")
    }
    
    kall <- lapply(kmeansRes$cl %>% unique, function(x) {
        
        eachG <- kmeansRes %>% filter(cl == x) %>% .$id
        
        return(eachG)
        
    }) %>%
        set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))
    
    kallGOBP <- compareCluster(geneCluster = kall,
                               fun = 'enrichGO',
                               OrgDb = sly.db,
                               keyType= 'SYMBOL',
                               ont = 'BP',
                               pAdjustMethod = 'BH',
                               pvalueCutoff=0.01,
                               qvalueCutoff=0.1)
    
    kallGOBPSim <- clusterProfiler::simplify(kallGOBP,
                                             cutoff = 0.9,
                                             by = 'p.adjust',
                                             select_fun = min)
    dotplot(kallGOBPSim, showCategory = 10) + 
        ggtitle("Biological process")
    # ggsave("kallGOBPSim.pdf", width = 6.4, height = 5.4)
    
    kallGOCC <- compareCluster(geneCluster = kall,
                               fun = 'enrichGO',
                               OrgDb = sly.db,
                               keyType= 'SYMBOL',
                               ont = "CC",
                               pAdjustMethod = 'BH',
                               pvalueCutoff=0.01,
                               qvalueCutoff=0.1)
    
    kallGOCCSim <- clusterProfiler::simplify(kallGOCC,
                                             cutoff = 0.9,
                                             by = 'p.adjust',
                                             select_fun = min)
    dotplot(kallGOCCSim, showCategory = 20) + 
        ggtitle("Cellular component")
    # ggsave("kallGOCCSim.pdf", width = 6.4, height = 4)
    
    
    kallGOMF <- compareCluster(geneCluster = kall,
                               fun = 'enrichGO',
                               OrgDb = sly.db,
                               keyType= 'SYMBOL',
                               ont = "MF",
                               pAdjustMethod = 'BH',
                               pvalueCutoff=0.01,
                               qvalueCutoff=0.1)
    
    kallGOMFSim <- clusterProfiler::simplify(kallGOMF,
                                             cutoff = 0.9,
                                             by = 'p.adjust',
                                             select_fun = min)
    dotplot(kallGOMFSim, showCategory = 10) + 
        ggtitle("Molecular function")
    ggsave("kallGOMFSim.pdf", width = 6.9, height = 4)
    
    kallGOBP %>%
        as.data.frame %>%
        write.table('kmeans10_GOBP.txt', quote = FALSE, sep = "\t")
    
    emapplot(kallGOBP,
             showCategory = 5,
             pie='count',
             pie_scale=1,
             layout='kk')
    
    
    kallKEGG_input <- lapply(kmeansRes$cl %>% unique, function(x) {
        
        eachG <- kmeansRes %>% filter(cl == x) %>% 
            .$id %>% 
            bitr(.,"SYMBOL", "ENTREZID", sly.db) %>% 
            dplyr::select("ENTREZID") %>% 
            unlist()
        
        return(eachG)
        
    }) %>%
        set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))
    kallKEGG <- compareCluster(geneCluster = kallKEGG_input,   
                               fun = 'enrichKEGG',
                               organism = "sly",
                               pvalueCutoff = 0.05)
    
    dotplot(kallKEGG)
    # ggsave('kmeans10_KEGGALL.pdf', width = 8, height = 4)
    
    kallKEGG %>% 
        as.data.frame %>%
        write.table('kmeans10_KEGG.txt', quote = FALSE, sep = "\t")
    
    

    列一个例子,GO富集的生物学过程


    image.png

    到此结束

    相关文章

      网友评论

        本文标题:转录组DEGs聚类热图和功能富集分析

        本文链接:https://www.haomeiwen.com/subject/ivfdertx.html