美文网首页R-作图
数据分析:基于GSVA的通路富集分析

数据分析:基于GSVA的通路富集分析

作者: 生信学习者2 | 来源:发表于2021-01-29 14:40 被阅读0次

    与传统的Over Representation Analysis: the up- or down-regulated DEGsGene Set Enrichment Analysis: All the ranked genes相比,Gene Set Variation Analysis分析直接使用gene expression或RNA-seq profile matrix计算基因在通路中的得分,并且这种计算可以基于单样本处理。更多知识分享请到 https://zouhua.top/

    • GSVA方法图解


    输入表达谱数据和基因集数据。计算其得分。

    The GSVA package implements a non-parametric unsupervised method, called Gene Set Variation
    Analysis (GSVA), for assessing gene set enrichment (GSE) in gene expression microarray and RNA-
    seq data. In contrast to most GSE methods, GSVA performs a change in coordinate systems,
    transforming the data from a gene by sample matrix to a gene set by sample matrix. Thereby
    allowing for the evaluation of pathway enrichment for each sample. This transformation is done
    without the use of a phenotype, thus facilitating very powerful and open-ended analyses in a now
    pathway centric manner.

    加载R包

    knitr::opts_chunk$set(warning = F, message = F)
    library(dplyr)
    library(tibble)
    library(ggplot2)
    library(data.table)
    library(GSVA)
    
    rm(list = ls())
    options(stringsAsFactors = F)
    options(future.globals.maxSize = 1000 * 1024^2)
    
    grp <- c("S1", "S2")
    grp.col <- c("#6C326C", "#77A2D1")
    

    load data

    • gene Expression DESeq2 object
    phen <- fread("../../Result/phenotype/phenotype_cluster.csv")
    geneExp <- fread("../../Result/profile/geneExp_filter.tsv")
    # 处理数据成整型
    geneExp <- geneExp %>% column_to_rownames("V1") %>% as.matrix()
    geneExp <- round(geneExp) %>% data.frame() %>% rownames_to_column("V1")
    
    pathways_hallmark_kegg <- fgsea::gmtPathways("../../Result/GeneID/msigdb.v7.1.symbols_KEGG.gmt")
    pathways_hallmark_GO <- fgsea::gmtPathways("../../Result/GeneID/msigdb.v7.1.symbols_GO.gmt")
    

    Curation Function

    get_ExprSet <- function(x=phen, 
                            y=geneExp){
      # x=phen
      # y=geneExp
    
      sid <- intersect(x$Barcode, colnames(y))
      # phenotype
      phe <- x %>% filter(Barcode%in%sid) %>%
        mutate(Cluster=factor(as.character(Cluster))) %>%
        column_to_rownames("Barcode") 
      
      # profile 
      prf <- y %>% column_to_rownames("V1") %>%
        dplyr::select(all_of(sid))
      
      # determine the right order between profile and phenotype 
      for(i in 1:ncol(prf)){ 
        if (!(colnames(prf)[i] == rownames(phe)[i])) {
          stop(paste0(i, " Wrong"))
        }
      }
      require(convert)
      exprs <- as.matrix(prf)
      adf <-  new("AnnotatedDataFrame", data=phe)
      experimentData <- new("MIAME",
            name="Hua Zou", lab="Hua Lab",
            contact="zouhua1@outlook.com",
            title="KRIC Experiment",
            abstract="The ExpressionSet",
            url="www.zouhua.top",
            other=list(notes="Created from text files"))
      expressionSet <- new("ExpressionSet", exprs=exprs,
                           phenoData=adf, 
                           experimentData=experimentData)
      
      return(expressionSet)
    }
    
    get_score <- function(dataset=get_ExprSet(x=phen, y=geneExp),
                          geneset=pathways_hallmark_kegg,
                          methods="ssgsea"){
      
      # dataset=get_ExprSet(x=phen, y=geneExp)
      # geneset=pathways_hallmark_kegg
      # methods="ssgsea"  
      
      dat_fit <- gsva(expr = dataset, 
                  gset.idx.list = geneset,
                  method = methods, 
                  min.sz = 5, 
                  max.sz = 500,
                  kcdf = "Poisson")
      res <- exprs(dat_fit) %>% t() %>% 
        data.frame() %>%
        rownames_to_column("SampleID")
      
      return(res)
    }
    
    # Differential methylation/copyNumber/methylation
    get_limma <- function(dataset=methylation_set,
                          group_col=grp,
                          tag="methylation",
                          fc=1,
                          pval=0.05){
      
      # dataset=methylation_set
      # group_col=grp
      # tag="methylation"  
      # fc=1
      # pval=0.05
      
      pheno <- pData(dataset) 
      
      if(tag == "methylation"){
        # transform the beta value into M values via lumi package
        require(lumi)
        edata <- beta2m(exprs(dataset))    
      }else{
        edata <- exprs(dataset)
      }
    
      
      require(limma)
      design <- model.matrix(~0 + pheno$Cluster)
      rownames(design) <- rownames(pheno)
      colnames(design) <- group_col
      exprSet <- edata  
      
      # show distribution
      boxplot(exprSet)
      plotDensities(exprSet) 
      
      # linear fitting 
      fit <- lmFit(exprSet, design, method = 'ls')
      contrast <- makeContrasts("S1-S2", levels = design) 
      
      # eBayes
      fit2 <- contrasts.fit(fit, contrast)
      fit2 <- eBayes(fit2)
      
      qqt(fit2$t, df = fit2$df.prior + fit2$df.residual, pch = 16, cex = 0.2)
      abline(0, 1)  
    
      # differential features
      diff_feature <- topTable(fit2, number = Inf, adjust.method = 'BH') %>%
        rownames_to_column("Feature") 
      
      # prof[rownames(prof)%in%"EVI2A", ] %>% data.frame() %>% setNames("EVI2A") %>%
      #   rownames_to_column("SampleID") %>%
      #   inner_join(pheno %>%rownames_to_column("SampleID"), by = "SampleID") -> a1
      # wilcox.test(EVI2A~Cluster, data = a1)
      # ggplot(a1, aes(x=Cluster, y=EVI2A))+geom_boxplot() 
      
      diff_feature[which(diff_feature$logFC >= fc & 
                           diff_feature$adj.P.Val < pval), "Enrichment"] <- group_col[1]
      diff_feature[which(diff_feature$logFC <= -fc & 
                           diff_feature$adj.P.Val < pval), "Enrichment"] <- group_col[2]
      diff_feature[which(abs(diff_feature$logFC) < fc |
                           diff_feature$adj.P.Val >= pval), "Enrichment"] <- "Nonsignif"
      
      diff_res <- diff_feature %>% 
        setNames(c("Feature", "log2FoldChange", "baseMean", "t", 
                   "pvalue", "padj", "B", "Enrichment")) %>% 
        dplyr::select(Feature, everything()) %>%
        arrange(padj, log2FoldChange) %>%
        column_to_rownames("Feature")
      
      res <- list(fit=fit2, diff_res=diff_res)
      
      return(res)
    }
    

    gsva score

    kegg_gsva <- get_score(dataset=get_ExprSet(x=phen, y=geneExp),
                           geneset=pathways_hallmark_kegg,
                           methods="gsva")
    
    if(!dir.exists("../../Result/pathway/")){
      dir.create("../../Result/pathway/", recursive = T)
    }
    write.csv(kegg_gsva, "../../Result/pathway/KEGG_gsva.csv", row.names = F)
    
    # Differential Test
    kegg_gsva_exprset <-  get_ExprSet(x=phen, y=kegg_gsva %>% column_to_rownames("SampleID") %>% t() %>% data.frame() %>%rownames_to_column("V1"))  
    Diff_gsva <- get_limma(dataset =kegg_gsva_exprset, tag = "gsva")
    DT::datatable(Diff_gsva$diff_res)
    

    网盘地址: https://pan.baidu.com/s/1AGsvOD6klfljvu0T2A-akA ;提取码:vw7z

    version

    sessionInfo()
    
    R version 4.0.2 (2020-06-22)
    Platform: x86_64-w64-mingw32/x64 (64-bit)
    Running under: Windows 10 x64 (build 19042)
    
    Matrix products: default
    
    locale:
    [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
    [3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
    [5] LC_TIME=English_United States.1252    
    system code page: 936
    
    attached base packages:
    [1] stats     graphics  grDevices utils     datasets  methods   base     
    
    other attached packages:
    [1] GSVA_1.36.3       data.table_1.13.6 ggplot2_3.3.3     tibble_3.0.3      dplyr_1.0.2      
    
    loaded via a namespace (and not attached):
      [1] fgsea_1.14.0                colorspace_1.4-1            ellipsis_0.3.1             
      [4] ggridges_0.5.2              qvalue_2.20.0               XVector_0.28.0             
      [7] GenomicRanges_1.40.0        rstudioapi_0.11             farver_2.0.3               
     [10] urltools_1.7.3              graphlayouts_0.7.1          ggrepel_0.9.1.9999         
     [13] DT_0.16                     bit64_0.9-7                 AnnotationDbi_1.50.3       
     [16] scatterpie_0.1.5            xml2_1.3.2                  splines_4.0.2              
     [19] GOSemSim_2.14.2             knitr_1.30                  shinythemes_1.1.2          
     [22] polyclip_1.10-0             jsonlite_1.7.1              annotate_1.66.0            
     [25] GO.db_3.11.4                graph_1.66.0                ggforce_0.3.2              
     [28] shiny_1.5.0                 BiocManager_1.30.10         compiler_4.0.2             
     [31] httr_1.4.2                  rvcheck_0.1.8               Matrix_1.3-2               
     [34] fastmap_1.0.1               later_1.1.0.1               tweenr_1.0.1               
     [37] htmltools_0.5.0             prettyunits_1.1.1           tools_4.0.2                
     [40] igraph_1.2.5                gtable_0.3.0                glue_1.4.2                 
     [43] GenomeInfoDbData_1.2.3      reshape2_1.4.4              DO.db_2.9                  
     [46] fastmatch_1.1-0             Rcpp_1.0.5                  enrichplot_1.8.1           
     [49] Biobase_2.48.0              vctrs_0.3.4                 ggraph_2.0.3               
     [52] xfun_0.19                   stringr_1.4.0               mime_0.9                   
     [55] lifecycle_0.2.0             XML_3.99-0.5                DOSE_3.14.0                
     [58] europepmc_0.4               MASS_7.3-53                 zlibbioc_1.34.0            
     [61] scales_1.1.1                tidygraph_1.2.0             hms_0.5.3                  
     [64] promises_1.1.1              parallel_4.0.2              SummarizedExperiment_1.18.2
     [67] RColorBrewer_1.1-2          yaml_2.2.1                  memoise_1.1.0              
     [70] gridExtra_2.3               triebeard_0.3.0             stringi_1.5.3              
     [73] RSQLite_2.2.0               S4Vectors_0.26.1            BiocGenerics_0.34.0        
     [76] BiocParallel_1.22.0         GenomeInfoDb_1.24.2         rlang_0.4.7                
     [79] pkgconfig_2.0.3             bitops_1.0-6                matrixStats_0.57.0         
     [82] evaluate_0.14               lattice_0.20-41             purrr_0.3.4                
     [85] htmlwidgets_1.5.2           cowplot_1.1.1               bit_4.0.3                  
     [88] tidyselect_1.1.0            GSEABase_1.50.1             plyr_1.8.6                 
     [91] magrittr_1.5                R6_2.5.0                    IRanges_2.22.2             
     [94] generics_0.1.0              DelayedArray_0.14.1         DBI_1.1.0                  
     [97] pillar_1.4.6                withr_2.3.0                 RCurl_1.98-1.2             
    [100] crayon_1.3.4                rmarkdown_2.5               viridis_0.5.1              
    [103] progress_1.2.2              grid_4.0.2                  blob_1.2.1                 
    [106] digest_0.6.25               xtable_1.8-4                tidyr_1.1.2                
    [109] httpuv_1.5.4                gridGraphics_0.5-0          stats4_4.0.2               
    [112] munsell_0.5.0               viridisLite_0.3.0           ggplotify_0.0.5   
    

    Reference

    1. GSVA: gene set variation analysis for microarray and RNA-Seq data

    参考文章如引起任何侵权问题,可以与我联系,谢谢。

    相关文章

      网友评论

        本文标题:数据分析:基于GSVA的通路富集分析

        本文链接:https://www.haomeiwen.com/subject/gkvqtltx.html