美文网首页基因组数据绘图生物统计
数据分析:数据降维方法的实现

数据分析:数据降维方法的实现

作者: 生信学习者2 | 来源:发表于2021-06-28 09:46 被阅读0次

    前言

    随着大数据时代的到来,我们常常需要对高纬度数据进行降维处理后再对数据整体结构进行可视化,进而发现数据内在的区别,为后续数据分析提供初步的参考。更多知识分享请到 https://zouhua.top/

    这里主要介绍三类数据分析方法的R实现过程:

    1.PCA主成分分析;

    2.tSNE,基于概率密度分布计算能保留更多组间和个体差异的方法;

    3.UMAP,非线性降维方法。

    导入数据

    options(warn = 0)
    library(dplyr)
    library(tibble)
    library(ggplot2)
    options(warn = 0)
    
    ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
    subgrp <- c("NC", "PC", "PT")
    grp.col <- c("#568875", "#73FAFC", "#EE853D")
    

    Principal Component Analysis

    PCAFun <- function(dataset = ExprSet_LOG2Impute ){
      
      # dataset = ExprSet_LOG2Impute 
      
      require(convert)
      metadata <- pData(dataset)
      profile <- exprs(dataset)
      
      # pca 
      pca <- prcomp(scale(t(profile), center = T, scale = T))
      require(factoextra)
      eig <- get_eig(pca)
      # explains variable 
      explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
      # principal component score of each sample
      score <- inner_join(pca$x %>% data.frame() %>% 
                            dplyr::select(c(1:2)) %>% 
                            rownames_to_column("SampleID"), 
                          metadata %>% rownames_to_column("SampleID"),
                          by = "SampleID") %>%
        mutate(Group=factor(Group, levels = grp))
      
      
      # PERMANOVA
      require(vegan)
      set.seed(123)
      if(any(profile < 0)){
        res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
      }else{
        res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)    
      }
      adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
      adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
      #use the bquote function to format adonis results to be annotated on the ordination plot.
      signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
      adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                      atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
      
      
      pl <- ggplot(score, aes(x=PC1, y=PC2))+
                  geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
                  stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
                  labs(x=explains[1], y=explains[2])+
                  scale_color_manual(values = grp.col)+
                  scale_fill_manual(name = "Condition", 
                                    values = grp.col)+
                  annotate("text", x = max(score$PC1) - 8,
                           y = min(score$PC1),
                           label = adn_res_format,
                           size = 6)+ 
                  guides(color=F)+
                  theme_classic()+
                  theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                        axis.text = element_text(size = 9, color = "black"),
                        text = element_text(size = 8, color = "black", family = "serif"),
                        strip.text = element_text(size = 9, color = "black", face = "bold"), 
                        panel.grid = element_blank(),
                        legend.title = element_text(size = 11, color = "black", family = "serif"),
                        legend.text = element_text(size = 10, color = "black", family = "serif"),
                        legend.position = c(0, 0),
                        legend.justification = c(0, 0),
                        legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
      return(pl)
    }
    
    PCA_LOG2Impute <- PCAFun(dataset = ExprSet_LOG2Impute)
    PCA_LOG2Impute 
    ggsave("Mass_ProteinsLOG2Impute_PCA.pdf", PCA_LOG2Impute, width = 5, height = 5, dpi = 300)
    

    Rtsne

    RtsneFun <- function(dataset = ExprSet_LOG2Impute,
                         perpl = 10){
      
      # dataset = ExprSet_LOG2Impute
      # perpl = 10
      
      require(convert)
      metadata <- pData(dataset)
      profile <- t(exprs(dataset))
      
      # Rtsne
      require(Rtsne)
      #set.seed(123)
      Rtsne <- Rtsne(profile, 
                     dims=2, 
                     perplexity=perpl,
                     verbose=TRUE, 
                     max_iter=500, 
                     eta=200)
      point <- Rtsne$Y %>% data.frame() %>% 
        dplyr::select(c(1:2)) %>%
        setNames(c("tSNE1", "tSNE2"))
      rownames(point) <- rownames(profile)
      score <- inner_join(point %>% rownames_to_column("SampleID"), 
                          metadata %>% rownames_to_column("SampleID"),
                          by = "SampleID") %>%
        mutate(Group=factor(Group, levels = grp))
      
      # PERMANOVA
      require(vegan)
      set.seed(123)
      if(any(profile < 0)){
        res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
      }else{
        res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
      }
      adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
      adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
      #use the bquote function to format adonis results to be annotated on the ordination plot.
      signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
      adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                      atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
      
      pl <- ggplot(score, aes(x=tSNE1, y=tSNE2))+
                  geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
                  stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
                  scale_color_manual(values = grp.col)+
                  scale_fill_manual(name = "Condition", 
                                    values = grp.col)+
                  annotate("text", x = max(score$tSNE1) - 8,
                           y = max(score$tSNE2)-5,
                           label = adn_res_format,
                           size = 6)+ 
                  guides(color=F)+
                  theme_classic()+
                  theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                        axis.text = element_text(size = 9, color = "black"),
                        text = element_text(size = 8, color = "black", family = "serif"),
                        strip.text = element_text(size = 9, color = "black", face = "bold"), 
                        panel.grid = element_blank(),
                        legend.title = element_text(size = 11, color = "black", family = "serif"),
                        legend.text = element_text(size = 10, color = "black", family = "serif"),
                        legend.position = c(0, 0),
                        legend.justification = c(0, 0),
                        legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
      return(pl)
    }
    
    Rtsne_LOG2Impute <- RtsneFun(dataset = ExprSet_LOG2Impute, perpl = 10)
    Rtsne_LOG2Impute
    ggsave("Mass_ProteinsLOG2Impute_Rtsne.pdf", Rtsne_LOG2Impute, width = 5, height = 5, dpi = 300)
    

    UMAP: a non-linear dimensionality reduction algorithm

    UMAPFun <- function(dataset = ExprSet_LOG2Impute){
      
      # dataset = ExprSet_LOG2Impute
      
      require(convert)
      metadata <- pData(dataset)
      profile <- t(exprs(dataset))
      
      # umap 
      require(umap)
      umap <- umap::umap(profile)
      
      point <- umap$layout %>% data.frame() %>%
        setNames(c("UMAP1", "UMAP2"))
      rownames(point) <- rownames(profile)
      score <- inner_join(point %>% rownames_to_column("SampleID"), 
                          metadata %>% rownames_to_column("SampleID"),
                          by = "SampleID") %>%
        mutate(Group=factor(Group, levels = grp))
      
      # PERMANOVA
      require(vegan)
      set.seed(123)
      if(any(profile < 0)){
        res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
      }else{
        res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
      }
      adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
      adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
      #use the bquote function to format adonis results to be annotated on the ordination plot.
      signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
      adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                      atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))   
      
      pl <- ggplot(score, aes(x=UMAP1, y=UMAP2))+
                  geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
                  stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
                  scale_color_manual(values = grp.col)+
                  scale_fill_manual(name = "Condition", 
                                    values = grp.col)+
                  annotate("text", x = max(score$UMAP1),
                           y = min(score$UMAP2),
                           label = adn_res_format,
                           size = 6)+ 
                  guides(color=F)+
                  theme_classic()+
                  theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                        axis.text = element_text(size = 9, color = "black"),
                        text = element_text(size = 8, color = "black", family = "serif"),
                        strip.text = element_text(size = 9, color = "black", face = "bold"), 
                        panel.grid = element_blank(),
                        legend.title = element_text(size = 11, color = "black", family = "serif"),
                        legend.text = element_text(size = 10, color = "black", family = "serif"),
                        legend.position = c(0, 0),
                        legend.justification = c(0, 0),
                        legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
      return(pl)
    }
    
    UMAP_LOG2Impute <- UMAPFun(dataset = ExprSet_LOG2Impute)
    UMAP_LOG2Impute 
    ggsave("Mass_ProteinsLOG2Impute_UMAP.pdf", UMAP_LOG2Impute, width = 5, height = 5, dpi = 300)
    

    Notes: 三组数据的整体性差异如果较为明显,一般可以在不同的降维方法上都有所体现。从分组看,NC组和PC、PT组均存在明显的差异,后续数据分析应该着重关注 NC vs PC, NC vs PT,但也可以对PC vs PT做比较分析。

    Systemic information

    sessionInfo()
    
    R version 4.0.2 (2020-06-22)
    Platform: x86_64-conda_cos6-linux-gnu (64-bit)
    Running under: CentOS Linux 8 (Core)
    
    Matrix products: default
    BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
    
    locale:
     [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
     [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
     [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
    
    attached base packages:
    [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
    
    other attached packages:
     [1] umap_0.2.7.0                                        Rtsne_0.15                                         
     [3] vegan_2.5-6                                         lattice_0.20-41                                    
     [5] permute_0.9-5                                       factoextra_1.0.7                                   
     [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 IlluminaHumanMethylationEPICmanifest_0.3.0         
     [9] minfi_1.36.0                                        bumphunter_1.32.0                                  
    [11] locfit_1.5-9.4                                      iterators_1.0.13                                   
    [13] foreach_1.5.1                                       Biostrings_2.58.0                                  
    [15] XVector_0.30.0                                      SummarizedExperiment_1.20.0                        
    [17] MatrixGenerics_1.2.1                                matrixStats_0.58.0                                 
    [19] GenomicRanges_1.42.0                                GenomeInfoDb_1.26.4                                
    [21] IRanges_2.24.1                                      S4Vectors_0.28.1                                   
    [23] eulerr_6.1.0                                        UpSetR_1.4.0                                       
    [25] ggplot2_3.3.3                                       DEP_1.12.0                                         
    [27] convert_1.64.0                                      marray_1.68.0                                      
    [29] limma_3.46.0                                        Biobase_2.50.0                                     
    [31] BiocGenerics_0.36.0                                 data.table_1.14.0                                  
    [33] tibble_3.1.0                                        dplyr_1.0.5                                        
    
    loaded via a namespace (and not attached):
      [1] utf8_1.2.1                shinydashboard_0.7.1      reticulate_1.18           gmm_1.6-5                
      [5] tidyselect_1.1.0          RSQLite_2.2.5             AnnotationDbi_1.52.0      htmlwidgets_1.5.3        
      [9] grid_4.0.2                BiocParallel_1.24.1       norm_1.0-9.5              munsell_0.5.0            
     [13] codetools_0.2-18          preprocessCore_1.52.1     DT_0.18                   withr_2.4.1              
     [17] colorspace_2.0-0          knitr_1.31                rstudioapi_0.13           mzID_1.28.0              
     [21] labeling_0.4.2            GenomeInfoDbData_1.2.4    farver_2.1.0              bit64_4.0.5              
     [25] rhdf5_2.34.0              vctrs_0.3.7               generics_0.1.0            xfun_0.20                
     [29] BiocFileCache_1.14.0      R6_2.5.0                  doParallel_1.0.16         illuminaio_0.32.0        
     [33] clue_0.3-58               bitops_1.0-6              rhdf5filters_1.2.0        cachem_1.0.4             
     [37] reshape_0.8.8             DelayedArray_0.16.3       assertthat_0.2.1          promises_1.2.0.1         
     [41] scales_1.1.1              gtable_0.3.0              Cairo_1.5-12.2            affy_1.68.0              
     [45] sandwich_3.0-0            rlang_0.4.10              genefilter_1.72.0         mzR_2.24.1               
     [49] GlobalOptions_0.1.2       splines_4.0.2             rtracklayer_1.50.0        GEOquery_2.58.0          
     [53] impute_1.64.0             yaml_2.2.1                reshape2_1.4.4            BiocManager_1.30.12      
     [57] GenomicFeatures_1.42.2    httpuv_1.5.5              tools_4.0.2               nor1mix_1.3-0            
     [61] affyio_1.60.0             ellipsis_0.3.1            jquerylib_0.1.3           RColorBrewer_1.1-2       
     [65] siggenes_1.64.0           MSnbase_2.16.1            Rcpp_1.0.6                plyr_1.8.6               
     [69] sparseMatrixStats_1.2.1   progress_1.2.2            zlibbioc_1.36.0           purrr_0.3.4              
     [73] RCurl_1.98-1.3            prettyunits_1.1.1         openssl_1.4.3             GetoptLong_1.0.5         
     [77] cowplot_1.1.0             zoo_1.8-8                 ggrepel_0.9.1.9999        cluster_2.1.0            
     [81] tinytex_0.31              magrittr_2.0.1            RSpectra_0.16-0           circlize_0.4.10          
     [85] pcaMethods_1.80.0         mvtnorm_1.1-1             ProtGenerics_1.22.0       evaluate_0.14            
     [89] hms_1.0.0                 mime_0.10                 xtable_1.8-4              XML_3.99-0.6             
     [93] mclust_5.4.7              gridExtra_2.3             shape_1.4.5               compiler_4.0.2           
     [97] biomaRt_2.46.3            ncdf4_1.17                crayon_1.4.1              htmltools_0.5.1.1        
    [101] mgcv_1.8-34               later_1.1.0.1             tidyr_1.1.3               DBI_1.1.1                
    [105] dbplyr_2.1.1              ComplexHeatmap_2.6.2      MASS_7.3-53.1             tmvtnorm_1.4-10          
    [109] rappdirs_0.3.3            Matrix_1.3-2              readr_1.4.0               cli_2.4.0                
    [113] vsn_3.58.0                imputeLCMD_2.0            quadprog_1.5-8            pkgconfig_2.0.3          
    [117] GenomicAlignments_1.26.0  MALDIquant_1.19.3         xml2_1.3.2                annotate_1.68.0          
    [121] bslib_0.2.4               rngtools_1.5              multtest_2.46.0           beanplot_1.2             
    [125] doRNG_1.8.2               scrime_1.3.5              stringr_1.4.0             digest_0.6.27            
    [129] rmarkdown_2.7             base64_2.0                DelayedMatrixStats_1.12.3 curl_4.3                 
    [133] shiny_1.6.0               Rsamtools_2.6.0           rjson_0.2.20              jsonlite_1.7.2           
    [137] nlme_3.1-150              lifecycle_1.0.0           Rhdf5lib_1.12.1           askpass_1.1              
    [141] fansi_0.4.2               pillar_1.6.0              fastmap_1.1.0             httr_1.4.2               
    [145] survival_3.2-10           glue_1.4.2                png_0.1-7                 bit_4.0.4                
    [149] sass_0.3.1                stringi_1.4.6             HDF5Array_1.18.1          blob_1.2.1               
    [153] memoise_2.0.0   
    

    Reference

    1. How to change Legend of ggplot2

    2. How to change ggplot facet labels

    相关文章

      网友评论

        本文标题:数据分析:数据降维方法的实现

        本文链接:https://www.haomeiwen.com/subject/rjxqultx.html