数据分析：数据降维方法的实现

作者: 生信学习者2 | 来源:发表于2021-06-28 09:46 被阅读0次

数据分析：数据降维方法的实现
PCA算法推导
人脸识别基本原理
详解主成分分析PCA
机器学习实战Py3.x填坑记10—利用PCA来简化数据
算法笔记（14）PCA主成分分析及Python代码实现
PCA
数据降维
线性判别分析（LDA）
PCA简述

前言

随着大数据时代的到来，我们常常需要对高纬度数据进行降维处理后再对数据整体结构进行可视化，进而发现数据内在的区别，为后续数据分析提供初步的参考。更多知识分享请到 https://zouhua.top/。

这里主要介绍三类数据分析方法的R实现过程：

1.PCA主成分分析；

2.tSNE，基于概率密度分布计算能保留更多组间和个体差异的方法；

3.UMAP，非线性降维方法。

导入数据

options(warn = 0)
library(dplyr)
library(tibble)
library(ggplot2)
options(warn = 0)

ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

Principal Component Analysis

PCAFun <- function(dataset = ExprSet_LOG2Impute ){
  
  # dataset = ExprSet_LOG2Impute 
  
  require(convert)
  metadata <- pData(dataset)
  profile <- exprs(dataset)
  
  # pca 
  pca <- prcomp(scale(t(profile), center = T, scale = T))
  require(factoextra)
  eig <- get_eig(pca)
  # explains variable 
  explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
  # principal component score of each sample
  score <- inner_join(pca$x %>% data.frame() %>% 
                        dplyr::select(c(1:2)) %>% 
                        rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
  
  
  pl <- ggplot(score, aes(x=PC1, y=PC2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              labs(x=explains[1], y=explains[2])+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$PC1) - 8,
                       y = min(score$PC1),
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

PCA_LOG2Impute <- PCAFun(dataset = ExprSet_LOG2Impute)
PCA_LOG2Impute 
ggsave("Mass_ProteinsLOG2Impute_PCA.pdf", PCA_LOG2Impute, width = 5, height = 5, dpi = 300)

Rtsne

RtsneFun <- function(dataset = ExprSet_LOG2Impute,
                     perpl = 10){
  
  # dataset = ExprSet_LOG2Impute
  # perpl = 10
  
  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))
  
  # Rtsne
  require(Rtsne)
  #set.seed(123)
  Rtsne <- Rtsne(profile, 
                 dims=2, 
                 perplexity=perpl,
                 verbose=TRUE, 
                 max_iter=500, 
                 eta=200)
  point <- Rtsne$Y %>% data.frame() %>% 
    dplyr::select(c(1:2)) %>%
    setNames(c("tSNE1", "tSNE2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
  
  pl <- ggplot(score, aes(x=tSNE1, y=tSNE2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$tSNE1) - 8,
                       y = max(score$tSNE2)-5,
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

Rtsne_LOG2Impute <- RtsneFun(dataset = ExprSet_LOG2Impute, perpl = 10)
Rtsne_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_Rtsne.pdf", Rtsne_LOG2Impute, width = 5, height = 5, dpi = 300)

UMAP: a non-linear dimensionality reduction algorithm

UMAPFun <- function(dataset = ExprSet_LOG2Impute){
  
  # dataset = ExprSet_LOG2Impute
  
  require(convert)
  metadata <- pData(dataset)
  profile <- t(exprs(dataset))
  
  # umap 
  require(umap)
  umap <- umap::umap(profile)
  
  point <- umap$layout %>% data.frame() %>%
    setNames(c("UMAP1", "UMAP2"))
  rownames(point) <- rownames(profile)
  score <- inner_join(point %>% rownames_to_column("SampleID"), 
                      metadata %>% rownames_to_column("SampleID"),
                      by = "SampleID") %>%
    mutate(Group=factor(Group, levels = grp))
  
  # PERMANOVA
  require(vegan)
  set.seed(123)
  if(any(profile < 0)){
    res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
  }else{
    res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)    
  }
  adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
  adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
  #use the bquote function to format adonis results to be annotated on the ordination plot.
  signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
  adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                  atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))   
  
  pl <- ggplot(score, aes(x=UMAP1, y=UMAP2))+
              geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
              stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
              scale_color_manual(values = grp.col)+
              scale_fill_manual(name = "Condition", 
                                values = grp.col)+
              annotate("text", x = max(score$UMAP1),
                       y = min(score$UMAP2),
                       label = adn_res_format,
                       size = 6)+ 
              guides(color=F)+
              theme_classic()+
              theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                    axis.text = element_text(size = 9, color = "black"),
                    text = element_text(size = 8, color = "black", family = "serif"),
                    strip.text = element_text(size = 9, color = "black", face = "bold"), 
                    panel.grid = element_blank(),
                    legend.title = element_text(size = 11, color = "black", family = "serif"),
                    legend.text = element_text(size = 10, color = "black", family = "serif"),
                    legend.position = c(0, 0),
                    legend.justification = c(0, 0),
                    legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
  return(pl)
}

UMAP_LOG2Impute <- UMAPFun(dataset = ExprSet_LOG2Impute)
UMAP_LOG2Impute 
ggsave("Mass_ProteinsLOG2Impute_UMAP.pdf", UMAP_LOG2Impute, width = 5, height = 5, dpi = 300)

Notes: 三组数据的整体性差异如果较为明显，一般可以在不同的降维方法上都有所体现。从分组看，NC组和PC、PT组均存在明显的差异，后续数据分析应该着重关注 NC vs PC, NC vs PT，但也可以对PC vs PT做比较分析。

Systemic information

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] umap_0.2.7.0                                        Rtsne_0.15                                         
 [3] vegan_2.5-6                                         lattice_0.20-41                                    
 [5] permute_0.9-5                                       factoextra_1.0.7                                   
 [7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 IlluminaHumanMethylationEPICmanifest_0.3.0         
 [9] minfi_1.36.0                                        bumphunter_1.32.0                                  
[11] locfit_1.5-9.4                                      iterators_1.0.13                                   
[13] foreach_1.5.1                                       Biostrings_2.58.0                                  
[15] XVector_0.30.0                                      SummarizedExperiment_1.20.0                        
[17] MatrixGenerics_1.2.1                                matrixStats_0.58.0                                 
[19] GenomicRanges_1.42.0                                GenomeInfoDb_1.26.4                                
[21] IRanges_2.24.1                                      S4Vectors_0.28.1                                   
[23] eulerr_6.1.0                                        UpSetR_1.4.0                                       
[25] ggplot2_3.3.3                                       DEP_1.12.0                                         
[27] convert_1.64.0                                      marray_1.68.0                                      
[29] limma_3.46.0                                        Biobase_2.50.0                                     
[31] BiocGenerics_0.36.0                                 data.table_1.14.0                                  
[33] tibble_3.1.0                                        dplyr_1.0.5                                        

loaded via a namespace (and not attached):
  [1] utf8_1.2.1                shinydashboard_0.7.1      reticulate_1.18           gmm_1.6-5                
  [5] tidyselect_1.1.0          RSQLite_2.2.5             AnnotationDbi_1.52.0      htmlwidgets_1.5.3        
  [9] grid_4.0.2                BiocParallel_1.24.1       norm_1.0-9.5              munsell_0.5.0            
 [13] codetools_0.2-18          preprocessCore_1.52.1     DT_0.18                   withr_2.4.1              
 [17] colorspace_2.0-0          knitr_1.31                rstudioapi_0.13           mzID_1.28.0              
 [21] labeling_0.4.2            GenomeInfoDbData_1.2.4    farver_2.1.0              bit64_4.0.5              
 [25] rhdf5_2.34.0              vctrs_0.3.7               generics_0.1.0            xfun_0.20                
 [29] BiocFileCache_1.14.0      R6_2.5.0                  doParallel_1.0.16         illuminaio_0.32.0        
 [33] clue_0.3-58               bitops_1.0-6              rhdf5filters_1.2.0        cachem_1.0.4             
 [37] reshape_0.8.8             DelayedArray_0.16.3       assertthat_0.2.1          promises_1.2.0.1         
 [41] scales_1.1.1              gtable_0.3.0              Cairo_1.5-12.2            affy_1.68.0              
 [45] sandwich_3.0-0            rlang_0.4.10              genefilter_1.72.0         mzR_2.24.1               
 [49] GlobalOptions_0.1.2       splines_4.0.2             rtracklayer_1.50.0        GEOquery_2.58.0          
 [53] impute_1.64.0             yaml_2.2.1                reshape2_1.4.4            BiocManager_1.30.12      
 [57] GenomicFeatures_1.42.2    httpuv_1.5.5              tools_4.0.2               nor1mix_1.3-0            
 [61] affyio_1.60.0             ellipsis_0.3.1            jquerylib_0.1.3           RColorBrewer_1.1-2       
 [65] siggenes_1.64.0           MSnbase_2.16.1            Rcpp_1.0.6                plyr_1.8.6               
 [69] sparseMatrixStats_1.2.1   progress_1.2.2            zlibbioc_1.36.0           purrr_0.3.4              
 [73] RCurl_1.98-1.3            prettyunits_1.1.1         openssl_1.4.3             GetoptLong_1.0.5         
 [77] cowplot_1.1.0             zoo_1.8-8                 ggrepel_0.9.1.9999        cluster_2.1.0            
 [81] tinytex_0.31              magrittr_2.0.1            RSpectra_0.16-0           circlize_0.4.10          
 [85] pcaMethods_1.80.0         mvtnorm_1.1-1             ProtGenerics_1.22.0       evaluate_0.14            
 [89] hms_1.0.0                 mime_0.10                 xtable_1.8-4              XML_3.99-0.6             
 [93] mclust_5.4.7              gridExtra_2.3             shape_1.4.5               compiler_4.0.2           
 [97] biomaRt_2.46.3            ncdf4_1.17                crayon_1.4.1              htmltools_0.5.1.1        
[101] mgcv_1.8-34               later_1.1.0.1             tidyr_1.1.3               DBI_1.1.1                
[105] dbplyr_2.1.1              ComplexHeatmap_2.6.2      MASS_7.3-53.1             tmvtnorm_1.4-10          
[109] rappdirs_0.3.3            Matrix_1.3-2              readr_1.4.0               cli_2.4.0                
[113] vsn_3.58.0                imputeLCMD_2.0            quadprog_1.5-8            pkgconfig_2.0.3          
[117] GenomicAlignments_1.26.0  MALDIquant_1.19.3         xml2_1.3.2                annotate_1.68.0          
[121] bslib_0.2.4               rngtools_1.5              multtest_2.46.0           beanplot_1.2             
[125] doRNG_1.8.2               scrime_1.3.5              stringr_1.4.0             digest_0.6.27            
[129] rmarkdown_2.7             base64_2.0                DelayedMatrixStats_1.12.3 curl_4.3                 
[133] shiny_1.6.0               Rsamtools_2.6.0           rjson_0.2.20              jsonlite_1.7.2           
[137] nlme_3.1-150              lifecycle_1.0.0           Rhdf5lib_1.12.1           askpass_1.1              
[141] fansi_0.4.2               pillar_1.6.0              fastmap_1.1.0             httr_1.4.2               
[145] survival_3.2-10           glue_1.4.2                png_0.1-7                 bit_4.0.4                
[149] sass_0.3.1                stringi_1.4.6             HDF5Array_1.18.1          blob_1.2.1               
[153] memoise_2.0.0

Reference

数据分析：数据降维方法的实现
前言随着大数据时代的到来，我们常常需要对高纬度数据进行降维处理后再对数据整体结构进行可视化，进而发现数据内在的区...
PCA算法推导
一、PCA降维 1.PCA简介 PCA（主成分分析）是一种数据降维的方法，即用较少特征地数据表达较多特征地数据（数...
人脸识别基本原理
特征脸特征脸方法利用主分量分析进行降维和提取特征。主分量分析是一种应用十分广泛的数据降维技术，该方法选择与原数据协...
详解主成分分析PCA
主成分分析（ Principal components analysis），简称PCA，是最主要的数据降维方法之一...
机器学习实战Py3.x填坑记10—利用PCA来简化数据
本章内容：降维技术主成分分析对半导体数据进行降维处理
算法笔记（14）PCA主成分分析及Python代码实现
降维就是一种对高维度特征数据预处理方法。降维的算法有很多，比如奇异值分解(SVD)、主成分分析(PCA）、因子分析...
PCA
降维算法 PCA主成分分析, 是无监督的降维方法, 可以将你的数据降低到n-1维. 它有几种不同的方式去解释原理,...
数据降维
#数据降维 #python 数据分析与数据化运营，宋天龙.2018.1 import numpy as np fr...
线性判别分析（LDA）
线性判别分析（Linear Discriminant Analysis，简称LDA）是一种经典的有监督数据降维方法...
PCA简述
简介 PCA全称Principal Component Analysis，即主成分分析，是一种常用的数据降维方法。...