美文网首页
R语言可视化9: PCA图 - ggplot2/factoext

R语言可视化9: PCA图 - ggplot2/factoext

作者: 小程的学习笔记 | 来源:发表于2023-04-21 21:13 被阅读0次

    PCA分析,全称Principal Components Analysis,即主成分分析,这是降维中最常见的一种方法。其是一种无监督算法,不需要标签即可对数据进行降维;降维后,由于失去了标签,可能无法理解每个维度的含义;但至少减少了数据维度,使得计算机能更好的识别和计算。

    1. 使用\color{green}{ggplot2}包绘制PCA图

    1.1 逐步计算PCA分析中的参数

    # 安装并加载所需的R包
    # install.packages("ggplot2")
    library(ggplot2)
    library(tidyverse)
    
    # 使用R语言自带的iris数据集
    head(iris)
    ##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    ## 1          5.1         3.5          1.4         0.2  setosa
    ## 2          4.9         3.0          1.4         0.2  setosa
    ## 3          4.7         3.2          1.3         0.2  setosa
    ## 4          4.6         3.1          1.5         0.2  setosa
    ## 5          5.0         3.6          1.4         0.2  setosa
    ## 6          5.4         3.9          1.7         0.4  setosa
    
    # 特征分解
    eigen <- scale(iris[, -5]) %>% # scale()对数据进行标准化
      cor() %>% # cor()对矩阵进行相关性分析
      eigen() # eign()对数据矩阵进行光谱分解,得到特征值和特征向量
    eigen
    ## eigen() decomposition
    ## $values
    ## [1] 2.91849782 0.91403047 0.14675688 0.02071484
    ## 
    ## $vectors
    ##            [,1]        [,2]       [,3]       [,4]
    ## [1,]  0.5210659 -0.37741762  0.7195664  0.2612863
    ## [2,] -0.2693474 -0.92329566 -0.2443818 -0.1235096
    ## [3,]  0.5804131 -0.02449161 -0.1421264 -0.8014492
    ## [4,]  0.5648565 -0.06694199 -0.6342727  0.5235971
    
    # 提取特征值(如上)
    eigen$values
    
    # 特征向量提取(如上)
    eigen$vectors
    
    # 计算标准化的主成分得分
    scale(iris[,-5]) %*% eigen$vectors %>% head()
    ##           [,1]       [,2]        [,3]         [,4]
    ## [1,] -2.257141 -0.4784238  0.12727962  0.024087508
    ## [2,] -2.074013  0.6718827  0.23382552  0.102662845
    ## [3,] -2.356335  0.3407664 -0.04405390  0.028282305
    ## [4,] -2.291707  0.5953999 -0.09098530 -0.065735340
    ## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
    ## [6,] -2.068701 -1.4842053 -0.02687825  0.006586116
    

    1.2 \color{orange}{prcomp}函数和\color{orange}{princomp}函数

    1.2.1 prcomp函数

    与常规的求取特征值和特征向量不同的是,prcomp函数是对变量矩阵(相关矩阵)采用SVD方法计算其奇异值(原理上是特征值的平方根)

    # 相关矩阵分解
    iris.pca <- prcomp(iris[,-5],
                       scale. = T, # scale.=T表示标准化
                       rank. = 4, # rank.指定最大秩的数字
                       retx = T) # retx一个逻辑值,指示返回已旋转的变量
    iris.pca
    ## Standard deviations (1, .., p=4):
    ## [1] 1.7083611 0.9560494 0.3830886 0.1439265
    ## 
    ## Rotation (n x k) = (4 x 4):
    ##                     PC1         PC2        PC3        PC4
    ## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
    ## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
    ## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
    ## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971
    
    # 查看结果
    summary(iris.pca) # 方差解释度(PCA分析结果)
    ## Importance of components:
    ##                           PC1    PC2     PC3     PC4
    ## Standard deviation     1.7084 0.9560 0.38309 0.14393
    ## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
    ## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000
    
    head(iris.pca$x) # 所有样本每个轴的得分
    ##            PC1        PC2         PC3          PC4
    ## [1,] -2.257141 -0.4784238  0.12727962  0.024087508
    ## [2,] -2.074013  0.6718827  0.23382552  0.102662845
    ## [3,] -2.356335  0.3407664 -0.04405390  0.028282305
    ## [4,] -2.291707  0.5953999 -0.09098530 -0.065735340
    ## [5,] -2.381863 -0.6446757 -0.01568565 -0.035802870
    ## [6,] -2.068701 -1.4842053 -0.02687825  0.006586116
    

    ★ PCA分析结果:
    \ \ \ \ Standard deviation: 标准差 其平方为方差=特征值
    \ \ \ \ Proportion of Variance: 方差贡献率
    \ \ \ \ Cumulative Proportion: 方差累计贡献率

    1.2.2 princomp函数

    princomp以计算相关矩阵或者协方差矩阵的特征值为主。

    iris.pc <- princomp(iris[, -5], cor = T, scores = T)
    iris.pc
    ## Call:
    ## princomp(x = iris[, -5], cor = T, scores = T)
    ## 
    ## Standard deviations:
    ##    Comp.1    Comp.2    Comp.3    Comp.4 
    ## 1.7083611 0.9560494 0.3830886 0.1439265 
    ## 
    ##  4  variables and  150 observations.
    
    summary(iris.pc) # 各主成份的解释量
    ## Importance of components:
    ##                           Comp.1    Comp.2     Comp.3      Comp.4
    ## Standard deviation     1.7083611 0.9560494 0.38308860 0.143926497
    ## Proportion of Variance 0.7296245 0.2285076 0.03668922 0.005178709
    ## Cumulative Proportion  0.7296245 0.9581321 0.99482129 1.000000000
    
    head(iris.pc$score) # 所有样本各轴的得分
    ##         Comp.1     Comp.2      Comp.3      Comp.4
    ## [1,] -2.264703  0.4800266  0.12770602  0.02416820
    ## [2,] -2.080961 -0.6741336  0.23460885  0.10300677
    ## [3,] -2.364229 -0.3419080 -0.04420148  0.02837705
    ## [4,] -2.299384 -0.5973945 -0.09129011 -0.06595556
    ## [5,] -2.389842  0.6468354 -0.01573820 -0.03592281
    ## [6,] -2.075631  1.4891775 -0.02696829  0.00660818
    
    screeplot(iris.pc,type="lines") # 方差分布图
    biplot(iris.pc,scale=F) # 双标图直接把x与rotation绘图,而非标准化
    
    princomp-1

    1.3 ggplot2可视化

    1.3.1 prcomp + ggplot2
    # 提取PC score; 
    df1 <- iris.pca$x 
    
    # 将iris数据集的第5列数据合并进来;
    df1 <- data.frame(df1, iris$Species) 
    head(df1) 
    ##         PC1        PC2         PC3          PC4 iris.Species
    ## 1 -2.257141 -0.4784238  0.12727962  0.024087508       setosa
    ## 2 -2.074013  0.6718827  0.23382552  0.102662845       setosa
    ## 3 -2.356335  0.3407664 -0.04405390  0.028282305       setosa
    ## 4 -2.291707  0.5953999 -0.09098530 -0.065735340       setosa
    ## 5 -2.381863 -0.6446757 -0.01568565 -0.035802870       setosa
    ## 6 -2.068701 -1.4842053 -0.02687825  0.006586116       setosa
    
    summ <- summary(iris.pca)
    
    xlab <- paste0("PC1(", round(summ$importance[2, 1] * 100, 2),"%)")
    ylab <- paste0("PC2(", round(summ$importance[2, 2] * 100, 2),"%)")
    
    ggplot(data = df1, aes(x = PC1, y = PC2, color = iris.Species)) +
      stat_ellipse(aes(fill = iris.Species), type = "norm", geom ="polygon", alpha = 0.2, color = NA) + 
      geom_point() + labs(x = xlab, y = ylab, color = "") + 
      guides(fill = F) +
      scale_fill_manual(values = c("purple","orange","blue")) + 
      scale_colour_manual(values = c("purple","orange","blue"))
    
    1.3.2 princomp + ggplot2
    # 查看主成分的结果
    pca.scores <- as.data.frame(iris.pc$scores)
    head(pca.scores)
    ##      Comp.1     Comp.2      Comp.3      Comp.4
    ## 1 -2.264703  0.4800266  0.12770602  0.02416820
    ## 2 -2.080961 -0.6741336  0.23460885  0.10300677
    ## 3 -2.364229 -0.3419080 -0.04420148  0.02837705
    ## 4 -2.299384 -0.5973945 -0.09129011 -0.06595556
    ## 5 -2.389842  0.6468354 -0.01573820 -0.03592281
    ## 6 -2.075631  1.4891775 -0.02696829  0.00660818
    
    # 绘制PCA图
    ggplot(pca.scores, aes(Comp.1, Comp.2, 
                           col = iris$Species,
                           shape = iris$Species)
                           ) + 
      scale_shape_manual(values = c(15, 19, 17)) + 
      scale_color_manual(values = c('#999999','#E69F00', '#56B4E9')) +
      geom_point(size = 3) + 
      # geom_text(aes(label = rownames(pca.scores)), vjust = "outward") + 
      geom_hline(yintercept = 0, lty = 2, col = "red") + 
      geom_vline(xintercept = 0, lty = 2, col = "blue", lwd = 1) +
      theme_bw() + theme(legend.position = "none") + 
      theme(plot.title = element_text(hjust = 0.5)) + 
      labs(x = "PCA_1", y  ="PCA_2", title = "PCA analysis")
    
    ggplot2-1

    2. 使用\color{green}{scatterplot3d}包绘制PCA图

    # 安装并加载所需的R包
    # install.packages("scatterplot3d")
    library(scatterplot3d)
    
    # 绘制三维PCA图
    scatterplot3d(iris.pca$x[, 1:3],
                  pch = c(rep(16, 50), rep(17, 50), rep(18, 50)),
                  color = c(rep("purple", 50), rep("orange", 50), rep("blue", 50)),
                  angle = 45, main= "3D PCA plot",
                  cex.symbols = 1.5, mar = c(5, 4, 4, 5))
    
    # 添加图例
    legend("topright", title = "Sample",
           xpd = TRUE, inset = -0.15,
           legend = c("setosa", "versicolor", "virginica"),
           col = c("purple", "orange", "blue"),
           pch = c(16, 17, 18))
    
    scatterplot3d-1

    3. 使用\color{green}{factoextra}包绘制PCA图

    3.1 空间分布图

    # 安装并加载所需的R包
    # install.packages("factoextra")
    # install.packages("FactoMineR")
    library(FactoMineR)
    library(factoextra)
    
    # PCA分析
    iris.pca <- PCA(iris[,-5], graph = T)
    
    # 获取样本的主成分分析结果
    var <- get_pca_var(iris.pca)
    
    # 绘制主成分碎石图,查看每一个主成分能在多大程度上代表原来的特征
    fviz_screeplot(iris.pca, addlabels = TRUE, ylim = c(0, 80))
    
    # PCA图
    fviz_pca_ind(iris.pca,
                 mean.point = F, # 去除分组的中心点,否则每个群中间会有一个比较大的点
                 label = "none", # 隐藏每一个样本的标签
                 habillage = iris$Species, # 根据样本类型来着色
                 palette = c("purple","orange","blue"), # 三个组设置三种颜色
                 addEllipses = TRUE, # 添加边界线
                 ellipse.type = "convex" # 设置边界线为多边形
    )
    
    factoextra-1

    3.2 特征分布图

    # 查看每一个特征对每一个主成分的贡献程度
    var$contrib
    ##                  Dim.1       Dim.2     Dim.3     Dim.4
    ## Sepal.Length 27.150969 14.24440565 51.777574  6.827052
    ## Sepal.Width   7.254804 85.24748749  5.972245  1.525463
    ## Petal.Length 33.687936  0.05998389  2.019990 64.232089
    ## Petal.Width  31.906291  0.44812296 40.230191 27.415396
    
    fviz_pca_var(iris.pca,   
                 col.var = "contrib", #根据贡献度着色
                 gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
    )
    
    library("corrplot")
    corrplot(var$contrib, is.corr = FALSE)
    

    ★ Sepal.Width这个特征对Dim.2的贡献最大


    factoextra-2

    3.3 同时展示样本和特征

    fviz_pca_biplot(iris.pca, 
                    label = "var", # 只标注变量,不标注样本
                    habillage = iris$Species, # 根据样本类型着色
                    addEllipses = TRUE, # 添加边界线
                    ellipse.type = "convex", # 设置边界线为多边形
                    ggtheme = theme_minimal() # 白色背景,浅灰色网格线,无坐标轴,无边框
                    )
    
    factoextra-3

    参考:

    1. https://blog.csdn.net/qq_42830713/article/details/127545000
    2. https://zhuanlan.zhihu.com/p/443843102
    3. https://www.jianshu.com/p/51de30fd18e7

    相关文章

      网友评论

          本文标题:R语言可视化9: PCA图 - ggplot2/factoext

          本文链接:https://www.haomeiwen.com/subject/qduhjdtx.html