美文网首页R语言模型
数量生态学笔记||聚类外部验证

数量生态学笔记||聚类外部验证

作者: 周运来就是我 | 来源:发表于2018-08-02 22:49 被阅读0次

    对每一种聚类算法的结果我们都会遇到一个真实的问题:就是这个结果能否反映真实的生态情况。我们已经看到内部准则(如轮廓法或其他聚类质量指数) 都仅仅是依赖物种数据,还不足以选择最佳聚类结果。选择最终的聚类结果有时也需要基于生态学解释。生态学解释可视为样方聚类的外部验证。

    利用外部独立的解释变量验证聚类结果(作为响应数据)可以用判别式分析。当然也可以将样方分组当做因子对解释变量(作为响应数据)进行方差分析,了解解释变量在各组间是否有显著差异。

    # 加载所需的程序包
    library(ade4)
    library(vegan)  # 应该先加载ade4后加载vegan以避免冲突
    library(gclus)
    library(cluster)
    library(RColorBrewer)
    library(labdsv)
    library(mvpart)
    library(MVPARTwrap) # MVPARTwrap这个程序包必须从本地zip文件安装
    # 导入CSV格式的数据
    
    rm(list = ls())
    setwd("D:\\Users\\Administrator\\Desktop\\RStudio\\数量生态学\\DATA")
    
    spe <- read.csv("DoubsSpe.csv", row.names=1)
    env <- read.csv("DoubsEnv.csv", row.names=1)
    spa <- read.csv("DoubsSpa.csv", row.names=1)
    # 删除无物种数据的样方8
    spe <- spe[-8,]
    env <- env[-8,]
    spa <- spa[-8,]
    # 物种多度数据:先计算样方之间的弦距离矩阵,然后进行单连接聚合聚类
    # **************************************************************
    spe.norm <- decostand(spe, "normalize")
    spe.ch <- vegdist(spe.norm, "euc")
    spe.ch.single <- hclust(spe.ch, method="single")
    # 预转化后物种数据k-均值划分
    # ****************************
    spe.kmeans <- kmeans(spe.norm, centers=4, nstart=100)
    

    下面的案例展示一样方聚类簇为因子去对解释变量进行方差分析。首先检验某一环境变量是否符合方差分析假设(残差正态性和方差齐性),然后用传统的单因素方差分析或非参数的Kruskal-Wallis检验解释变量在组间是否有显著差异。

    # 基于k-均值划分结果(分4组)的样方聚类簇与4个环境变量的关系
    # *************************************************************
    attach(env)
    # 定量环境变量箱线图
    # 海拔、坡度、氧含量和铵浓度
    par(mfrow=c(2,2))
    boxplot(sqrt(alt) ~ spe.kmeans$cluster, main="海拔", las=1, 
      ylab="sqrt(alt)", col=2:5, varwidth=TRUE)
    boxplot(log(pen) ~ spe.kmeans$cluster, main="坡度", las=1, 
      ylab="log(pen)", col=2:5, varwidth=TRUE)
    boxplot(oxy ~ spe.kmeans$cluster, main="氧含量", las=1, 
      ylab="oxy", col=2:5, varwidth=TRUE)
    boxplot(sqrt(amm) ~ spe.kmeans$cluster, main="铵浓度", las=1, 
      ylab="sqrt(amm)", col=2:5, varwidth=TRUE)
    
    > # 检验方差分析的假设
    > # 检验残差的正态性
    > shapiro.test(resid(lm(sqrt(alt) ~ as.factor(spe.kmeans$cluster))))
    
        Shapiro-Wilk normality test
    
    data:  resid(lm(sqrt(alt) ~ as.factor(spe.kmeans$cluster)))
    W = 0.96238, p-value = 0.3756
    
    > shapiro.test(resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster))))
    
        Shapiro-Wilk normality test
    
    data:  resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster)))
    W = 0.95182, p-value = 0.204
    
    > shapiro.test(resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster))))
    
        Shapiro-Wilk normality test
    
    data:  resid(lm(log(pen) ~ as.factor(spe.kmeans$cluster)))
    W = 0.95182, p-value = 0.204
    
    > shapiro.test(resid(lm(oxy ~ as.factor(spe.kmeans$cluster))))
    
        Shapiro-Wilk normality test
    
    data:  resid(lm(oxy ~ as.factor(spe.kmeans$cluster)))
    W = 0.93941, p-value = 0.09674
    
    > shapiro.test(resid(lm(sqrt(amm) ~ as.factor(spe.kmeans$cluster))))
    
        Shapiro-Wilk normality test
    
    data:  resid(lm(sqrt(amm) ~ as.factor(spe.kmeans$cluster)))
    W = 0.95398, p-value = 0.2319
    
    > #检验结果表明sqrt(alt)、log(pen)、oxy和sqrt(amm)的残差是正态分布。
    > #也尝试为其他的环境变量寻找好的标准化转化。
    > # 检验方差齐性
    > bartlett.test(sqrt(alt), as.factor(spe.kmeans$cluster))
    
        Bartlett test of homogeneity of variances
    
    data:  sqrt(alt) and as.factor(spe.kmeans$cluster)
    Bartlett's K-squared = 16.953, df = 3, p-value = 0.0007227
    
    > bartlett.test(log(pen), as.factor(spe.kmeans$cluster))
    
        Bartlett test of homogeneity of variances
    
    data:  log(pen) and as.factor(spe.kmeans$cluster)
    Bartlett's K-squared = 2.9832, df = 3, p-value = 0.3942
    
    > bartlett.test(oxy, as.factor(spe.kmeans$cluster))
    
        Bartlett test of homogeneity of variances
    
    data:  oxy and as.factor(spe.kmeans$cluster)
    Bartlett's K-squared = 1.8258, df = 3, p-value = 0.6093
    
    > bartlett.test(sqrt(amm), as.factor(spe.kmeans$cluster))
    
        Bartlett test of homogeneity of variances
    
    data:  sqrt(amm) and as.factor(spe.kmeans$cluster)
    Bartlett's K-squared = 5.7203, df = 3, p-value = 0.126
    
    > #变量sqrt(alt)的方差不齐,所以参数检验的方差分析不适用
    > #可被参数检验的变量的方差分析 
    > summary(aov(log(pen) ~ as.factor(spe.kmeans$cluster)))
                                  Df Sum Sq Mean Sq F value  Pr(>F)   
    as.factor(spe.kmeans$cluster)  3  18.05   6.018   7.283 0.00114 **
    Residuals                     25  20.66   0.826                   
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    > summary(aov(oxy ~ as.factor(spe.kmeans$cluster)))
                                  Df Sum Sq Mean Sq F value   Pr(>F)    
    as.factor(spe.kmeans$cluster)  3 103.54   34.51   26.27 6.75e-08 ***
    Residuals                     25  32.84    1.31                     
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    > summary(aov(sqrt(amm) ~ as.factor(spe.kmeans$cluster)))
                                  Df Sum Sq Mean Sq F value   Pr(>F)    
    as.factor(spe.kmeans$cluster)  3 2.6126  0.8709   49.55 1.15e-10 ***
    Residuals                     25 0.4394  0.0176                     
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    > #坡度、含氧量和铵浓度在不同聚类簇之间是否差异显著?
    > # 变量alt的Kruskal-Wallis检验
    > kruskal.test(alt ~ as.factor(spe.kmeans$cluster))
    
        Kruskal-Wallis rank sum test
    
    data:  alt by as.factor(spe.kmeans$cluster)
    Kruskal-Wallis chi-squared = 21.526, df = 3, p-value = 8.186e-05
    
    > #海拔在不同聚类簇之间是否有显著差异?
    > detach(env)
    
    双类型比较(列联表分析)
    > # 两种聚类的列联表 
    > # ******************
    > # 基于环境变量(见第2章)的样方聚类
    > env2 <- env[,-1]
    > env.de <- vegdist(scale(env2), "euc")
    > env.kmeans <- kmeans(env.de, centers=4, nstart=100)
    > env.KM.4 <- env.kmeans$cluster
    > # 比较两种聚类的结果
    > table(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster))
       
        1 2 3 4
      1 0 4 7 1
      2 1 0 0 2
      3 5 3 0 0
      4 0 4 2 0
    > #两种聚类结果是否相同?
    > # 用卡方检验分析两种聚类之间的差异 
    > chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster))
    
        Pearson's Chi-squared test
    
    data:  as.factor(spe.kmeans$cluster) and as.factor(env.kmeans$cluster)
    X-squared = 30.227, df = 9, p-value = 0.0004014
    
    Warning message:
    In chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster)) :
      Chi-squared近似算法有可能不准
    > # 改用置换的方法进行卡方检验分析 
    > chisq.test(as.factor(spe.kmeans$cluster), as.factor(env.kmeans$cluster), 
    +   simulate.p.value=TRUE)
    
        Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
    
    data:  as.factor(spe.kmeans$cluster) and as.factor(env.kmeans$cluster)
    X-squared = 30.227, df = NA, p-value = 0.0004998
    
    > # k-均值样方聚类簇平均多度
    > # ***********************
    > groups <- as.factor(spe.kmeans$cluster)
    > as.factor(spe.kmeans$cluster)
     1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
     1  1  1  1  4  1  1  4  1  1  1  1  1  1  4  4  4  4  3  3  3  2  2  2  3  3  3  3  3 
    Levels: 1 2 3 4
    > as.factor(env.kmeans$cluster)
     1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
     4  3  3  3  3  3  3  3  3  2  2  2  2  3  2  2  2  2  2  2  2  4  1  4  1  1  1  1  1 
    Levels: 1 2 3 4
    

    相关文章

      网友评论

        本文标题:数量生态学笔记||聚类外部验证

        本文链接:https://www.haomeiwen.com/subject/mvxjvftx.html