美文网首页统计与科研
如何对多变量数据批量进行t test和anova test并标注

如何对多变量数据批量进行t test和anova test并标注

作者: Jason数据分析生信教室 | 来源:发表于2022-02-18 10:52 被阅读0次

    前言

    用R语言对单独的变量数据进行t test或者anova test大家肯定耳熟能详。就分两步走

    1. ggplot 或者基础函数画出boxplot进行可视化
    2. t.test oneway.test 等函数进行统计分析
    3. 重复1和2

    这种方法应付少量的变量还可以,当变量是几十个甚至几百个的时候就有点力不从心了。特别是转录组分析,几十个几百个差异基因那可是家常便饭。和这次的主题无关,多变量的时候别忘了Bonferroni矫正(a=0.05/m)去除伪阳。

    一次性批量t test

    dat<-iris
    ## 因为是t test,所以要去掉一组数据
    dat<-subset(dat,Species !="setosa")
    dat$Species<-factor(dat$Species)
    ## 简单的for循环就可以解决批量鉴定
    for(i in 1:4){
      boxplot(dat[,i]~dat$Species,
              ylab=names(dat[I]),
              xlab="Species"
              )  
      print(t.test(dat[,i]~dat$Species))
    }
    
    
    Welch Two Sample t-test
    data: dat[, i] by dat$Species
    t = -5.6292, df = 94.025, p-value = 1.866e-07
    alternative hypothesis: true difference in means is not equal to 0
    95 percent confidence interval:
    -0.8819731 -0.4220269
    sample estimates:
    mean in group versicolor mean in group virginica
    5.936 6.588
    
    
    Welch Two Sample t-test
    data: dat[, i] by dat$Species
    t = -3.2058, df = 97.927, p-value = 0.001819
    alternative hypothesis: true difference in means is not equal to 0
    95 percent confidence interval:
    -0.33028364 -0.07771636
    sample estimates:
    mean in group versicolor mean in group virginica
    2.770 2.974
    
    
    Welch Two Sample t-test
    data: dat[, i] by dat$Species
    t = -12.604, df = 95.57, p-value < 2.2e-16
    alternative hypothesis: true difference in means is not equal to 0
    95 percent confidence interval:
    -1.49549 -1.08851
    sample estimates:
    mean in group versicolor mean in group virginica
    4.260 5.552
    
    
    Welch Two Sample t-test
    data: dat[, i] by dat$Species
    t = -14.625, df = 89.043, p-value < 2.2e-16
    alternative hypothesis: true difference in means is not equal to 0
    95 percent confidence interval:
    -0.7951002 -0.6048998
    sample estimates:
    mean in group versicolor mean in group virginica
    1.326 2.026
    
    

    使用ggpubr画出更直观的图

    还是用刚才的两组数据。

    library(ggpubr)
    x <- which(names(dat) == "Species") # 组名
    y <- which(names(dat) == "Sepal.Length" # 需要测试的变量名
               | names(dat) == "Sepal.Width"
               | names(dat) == "Petal.Length"
               | names(dat) == "Petal.Width")
    method <- "t.test" # 选择test种类 
    paired <- FALSE 
    # 根据数据是否一一对应写一个ifelse循环
    for (i in y) {
      for (j in x) {
        ifelse(paired == TRUE,
               p <- ggpaired(dat,
                             x = colnames(dat[j]), y = colnames(dat[I]),
                             color = colnames(dat[j]), line.color = "gray", line.size = 0.4,
                             palette = "npg",
                             legend = "none",
                             xlab = colnames(dat[j]),
                             ylab = colnames(dat[I]),
                             add = "jitter"
               ),
               p <- ggboxplot(dat,
                              x = colnames(dat[j]), y = colnames(dat[I]),
                              color = colnames(dat[j]),
                              palette = "npg",
                              legend = "none",
                              add = "jitter"
               )
        )
        #  添加P值 
        print(p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
                                     method = method,
                                     paired = paired,
                                     # group.by = NULL,
                                     ref.group = NULL
        ))
      }
    }
    
    



    批量P值调整

    多组比较的时候需要进行bonferroni等调整。同样可以写一段代码来实现批量处理。

    raw_pvalue <- numeric(length = length(1:4))
    for (i in (1:4)) {
      raw_pvalue[i] <- t.test(dat[, i] ~ dat$Species,
        paired = FALSE,
        alternative = "two.sided"
      )$p.value
    }
    df <- data.frame(
      Variable = names(dat[, 1:4]),
      raw_pvalue = round(raw_pvalue, 3)
    )
    df$Bonferroni <-
      p.adjust(df$raw_pvalue,
        method = "bonferroni"
      )
    df$BH <-
      p.adjust(df$raw_pvalue,
        method = "BH"
      )
    df$Holm <-
      p.adjust(df$raw_pvalue,
        method = "holm"
      )
    df$Hochberg <-
      p.adjust(df$raw_pvalue,
        method = "hochberg"
      )
    df$Hommel <-
      p.adjust(df$raw_pvalue,
        method = "hommel"
      )
    df$BY <-
      round(p.adjust(df$raw_pvalue,
        method = "BY"
      ), 3)
    df
    
    
    Variable raw_pvalue Bonferroni BH Holm Hochberg Hommel BY
    1 Sepal.Length 0.000 0.000 0.000 0.000 0.000 0.000 0.000
    2 Sepal.Width 0.002 0.008 0.002 0.002 0.002 0.002 0.004
    3 Petal.Length 0.000 0.000 0.000 0.000 0.000 0.000 0.000
    4 Petal.Width 0.000 0.000 0.000 0.000 0.000 0.000 0.000
    
    

    也可以自己写一个function,完了以后直接套数据就好了。

    t_table <- function(data, dvs, iv,
                        var_equal = TRUE,
                        p_adj = "none",
                        alpha = 0.05,
                        paired = FALSE,
                        wilcoxon = FALSE) {
      if (!inherits(data, "data.frame")) {
        stop("data must be a data.frame")
      }  if (!all(c(dvs, iv) %in% names(data))) {
        stop("at least one column given in dvs and iv are not in the data")
      }  if (!all(sapply(data[, dvs], is.numeric))) {
        stop("all dvs must be numeric")
      }  if (length(unique(na.omit(data[[iv]]))) != 2) {
        stop("independent variable must only have two unique values")
      }  
        out <- lapply(dvs, function(x) {
        if (paired == FALSE & wilcoxon == FALSE) {
          tres <- t.test(data[[x]] ~ data[[iv]], var.equal = var_equal)
        }    
          else if (paired == FALSE & wilcoxon == TRUE) {
          tres <- wilcox.test(data[[x]] ~ data[[iv]])
        }
          else if (paired == TRUE & wilcoxon == FALSE) {
          tres <- t.test(data[[x]] ~ data[[iv]],
            var.equal = var_equal,
            paired = TRUE
          )
        }    else {
          tres <- wilcox.test(data[[x]] ~ data[[iv]],
            paired = TRUE
          )
        }
        c(
          p_value = tres$p.value
        )
      })  
      out <- as.data.frame(do.call(rbind, out))
      out <- cbind(variable = dvs, out)
      names(out) <- gsub("[^0-9A-Za-z_]", "", names(out))
      out$p_value <- ifelse(out$p_value < 0.001,
        "<0.001",
        round(p.adjust(out$p_value, p_adj), 3)
      )
      out$conclusion <- ifelse(out$p_value < alpha,
        paste0("Reject H0 at ", alpha * 100, "%"),
        paste0("Do not reject H0 at ", alpha * 100, "%")
      )  
    return(out)
    }
    
    

    然后就出来了这个结果

    result <- t_table(
      data = dat,
      c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
      "Species"
    )result
    ##       variable p_value      conclusion
    ## 1 Sepal.Length  <0.001 Reject H0 at 5%
    ## 2  Sepal.Width   0.002 Reject H0 at 5%
    ## 3 Petal.Length  <0.001 Reject H0 at 5%
    ## 4  Petal.Width  <0.001 Reject H0 at 5%
    
    

    ANOVA方差分析

    把方差分析和1对1的t.test整合到一起

    dat <- iris
    # Edit from here
    x <- which(names(dat) == "Species") # name of grouping variable
    y <- which(names(dat) == "Sepal.Length" # names of variables to test
    | names(dat) == "Sepal.Width"
    | names(dat) == "Petal.Length"
    | names(dat) == "Petal.Width")
    method1 <- "anova" # one of "anova" or "kruskal.test"
    method2 <- "t.test" # one of "wilcox.test" or "t.test"
    my_comparisons <- list(c("setosa", "versicolor"), c("setosa", "virginica"), c("versicolor", "virginica")) # comparisons for post-hoc tests
    # Edit until here
    # Edit at your own risk
    for (i in y) {
      for (j in x) {
        p <- ggboxplot(dat,
          x = colnames(dat[j]), y = colnames(dat[I]),
          color = colnames(dat[j]),
          legend = "none",
          palette = "npg",
          add = "jitter"
        )
        print(
          p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
            method = method1, label.y = max(dat[, i], na.rm = TRUE)
          )
          + stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format") # remove if p-value of ANOVA or Kruskal-Wallis test >= alpha
        )
      }
    }
    
    



    相关文章

      网友评论

        本文标题:如何对多变量数据批量进行t test和anova test并标注

        本文链接:https://www.haomeiwen.com/subject/ugaelrtx.html