美文网首页
数据分析:数据预处理--批次校正(四)

数据分析:数据预处理--批次校正(四)

作者: 生信学习者2 | 来源:发表于2022-01-12 16:25 被阅读0次

    前言

    批次校正是数据预处理常见的步骤,在整合多批次数据的时候,需要考虑到这些非生物学因素对treatment的影响。更多知识分享请到 https://zouhua.top/

    加载R包

    library(dplyr)
    library(tibble)
    library(Biobase)
    library(limma)
    library(ggplot2)
    
    # rm(list = ls())
    options(stringsAsFactors = F)
    options(future.globals.maxSize = 1000 * 1024^2)
    
    subgrp <- c("HC", "CP", "PDAC")
    grp.col <- c("#568875", "#73FAFC", "#EE853D")
    
    ExprSet <- readRDS("MS_Protein_ImputeExprSet.RDS")
    

    Function

    PCAFun <- function(dataset=ExprSet){
      
      # dataset=ExprSet
      
      metadata <- pData(dataset)
      profile <- exprs(dataset)
      
      # pca 
      pca <- prcomp(scale(t(profile), center = T, scale = T))
      require(factoextra)
      eig <- get_eig(pca)
      # explains variable 
      explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
      # principal component score of each sample
      score <- inner_join(pca$x %>% data.frame() %>% 
                            dplyr::select(c(1:2)) %>% 
                            rownames_to_column("SampleID"), 
                          metadata %>% rownames_to_column("SampleID"),
                          by = "SampleID") %>%
        mutate(SubGroup=factor(SubGroup, levels = subgrp))
      
      
      # PERMANOVA
      require(vegan)
      set.seed(123)
      if(any(profile < 0)){
        res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999) 
      }else{
        res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)    
      }
      adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
      adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
      signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
      adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
                                      atop("p-value="~.(adn_pvalue)~.(signi_label), phantom()))) 
      
      pl <- ggplot(score, aes(x=PC1, y=PC2))+
                  #geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
                  geom_point(aes(color=SubGroup, shape=Omics), size=2.5)+
                  stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
                  labs(x=explains[1], y=explains[2])+
                  scale_color_manual(values = grp.col)+
                  # scale_fill_manual(name = "Condition", 
                  #                   values = grp.col)+
                  scale_shape_manual(name = "Batch", 
                                     values = c(15, 16, 17))+    
                  annotate("text", x = max(score$PC1) - 8,
                           y = min(score$PC1),
                           label = adn_res_format,
                           size = 6)+ 
                  #guides(color="none")+
                  theme_bw()+
                  theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
                        axis.text = element_text(size = 9, color = "black"),
                        text = element_text(size = 8, color = "black", family = "serif"),
                        strip.text = element_text(size = 9, color = "black", face = "bold"), 
                        panel.grid = element_blank(),
                        legend.title = element_text(size = 11, color = "black", family = "serif"),
                        legend.text = element_text(size = 10, color = "black", family = "serif"),
                        legend.position = c(0, 0),
                        legend.justification = c(0, 0),
                        legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
      return(pl)
    }
    
    RemoveBatchExprSet <- function(datset=ExprSet){
      
      # datset=ExprSet
      
      pheno <- pData(datset)
      expr <- exprs(datset)
      feature <- fData(datset)
      
      # Remove batch effects
      phen <- pheno %>% arrange(Omics)
      prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
      if(!any(rownames(phen) == colnames(prof))){
        stop("The Order of SampleID in DiscoverySet is wrong")
      }
      
      # CA19-9 should be removed batch effects
      prof_CA199 <- prof[rownames(prof) == "CA199", , F]
      prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
      prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics)
      prof_merge <- rbind(prof_rbe, prof_CA199)
      
      fdf <- new("AnnotatedDataFrame", data=feature)
      exprs_dis <- as.matrix(prof_merge)
      adf_dis <-  new("AnnotatedDataFrame", data=phen)
      experimentData <- new("MIAME",
              name="Hua", lab="Lab",
              title="tumor Experiment",
              abstract="The Mass Spectrometry ExpressionSet with overlap samples",
              url="www.zouhua.top",
              other=list(notes="The Intersect Proteins and Samples"))
      res <- new("ExpressionSet", 
                    exprs=exprs_dis,
                    phenoData=adf_dis,
                    featureData=fdf,
                    experimentData=experimentData)
      
      return(res)  
    }
    
    
    RemoveBatchExprSet_Design <- function(datset=ExprSet){
      
      # datset=ExprSet
      
      pheno <- pData(datset)
      expr <- exprs(datset)
      feature <- fData(datset)
    
      # design
      Design <- model.matrix(~SubGroup, data = pheno)
      
      # Remove batch effects
      phen <- pheno %>% arrange(Omics)
      prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
      if(!any(rownames(phen) == colnames(prof))){
        stop("The Order of SampleID in DiscoverySet is wrong")
      }
      
      # CA19-9 should be removed batch effects
      prof_CA199 <- prof[rownames(prof) == "CA199", , F]
      prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
      prof_rbe <- removeBatchEffect(as.matrix(prof_NoCA199), batch = phen$Omics, design = Design)
      prof_merge <- rbind(prof_rbe, prof_CA199)
      
      fdf <- new("AnnotatedDataFrame", data=feature)
      exprs_dis <- as.matrix(prof_merge)
      adf_dis <-  new("AnnotatedDataFrame", data=phen)
      experimentData <- new("MIAME",
              name="Hua", lab="Lab",
              title="tumor Experiment",
              abstract="The Mass Spectrometry ExpressionSet with overlap samples",
              url="www.zouhua.top",
              other=list(notes="The Intersect Proteins and Samples"))
      res <- new("ExpressionSet", 
                    exprs=exprs_dis,
                    phenoData=adf_dis,
                    featureData=fdf,
                    experimentData=experimentData)
      
      return(res)  
    }
    
    
    # Batch mean centering
    RemoveBatchExprSet_BMC <- function(datset=ExprSet){
      
      # datset=ExprSet
      
      pheno <- pData(datset)
      expr <- exprs(datset)
      feature <- fData(datset)
    
      # Remove batch effects
      phen <- pheno %>% arrange(Omics)
      prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
      # CA19-9 should be removed batch effects
      prof_CA199 <- prof[rownames(prof) == "CA199", , F]
      prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
    
      prof_list <- list()
      batchlevels <- unique(phen$Omics)
      for(i in 1:length(batchlevels)){
        prof_list[[i]] <- scale(t(prof_NoCA199 %>% 
                                    dplyr::select(rownames(phen[phen$Omics == batchlevels[i], ]))),
                                center = TRUE, scale = FALSE)
      }
      prof_merge <- rbind(t(do.call(rbind, prof_list)), prof_CA199)
      if(!any(rownames(phen) == colnames(prof_merge))){
        stop("The Order of SampleID in DiscoverySet is wrong")
      }  
      
      fdf <- new("AnnotatedDataFrame", data=feature)
      exprs_dis <- as.matrix(prof_merge)
      adf_dis <-  new("AnnotatedDataFrame", data=phen)
      experimentData <- new("MIAME",
              name="Hua", lab="Lab",
              title="tumor Experiment",
              abstract="The Mass Spectrometry ExpressionSet with overlap samples",
              url="www.zouhua.top",
              other=list(notes="The Intersect Proteins and Samples"))
      res <- new("ExpressionSet", 
                 exprs=exprs_dis,
                 phenoData=adf_dis,
                 featureData=fdf,
                 experimentData=experimentData)
      
      return(res)  
    }
    
    
    # ComBat
    RemoveBatchExprSet_ComBat <- function(datset=ExprSet){
      
      # datset=ExprSet
      
      pheno <- pData(datset)
      expr <- exprs(datset)
      feature <- fData(datset)
      
      # design
      Design <- model.matrix(~SubGroup, data = pheno)  
    
      # Remove batch effects
      phen <- pheno %>% arrange(Omics)
      prof <- data.frame(expr) %>% dplyr::select(rownames(phen))
      # CA19-9 should be removed batch effects
      prof_CA199 <- prof[rownames(prof) == "CA199", , F]
      prof_NoCA199 <- prof[rownames(prof) != "CA199", , F]
    
      prof_combat <- sva::ComBat(prof_NoCA199, batch = phen$Omics, mod = Design, par.prior = F, prior.plots = F)
      prof_merge <- rbind(prof_combat, prof_CA199)
      if(!any(rownames(phen) == colnames(prof_merge))){
        stop("The Order of SampleID in DiscoverySet is wrong")
      }  
      
      fdf <- new("AnnotatedDataFrame", data=feature)
      exprs_dis <- as.matrix(prof_merge)
      adf_dis <-  new("AnnotatedDataFrame", data=phen)
      experimentData <- new("MIAME",
              name="Hua", lab="Lab",
              title="tumor Experiment",
              abstract="The Mass Spectrometry ExpressionSet with overlap samples",
              url="www.zouhua.top",
              other=list(notes="The Intersect Proteins and Samples"))
      res <- new("ExpressionSet", 
                 exprs=exprs_dis,
                 phenoData=adf_dis,
                 featureData=fdf,
                 experimentData=experimentData)
      
      return(res)  
    }
    

    Run

    ExprSet_RBE <- RemoveBatchExprSet(datset=ExprSet)
    ExprSet_RBE_Design <- RemoveBatchExprSet_Design(datset=ExprSet)
    ExprSet_RBE_BMC <- RemoveBatchExprSet_BMC(datset=ExprSet)
    ExprSet_RBE_ComBat <- RemoveBatchExprSet_ComBat(datset=ExprSet)
    
    cowplot::plot_grid(#PCAFun(dataset=ExprSet), 
                       PCAFun(dataset=ExprSet_RBE),
                       PCAFun(dataset=ExprSet_RBE_Design),
                       PCAFun(dataset=ExprSet_RBE_BMC),
                       PCAFun(dataset=ExprSet_RBE_ComBat),
                       align = "hv", 
                       ncol = 2,
                       labels = c(#"Origin", 
                                  "Batch corrected", 
                                  "Batch corrected with Design", 
                                  "Batch corrected with BMC",
                                  "Batch corrected with ComBat"))
    

    systemic information

    sessionInfo()
    
    R version 4.0.2 (2020-06-22)
    Platform: x86_64-conda_cos6-linux-gnu (64-bit)
    Running under: CentOS Linux 8 (Core)
    
    Matrix products: default
    BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
    
    locale:
     [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
     [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
     [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
    
    attached base packages:
    [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
    
    other attached packages:
     [1] DEP_1.12.0          gg.gap_1.4          ggpubr_0.4.0        gtsummary_1.4.0     vegan_2.5-6        
     [6] lattice_0.20-45     permute_0.9-5       factoextra_1.0.7    ggrepel_0.9.1.9999  sampling_2.8       
    [11] data.table_1.14.2   convert_1.64.0      marray_1.68.0       limma_3.46.0        Biobase_2.50.0     
    [16] BiocGenerics_0.36.0 ggplot2_3.3.5       tibble_3.1.6        dplyr_1.0.7        
    
    loaded via a namespace (and not attached):
      [1] utf8_1.2.2                  shinydashboard_0.7.1        reticulate_1.18            
      [4] gmm_1.6-5                   tidyselect_1.1.1            htmlwidgets_1.5.4          
      [7] grid_4.0.2                  BiocParallel_1.24.1         lpSolve_5.6.15             
     [10] norm_1.0-9.5                munsell_0.5.0               codetools_0.2-18           
     [13] preprocessCore_1.52.1       DT_0.20                     umap_0.2.7.0               
     [16] withr_2.4.3                 colorspace_2.0-2            knitr_1.37                 
     [19] rstudioapi_0.13             stats4_4.0.2                robustbase_0.93-6          
     [22] bayesm_3.1-4                ggsignif_0.6.0              mzID_1.28.0                
     [25] MatrixGenerics_1.2.1        labeling_0.4.2              GenomeInfoDbData_1.2.4     
     [28] farver_2.1.0                vctrs_0.3.8                 generics_0.1.1             
     [31] xfun_0.29                   R6_2.5.1                    doParallel_1.0.16          
     [34] GenomeInfoDb_1.26.4         clue_0.3-60                 bitops_1.0-7               
     [37] DelayedArray_0.16.3         assertthat_0.2.1            promises_1.2.0.1           
     [40] scales_1.1.1                gtable_0.3.0                Cairo_1.5-12.2             
     [43] affy_1.68.0                 sandwich_3.0-0              rlang_0.4.12               
     [46] mzR_2.24.1                  GlobalOptions_0.1.2         splines_4.0.2              
     [49] rstatix_0.7.0               impute_1.64.0               broom_0.7.9                
     [52] BiocManager_1.30.16         yaml_2.2.1                  abind_1.4-5                
     [55] crosstalk_1.2.0             backports_1.4.1             httpuv_1.6.4               
     [58] tensorA_0.36.2              tools_4.0.2                 affyio_1.60.0              
     [61] ellipsis_0.3.2              jquerylib_0.1.4             RColorBrewer_1.1-2         
     [64] MSnbase_2.16.1              Rcpp_1.0.7                  plyr_1.8.6                 
     [67] zlibbioc_1.36.0             purrr_0.3.4                 RCurl_1.98-1.3             
     [70] openssl_1.4.6               GetoptLong_1.0.5            cowplot_1.1.0              
     [73] S4Vectors_0.28.1            zoo_1.8-8                   SummarizedExperiment_1.20.0
     [76] haven_2.3.1                 cluster_2.1.0               tinytex_0.36               
     [79] magrittr_2.0.1              RSpectra_0.16-0             openxlsx_4.2.3             
     [82] circlize_0.4.10             pcaMethods_1.80.0           mvtnorm_1.1-1              
     [85] ProtGenerics_1.22.0         matrixStats_0.61.0          xtable_1.8-4               
     [88] hms_1.1.1                   mime_0.12                   evaluate_0.14              
     [91] XML_3.99-0.6                rio_0.5.16                  readxl_1.3.1               
     [94] IRanges_2.24.1              shape_1.4.5                 compiler_4.0.2             
     [97] gt_0.2.2                    ncdf4_1.17                  crayon_1.4.2               
    [100] htmltools_0.5.2             mgcv_1.8-38                 later_1.2.0                
    [103] tzdb_0.2.0                  tidyr_1.1.4                 DBI_1.1.2                  
    [106] ComplexHeatmap_2.6.2        MASS_7.3-54                 tmvtnorm_1.4-10            
    [109] broom.helpers_1.3.0         compositions_2.0-2          Matrix_1.4-0               
    [112] car_3.0-10                  readr_2.1.1                 cli_3.1.0                  
    [115] vsn_3.58.0                  imputeLCMD_2.0              GenomicRanges_1.42.0       
    [118] forcats_0.5.0               pkgconfig_2.0.3             foreign_0.8-81             
    [121] MALDIquant_1.19.3           foreach_1.5.1               bslib_0.3.1                
    [124] XVector_0.30.0              stringr_1.4.0               digest_0.6.29              
    [127] rmarkdown_2.11              cellranger_1.1.0            curl_4.3.2                 
    [130] shiny_1.7.1                 rjson_0.2.20                lifecycle_1.0.1            
    [133] nlme_3.1-150                jsonlite_1.7.2              carData_3.0-4              
    [136] askpass_1.1                 fansi_0.5.0                 pillar_1.6.4               
    [139] fastmap_1.1.0               DEoptimR_1.0-8              survival_3.2-13            
    [142] glue_1.6.0                  zip_2.1.1                   png_0.1-7                  
    [145] iterators_1.0.13            stringi_1.4.6               sass_0.4.0
    

    Reference

    参考文章如引起任何侵权问题,可以与我联系,谢谢。

    相关文章

      网友评论

          本文标题:数据分析:数据预处理--批次校正(四)

          本文链接:https://www.haomeiwen.com/subject/lhbwcrtx.html