美文网首页js css html
封装函数-用R包Seurat跑单套数据

封装函数-用R包Seurat跑单套数据

作者: Aji | 来源:发表于2022-10-24 21:09 被阅读0次

    呜呜最近发现我工作效率低的一个原因就是重复性工作没有流程化,一气之下,把seurat分析单套数据的流程封装了起来,步骤包含数据质控、数据标准化、聚类以及初步的细胞类型鉴定。细胞类型鉴定是用每个cluster的top marker来标注的。之后再更新整合多套数据的流程,希望与君分享

    1. 用到的所有函数放在了SeuratWrapperFunction.R中了

    这个需要用source()函数导入到下面封装好的代码中的

    ### Time: 20221025
    ###  Author: zhengyiyi
    ## load function 
    library(Seurat)
    library(SingleCellExperiment)
    library(scater)
    library(MAST)
    library(ggplot2)
    library(patchwork)
    library(RColorBrewer)
    library(Seurat)
    library(openxlsx)
    library("grid")
    library("ggplotify")
    library(magrittr)
    library(dplyr)
    library(ggsignif)
    library("ggplot2")
    library(cowplot)
    library(openxlsx)
    
    CreateFirstFilterSeuratObject <- function(exp_count, pr_name, Or_ident, 
                                              min_cell, min_gene, mt_pattern,rb_pattern){
      # CreateSeuratObject : filter gene and cell by min.cells(Include features detected in at least this many cells) and min.features(Include cells where at least this many features are detected.)
      exp_count.seurat <- CreateSeuratObject(exp_count, 
                                             project = pr_name, 
                                             min.cells = min_cell,
                                             min.features = min_gene)
      exp_count.seurat@meta.data$orig.ident <- as.factor(Or_ident)
      exp_count.seurat@active.ident <- exp_count.seurat$orig.ident
      ## Check the mt gene and rp gene
      exp_count.seurat[["percent.mt"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = mt_pattern)
      exp_count.seurat[["percent.rb"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = rb_pattern)
      
      print(levels(exp_count.seurat@active.ident))
      print(table(exp_count.seurat$orig.ident))
      print(head(x = exp_count.seurat@meta.data, 5))
      print(dim(exp_count.seurat@meta.data))
      return(exp_count.seurat)
    }
    
    SeuratQC <- function(exp_count.seurat, name_pre){
      folder_name="1.QC"
      if (file.exists(folder_name)){
        print("1.QC existed.")
      }
      else{
        dir.create(folder_name)
      } 
      file_name=paste(name_pre,"Violinplot.pdf",sep="_")
      
      reads.drop <- isOutlier(as.numeric(exp_count.seurat$nCount_RNA), nmads = 3, type = "lower")
      feature.drop <- isOutlier(as.numeric(exp_count.seurat$nFeature_RNA), nmads = 3, type = "lower")
      qc.mito2 <- isOutlier(exp_count.seurat$percent.mt, nmads = 3, type="higher")
      qc.ribo2 <- isOutlier(exp_count.seurat$percent.rb, nmads = 3, type="higher")
      
      ### 保留下那些质量差的细胞,看看这些细胞的情况 
      low_q_cells <- (reads.drop | feature.drop | qc.mito2 | qc.ribo2)
      exp_count.seurat$lowquality_cells <- "High_qualitycells"
      exp_count.seurat$lowquality_cells[low_q_cells] <- "Low_qualitycells"
      
      plot1=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "lowquality_cells")
      plot2=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "lowquality_cells")
      plot3=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.rb", group.by = "lowquality_cells")
      
      pdf(file = file_name, height = 10, width = 20)
      print(VlnPlot(object = exp_count.seurat, features = c("nFeature_RNA", "nCount_RNA", 
                                                            "percent.mt","percent.rb"), ncol = 2, pt.size=0)) 
      
      opar <- par(no.readonly = TRUE)
      par(mfrow=c(2,2))
      print(hist(exp_count.seurat$nCount_RNA, breaks = 100, main = "Reads Count Distribution", col = "grey80", 
                 xlab = "library size", ylab = "frequency",
                 cex.lab = 1.4, cex.axis = 1.4))
      print(abline(v = 10000, col = "red", lwd = 2, lty = 2))
      
      print(hist(exp_count.seurat$nFeature_RNA, breaks = 100, main = "Gene Count Distribution",
                 col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
      print(abline(v = 1500, col = "red", lwd = 2, lty = 2))
      
      print(hist(exp_count.seurat$percent.mt, breaks = 10, main = "Mitochondrial Percentage Distribution",
                 col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
      print(abline(v = 20, col = "red", lwd = 2, lty = 2))
      
      print(hist(exp_count.seurat$percent.rb, breaks = 10, main = "Ribosome Percentage Distribution",
                 col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
      print(abline(v = 20, col = "red", lwd = 2, lty = 2))
      par(opar)
      
      print(plot1 + plot2 + plot3)
      dev.off()
      
      #copy files to 1.QC
      result_files <- c(file_name)
      file.copy(result_files, folder_name ,overwrite = T)
      file.remove(result_files) 
      print("Based on the Violin plot, you can filter the cell by the nFeature_RNA, nCount_RNA and/or  percent.mt rb")
    }
    
    
    SubsetSeuratData <- function(exp_count.seurat, min_gene, max_gene,
                                 min_ncountRNA,  max_ncountRNA,
                                 max_mt_percent, max_rb_percent){
      exp_count.seurat <- subset(exp_count.seurat,  subset = nCount_RNA < max_ncountRNA &  nCount_RNA > min_ncountRNA & nFeature_RNA < max_gene & nFeature_RNA > min_gene  & percent.mt < max_mt_percent & percent.rb < max_rb_percent )
      # QC
      qc_pdf_name <- paste("filtered_by_gene", min_gene, "UMI", min_ncountRNA, "percent.mt", max_mt_percent, "percent.rb", max_rb_percent, sep="_")
      SeuratQC(exp_count.seurat, qc_pdf_name)
      dim(exp_count.seurat)
      print(levels(exp_count.seurat$orig.ident))
      return(exp_count.seurat)
    }
    
    
    SelectPCsByLogNormalize <- function(exp_count.seurat, file_prefix, scale_factor, 
                                        nfeatures, npc_plot){
      
      folder_name="2.RunPCA"
      if (file.exists(folder_name)){
        print("2.RunPCA existed.")
      }
      else{
        dir.create(folder_name)
      } 
      
      DefaultAssay(exp_count.seurat) <- "RNA"
      # Normalize Find VariableGene and Scale Data
      exp_count.seurat <- NormalizeData(exp_count.seurat, normalization.method = "LogNormalize",
                                        scale.factor = scale_factor)
      exp_count.seurat <- FindVariableFeatures(exp_count.seurat, selection.method = "vst", nfeatures = nfeatures)
      exp_count.seurat <- ScaleData(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
      
      # RunPCA
      exp_count.seurat <- RunPCA(exp_count.seurat, pc.genes = exp_count.seurat@var.genes, 
                                 npcs = npc_plot, verbose = FALSE,)
      
      # Select PCs:1:30pcs
      selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
      pdf(file = selected_pcs_name, height = 8, width = 8)
      print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
      dev.off()
      
      # Plot 
      pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
      pdf(pca_plot_name,8,8)
      print(DimPlot(exp_count.seurat, reduction="pca", label = TRUE, pt.size=0.5, group.by="ident"))
      dev.off()
      
      #Copy files to 2.RunPCA
      result_files <- c(selected_pcs_name,pca_plot_name)
      file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
      file.remove(result_files) #移除拷贝完的文件
      
      return(exp_count.seurat)
    }
    
    SelectPCsBySCTransform <- function(exp_count.seurat, file_prefix, npc_plot){
      
      folder_name="2.RunPCA"
      if (file.exists(folder_name)){
        print("2.RunPCA existed.")
      }
      else{
        dir.create(folder_name)
      } 
      
      # Normalize Find VariableGene and Scale Data
      exp_count.seurat <- SCTransform(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
      # RunPCA
      exp_count.seurat <- RunPCA(exp_count.seurat, verbose = FALSE,  npcs = npc_plot)
      
      # Select PCs:1:30pcs
      selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
      pdf(file = selected_pcs_name, height = 8, width = 8)
      print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
      dev.off()
      
      # Plot 
      pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
      pdf(pca_plot_name, 8, 8)
      print(DimPlot(exp_count.seurat,reduction="pca", label = TRUE, pt.size= 0.5,group.by="ident",shape.by="orig.ident"))
      dev.off()
      
      #Copy files to 2.RunPCA
      result_files <- c(selected_pcs_name,pca_plot_name)
      file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
      file.remove(result_files) #移除拷贝完的文件
      
      return(exp_count.seurat)
    }
    
    # RunUMAP and RunTSNE and Then find cluster
    PlotCluster <- function(exp_count.seurat, file_prefix, npc_used, k_param, resolution_number){
      
      folder_name="3.PlotCluster"
      if (file.exists(folder_name)){
        print("3.PlotCluster existed.")
      }
      else{
        dir.create(folder_name)
      } 
      
      # FinderNeighbors 
      exp_count.seurat <- FindNeighbors(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, k.param = k_param)
      exp_count.seurat <- FindClusters(exp_count.seurat, verbose = FALSE, resolution = resolution_number)
      
      # RunTSNE and RunUMAP
      exp_count.seurat <- RunTSNE(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, check_duplicates = FALSE)
      exp_count.seurat <- RunUMAP(exp_count.seurat, dims = 1:npc_used, verbose = FALSE)
      
      # RenameIdents from zero to one
      levels_define <- as.numeric(levels(exp_count.seurat))
      new.cluster.ids <- levels_define + 1
      names(new.cluster.ids) <- levels(exp_count.seurat)
      exp_count.seurat <- RenameIdents(exp_count.seurat, new.cluster.ids)
      
      combined_cluster_plotname <- paste0(file_prefix, "_combined_cluster_resolution_",resolution_number,".pdf")
      pdf(combined_cluster_plotname,7,7)
      print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5, 
                    label.size = 4.5, repel=TRUE, group.by="ident"))
      print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5, 
                    label.size = 4.5, repel=TRUE, group.by="ident"))
      dev.off()
      
      split_clustering_plot_name <- paste0(file_prefix, "_split_clustering_resolution_", resolution_number, ".pdf")
      pdf(split_clustering_plot_name,14,7)
      print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5, 
                    label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident"))
      print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5, 
                    label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident")) 
      dev.off()
      
      # Copy files to 2.Cluster
      file.copy(combined_cluster_plotname, folder_name ,overwrite = T)
      file.remove(combined_cluster_plotname) 
      file.copy(split_clustering_plot_name, folder_name ,overwrite = T)
      file.remove(split_clustering_plot_name) 
      return(exp_count.seurat)
    }
    
    # Find markers for each cluster
    FindmarkerForCluster <- function(exp_count.seurat, file_prefix, min.pct, logfc.threshold, p_val_adj, mt_rb_pattern){
      folder_name="4.MarkersInCluster"
      if (file.exists(folder_name)){
        print("4.MarkersInCluster file existed.")
      }
      else{
        dir.create(folder_name)
      } 
      
      cluster.markers <- FindAllMarkers(object = exp_count.seurat, only.pos = TRUE, 
                                        min.pct = min.pct, logfc.threshold = logfc.threshold)
      index <- cluster.markers$p_val_adj < p_val_adj
      cluster.markers <- cluster.markers[index,]
      # remove mt and rb gene
      index <- grep(mt_rb_pattern, cluster.markers$gene)
      if (length(index)  > 0){
        cluster.markers <- cluster.markers[-index, ]
      }
      
      save_name <- paste0(file_prefix,"_MarkersInClusters.csv")
      write.csv(cluster.markers,save_name)
      
      file.copy(save_name, folder_name,overwrite = T)
      file.remove(save_name)
      return(cluster.markers)
    }
    
    # Top markers for each cluster
    TopMarkersInCluster <- function(cluster.markers, file_prefix, top_num){
      library(dplyr)
      folder_name="4.MarkersInCluster"
      if (file.exists(folder_name)){
        print("4.MarkersInCluster file existed.")
      }
      else{
        dir.create(folder_name)
      } 
      #将readsCountSM.markers传给group_by,group_by按cluster 排序,再将结果传给top_n,top_n按avg_logFC排序,显示每个类中的前两个
      top_marker <- cluster.markers %>% group_by(cluster) %>% top_n(n = top_num, wt = avg_log2FC)
      file_name=paste(file_prefix, "_top_marker", top_num,".csv",sep="")
      write.csv(top_marker, file =file_name)
      file.copy(file_name, folder_name,overwrite = T)
      return(top_marker)
      file.remove(file_name)
    }
    
    # Rename each cluster with top2 markers
    MapTop2MarkerEachCluster <- function(exp_count.seurat, cluster.markers, file_prefix){
      library(dplyr)
      library(plyr)
      exp_count.seurat$seurat_clusters <- exp_count.seurat@active.ident
      
      top2 <- cluster.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
      top2 <- top2 %>%  group_by(cluster) %>%  
        dplyr::mutate(markers = paste0(gene, collapse = "/")) %>% dplyr::slice(1)  
      marker.names <- top2$markers
      current.cluster.ids <- as.character(1:(length(unique(Idents(exp_count.seurat))))) 
      new.cluster.ids <- marker.names
      Idents(exp_count.seurat) <- plyr::mapvalues(Idents(exp_count.seurat),
                                                  from = current.cluster.ids, 
                                                  to = new.cluster.ids)
      return(exp_count.seurat)
    }
    

    2.DataQualityOverviewSingle.R: 数据质控的函数是放在了这个脚本

    使用的话需要把source()那行代码改成自己的路径
    这边有个特别注意的点就是这个脚本里的第一行
    “#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript”
    之前我是把脚本放环境变量里了,每次source环境的时候总是就不能直接使用,必须全路径,只要把这个Rscript所在位置换成你用的conda环境中的Rscript就可以了。另外需要chmod +x DataQualityOverviewSingle.R 这个代码,才可以执行

    #!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
    # parameter
    
    library(optparse)
    library(getopt)
    
    option_list <- list(
      make_option(c("-o", "--output"), type = "character", default = FALSE,
                  action = "store", help = "This is the output directory."
      ),
      make_option(c("-t", "--Type"), type = "character", default = "10X",
                  action = "store", help = "This is type of exp matrix, default is 10X, the other is .csv which row is gene, column is cell."
      ),
      make_option(c("--datapath"), type = "character", default = FALSE,
                  action = "store", help = "This is the path of exp matrix."
      ),
      
      make_option(c("--project_name"), type = "character", default = FALSE,
                  action = "store", help = "This is project name built in seurat object and the pre_fix of files generated by this function."
      ),
      make_option(c("--mt_pattern"), type = "character", default = "^mt-",
                  action = "store", help = "This is mitochondria pattern used to calculate its percentage, default is the ^mt-."
      ),
      make_option(c("--rb_pattern"), type = "character", default = "^Rpl|^Rps",
                  action = "store", help = "This is ribosome pattern used to calculate its percentage, default is ^Rpl|^Rps"
      ),
      
      make_option(c("--min_cell"), type = "integer", default = 3,
                  action = "store", help = "This is min cells number to filter, default is 3."
      ),
      make_option(c("--min_gene"), type = "integer", default = 0,
                  action = "store", help = "This is min gene number to filter, default is 0."
      )
      )
    
    # -help 
    opt = parse_args(OptionParser(option_list = option_list, 
                                  usage = "This Script is general overview data quality of single cell exp matrix!"))
    print(opt)
    
    #### step0.load data ####
    source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
    
    print("Step0: Load Data")
    ##### step00. load functions ####
    # read in the functions
    timestart<-Sys.time() 
    # set the output path
    cat("Current working dir: ", opt$output)
    cat("\n")
    setwd(opt$output)
    # load ref object
    
    #### step01. load exp matrix ####
    
    print("Step0: Load exp matrix data")
    if(opt$Type=="10X"){
      print("Read exp matrix from 10X cellranger output")
      exp_count <- Read10X(data.dir = opt$datapath)
      print("The rownum and colnum of  exp_count is ")
      print(dim(exp_count))
    }else{
      print("Read exp matrix from csv file")
      exp_count <- read.csv(opt$datapath, header = T, row.names = 1) 
      print("The rownum and colnum of normal exp is ")
      print(dim(exp_count))
    }
    
    
    #### step1: Create_seurat_filter_cell ######
    print("Step1: Creat Seuart Object with filtering lower genes and lower cells")
    Or_ident <- rep(opt$project_name, ncol(exp_count))
    exp_count.seurat <- CreateFirstFilterSeuratObject(exp_count, opt$project_name, Or_ident,
                                                       opt$min_cell, opt$min_gene,
                                                       opt$mt_pattern, opt$rb_pattern)
    
    print("The row_num and col_num of the first filtered matrix:")
    print(dim(exp_count.seurat))
    print("The cellnumber of samples:")
    print(table(exp_count.seurat$orig.ident))
    name_pre <- paste0("filter_gene_expressed_", opt$min_cell, "cells", opt$min_gene, "genes") 
    
    #### step2: Overview of the data quality ######
    print("Step2: Overview of the data quality")
    SeuratQC(exp_count.seurat, name_pre)
    rds_name <- paste0(opt$project_name, "_dataquality_overview_seurat.rds")
    saveRDS(exp_count.seurat, file = rds_name)
    
    print("Happy~Finished!!!")
    timeend<-Sys.time()
    runningtime<-timeend-timestart
    print(runningtime)
    

    3.LogNormalizeClusteringSingle.R脚本: LogNormalize数据并聚类

    #!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
    # parameter
    
    library(optparse)
    library(getopt)
    
    option_list <- list(
      make_option(c("-o", "--output"), type = "character", default = FALSE,
                  action = "store", help = "This is the output directory."
      ),
      make_option(c( "--seuratobject"), type = "character", default = "10X",
                  action = "store", help = "This is absoulte path of seuratobject."
      ),
      
      make_option(c("--min_gene"), type = "integer", default = 50,
                  action = "store", help = "This is min gene number to filter, default is 50."
      ),
      make_option(c("--max_gene"), type = "integer", default = 10000,
                  action = "store", help = "This is max gene number to filter, default is 10000."
      ),
      make_option(c("--min_ncountRNA"), type = "integer", default = 500,
                  action = "store", help = "This is min ncountRNA to filter, default is 5000."
      ),
      make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
                  action = "store", help = "This is min ncountRNA to filter, default is 100000."
      ),
      make_option(c("--max_mt_percent"), type = "integer", default = 60,
                  action = "store", help = "This is min mitochondria percentage  to filter, default is 60."
      ),
      make_option(c("--max_rb_percent"), type = "integer", default = 60,
                  action = "store", help = "This is min ribosome percentage to filter, default is 60."
      ),
      make_option(c("--file_prefix"), type = "character",
                  action = "store", help = "This is number of file prefix  to used in the following analysis."
      ),
      make_option(c("--scale_factor"), type = "integer", default = 10000,
                  action = "store", help = "This is scale factor used in Seurat NormalizeData function, default is 10000."
      ),
      make_option(c("--nfeatures"), type = "integer", default = 2000,
                  action = "store", help = "This is number of features used in Seurat FindVariableFeatures function, default is 2000."
      ),
      make_option(c("--npc_plot"), type = "integer", default = 50,
                  action = "store", help = "This is number of principles  to plot, default is 50."
      ),
      make_option(c("--npc_used"), type = "integer", default = 20,
                  action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
      ),
      
      make_option(c("--k_parameter"), type = "integer", default = 20,
                  action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
      ),
      make_option(c("--resolution_number"), type = "double", default = 0.5,
                  action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
      ),
      make_option(c("--min_pct"), type = "double", default = 0.25,
                  action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
      ),
      make_option(c("--logfc_threshold"), type = "double", default = 0.25,
                  action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
      ),
      make_option(c("--p_val_adj"), type = "double", default = 0.05,
                  action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
      ),
      make_option(c("--top_num"), type = "integer", default = 10,
                  action = "store", help = "This is top num marker of each cluster, default is 10"
      ),
      make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
                  action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
      )
      
    )
    
    # -help 
    opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is general processing of single cell exp matrix!"))
    print(opt)
    
    source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
    
    ###### step0. subset seurat object ####
    timestart<-Sys.time() 
    
    # set the output path
    setwd(opt$output)
    
    exp_count.seurat <- readRDS(opt$seuratobject)
    print("Step0: Subset Seurat Object")
    cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
    cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
    cat("max_mt_percent ", opt$max_mt_percent,  "\n")
    cat("max_rb_percent ", opt$max_rb_percent,  "\n")
    exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat, 
                                         min_gene = opt$min_gene, max_gene = opt$max_gene, 
                                          min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA, 
                                          max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)
    
    #### step1. normalize data and run pca ####
    print("Step1: LogNormalize Data and Run PCA")
    cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
    cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
    cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
    exp_count.seurat <- SelectPCsByLogNormalize(exp_count.seurat = exp_count.seurat, 
                                                file_prefix = opt$file_prefix, 
                                                scale_factor = opt$scale_factor,
                                                nfeatures =  opt$nfeatures,
                                                npc_plot = opt$npc_plot)
    
    #### step2. find clusters ####
    print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
    cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
    cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
    cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
    exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat, 
                                 file_prefix = opt$file_prefix, 
                                 npc_used = opt$npc_used, 
                                 k_param = opt$k_param,
                                 resolution_number = opt$resolution_number)
      
    
    #### step3. find top markers #####
    print("Step3: find top markers and then map cluster with top2 marke")
    cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
    cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
    cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")
    
    cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat, 
                                             file_prefix = opt$file_prefix, 
                                             min.pct = opt$min_pct, 
                                             logfc.threshold = opt$logfc_threshold, 
                                             p_val_adj = opt$p_val_adj,
                                             mt_rb_pattern = opt$mt_rb_pattern)
    
    TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers, 
                                     file_prefix = opt$file_prefix, 
                                     top_num = opt$top_num)
    # Top markers heatmap
    P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
    filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
    folder_name <- "4.MarkersInCluster"
    pdf(filename, 30, 15)
    print(P_heatmap_top_marker)
    dev.off()
    
    file.copy(filename, folder_name, overwrite = T) #copy files
    file.remove(filename)
    
    
    #### step4. map top2 marker to each cluster #####
    print("step4: map top2 marker to each cluster")
    exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat, 
                                                 cluster.markers = cluster.markers,
                                                 file_prefix = opt$file_prefix)
    
    #### step5. save the clustering plots aftering assign top2 markers ###
    print("step5. save the clustering plots aftering assign top2 markers")
    
    p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
                  pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
    p1 <- p1 + labs(title="Clustering UMAP Reduction")
    p1 <- p1 + theme(plot.title = element_text(hjust = 0.5)) 
    
    p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
                  pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
    p2 <- p2 + labs(title="Clustering TSNE Reduction")
    p2 <- p2 + theme(plot.title = element_text(hjust = 0.5)) 
    
    plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))
    
    cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
    folder_name <- "3.PlotCluster"
    pdf(cluster_pdfname, width = 18, height = 6)
    print(plots)
    dev.off()
    
    file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
    file.remove(cluster_pdfname)
    
    rds_name <- paste0(opt$file_prefix, "_lognormalize_clustering_seurat.rds.rds")
    saveRDS(exp_count.seurat, file = rds_name)
    
    print("Happy~Finished!!")
    timeend<-Sys.time()
    runningtime<-timeend-timestart
    print(runningtime)
    

    4. SCTransformClusteringSingle.R 脚本: SCTtransform数据并聚类

    #!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
    # parameter
    
    library(optparse)
    library(getopt)
    
    option_list <- list(
      make_option(c("-o", "--output"), type = "character", default = FALSE,
                  action = "store", help = "This is the output directory."
      ),
      make_option(c( "--seuratobject"), type = "character", default = "10X",
                  action = "store", help = "This is absoulte path of seuratobject."
      ),
      
      make_option(c("--min_gene"), type = "integer", default = 50,
                  action = "store", help = "This is min gene number to filter, default is 50."
      ),
      make_option(c("--max_gene"), type = "integer", default = 10000,
                  action = "store", help = "This is max gene number to filter, default is 10000."
      ),
      make_option(c("--min_ncountRNA"), type = "integer", default = 500,
                  action = "store", help = "This is min ncountRNA to filter, default is 5000."
      ),
      make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
                  action = "store", help = "This is min ncountRNA to filter, default is 100000."
      ),
      make_option(c("--max_mt_percent"), type = "integer", default = 60,
                  action = "store", help = "This is min mitochondria percentage  to filter, default is 60."
      ),
      make_option(c("--max_rb_percent"), type = "integer", default = 60,
                  action = "store", help = "This is min ribosome percentage to filter, default is 60."
      ),
      make_option(c("--file_prefix"), type = "character",
                  action = "store", help = "This is number of file prefix  to used in the following analysis."
      ),
      make_option(c("--npc_plot"), type = "integer", default = 50,
                  action = "store", help = "This is number of principles  to plot, default is 50."
      ),
      make_option(c("--npc_used"), type = "integer", default = 20,
                  action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
      ),
      
      make_option(c("--k_parameter"), type = "integer", default = 20,
                  action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
      ),
      make_option(c("--resolution_number"), type = "double", default = 0.5,
                  action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
      ),
      make_option(c("--min_pct"), type = "double", default = 0.25,
                  action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
      ),
      make_option(c("--logfc_threshold"), type = "double", default = 0.25,
                  action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
      ),
      make_option(c("--p_val_adj"), type = "double", default = 0.05,
                  action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
      ),
      make_option(c("--top_num"), type = "integer", default = 10,
                  action = "store", help = "This is top num marker of each cluster, default is 10"
      ),
      make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
                  action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
      )
      
    )
    
    # -help 
    opt = parse_args(OptionParser(option_list = option_list, usage = "This Script is general processing of single cell exp matrix!"))
    print(opt)
    
    source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
    
    ###### step0. subset seurat object ####
    timestart<-Sys.time() 
    
    # set the output path
    setwd(opt$output)
    
    exp_count.seurat <- readRDS(opt$seuratobject)
    print("Step0: Subset Seurat Object")
    cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
    cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
    cat("max_mt_percent ", opt$max_mt_percent,  "\n")
    cat("max_rb_percent ", opt$max_rb_percent,  "\n")
    exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat, 
                                         min_gene = opt$min_gene, max_gene = opt$max_gene, 
                                          min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA, 
                                          max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)
    
    #### step1. normalize data and run pca ####
    print("Step1: SCTtransform Data and Run PCA")
    cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
    cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
    cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
    exp_count.seurat <- SelectPCsBySCTransform(exp_count.seurat = exp_count.seurat, 
                                                file_prefix = opt$file_prefix, 
                                                npc_plot = opt$npc_plot)
    
    #### step2. find clusters ####
    print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
    cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
    cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
    cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
    exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat, 
                                 file_prefix = opt$file_prefix, 
                                 npc_used = opt$npc_used, 
                                 k_param = opt$k_param,
                                 resolution_number = opt$resolution_number)
      
    
    #### step3. find top markers #####
    print("Step3: find top markers and then map cluster with top2 marke")
    cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
    cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
    cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")
    
    cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat, 
                                             file_prefix = opt$file_prefix, 
                                             min.pct = opt$min_pct, 
                                             logfc.threshold = opt$logfc_threshold, 
                                             p_val_adj = opt$p_val_adj,
                                             mt_rb_pattern = opt$mt_rb_pattern)
    
    TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers, 
                                     file_prefix = opt$file_prefix, 
                                     top_num = opt$top_num)
    
    # Top markers heatmap
    P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
    filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
    folder_name <- "4.MarkersInCluster"
    pdf(filename, 30, 15)
    print(P_heatmap_top_marker)
    dev.off()
    
    file.copy(filename, folder_name, overwrite = T)# copy files
    file.remove(filename)
    
    #### step4. map top2 marker to each cluster #####
    print("step4: map top2 marker to each cluster")
    exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat, 
                                                 cluster.markers = cluster.markers,
                                                 file_prefix = opt$file_prefix)
    
    #### step5. save the clustering plots aftering assign top2 markers ###
    print("step5. save the clustering plots aftering assign top2 markers")
    
    p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
                  pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
    p1 <- p1 + labs(title="Clustering UMAP Reduction")
    p1 <- p1 + theme(plot.title = element_text(hjust = 0.5)) 
    
    p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
                  pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
    p2 <- p2 + labs(title="Clustering TSNE Reduction")
    p2 <- p2 + theme(plot.title = element_text(hjust = 0.5)) 
    
    plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))
    
    cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
    folder_name <- "3.PlotCluster"
    pdf(cluster_pdfname, width = 18, height = 6)
    print(plots)
    dev.off()
    
    file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
    file.remove(cluster_pdfname)
    
    rds_name <- paste0(opt$file_prefix, "_sctransform_clustering_seurat.rds.rds")
    saveRDS(exp_count.seurat, file = rds_name)
    
    print("Happy~Finished!!")
    
    timeend<-Sys.time()
    runningtime<-timeend-timestart
    print(runningtime)
    

    5.附上使用说明

    1) 把SeuratWrapperFunction.R模块下面的代码复制到文件SeuratWrapperFunction.R,并存储
    2) 把DataQualityOverviewSingle.R模块下面的代码复制到文件DataQualityOverviewSingle.R,并存储
    3) 把LogNormalizeClusteringSingle.R模块下面的代码复制到文件LogNormalizeClusteringSingle.R,并存储
    4) 把SCTransformClusteringSingle.R模块下面的代码复制到文件SCTransformClusteringSingle.R,并存储
    5) 可执行权限
    chmod +x DataQualityOverviewSingle.R
    chmod +x LogNormalizeClusteringSingle.R
    chmod +x SCTransformClusteringSingle.R
    
    6)使用
    ## 质量控制
    Rscript ~/scripts/seurat/pipeline/DataQualityOverviewSingle.R -o /home/zhengjh/other/PinealGland/Results/Zebrafish -t 10X --datapath /home/zhengjh/other/PinealGland/Data/Zebrafish_GSE123778_RAW/GSM3511192_scSeq1 \
    --project_name "zebrafish_scseq1" --mt_pattern "^mt-" --rb_pattern "^rpl|^rps" --min_cell 3 --min_gene 100
    
    ## LogNormalize
    Rscript ~/scripts/seurat/pipeline/LogNormalizeClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds \
    --min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
    --max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_logtransform \
    --scale_factor 10000 --nfeatures 2000 --npc_plot 50 --npc_used 30 \
    --k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25  \
    --p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps"
    
    ## SCTransform
    Rscript ~/scripts/seurat/pipeline/SCTransformClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds --min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
    --max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_scttransform \
    --npc_plot 50 --npc_used 30 \
    --k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25 \
    --p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps" 
    
    7) 结果展示
    结果展示

    最后放个图片
    祝君开心快乐 健康!
    也希望自己一直保持学习 不郁闷呜呜u


    image.png

    相关文章

      网友评论

        本文标题:封装函数-用R包Seurat跑单套数据

        本文链接:https://www.haomeiwen.com/subject/tiyozrtx.html