美文网首页
2022-01-17 CCA校正批次效应

2022-01-17 CCA校正批次效应

作者: 徐添添 | 来源:发表于2022-01-18 00:03 被阅读0次

    批次效应强时,用CCA进行校正

    先用PCA图查看不整合前的批次效应
    原理在每个批次中找到相同的一组表达差异低的基因,用这组基因在每个亚群中的差异代表整体基因的差异,按照这个差异调整整体基因表达水平

    # 该代码用于理解并分析批次差异比较大的样本,通过CCA算法将批次效应进行校正
    # 并基于校正之后的数据进行下游分析
    
    #加载分析使用的包
    library(Seurat)
    library(monocle)
    library(devtools)
    devtools::install_github("cole-trapnell-lab/monocle-release@develop")
    
    library(monocle)
    library(ggplot2)
    library(cowplot)
    library(Matrix)
    library(dplyr)
    
    #读入原始表达数据
    exp1<-read.table("./CCA_monocle2_data/01.CRC01-CRC02_500sc_Read_Counts.txt",
                     sep="\t",header=T,row.names=1)
    colnames(exp1)<-paste(colnames(exp1),"read",sep="_")#第一个样本叫read
    
    exp2<-read.table("./CCA_monocle2_data/02.CRC03-04-06-09-10-11_688sc_UMI_Counts.txt",
                     sep="\t",header=T,row.names=1)
    colnames(exp2)<-paste(colnames(exp2),"umi",sep="_") #第二个样本叫umi
    
    #不使用CCA去批次效应,直接合并分析
    combined<-cbind(exp1,exp2)
    
    #创建一个文件夹用于写分析结果
    result.name <- "cca_monocle_result"
    if(!dir.exists(result.name)){
      dir.create(result.name)
    }
    
    #合并的样本分析看批次效应
    combined.seurat<- CreateSeuratObject(
      combined,
      project = "combined", 
      min.cells = 5,
      min.features = 200,
      names.field = 4,
      names.delim = "_")
    combined.seurat[["percent.mt"]] <- PercentageFeatureSet(combined.seurat,pattern = "^mt-")
    combined.seurat<-NormalizeData(combined.seurat,verbose = FALSE)
    combined.seurat<- FindVariableFeatures(combined.seurat, selection.method = "vst",nfeatures = 2000)
    combined.seurat <- ScaleData(combined.seurat, verbose = FALSE)
    combined.seurat <- RunPCA(combined.seurat, npcs = 50, verbose = FALSE)
    pdf(paste0("./",result.name,"/PCA-DimPlot-noCCA.pdf"),width = 5,height = 4)
    DimPlot(combined.seurat, reduction = "pca", )
    dev.off()
    

    接下来用CCA校正,需要校正几个就创建几个seurat对象

    
    #创建Seurat对象,标准化及寻找高表达变异基因-exp1
    exp1.seurat<- CreateSeuratObject(
      exp1,
      project = "exp1", 
      min.cells = 5,
      min.features = 200,
      names.field = 4,
      names.delim = "_")
    exp1.seurat[["percent.mt"]] <- PercentageFeatureSet(exp1.seurat,pattern = "^mt-")
    exp1.seurat<-NormalizeData(exp1.seurat,verbose = FALSE)
    exp1.seurat<- FindVariableFeatures(exp1.seurat, selection.method = "vst",nfeatures = 2000)
    
    #创建Seurat对象,标准化寻找高表达变异基因-exp2
    exp2.seurat<- CreateSeuratObject(
      exp2,
      project = "exp2", 
      min.cells = 5,
      min.features = 200,
      names.field = 4,
      names.delim = "_")
    exp2.seurat[["percent.mt"]] <- PercentageFeatureSet(exp2.seurat,pattern = "^mt-")
    exp2.seurat<-NormalizeData(exp2.seurat,verbose = FALSE)
    exp2.seurat<- FindVariableFeatures(exp2.seurat, selection.method = "vst",nfeatures = 2000)
    
    #整合2批数据(k.filter默认为200,一定要小于最小细胞数,不然会报错)
    exp.anchors<- FindIntegrationAnchors(object.list = c(exp1.seurat,exp2.seurat), dims = 1:30)
    exp.seurat <- IntegrateData(anchorset = exp.anchors, dims = 1:30)
    
    #设置当前使用的Assay(不运行也行,整合完以后默认使用整合后的Assay)
    DefaultAssay(exp.seurat)<-"integrated"
    
    #均一化
    exp.seurat <- ScaleData(exp.seurat, verbose = FALSE)
    
    #运行PCA
    exp.seurat <- RunPCA(exp.seurat, npcs = 50, verbose = FALSE)
    
    #PCA结果展示
    pdf(paste0("./",result.name,"/PCA-DimPlot-CCA.pdf"),width = 5,height = 4)
    DimPlot(exp.seurat, reduction = "pca")
    dev.off()
    
    #PCA结果展示 - 碎石图
    pdf(paste0("./",result.name,"/PCA-ElbowPlot.pdf"),width = 5,height = 4)
    ElbowPlot(exp.seurat, ndim=50,reduction="pca")
    dev.off()
    
    #增加样本以及批次的信息到metadata中
    cell.name<-row.names(exp.seurat@meta.data)
    cell.name<-strsplit(cell.name,"_")
    sample<-c()
    batch=c()
    for (i in 1:length(cell.name)){sample<-c(sample,substring(cell.name[[i]][2],1,2));batch<-c(batch,cell.name[[i]][4])}
    exp.seurat@meta.data[["sample"]]<-sample
    exp.seurat@meta.data[["batch"]]<-batch
    rm(cell.name,sample,batch,i)
    View(exp.seurat@meta.data)
    
    #确定用于细胞分群的PC
    dim.use <- 1:20
    
    #细胞分群TSNE算法
    exp.seurat <- FindNeighbors(exp.seurat, dims = dim.use)
    exp.seurat <- FindClusters(exp.seurat, resolution = 0.5)
    exp.seurat <- RunTSNE(exp.seurat, dims = dim.use, do.fast = TRUE)
    
    #展示TSNE分类结果
    pdf(paste0("./",result.name,"/CellCluster-TSNEPlot_",max(dim.use),"PC.pdf"),width = 5,height = 4)
    DimPlot(object = exp.seurat, pt.size=1,label = T)
    dev.off()
    
    #按照数据来源分组展示细胞异同-batch
    pdf(paste0("./",result.name,"/CellCluster-TSNEPlot_batch_",max(dim.use),"PC.pdf"),width = 5,height = 4)
    DimPlot(object = exp.seurat, group.by="batch", pt.size=1)
    dev.off()
    
    #按照数据来源分组展示细胞异同-sample
    pdf(paste0("./",result.name,"/CellCluster-TSNEPlot_sample_",max(dim.use),"PC.pdf"),width = 5,height = 4)
    DimPlot(object = exp.seurat, group.by="sample", pt.size=1)
    dev.off()
    
    #plot在同一张图中
    plot1 <- DimPlot(object = exp.seurat, pt.size=1,label = T)
    plot2 <- DimPlot(object = exp.seurat, group.by="batch", pt.size=1)
    plot3 <- DimPlot(object = exp.seurat, group.by="sample", pt.size=1)
    pdf(paste0("./",result.name,"/Combined_plot_exp.seurat_",max(dim.use),"PC.pdf"),width = 14,height = 4)
    CombinePlots(plots = list(plot1,plot2,plot3),ncol=3)
    dev.off()
    rm(plot1,plot2,plot3)
    
    #计算marker基因
    all.markers<-FindAllMarkers(exp.seurat, only.pos = TRUE, 
                                min.pct = 0.25, logfc.threshold = 0.25)
    
    # 遍历每一个cluster然后展示其中前4个基因
    marker.sig <- all.markers %>% filter(p_val_adj <= 0.05)
    for(cluster in unique(marker.sig$cluster)){
      cluster.markers <- FindMarkers(exp.seurat, ident.1 = cluster, min.pct = 0.25)
      cluster.markers <- as.data.frame(cluster.markers) %>% 
        mutate(Gene = rownames(cluster.markers))
      cl4.genes <- cluster.markers %>% arrange(desc(avg_logFC))
      cl4.genes <- cl4.genes[1:min(nrow(cl4.genes),4),"Gene"]
      
      #RidgePlot
      pvn<-RidgePlot(exp.seurat, features = cl4.genes, ncol = 2)
      pdf(paste0("./",result.name,"/MarkerGene-RidgePlot_cluster",cluster,"_tsne_",max(dim.use),"PC.pdf"),width = 7,height = 6)
      print(pvn)
      dev.off()
      
      #VlnPlot
      pvn <- VlnPlot(exp.seurat, features = cl4.genes)
      pdf(paste0("./",result.name,"/MarkerGene-VlnPlot_cluster",cluster,"_tsne_",max(dim.use),"PC.pdf"),width = 7,height = 6)
      print(pvn)
      dev.off()
      
      #Feather plot 
      pvn <- FeaturePlot(exp.seurat,features=cl4.genes)
      pdf(paste0("./",result.name,"/MarkerGene-FeaturePlot_cluster",cluster,"_tsne_",max(dim.use),"PC.pdf"),width = 7,height = 6)
      print(pvn)
      dev.off()
      
    }
    rm(cl4.genes,cluster.markers,pvn)
    
    #热图展示Top marker基因
    top10 <- marker.sig %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
    
    pdf(paste0("./",result.name,"/MarkerGene-Heatmap_all_cluster_tsne_",max(dim.use),"PC.pdf"),width = 8,height = 5)
    DoHeatmap(exp.seurat, features = top10$gene,size = 2)
    dev.off()
    write.table(top10,file=paste0("./",result.name,"_top10_marker_genes_tsne_",max(dim.use),"PC.txt"),sep="\t",quote = F)
    
    pdf(paste0("./",result.name,"/MarkerGene-DotPlot_all_cluster_tsne_",max(dim.use),"PC.pdf"),width = 100,height = 10)
    DotPlot(exp.seurat, features = unique(top10$gene))+RotatedAxis()
    dev.off()
    

    相关文章

      网友评论

          本文标题:2022-01-17 CCA校正批次效应

          本文链接:https://www.haomeiwen.com/subject/rfxhhrtx.html