美文网首页
2021-06-16 PBMC入门与sctransform

2021-06-16 PBMC入门与sctransform

作者: 学习生信的小兔子 | 来源:发表于2021-06-16 01:06 被阅读0次

    参考:公众号:生信会客厅

    library(Seurat)
    library(tidyverse)
    library(patchwork)
    # 下载并解压数据
    dir.create("Vignette01")
    download.file("https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz",
                  "Vignette01/pbmc.tar.gz")
    untar("Vignette01/pbmc.tar.gz", exdir="Vignette01")
    # 创建表达矩阵
    pbmc.data <- Read10X(data.dir = "Vignette01/filtered_gene_bc_matrices/hg19/")
    #查看基因数和细胞数
    dim(pbmc.data)
    #[1] 32738  2700
    # 创建seurat对象,保留最少在3个细胞中表达的基因以及最少表达200个基因的细胞
    pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
    #查看基因数和细胞数
    dim(pbmc)
    #[1] 13714  2700
    #可以发现过滤了一大半的基因
    head(pbmc@meta.data, 2)
    #               orig.ident nCount_RNA nFeature_RNA
    #AAACATACAACCAC     pbmc3k       2419          779
    #AAACATTGAGCTAC     pbmc3k       4903         1352
    #可以发现CreateSeuratObject函数中的project参数与orig.ident是对应关系
    #细胞质控
    pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")  
    p=VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size=0.1, ncol = 3)
    
    pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
    p=VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size=0.1, ncol = 3)
    
    
    pbmc.sc <- pbmc
    
    数据标准化
    pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
    pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 3000)
    pbmc <- ScaleData(pbmc, features = rownames(pbmc))            #对所有基因执行scale
    pbmc <- ScaleData(pbmc, features = VariableFeatures(pbmc))     #只对高变基因执行scale
    library(sctransform)
    pbmc.sc <- SCTransform(pbmc.sc, verbose = FALSE)
    ##对比它们选择的高变基因
    library(VennDiagram)
    library(gplots)
    v1 <- VariableFeatures(pbmc)
    v2 <- VariableFeatures(pbmc.sc)
    p <- venn.diagram(list(v1,v2), col=c("red","green"), alpha = c(0.5, 0.5),
                      category.names=c('pbmc','pbmc.sc'), filename=NULL, margin=0.15)    
    png(file='Vignette01/compare_variablefeatures.png', width=300, height=300)
    grid.draw(p)
    dev.off()
    ##降维结果对比
    #作者介绍三步法(Log-normalization)的后续分析选择的pc轴数量增加对结果影响不大,一步法则不同(sctransform)
    #我们尝试三种不同的参数nPCs=10,nPCs=20,nPCs=30来验证一下
    #先画出ElbowPlot
    pbmc <- RunPCA(pbmc, features = VariableFeatures(pbmc)) 
    pbmc.sc<- RunPCA(pbmc.sc, features = VariableFeatures(pbmc.sc)) 
    p1 = ElbowPlot(pbmc, ndims=30,reduction = 'pca')
    p2 = ElbowPlot(pbmc.sc, ndims=30,reduction = 'pca')
    plotc = p1+p2
    ggsave('Vignette01/compare_ElbowPlot.png', plotc, width=8, height=4)
    
    
    #三个参数都分析看看
    nPC.list=list(1:10,1:20,1:30)
    set.seed(822)
    for(nPCs in nPC.list){
      pbmc <- RunPCA(pbmc, verbose = FALSE) %>% RunUMAP(dims = nPCs) %>% FindNeighbors(dims = nPCs) %>% 
        FindClusters()
      pbmc.sc <- RunPCA(pbmc.sc, verbose = FALSE) %>% RunUMAP(dims = nPCs) %>% FindNeighbors(dims = nPCs) %>% 
        FindClusters()
      p1 <- DimPlot(object = pbmc, label = TRUE) + NoLegend() + ggtitle(paste0('Log-normalization_',max(nPCs)))
      p2 <- DimPlot(object = pbmc.sc, label = TRUE) + NoLegend() + ggtitle(paste0('sctransform_',max(nPCs)))
      plotc <- p1 + p2
      ggsave(paste0('Vignette01/compare_umap_',max(nPCs),'.png'), plotc, width=8, height=4)
    }
    plotc
    #pc.num=1:30
    #pbmc.sc <- RunPCA(pbmc.sc, verbose = FALSE) %>% RunUMAP(dims = pc.num) %>% FindNeighbors(dims = pc.num) %>% 
      FindClusters()
    


    补充说明:官方介绍sctransform挑选出来的高变基因和标准化之后的数据,更能去除技术噪音并保留细微的生物学差异。三步法标准化之后的数据进行后续分析,选择的pc轴达到一定数量(拐点)之后再增加就没有意义了;但是一步法标准化之后的数据用于后续分析,选择的pc轴越多越有利于发现细微的生物学差异。实测聚类效果都没有明显的差异,详见上图!
    sctransform函数运行之后的结果保存在pbmc@ASSAY@SCT,分析结果是@scale.data,@counts和@data都是用@scale.data反向计算回去的,@counts相当于细胞数据量相同情况下的值(也就是均一化了的)。差异表达分析和数据整合最好用@scale.data,但是seurat目前不支持,因此还得用@data的数据

    
    
    
    
    ##后续分析采用sctransform处理过的数据,log_normalization的结果备份
    saveRDS(pbmc, 'pbmc_log_normalization.rds')
    pbmc = pbmc.sc
    pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    colnames(pbmc.markers)[2] <- 'avg_logFC'
    top10markers <- subset(pbmc.markers, p_val<0.05&p_val_adj<0.05) %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
    write.csv(top10markers,'Vignette01/top10markers.scv', row.names=F)
    
    ##细胞类型的鉴定我综合了SingleR和cellmarker数据库,以及原教程的biomarker,过程就不赘述了
    
    ##将判断结果输入pbmc
    celltype <- c('CD4+ T','CD14+ Mono','CD4+ T','B','CD8+ T','NK','FCGR3A+ Mono',
                   'NK','CD8+ T','DC','CD4+ T','Platelet')
    
    celltype <- as.data.frame(celltype)
    
    pbmc@meta.data$celltype = "NA"
    for(i in 1:nrow(celltype)){
      pbmc@meta.data[which(pbmc@meta.data$seurat_clusters == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
    table(pbmc@meta.data$celltype)
    
    
    library(SingleR)
    load("D:\\genetic_r\\singer-cell-learning\\SingleR_ref\\SingleR_ref\\ref_Human_all.RData")
    
    refdata <- ref_Human_all
    testdata <- GetAssayData(pbmc, slot="data")
    ###把scRNA数据中的seurat_clusters提取出来,注意这里是因子类型的
    clusters <- pbmc@meta.data$seurat_clusters
    ###开始用singler分析
    cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main, 
                        method = "cluster", clusters = clusters, 
                        assay.type.test = "logcounts", assay.type.ref = "logcounts")
    ###制作细胞类型的注释文件
    celltype = data.frame(ClusterID=rownames(cellpred), celltype=cellpred$labels, stringsAsFactors = FALSE)
    celltype$celltype <- c('CD4+ T','CD4+ T','CD14+ Mono','CD4+ T','B','CD8+ T','NK','FCGR3A+ Mono',
                                       'NK','CD8+ T','DC','CD4+ T','Platelet')
    names(cell.type) <- levels(pbmc)
    pbmc <- RenameIdents(pbmc, cell.type)  #改变了Idents变量,没有改变meta.data
    pbmc$celltype <- Idents(pbmc)          #将变量Idents赋值给了meta.data新的列
    p1=DimPlot(pbmc, group.by="seurat_clusters", repel=T, label=T, 
               label.size=5, reduction='umap') + NoLegend()
    p2=DimPlot(pbmc, group.by="celltype", repel=T, label=T, label.size=3, reduction='umap') + NoLegend()
    p3=p1+p2
    #ggsave("Vignette01/celltype_UMAP.png", p2, width=7 ,height=6)
    ggsave("Vignette01/celltype.png", p3, width=10 ,height=5)
    ##保存pbmc
    saveRDS(pbmc, 'pbmc_Vignette01.rds')
    后面的代码有点问题 暂时还不知带怎么解决 菜狗哭泣。。。。
    
    
    

    相关文章

      网友评论

          本文标题:2021-06-16 PBMC入门与sctransform

          本文链接:https://www.haomeiwen.com/subject/nxtxyltx.html