美文网首页
数据质控-单细胞转录组分析的数据质控scRNA-seq03

数据质控-单细胞转录组分析的数据质控scRNA-seq03

作者: 一车小面包人 | 来源:发表于2023-08-31 11:54 被阅读0次

背景:单细胞转录组数据的质量控制,质量控制去除(1)低质量的细胞(2)双胞(3)RNA污染的细胞

  • 去除低质量的细胞
    什么样的细胞是低质量的细胞呢?通常是检测到的基因数目少、计数深度低或者线粒体等基因的比例比较高的细胞,那么就根据相关的特征把这类低质量细胞过滤掉。
    去除低质量细胞的阈值对象:
    (1)每个条形码(细胞)的计数数量(计数深度、测序深度)
    (2)每个条形码的基因数量
    (3)每个条形码的线粒体基因(核糖体、血红蛋白基因)比例
library(Seurat)
library(ggplot2)
library(ggpubr)
sc<-readRDS("./sc.merge.rds")
sc<-CreateSeuratObject(sc$RNA@counts,meta.data=sc@meta.data,min.cells=3) #'筛选基因,保留在大于3个细胞中表达的基因
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern="^MT-") #'人类中线粒体基因以MT开头,因此计算MT开头的基因比例
sc[["percent.Ribo"]] <- PercentageFeatureSet(sc, pattern="^RP[SL]") #'计算以RPS或者RPL开头的基因比例(核糖体基因)
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB.genes <- CaseMatch(HB.genes, rownames(sc))
sc[["percent.HB"]] <- PercentageFeatureSet(sc, features=HB.genes) #'血红蛋白基因比例

qc_before<- VlnPlot(sc, group.by="orig.ident",
                              features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
                              pt.size=0,
                              ncol=2)
ggsave(qc_before,file="qc_before.png", width=12, height=6)
#'质控前的相关阈值情况
Feature_ber1 <- FeatureScatter(sc, feature1="nCount_RNA",
                                   feature2="nFeature_RNA",
                                   group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.mt",
                                   group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.Ribo",
                                   group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc, feature1 = 'nCount_RNA',
                                   feature2="percent.HB",
                                   group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_before.png", width=12, height=6)
qc_before.png
Feature_ber_before.png

开始质控:

sc.sub=subset(sc,nFeature_RNA > 200 & nFeature_RNA < 2500 & nCount_RNA > 500 & nCount_RNA < 10000 & percent.mt < 5 &  percent.HB<5 & percent.Ribo<10) #'筛选细胞,规则是细胞的基因数目在200-2500之间,测序深度(基因表达总量)在500-10000,线粒体比例低于5%,血红蛋白基因低>于5%,核糖体基因低于10%
qc_after<- VlnPlot(sc.sub, group.by="orig.ident",
                              features=c("nFeature_RNA","nCount_RNA","percent.mt","percent.Ribo","percent.HB"),
                              pt.size=0,
                              ncol=2)
ggsave(qc_after,file="qc_after.png", width=12, height=6)
Feature_ber1 <- FeatureScatter(sc.sub, feature1="nCount_RNA",
                                   feature2="nFeature_RNA",
                                   group.by="orig.ident")
Feature_ber2 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.mt",
                                   group.by="orig.ident")
Feature_ber3 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.Ribo",
                                   group.by="orig.ident")
Feature_ber4 <- FeatureScatter(sc.sub, feature1 = 'nCount_RNA',
                                   feature2="percent.HB",
                                   group.by="orig.ident")
Feature_ber1 <- Feature_ber1 +theme(legend.position="none")
Feature_ber2 <- Feature_ber2 +theme(legend.position="none")
Feature_ber3 <- Feature_ber3 +theme(legend.position="none")
Feature_ber4 <- Feature_ber4 +theme(legend.position="none")
Feature_ber <- ggarrange(Feature_ber1, Feature_ber2, Feature_ber3,Feature_ber4, ncol=4, nrow=1, widths=c(1,1,1,1))
ggsave(Feature_ber, file="Feature_ber_after.png", width=12, height=6)
qc_after.png

可以看到留下来的细胞的基因数目都在200-2500之间,基因表达总量在300-10000之间,线粒体比例低于5%,核糖体基因比例低于10%。


Feature_ber_after.png
  • 去除双胞
    “双细胞被定义为在相同的细胞条形码(barcode)下进行测序的两个细胞,例如,两个细胞被捕获在同一个液滴(droplet)中。这也是为什么我们一直使用barcode而不是cells的原因。”
#'去除双胞
library(DoubletFinder)
library(Seurat)
library(dplyr)
sc<-readRDS("sc.merge.rds")
DefaultAssay(sc)='RNA'
data <- sc %>%NormalizeData() %>% FindVariableFeatures(nfeatures=2000) %>% ScaleData() %>% RunPCA()%>%RunTSNE(dim.use=1:10) #'数据预处
理

sweep.res.list <- paramSweep_v3(data, PCs = 1:30, sct = F) #'使用log标准化,sct参数设置为 sct = F(默认),如使用SCT标准化方法,设置为T
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats) #'可以看到最佳参数的点
pK_bcmvn <- bcmvn$pK[which.max(bcmvn$BCmetric)] %>% as.character() %>% as.numeric() #'提取最佳pk值

DoubletRate = ncol(data)*8*1e-6 #'每增加1000个细胞,双胞概率增加0.008
homotypic.prop <- modelHomotypic(data$groups)
nExp_poi <- round(DoubletRate*ncol(data))
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
data <- doubletFinder_v3(data, PCs = 1:30, pN = 0.25, pK = pK_bcmvn,
                         nExp = nExp_poi.adj, reuse.pANN = F, sct = F)

data@meta.data[,"DF_hi.lo"] <- data@meta.data[,length(colnames(data@meta.data))]
#data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet" & data@meta.data$DF.classifications_0.25_0.01_198 == "Singlet")] <- "Doublet-Low Confidience"
#data@meta.data$DF_hi.lo[which(data@meta.data$DF_hi.lo == "Doublet")] <- "Doublet-High Confidience"
table(data@meta.data$DF_hi.lo) #'查看细胞类型(双胞或者单胞)分布
#'结果可视化
png("./doubletFinder.png",2500,1800,res=300)
DimPlot(data, reduction = "umap", group.by ="DF_hi.lo",cols =c("black","red","gold"))
dev.off()
saveRDS(data,'sc_rmdoublet.rds') #'保存数据的rds文件
doubletfinder.png

注意:理论上去除双胞应该在上一步细胞的基因数目等阈值质控之前进行。

  • 去除周期相关的基因
#'去除周期基因的影响
library(Seurat)
library(dplyr)
sc<-readRDS("./sc.merge.rds")
sc<-sc%>%NormalizeData()%>%ScaleData(features=rownames(sc))%>%FindVariableFeatures() %>% RunPCA() #'数据预处理
pc.genes.heatmap<-DimHeatmap(sc, dims = c(8, 10))
ggsave(pc.genes.heatmap,file="pc_genes_heatmap.png", width=12, height=6)
g2m_genes <- cc.genes$g2m.genes
g2m_genes <- CaseMatch(search=g2m_genes, match=rownames(sc))
s_genes <- cc.genes$s.genes
s_genes <- CaseMatch(search=s_genes, match=rownames(sc))
scRNA_cycle <- CellCycleScoring(object=sc, g2m.features=g2m_genes, s.features=rownames(sc)) #'计算细胞周期
cycleplot <- DimPlot(sc, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_before.png", width=12, height=6)
scRNA_cycle <- RunPCA(scRNA_cycle, features=c(s_genes,g2m_genes))
cycleplot <- DimPlot(scRNA_cycle, reduction="pca", group.by="Phase")
ggsave(cycleplot, file="cycleplot_after.png", width=12, height=6)

注意:谨慎使用。

  • 去除RNA污染
library(SoupX)
library(Seurat)
library(DropletUtils)
sc<-readRDS("./sc.merge.rds")
#'准备将矩阵标准化作为背景矩阵
sc.nor<- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 10000)
sc.nor<- FindVariableFeatures(sc.nor, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(sc.nor)
sc.nor<- ScaleData(sc.nor, features = all.genes)
sc.nor<- RunPCA(sc.nor, features = VariableFeatures(sc.nor), npcs = 40, verbose = F)
sc.nor<- FindNeighbors(sc.nor, dims = 1:30)
sc.nor<- FindClusters(sc.nor, resolution = 0.5)
sc.nor<- RunUMAP(sc.nor, dims = 1:30)
#'参考标准化的矩阵进行对比和校正
meta<- sc.nor@meta.data
sc = SoupChannel(sc, sc.nor)
sc = setClusters(sc, setNames(meta$seurat_clusters, rownames(meta)))
sc = autoEstCont(sc)
sc = setContaminationFraction(sc, 0.2)
out = adjustCounts(sc) #矩阵校正
saveRDS(sc,"sc.rds")

注意:谨慎使用。

相关文章

网友评论

      本文标题:数据质控-单细胞转录组分析的数据质控scRNA-seq03

      本文链接:https://www.haomeiwen.com/subject/htdamdtx.html