练习数据集的GEO编号:GSE139324 (Immune Landscape of Viral- and Carcinogen-Driven Head and Neck Cancer)。原数据集有63个scRNA的数据,本次选取10个练习。
library(Seurat)
library(harmony)
library(tidyverse)
library(patchwork)
rm(list = ls())
一. 多个单细胞样本的合并
1. 读取并合并数据
- 1.1 读取数据
## 批量读取数据
### 设置数据路径与样本名称
assays <- dir("GSE139324/")
dir <- paste0("GSE139324/", assays)
# 按文件顺序给样本命名,名称不要以数字开头,中间不能有空格
samples_name = c('HNC01PBMC', 'HNC01TIL', 'HNC10PBMC', 'HNC10TIL', 'HNC20PBMC',
'HNC20TIL', 'PBMC1', 'PBMC2', 'Tonsil1', 'Tonsil2')
- 1.2 批量创建seurat对象
scRNAlist <- list()
for(i in 1:length(dir)){
counts <- Read10X(data.dir = dir[i])
#不设置min.cells过滤基因会导致CellCycleScoring报错:
#Insufficient data values to produce 24 bins.
scRNAlist[[i]] <- CreateSeuratObject(counts, project=samples_name[i],
min.cells=3, min.features = 200)
#给细胞barcode加个前缀,防止合并后barcode重名
scRNAlist[[i]] <- RenameCells(scRNAlist[[i]], add.cell.id = samples_name[i])
#计算线粒体基因比例
if(T){
scRNAlist[[i]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^MT-")
}
#计算核糖体基因比例
if(T){
scRNAlist[[i]][["percent.rb"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^RP[SL]")
}
#计算红细胞基因比例
if(T){
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB.genes <- CaseMatch(HB.genes, rownames(scRNAlist[[i]]))
scRNAlist[[i]][["percent.HB"]]<-PercentageFeatureSet(scRNAlist[[i]], features=HB.genes)
}
}
### 给列表命名并保存数据
dir.create("Integrate")
setwd("./Integrate")
names(scRNAlist) <- samples_name
#system.time(save(scRNAlist, file = "Integrate/scRNAlist0.Rdata"))
system.time(saveRDS(scRNAlist, file = "scRNAlist0.rds"))
这一步得到了一个包含了十个样本Seurat对象的list
- 1.3 使用
merge
函数将scRNAlist合成一个Seurat对象
scRNA <- merge(scRNAlist[[1]], scRNAlist[2:length(scRNAlist)])
scRNA
# An object of class Seurat
# 18818 features across 19738 samples within 1 assay
# Active assay: RNA (18818 features, 0 variable features)
table(scRNA$orig.ident)
# HNC01PBMC HNC01TIL HNC10PBMC HNC10TIL HNC20PBMC
# 1721 1298 1750 1383 1525
# HNC20TIL PBMC1 PBMC2 Tonsil1 Tonsil2
# 1148 2444 2436 3324 2709
save(scRNA,file = 'scRNA_orig.Rdata')
#scRNAlist <- SplitObject(scRNA, split.by = "orig.ident") #分割Seurat对象
2. 数据质控
### 绘制质控小提琴图
# 设置可能用到的主题
theme.set2 = theme(axis.title.x=element_blank())
# 设置绘图元素
plot.featrures = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.rb", "percent.HB")
group = "orig.ident"
# 质控前小提琴图
plots = list()
for(i in seq_along(plot.featrures)){
plots[[i]] = VlnPlot(scRNA, group.by=group, pt.size = 0,
features = plot.featrures[i]) + theme.set2 + NoLegend()}
violin <- wrap_plots(plots = plots, nrow=2)
dir.create("QC")
ggsave("QC/vlnplot_before_qc.pdf", plot = violin, width = 9, height = 8)
### 设置质控标准
minGene=500
maxGene=4000
maxUMI=15000
pctMT=10
pctHB=1
### 数据质控并绘制小提琴图
scRNA <- subset(scRNA, subset = nCount_RNA < maxUMI & nFeature_RNA > minGene &
nFeature_RNA < maxGene & percent.mt < pctMT & percent.HB < pctHB)
plots = list()
for(i in seq_along(plot.featrures)){
plots[[i]] = VlnPlot(scRNA, group.by=group, pt.size = 0,
features = plot.featrures[i]) + theme.set2 + NoLegend()}
violin <- wrap_plots(plots = plots, nrow=2)
ggsave("QC/vlnplot_after_qc.pdf", plot = violin, width = 10, height = 8)
3. 查看批次效应(对merge后的Seurat对象进行标准化和降维)
# 降维聚类
scRNA <- NormalizeData(scRNA) %>% FindVariableFeatures(nfeatures = 3000) %>% ScaleData()
scRNA <- RunPCA(scRNA, verbose = F)
ElbowPlot(scRNA, ndims = 50)
pc.num=1:30
scRNA <- scRNA %>% RunTSNE(dims=pc.num) %>% RunUMAP(dims=pc.num)
scRNA <- FindNeighbors(scRNA, dims=pc.num) %>% FindClusters()
DimPlot(scRNA, label = T)
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples.pdf", p, width = 8, height = 6)
可以看出,不同样本间还是有批次效应的存在。结合这张图和上面那张图来看,比如上面那张图中的6和7cluster在整合之后应该就是一个cluster
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split.pdf", p, width = 18, height = 12)
saveRDS(scRNA, "scRNA.rds")
二. 数据整合
数据整合的目的:
1. 画出tsne/umap图,以辅助细胞类型鉴定(把不同组之间的一类细胞整合在一起)
2. 轨迹分析可能需要用到umap图
⚠️数据整合只影响降维聚类,不影响差异分析(原始count矩阵没有改变)。
1. seurat锚点整合
参考:https://satijalab.org/seurat/articles/integration_large_datasets.html
Seurat2的整合主要用的是CCA
(canonical correlation analysis,典型关联分析)的方法,Seurat3和Seurat4用的是CCA
+MNN
(mutual nearest neighbor,互近邻)
锚点整合操作速度很慢,且常常会过度整合,因此在实际操作中,跨物种整合的时候或不同的数据类型如ATAC、蛋白组的数据和单细胞的数据整合的时候,可以使用锚点整合。单纯单细胞数据的整合,可以使用Harmony。
- 1.1 读入数据,拆分样本
rm(list=ls())
library(future) #Seurat并行计算的一个包,不加载这个包不能进行并行计算
options(future.globals.maxSize = 50 * 1024^3) #将全局变量上限调至50G(锚点整合很占内存)
##重新创建没有处理的经过降维等处理的数据
scRNA <- readRDS("scRNA.rds")
cellinfo <- subset(scRNA@meta.data, select = c("orig.ident", "percent.mt", "percent.rb", "percent.HB"))
scRNA <- CreateSeuratObject(scRNA@assays$RNA@counts, meta.data = cellinfo)
#做锚点整合需要把样本处理成单个的Seurat对象来两两组合
scRNAlist <- SplitObject(scRNA, split.by = "orig.ident")
#也可以按别的指标(metadata中的)来进行拆分,比如可以按不同的分组来拆分样本,再进行整合。
- 1.2 标准化(由于锚点整合会把单个样本两两组合,所以需要单独做标准化)
#SCTransform标准化(⚠️使用log标准化还是SCT标准化差别不大)
#如果用log标准化,后面FindIntegrationAnchors和IntegrateData函数的normalization.method参数选'LogNormalize'
scRNAlist <- parallel::mclapply(scRNAlist, FUN=function(x) SCTransform(x), mc.cores = 10) #10个对象最好写10个核,没有10个核少写几个也可以。top命令可以查看服务器有几个核,mc.core设置为1就每次处理一个对象。
# mclapply是lapply的多核版本
- 1.3 选择用于整合的高变基因(三步)
### FindAnchors
### 每个样本的高变基因不完全一样,SelectIntegrationFeatures可以整合这些高变基因,选出3000个
scRNA.features <- SelectIntegrationFeatures(scRNAlist, nfeatures = 3000)
### 将每个样本的高变基因都调整成上一步选出的3000个
scRNAlist <- PrepSCTIntegration(scRNAlist, anchor.features = scRNA.features)
##寻找锚点,运行速度非常慢,至少需要1-2小时
plan("multisession", workers = 10)
scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist,
normalization.method = "SCT", #如果前面是log标准化,这里改成LogNormalize
anchor.features = scRNA.features)
- 1.4 锚点整合
### Integrate 运行速度慢
scRNA.sct.int <- IntegrateData(scRNA.anchors, normalization.method="SCT") #速度慢
plan("sequential") #把并行计算改为单核计算
- 1.5 降维,可视化
### redunction
scRNA <- RunPCA(scRNA.sct.int, npcs = 50, verbose = FALSE)
ElbowPlot(scRNA, ndims=50)
pc.num=1:20
scRNA <- scRNA %>% RunTSNE(dims=pc.num) %>% RunUMAP(dims=pc.num)
### Visual
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples_integr.pdf", p, width = 8, height = 6)
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split_integr.pdf", p, width = 18, height = 12)
##save seurat object
saveRDS(scRNA, "scRNA_SCT_int.rds")
2. harmony整合
Harmony整合的官网教程及其原理此前已经介绍过:Harmony
- 2.1 准备数据
rm(list = ls())
scRNA <- readRDS("scRNA.rds")
cellinfo <- subset(scRNA@meta.data, select = c("orig.ident", "percent.mt", "percent.rb", "percent.HB"))
scRNA <- CreateSeuratObject(scRNA@assays$RNA@counts, meta.data = cellinfo)
- 2.2 数据标准化(和锚点整合不同,不需拆分样本,直接标准化)
### SCT标准化数据
scRNA <- SCTransform(scRNA)
- 2.3 使用harmony整合数据
### PCA
scRNA <- RunPCA(scRNA, npcs=50, verbose=FALSE)
### 整合方法1:单个样本间进行整合(推荐,效果更好)
scRNA <- RunHarmony(scRNA, group.by.vars="orig.ident", assay.use="SCT", max.iter.harmony = 20)
# group.by.vars参数是设置按哪个分组来整合
# max.iter.harmony设置迭代次数,默认是10。运行RunHarmony结果会提示在迭代多少次后完成了收敛。
#⚠️RunHarmony函数中有个lambda参数,默认值是1,决定了Harmony整合的力度。lambda值调小,整合力度变大,反之。(只有这个参数影响整合力度,调整范围一般在0.5-2之间)
###整合方法2:按其他分类如不同分组来校正(实测效果不如方法1)
if(F){
scRNA$batches <- scRNA$orig.ident
scRNA$batches <- recode(scRNA$batches,
"HNC01PBMC" = "batch1",
"HNC01TIL" = "batch2",
"HNC10PBMC" = "batch1",
"HNC10TIL" = "batch2",
"HNC20PBMC" = "batch1",
"HNC20TIL" = "batch2",
"PBMC1" = "batch1",
"PBMC2" = "batch1",
"Tonsil1" = "batch3",
"Tonsil2" = "batch3")
scRNA2 <- RunHarmony(scRNA, group.by.vars="batches", assay.use="SCT")
}
- 2.4 降维聚类
ElbowPlot(scRNA, ndims = 50)
pc.num=1:30
scRNA <- RunTSNE(scRNA, reduction="harmony", dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num)
#scRNA2 <- RunTSNE(scRNA2, reduction="harmony", dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num)
p <- DimPlot(scRNA, group.by = "orig.ident")
ggsave("UMAP_Samples_harmony.pdf", p, width = 8, height = 6)
p <- DimPlot(scRNA, group.by = "orig.ident", split.by = "orig.ident", ncol = 4)
ggsave("UMAP_Samples_Split_harmony.pdf", p, width = 18, height = 12)
##save seurat object
saveRDS(scRNA, "scRNA_SCT_harmony.rds")
3. 整合结果评估
library(SingleR)
source("sc_function.R")
- 3.1 Seurat锚点整合结果评估
scRNA <- readRDS("scRNA_SCT_int.rds")
scRNA <- FindNeighbors(scRNA, dims = 1:20) %>% FindClusters()
load("ref_Hematopoietic.RData")
DefaultAssay(scRNA) <- "RNA"
scRNA <- cell_identify(scRNA, ref_Hematopoietic) #cell_identify是自己写的函数,ref_Hematopoietic是SingleR中的参考数据集
p <- DimPlot(scRNA, group.by = "SingleR", label = T)
ggsave("SingleR_Seurat.pdf", p, width = 8, height = 6)
DefaultAssay(scRNA) <- "integrated"
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Seurat_int.pdf", p, width = 18, height = 16)
这些基因是各种免疫细胞的marker基因,用的是整合之后的表达值(scRNA@assays$integrated@data),可以这个矩阵对表达值的校正有点过,因此$integrated中的数据不建议拿来做后续分析。
DefaultAssay(scRNA) <- "SCT"
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Seurat_sct.pdf", p, width = 18, height = 16)
用data的数据绘图显示一些细胞群同时有多种特征性marker,说明可能存在着过度整合。
- 3.2 harmony整合结果评估
scRNA <- readRDS("scRNA_SCT_harmony.rds")
scRNA <- FindNeighbors(scRNA, dims = 1:30) %>% FindClusters()
load("ref_Hematopoietic.RData")
scRNA <- cell_identify(scRNA, ref_Hematopoietic)
p <- DimPlot(scRNA, group.by = "SingleR", label = T)
ggsave("SingleR_Harmony.pdf", p, width = 8, height = 6)
p <- FeaturePlot(scRNA, features = c('CD3E', 'CD3G', 'CD79B', 'MS4A1', 'GNLY', 'NKG7',
'CD14', 'FCGR3A', 'CD68', 'S100A12','CD1C', 'CST3',
'FCER1A', 'GZMB', 'IL3RA', 'PPBP'), ncol = 4)
ggsave("Features_Harmony.pdf", p, width = 18, height = 16)
和上一张图对比可以看出来Harmony整合的效果比较好,细胞群的marker基因分的也比较开。
saveRDS(scRNA, "scRNA.classified.rds")
4. 最后的吐槽
锚点整合是真的很容易过整合啊
同样的数据,左边是Harmony整合,右边是锚点整合。锚点整合完Ccr2+的细胞群都没了呢🤷♀️
网友评论