摘要:对Nature Methods上发表的文章Single-cell landscape of bronchoalveolar immune cells in patients with COVID-19进行文章结果复现。
1 数据来源
GSE145926中包含了12例BALF样本的单细胞测序数据,其中包括6例重症感染患者样本,3例中症感染患者样本,3例健康对照样本,另GSM3660650中包含了1例健康对照样本。
2 数据质控并整合
所有数据准备好后,读入数据进行质控,之后对多样本来源的数据进行整合并去除批次效应,主要R语言代码如下:
library(Seurat)
library(Matrix)
library(dplyr)
rm(list=ls())
# 读入元数据信息
samples <- read.delim2("meta.txt",header = TRUE,
stringsAsFactors = FALSE,check.names = FALSE, sep = "\t")
### 读入表达数据并进行质控
nCoV.list <- list()
for(sample_s in samples$sample){
print(sample_s)
sample_i = samples %>% dplyr::filter(.,sample == sample_s)
# GSM3660650是从GEO下载的表达矩阵文件,其他文件是10X的h5文件
if (sample_s=="GSM3660650"){
datadir = paste("GSE145926_RAW/GSM3660650/",sep="")
sample.tmp <- Read10X(data.dir = datadir)
}else{
datadir = paste("GSE145926_RAW/",sample_s,"_filtered_feature_bc_matrix.h5",sep="")
sample.tmp <- Read10X_h5(datadir)
}
sample.tmp.seurat <- CreateSeuratObject(counts = sample.tmp, min.cells = 3, min.features = 200,project = sample_s)
sample.tmp.seurat[['percent.mito']] <- PercentageFeatureSet(sample.tmp.seurat, pattern = "^MT-")
#去除表达基因数量低于200或超过6000,UMI数量超过1000,线粒体RNA比例超过10%的细胞
sample.tmp.seurat <- subset(x = sample.tmp.seurat, subset = nFeature_RNA > 200 &
nFeature_RNA < 6000 &
nCount_RNA > 1000 &
percent.mito < 10)
sample.tmp.seurat <- NormalizeData(sample.tmp.seurat, verbose = FALSE)
sample.tmp.seurat <- FindVariableFeatures(sample.tmp.seurat, selection.method = "vst",
nfeatures = 2000,verbose = FALSE)
nCoV.list[sample_s] = sample.tmp.seurat
}
### 使用Seurat3自带的IntegrateData函数进行整合并去除批次效应,具体算法原理不详
nCoV <- FindIntegrationAnchors(object.list = nCoV.list, dims = 1:50)
nCoV.integrated <- IntegrateData(anchorset = nCoV, dims = 1:50,features.to.integrate = rownames(nCoV))
### 添加样本信息,主要包括病症类型与对照等信息
sample_info = as.data.frame(colnames(nCoV.integrated))
colnames(sample_info) = c('ID')
rownames(sample_info) = sample_info$ID
sample_info$sample = nCoV.integrated@meta.data$orig.ident
sample_info = dplyr::left_join(sample_info,samples)
rownames(sample_info) = sample_info$ID
nCoV.integrated = AddMetaData(object = nCoV.integrated, metadata = sample_info)
### 对RNA矩阵进行Normalize和Scale,以便进行差异表达和可视化
DefaultAssay(nCoV.integrated) <- "RNA"
nCoV.integrated[['percent.mito']] <- PercentageFeatureSet(nCoV.integrated, pattern = "^MT-")
nCoV.integrated <- NormalizeData(nCoV.integrated, normalization.method = "LogNormalize", scale.factor = 1e4)
nCoV.integrated <- FindVariableFeatures(nCoV.integrated, selection.method = "vst", nfeatures = 2000,verbose = FALSE)
nCoV.integrated <- ScaleData(nCoV.integrated, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mito"))
3 聚类与降维
这部分程序主要实现表达数据的PCA分析、聚类分析、降维展示,以及差异表达分析鉴定细胞群的marker gene,R语言代码如下:
### DefaultAssay设置为“integrated”矩阵并进行下游分析,
DefaultAssay(nCoV.integrated) <- "integrated"
### 以下进入Seurat标准分析流程
nCoV.integrated <- ScaleData(nCoV.integrated, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mito"))
nCoV.integrated <- RunPCA(nCoV.integrated, verbose = FALSE,npcs = 100)
nCoV.integrated <- ProjectDim(object = nCoV.integrated)
### 通过ElbowPlot选择PC维度进行下游降维和聚类
dpi = 300
png(file="pca.png", width = dpi*10, height = dpi*6, units = "px",res = dpi,type='cairo')
ElbowPlot(object = nCoV.integrated,ndims = 100)
dev.off()
Figure 1. PCA ElbowPlot.png
图1显示,大约前20个PCA维度就可以捕获数据的真实信号,文章中作者使用了前50个PCA维度进行降维与聚类分析。
### 聚类
nCoV.integrated <- FindNeighbors(object = nCoV.integrated, dims = 1:50)
nCoV.integrated <- FindClusters(object = nCoV.integrated, resolution = 1.2)
### 分别采用tSNE和umap两种方法进行降维展示
nCoV.integrated <- RunTSNE(object = nCoV.integrated, dims = 1:50)
nCoV.integrated <- RunUMAP(nCoV.integrated, reduction = "pca", dims = 1:50)
dpi=300
# umap展示
png(file="1-umap_1.png", width = dpi*8, height = dpi*6, units = "px",res = dpi,type='cairo')
pp = DimPlot(nCoV.integrated, reduction = 'umap',pt.size = 0.5,label = T,label.size = 5)
pp = pp + theme(axis.title = element_text(size = 15),
axis.text = element_text(size = 15,family = 'sans'),
legend.text = element_text(size = 15),
axis.line = element_line(size = 0.8))
print(pp)
dev.off()
# tsne展示
png(file="1-tsne_1.png", width = dpi*8, height = dpi*6, units = "px",res = dpi,type='cairo')
pp = DimPlot(nCoV.integrated, reduction = 'tsne',pt.size = 0.5,label = T,label.size = 5)
pp = pp + theme(axis.title = element_text(size = 15),
axis.text = element_text(size = 15,family = 'sans'),
legend.text = element_text(size = 15),
axis.line = element_line(size = 0.8))
print(pp)
dev.off()
Figure 2. UMAP and tSNE embedding.png
Figure2显示,一共鉴定到31群细胞,与文章中Figure1A基本一致。
### 使用RNA矩阵进行marker gene的鉴定,而不是批次矫正后的integrated矩阵,
###此处需要注意,除Seurat之外,其他多种差异表达分析的算法也推荐使用原始表达值,而不是批次矫正后的数值;
###此外,RNA矩阵用作文章中作图数据来源。
###据此来看,integrated矩阵仅仅是在去除批次效应、降维和聚类过程中用到,差异表达与数据可视化使用的都是RNA矩阵。
DefaultAssay(nCoV.integrated) <- "RNA"
nCoV.integrated@misc$markers <- FindAllMarkers(object = nCoV.integrated, assay = 'RNA',only.pos = TRUE, test.use = 'MAST')
write.table(nCoV.integrated@misc$markers,file='marker_MAST.txt',row.names = FALSE,quote = FALSE,sep = '\t')
### 挑选出每种细胞类型的marker gene,以及每个细胞群前30个表达最高的marker gene,加上variable feature,进行scaleData分析,
###目的是为了在scale.data中添加部分感兴趣的基因
markers = c('AGER','SFTPC','SCGB3A2','TPPP3','KRT5', 'FCGR3A','TREM2','KRT18', #上皮系细胞
'CD68','FCN1','CD1C','TPSB2','CD14','MARCO','CXCR2','CLEC9A','IL3RA', #树突细胞
'CD3D','CD8A','KLRF1', #T/NK细胞
'CD79A','IGHG4','MS4A1', #B细胞
'VWF','DCN' #内皮细胞)
hc.markers = read.delim2("marker_MAST.txt",header = TRUE, stringsAsFactors = FALSE,check.names = FALSE, sep = "\t")
hc.markers %>% group_by(cluster) %>% top_n(n = 30, wt = avg_logFC) -> top30
var.genes = c(nCoV.integrated@assays$RNA@var.features,top30$gene,markers)
nCoV.integrated <- ScaleData(nCoV.integrated, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mito"),features = var.genes)
saveRDS(nCoV.integrated, file = "nCoV.rds")
4 细胞群注释
这一步主要根据各个细胞群的marker gene表达情况确定其所属的细胞类型,R语言代码如下:
### marker gene umap热图
markers = c('TPPP3','KRT18','CD68','FCGR3B','CD1C','CLEC9A','LILRA4','TPSB2','CD3D','KLRD1','MS4A1','IGHG4')
png(file="1-umap_marker_1.png", width = dpi*16, height = dpi*10, units = "px",res = dpi,type='cairo')
pp_temp = FeaturePlot(object = nCoV.integrated, features = markers,cols = c("lightgrey","#ff0000"),combine = FALSE)
plots <- lapply(X = pp_temp, FUN = function(p) p + theme(axis.title = element_text(size = 22),
axis.text = element_text(size = 22),
plot.title = element_text(family = 'sans',face='italic',size=22),
axis.line = element_line(size = 1.5),
axis.ticks = element_line(size = 1.2),
legend.text = element_text(size = 22),
legend.key.height = unit(1.8,"line"),
legend.key.width = unit(1.2,"line")))
pp = CombinePlots(plots = plots,ncol = 4,legend = 'right')
print(pp)
dev.off()
Figure 3. UMAP projection of canonical markers for different cell types.png
#根据以上注释结果对细胞群添加注释信息
nCoV.integrated1 <- RenameIdents(nCoV.integrated, '12'='Epithelial','18'='Epithelial','26'='Epithelial','28'='Epithelial',
'0'='Macrophages','1'='Macrophages','2'='Macrophages','3'='Macrophages','4'='Macrophages','5'='Macrophages','6'='Macrophages','8'='Macrophages',
'10'='Macrophages','11'='Macrophages','15'='Macrophages','16'='Macrophages','20'='Macrophages','22'='Macrophages','23'='Macrophages','30'='Macrophages',
'29'='Mast','7'='T','9'='T','13'='T','19'='NK','24'='Plasma','25'='Plasma',
'14'='Neutrophil','17'='mDC','27'='pDC','21'='Doublets')
#计算各类细胞比例
nCoV.integrated1[["cluster"]] <- Idents(object = nCoV.integrated1)
big.cluster = nCoV.integrated1@meta.data
organ.summary = as.data.frame.matrix(table(big.cluster$sample,big.cluster$cluster))
organ.summary1 = organ.summary %>% select(c('Macrophages','Neutrophil','mDC','pDC','T','NK','Plasma','Mast'))
sample.percent <- round(organ.summary1 / rowSums(organ.summary1),3)
###添加分组信息
samples = read.delim2("../meta.txt",header = TRUE, stringsAsFactors = FALSE,check.names = FALSE, sep = "\t")
rownames(samples)=samples$sample
sample.percent$sample <- samples[rownames(sample.percent),'sample']
sample.percent$group <- samples[rownames(sample.percent),'group']
###作图展示
pplist = list()
nCoV_groups = c('Macrophages','Neutrophil','mDC','pDC','T','NK','Plasma','Mast')
for(group_ in nCoV_groups){
sample.percent_ = sample.percent %>% select(one_of(c('sample','group',group_)))
colnames(sample.percent_) = c('sample','group','percent')
sample.percent_$percent = as.numeric(sample.percent_$percent)
sample.percent_ <- sample.percent_ %>% group_by(group) %>% mutate(upper = quantile(percent, 0.75), lower = quantile(percent, 0.25),mean = mean(percent),median = median(percent))
print(group_)
print(sample.percent_$median)
pp1 = ggplot(sample.percent_,aes(x=group,y=percent)) + geom_jitter(shape = 21,aes(fill=group),width = 0.25) + stat_summary(fun=mean, geom="point", color="grey60") +
theme_cowplot() +
theme(axis.text = element_text(size = 6),axis.title = element_text(size = 6),legend.text = element_text(size = 6),
legend.title = element_text(size = 6),plot.title = element_text(size = 6,face = 'plain'),legend.position = 'none') + labs(title = group_,y='Percentage') +
geom_errorbar(aes(ymin = lower, ymax = upper),col = "grey60",width = 0.25)
###组间t检验分析
labely = max(sample.percent_$percent)
compare_means(percent ~ group, data = sample.percent_)
my_comparisons <- list( c("HC", "M"), c("M", "S"), c("HC", "S") )
pp1 = pp1 + stat_compare_means(comparisons = my_comparisons,size = 1,method = "t.test")
pplist[[group_]] = pp1
}
pdf(file="sample-percentage.pdf", width = 6, height = 3)
print(plot_grid(pplist[['Macrophages']],pplist[['Neutrophil']],pplist[['mDC']],pplist[['pDC']],pplist[['T']],pplist[['NK']],pplist[['Plasma']],pplist[['Mast']],ncol = 4, nrow = 2))
dev.off()
Figure 4. The proportions of the major BALF immune cell types in different groups.png
5 巨噬细胞分析
这一部分主要是对巨噬细胞进行分析,R语言代码如下:
###按照前面细胞marker gene,我们鉴定到16群巨噬细胞,首先提取巨噬细胞子集
nCoV.integrated = readRDS(file = "../nCoV.rds")
Macrophage = subset(nCoV.integrated,idents = c('0','1','2','3','4','5','6','8','10','11','15','16','20','22','23','30'))
###将不同样本的巨噬细胞分开,之后通过IntegrateData矫正批次效应
DefaultAssay(Macrophage) <- "RNA"
SubseMacrophages.list <- SplitObject(Macrophage, split.by = "sample")
for (i in 1:length(SubseMacrophages.list)) {
SubseMacrophages.list[[i]] <- NormalizeData(SubseMacrophages.list[[i]], verbose = FALSE)
SubseMacrophages.list[[i]] <- FindVariableFeatures(SubseMacrophages.list[[i]], selection.method = "vst", nfeatures = 2000,verbose = FALSE)
}
samples_name = c('C51','C52','C100','GSM3660650','C141','C142','C144','C143','C145','C146','C148','C149','C152')
reference.list <- SubseMacrophages.list[samples_name]
Macrophage.temp <- FindIntegrationAnchors(object.list = reference.list, dims = 1:50,k.filter = 115)
Macrophage.Integrated <- IntegrateData(anchorset = Macrophage.temp, dims = 1:50)
### 这部分代码与前面相同,先对整合后的RNA矩阵进行Normalize和Scale
DefaultAssay(nCoV.integrated) <- "RNA"
nCoV.integrated[['percent.mito']] <- PercentageFeatureSet(nCoV.integrated, pattern = "^MT-")
nCoV.integrated <- NormalizeData(object = nCoV.integrated, normalization.method = "LogNormalize", scale.factor = 1e4)
nCoV.integrated <- FindVariableFeatures(object = nCoV.integrated, selection.method = "vst", nfeatures = 2000,verbose = FALSE)
nCoV.integrated <- ScaleData(nCoV.integrated, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mito"))
### DefaultAssay设置为“integrated”矩阵并进行下游分析,包括降维与聚类等
DefaultAssay(Macrophage.Integrated) <- "integrated"
Macrophage.Integrated <- ScaleData(Macrophage.Integrated, verbose = FALSE, vars.to.regress = c("nCount_RNA", "percent.mito"))
Macrophage.Integrated <- RunPCA(Macrophage.Integrated, verbose = FALSE)
#visulaization pca result
Macrophage.Integrated <- ProjectDim(object = Macrophage.Integrated)
###选择前50个PCA维度进行聚类,并采用tSNE和UMAP进行降维展示
Macrophage.Integrated <- FindNeighbors(object = Macrophage.Integrated, dims = 1:50)
Macrophage.Integrated <- FindClusters(object = Macrophage.Integrated, resolution = 0.8)
Macrophage.Integrated <- RunTSNE(object = Macrophage.Integrated, dims = 1:50)
Macrophage.Integrated <- RunUMAP(Macrophage.Integrated, reduction = "pca", dims = 1:50)
###巨噬细胞UMAP展示
Macrophage.Integrated <- RunUMAP(Macrophage.Integrated, reduction = "pca", dims = 1:50)
png(file="2-umap-1.png", width = dpi*8, height = dpi*6, units = "px",res = dpi,type='cairo')
pp = DimPlot(object = Macrophage.Integrated, reduction = 'umap',label = T,pt.size = 0.8,label.size = 6,repel = TRUE)
pp = pp + theme(axis.title = element_text(size = 16),axis.text = element_text(size = 16),
legend.text = element_text(size = 16),axis.line = element_line(size = 1))
print(pp)
dev.off()
Figure 5. The UMAP presentation of macrophages..png
在此我们共鉴定到18群巨噬细胞,与文章中的Supplementary Figure 2A基本一致。
###巨噬细胞marker gene热图
dpi = 300
markers = c('SPP1','FCN1','IL1B','MAFB','FABP4','MARCO','INHBA','TREM2')
png(file="umap_marker.png", width = dpi*13, height = dpi*6, units = "px",res = dpi,type='cairo')
pp_temp = FeaturePlot(object = Macrophage.Integrated, ncol = 4, features = markers,cols = c("lightgrey","#ff0000"),combine = FALSE,pt.size = 0.8)
plots <- lapply(X = pp_temp, FUN = function(p) p + theme(plot.title = element_text(family = 'sans',face='italic',size=16),legend.position = 'right') +
theme(axis.line = element_line(size = 0.8),axis.ticks = element_line(size = 0.8),
axis.text = element_text(size = 16),axis.title = element_text(size = 16),
legend.text = element_text(size =16)))
pp = CombinePlots(plots = plots,ncol = 4,legend = 'right')
print(pp)
dev.off()
Figure 6. UMAP plots showing the expression of several markers on BALF macrophages.png
据Figure 6可将巨噬细胞分为四组:FCN1hi(group 1),FCN1loSPP1+(group 2),SPP1+(group 3),FABP4+(group 4),在此基础上进行下游分析。
网友评论