- 读取数据(10X)
pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/")
pryr::otype(pbmc.data)
结果就是S4类,是一个dgCMatrix
- 初始化raw data数据,变为Seurat对象,仍为S4类
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
- 过滤线粒体基因(用所有以MT-开头的基因作为一组线粒体基因),增加到metadata中去
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc@meta.data
- 红细胞比例基因
c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因
HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)

- Feature、count、线粒体基因、红细胞基因占比可视化
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), ncol = 4) + scale_fill_aaas()

- 特征之间的相关性
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

- 过滤细胞和基因
We filter cells that have unique feature counts over 2,500 or less than 200
We filter cells that have >5% mitochondrial counts
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
- 归一化数据,采用默认参数 NormalizeData 函数,采用归一化方法为LogNormalize, scale.factor = 10000,数据存放在
pbmc[["RNA"]]@data
,运行前存放的是count 文件,运行之后变为归一化之后的数据
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
- 高度可变特征的识别(特征选择)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10)
CombinePlots(plots = list(plot1, plot2))

- Scaling the data 缩放数据 (标准化),数据存储在
pbmc[["RNA"]]@scale.data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
- 去除不想要的来源的变异,例如线粒体污染造成的异质性,细胞循环阶段造成的异质性等
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
- 线性降维 PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
- 每个细胞在PC轴上的坐标 ------------------------------------------------------------
head(pbmc@reductions$pca@cell.embeddings)
- 每个基因对每个PC轴的贡献度(loading值) ---------------------------------------------
head(pbmc@reductions$pca@feature.loadings)
- Get the feature loadings for a given DimReduc -------------------------
t(Loadings(object = pbmc[["pca"]])[1:5,1:5])
- Get the feature loadings for a specified DimReduc in a Seurat object--------
t(Loadings(object = pbmc, reduction = "pca")[1:5,1:5])

- 可视化
VizDimLoadings(pbmc, dims = 1:3, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

- 选择合适的pc成分,有两种方法,一种是JackStraw函数实现(耗时最长),一种是ElbowPlot函数实现
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
plot1<-JackStrawPlot(pbmc, dims = 1:15)
# 第二种方法 -------------------------------------------------------------------
plot2<-ElbowPlot(pbmc)
CombinePlots(plots = list(plot1, plot2),legend="bottom")

- 每个轴显著相关的基因,这个也可以作为后面分析选择基因的一个参考
head(PCASigGenes(pbmc,pcs.use=1:2,pval.cut = 0.1))
- 聚类分析(UMAP/TSNE 两种方法)
> pbmc <- FindNeighbors(pbmc, dims = 1:10)
Computing nearest neighbor graph
Computing SNN
> pbmc <- FindClusters(pbmc, resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 95930
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8737
Number of communities: 9
Elapsed time: 0 seconds
>
> # 查看每一类有多少个细胞
>
> table(pbmc@active.ident)
0 1 2 3 4 5 6 7 8
709 480 432 342 313 162 154 32 14
>
- 提取某一类细胞也就是所谓的提取某一cluster内细胞
> head(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
pbmc@active.ident
AAACATTGATCAGC 2
AAACGCACTGGTAC 2
AAAGCCTGTATGCG 2
AAATCAACTCGCAA 2
AAATGTTGCCACAA 2
AAATTGACTCGCTC 2
> dim(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
[1] 432 1
- 提取某一cluster细胞。
subpbmc<-subset(x = pbmc,idents="2")
subpbmc
head(WhichCells(pbmc,idents="2"))
WhichCells(pbmc,idents="2")
- 查找前5个细胞的cluster ID
> head(Idents(pbmc), 5)
AAACATACAACCAC AAACATTGAGCTAC AAACATTGATCAGC AAACCGTGCTTCCG AAACCGTGTATGCG
0 3 2 1 6
Levels: 0 1 2 3 4 5 6 7 8
- 系统发育分析
pbmc<-BuildClusterTree(pbmc)
Tool(object = pbmc, slot = 'BuildClusterTree')
PlotClusterTree(pbmc)
- 非线性降维UMAP
pbmc <- RunUMAP(pbmc, dims = 1:10)
- 非线性降维TSNE
pbmc <- RunTSNE(pbmc, dims = 1:10)

- 比较一下两个可视化的结果
> plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE)+scale_color_npg()
> plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE)+scale_color_npg()
> CombinePlots(plots = list(plot1, plot2),legend="bottom")

pbmc[["RNA"]]@var.features[1:10] # 高变基因
pbmc[["RNA"]]@counts ###raw data count
pbmc[["RNA"]]@data ### normalize data
pbmc[["RNA"]]@scale.data ### scale data
pbmc[["RNA"]]@meta.features
pbmc[["RNA"]]@misc
网友评论