Seurat V3 学习(一)

作者: 一路向前_莫问前程_前程似锦 | 来源:发表于2019-10-28 16:33 被阅读0次

Seurat V3 学习(一)
Harmony整合单细胞数
Seurat V3 学习(二)
Seurat对象中NicheNet分析以及circos可视化
Seurat 基于celltype类型进行层次聚类
单细胞转录组3大R包系列学习笔记
使用Shiny搭建基于Seurat包的单细胞数据可视化平台
Seurat::DoHeatmap 的谜之报错
单细胞组学系列学习笔记
单细胞多组学 —— R包 Seurat 4.0 初探（Guide

读取数据（10X）

pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/")
pryr::otype(pbmc.data)

结果就是S4类，是一个dgCMatrix

初始化raw data数据,变为Seurat对象，仍为S4类

pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)

过滤线粒体基因（用所有以MT-开头的基因作为一组线粒体基因）,增加到metadata中去

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc@meta.data

红细胞比例基因

c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因

HB.genes_total <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 人类血液常见红细胞基因
HB_m <- match(HB.genes_total,rownames(pbmc@assays$RNA))
HB.genes <- rownames(pbmc@assays$RNA)[HB_m]
HB.genes <- HB.genes[!is.na(HB.genes)]
pbmc[["percent.HB"]]<-PercentageFeatureSet(pbmc,features=HB.genes)

image.png

Feature、count、线粒体基因、红细胞基因占比可视化

VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"), ncol = 4) + scale_fill_aaas()

image.png

特征之间的相关性

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

image.png

过滤细胞和基因

We filter cells that have unique feature counts over 2,500 or less than 200
We filter cells that have >5% mitochondrial counts

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

归一化数据，采用默认参数 NormalizeData 函数，采用归一化方法为LogNormalize, scale.factor = 10000，数据存放在pbmc[["RNA"]]@data,运行前存放的是count 文件，运行之后变为归一化之后的数据

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

高度可变特征的识别(特征选择)

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10)
CombinePlots(plots = list(plot1, plot2))

image.png

Scaling the data 缩放数据 (标准化),数据存储在 pbmc[["RNA"]]@scale.data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)

去除不想要的来源的变异，例如线粒体污染造成的异质性，细胞循环阶段造成的异质性等

pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")

线性降维 PCA

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

每个细胞在PC轴上的坐标 ------------------------------------------------------------

head(pbmc@reductions$pca@cell.embeddings)

每个基因对每个PC轴的贡献度（loading值） ---------------------------------------------

head(pbmc@reductions$pca@feature.loadings)

Get the feature loadings for a given DimReduc -------------------------

t(Loadings(object = pbmc[["pca"]])[1:5,1:5])

Get the feature loadings for a specified DimReduc in a Seurat object--------

t(Loadings(object = pbmc, reduction = "pca")[1:5,1:5])

image.png

可视化

VizDimLoadings(pbmc, dims = 1:3, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

image.png

选择合适的pc成分，有两种方法，一种是JackStraw函数实现(耗时最长)，一种是ElbowPlot函数实现

pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)

plot1<-JackStrawPlot(pbmc, dims = 1:15)


# 第二种方法 -------------------------------------------------------------------

plot2<-ElbowPlot(pbmc)

CombinePlots(plots = list(plot1, plot2),legend="bottom")

image.png

每个轴显著相关的基因，这个也可以作为后面分析选择基因的一个参考

head(PCASigGenes(pbmc,pcs.use=1:2,pval.cut = 0.1))

聚类分析(UMAP/TSNE 两种方法)

> pbmc <- FindNeighbors(pbmc, dims = 1:10)
Computing nearest neighbor graph
Computing SNN
> pbmc <- FindClusters(pbmc, resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 95930

Running Louvain algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8737
Number of communities: 9
Elapsed time: 0 seconds
> 
> # 查看每一类有多少个细胞
> 
> table(pbmc@active.ident) 

  0   1   2   3   4   5   6   7   8 
709 480 432 342 313 162 154  32  14 
>

提取某一类细胞也就是所谓的提取某一cluster内细胞

> head(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
               pbmc@active.ident
AAACATTGATCAGC                 2
AAACGCACTGGTAC                 2
AAAGCCTGTATGCG                 2
AAATCAACTCGCAA                 2
AAATGTTGCCACAA                 2
AAATTGACTCGCTC                 2
> dim(subset(as.data.frame(pbmc@active.ident),pbmc@active.ident=="2"))
[1] 432   1

提取某一cluster细胞。

subpbmc<-subset(x = pbmc,idents="2")
subpbmc

head(WhichCells(pbmc,idents="2"))

WhichCells(pbmc,idents="2")

查找前5个细胞的cluster ID

> head(Idents(pbmc), 5)
AAACATACAACCAC AAACATTGAGCTAC AAACATTGATCAGC AAACCGTGCTTCCG AAACCGTGTATGCG 
             0              3              2              1              6 
Levels: 0 1 2 3 4 5 6 7 8

系统发育分析

pbmc<-BuildClusterTree(pbmc)
Tool(object = pbmc, slot = 'BuildClusterTree')

PlotClusterTree(pbmc)

非线性降维UMAP

pbmc <- RunUMAP(pbmc, dims = 1:10)

非线性降维TSNE

pbmc <- RunTSNE(pbmc, dims = 1:10)

image.png

比较一下两个可视化的结果

> plot1<-DimPlot(pbmc, reduction = "umap",label = TRUE)+scale_color_npg()
> plot2<-DimPlot(pbmc, reduction = "tsne",label = TRUE)+scale_color_npg()
> CombinePlots(plots = list(plot1, plot2),legend="bottom")

image.png

pbmc[["RNA"]]@var.features[1:10] # 高变基因
pbmc[["RNA"]]@counts ###raw data count
pbmc[["RNA"]]@data ### normalize data
pbmc[["RNA"]]@scale.data ### scale data
pbmc[["RNA"]]@meta.features
pbmc[["RNA"]]@misc