官网教程:https://satijalab.org/signac/articles/monocle.html
在这一篇官网教程里,主要是演示了如何利用Monocle3来构建单细胞ATAC-seq数据的细胞“轨迹”。
为了方便Seurat (Signac使用)和CellDataSet (Cicero使用)格式之间的转换,我们将使用GitHub上提供的SeuratWrappers 包中的一个转换函数 。
数据下载
我们使用的单细胞ATAC-seq数据包含人类CD34+造血干细胞和祖细胞,这个数据发表在Satpathy and Granja et al. (2019, Nature Biotechnology)上。
处理好的dataset可以在NCBI GEO下载: 这里
你需要下载的是下面红色框里的:
![](https://img.haomeiwen.com/i18922188/eea70b3f1890b43f.png)
还需要下载:
![](https://img.haomeiwen.com/i18922188/68094a00b3d32986.png)
![](https://img.haomeiwen.com/i18922188/74fa84663ec7265b.png)
都下载完成后,可以开始进行练习了~
数据预处理
> library(Signac)
> library(Seurat)
> library(SeuratWrappers)
> library(monocle3)
> library(Matrix)
> library(ggplot2)
> library(patchwork)
> set.seed(1234)
上面要安装的SeuratWrappers包的安装方法见:https://github.com/satijalab/seurat-wrappers,如果没有报错当然更好,如果报错了按照报错的提示,缺哪个包,就安装哪个包就行了。
> peaks <- read.table(paste0("./GSE129785_scATAC-Hematopoiesis-CD34.peaks.txt"), header = TRUE)
> cells <- read.table(paste0( "./GSE129785_scATAC-Hematopoiesis-CD34.cell_barcodes.txt"), header = TRUE, stringsAsFactors = FALSE)
> head(peaks)
Feature
1 chr1_10238_10738
2 chr1_115476_115976
3 chr1_236856_237356
4 chr1_237511_238011
5 chr1_240801_241301
6 chr1_250084_250584
> head(cells)
UMAP1 UMAP2 Clusters Group depth FRIP Barcodes Internal_Name Group_Barcode
1 -2.419656 7.603472 Cluster6 Bone_Marrow_Rep1 29301 0.3471895 CAAGAAAGTCAAGACG-1 SUHealthy_BM_B1_50 Bone_Marrow_Rep1
2 -7.153399 -5.512960 Cluster11 Bone_Marrow_Rep1 29608 0.4491860 AAAGGGCAGTACCCAT-1 SUHealthy_BM_B1_52 Bone_Marrow_Rep1
3 -8.475636 -5.377475 Cluster9 Bone_Marrow_Rep1 16810 0.4986318 CACAACATCGTGGGTC-1 SUHealthy_BM_B1_53 Bone_Marrow_Rep1
4 -6.706686 12.041433 Cluster2 Bone_Marrow_Rep1 47369 0.3847031 TTACCCGTCTGATCCC-1 SUHealthy_BM_B1_55 Bone_Marrow_Rep1
5 -7.591989 -3.001234 Cluster9 Bone_Marrow_Rep1 23342 0.4957159 CTTTGCGGTACGGTTT-1 SUHealthy_BM_B1_56 Bone_Marrow_Rep1
6 -6.898403 11.516414 Cluster2 Bone_Marrow_Rep1 78378 0.4175853 GGTTGCGCAGGGAGTT-1 SUHealthy_BM_B1_57 Bone_Marrow_Rep1
> rownames(cells) <- make.unique(cells$Barcodes)
> head(cells)
UMAP1 UMAP2 Clusters Group depth FRIP Barcodes Internal_Name
CAAGAAAGTCAAGACG-1 -2.419656 7.603472 Cluster6 Bone_Marrow_Rep1 29301 0.3471895 CAAGAAAGTCAAGACG-1 SUHealthy_BM_B1_50
AAAGGGCAGTACCCAT-1 -7.153399 -5.512960 Cluster11 Bone_Marrow_Rep1 29608 0.4491860 AAAGGGCAGTACCCAT-1 SUHealthy_BM_B1_52
CACAACATCGTGGGTC-1 -8.475636 -5.377475 Cluster9 Bone_Marrow_Rep1 16810 0.4986318 CACAACATCGTGGGTC-1 SUHealthy_BM_B1_53
TTACCCGTCTGATCCC-1 -6.706686 12.041433 Cluster2 Bone_Marrow_Rep1 47369 0.3847031 TTACCCGTCTGATCCC-1 SUHealthy_BM_B1_55
CTTTGCGGTACGGTTT-1 -7.591989 -3.001234 Cluster9 Bone_Marrow_Rep1 23342 0.4957159 CTTTGCGGTACGGTTT-1 SUHealthy_BM_B1_56
GGTTGCGCAGGGAGTT-1 -6.898403 11.516414 Cluster2 Bone_Marrow_Rep1 78378 0.4175853 GGTTGCGCAGGGAGTT-1 SUHealthy_BM_B1_57
Group_Barcode
CAAGAAAGTCAAGACG-1 Bone_Marrow_Rep1
AAAGGGCAGTACCCAT-1 Bone_Marrow_Rep1
CACAACATCGTGGGTC-1 Bone_Marrow_Rep1
TTACCCGTCTGATCCC-1 Bone_Marrow_Rep1
CTTTGCGGTACGGTTT-1 Bone_Marrow_Rep1
GGTTGCGCAGGGAGTT-1 Bone_Marrow_Rep1
> mtx <- readMM(file = paste0("./GSE129785_scATAC-Hematopoiesis-CD34.mtx.gz"))
> mtx <- as(object = mtx, Class = "dgCMatrix")
> colnames(mtx) <- rownames(cells)
> rownames(mtx) <- peaks$Feature
> bone_assay <- CreateChromatinAssay(
counts = mtx,
min.cells = 5,
fragments = "./GSM3722029_CD34_Progenitors_Rep1_fragments.tsv.gz",
sep = c("_", "_"),
genome = "hg19"
)
Computing hash
Checking for 18489 cell barcodes
> bone <- CreateSeuratObject(
counts = bone_assay,
meta.data = cells,
assay = "ATAC"
)
# 这个dataset包含多种细胞类型
# 我们可以只保留Group_barcode里是CD34_Progenitors_Rep1的数据
> bone <- bone[, bone$Group_Barcode == "CD34_Progenitors_Rep1"]
# 添加细胞类型注释(来自文献)
> cluster_names <- c("HSC", "MEP", "CMP-BMP", "LMPP", "CLP", "Pro-B", "Pre-B", "GMP",
"MDP", "pDC", "cDC", "Monocyte-1", "Monocyte-2", "Naive-B", "Memory-B",
"Plasma-cell", "Basophil", "Immature-NK", "Mature-NK1", "Mature-NK2", "Naive-CD4-T1",
"Naive-CD4-T2", "Naive-Treg", "Memory-CD4-T", "Treg", "Naive-CD8-T1", "Naive-CD8-T2",
"Naive-CD8-T3", "Central-memory-CD8-T", "Effector-memory-CD8-T", "Gamma delta T")
> num.labels <- length(cluster_names)
> num.labels
[1] 31
> names(cluster_names) <- paste0(rep("Cluster",num.labels),seq(num.labels))
> cluster_names #这样每一个细胞类型都有一个cluster编号
Cluster1 Cluster2 Cluster3
"HSC" "MEP" "CMP-BMP"
Cluster4 Cluster5 Cluster6
"LMPP" "CLP" "Pro-B"
Cluster7 Cluster8 Cluster9
"Pre-B" "GMP" "MDP"
Cluster10 Cluster11 Cluster12
"pDC" "cDC" "Monocyte-1"
Cluster13 Cluster14 Cluster15
"Monocyte-2" "Naive-B" "Memory-B"
Cluster16 Cluster17 Cluster18
"Plasma-cell" "Basophil" "Immature-NK"
Cluster19 Cluster20 Cluster21
"Mature-NK1" "Mature-NK2" "Naive-CD4-T1"
Cluster22 Cluster23 Cluster24
"Naive-CD4-T2" "Naive-Treg" "Memory-CD4-T"
Cluster25 Cluster26 Cluster27
"Treg" "Naive-CD8-T1" "Naive-CD8-T2"
Cluster28 Cluster29 Cluster30
"Naive-CD8-T3" "Central-memory-CD8-T" "Effector-memory-CD8-T"
Cluster31
"Gamma delta T"
> bone$celltype <- cluster_names[as.character(bone$Clusters)]
> bone[["ATAC"]]
ChromatinAssay data with 570154 features for 9913 cells
Variable features: 0
Genome: hg19
Annotation present: FALSE
Motifs present: FALSE
Fragment files: 1
下一步就是加基因注释到object里了:
#add gene annotations for the hg19 genome to the object
> library(EnsDb.Hsapiens.v75)
# extract gene annotations from EnsDb
> annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)
# change to UCSC style since the data was mapped to hg19
> seqlevelsStyle(annotations) <- 'UCSC'
> genome(annotations) <- "hg19"
# add the gene information to the object
> Annotation(bone) <- annotations
质量控制
现在要计算转录起始位点的reads富集,核小体信号分数,以及落在基因组“黑名单”区域内的count数,利用这些参数来过滤我们的低质量细胞。
> bone <- TSSEnrichment(bone)
Extracting TSS positions
Extracting fragments at TSSs
|========================================================================| 100%
Computing TSS enrichment score
> bone <- NucleosomeSignal(bone)
Found 18489 cell barcodes
Done Processing 99 million lines
> bone$blacklist_fraction <- FractionCountsInRegion(bone, regions = blacklist_hg19)
> VlnPlot(
object = bone,
features = c("nCount_ATAC", "TSS.enrichment", "nucleosome_signal", "blacklist_fraction"),
pt.size = 0.1,
ncol = 4
)
![](https://img.haomeiwen.com/i18922188/2f53c855c4fa7c69.png)
#过滤细胞
> bone <- bone[, (bone$nCount_ATAC < 50000) &
(bone$TSS.enrichment > 2) &
(bone$nucleosome_signal < 5)]
数据处理
现在运行标准的scATAC-seq分析流程(使用Signac包)进行降维处理、聚类以及细胞类型的注释。
> bone <- FindTopFeatures(bone, min.cells = 10)
> bone <- RunTFIDF(bone)
Performing TF-IDF normalization
Warning message:
In RunTFIDF.default(object = GetAssayData(object = object, slot = "counts"), :
Some features contain 0 total counts
> bone <- RunSVD(bone, n = 100)
Running SVD
Scaling cell embeddings
> DepthCor(bone)
![](https://img.haomeiwen.com/i18922188/bffed11888b99553.png)
> bone <- RunUMAP(
bone,
reduction = "lsi",
dims = 2:100,
reduction.name = "UMAP"
)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
12:25:45 UMAP embedding parameters a = 0.9922 b = 1.112
12:25:46 Read 9765 rows and found 99 numeric columns
12:25:46 Using Annoy for neighbor search, n_neighbors = 30
12:25:46 Building Annoy index with metric = cosine, n_trees = 50
12:25:57 Writing NN index file to temp file
12:25:57 Searching Annoy index using 1 thread, search_k = 3000
12:26:01 Annoy recall = 100%
12:26:14 Commencing smooth kNN distance calibration using 1 thread
12:26:20 Initializing from normalized Laplacian + noise
12:26:21 Commencing optimization for 500 epochs, with 461970 positive edges
12:27:00 Optimization finished
> bone <- FindNeighbors(bone, dims = 2:100, reduction = "lsi")
Computing nearest neighbor graph
Computing SNN
> bone <- FindClusters(bone, resolution = 0.8, algorithm = 3)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 9765
Number of edges: 633636
Running smart local moving algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8206
Number of communities: 61
Elapsed time: 11 seconds
45 singletons identified. 16 final clusters.
> DimPlot(bone, label = TRUE) + NoLegend()
![](https://img.haomeiwen.com/i18922188/2100482dab43408d.png)
根据文献,给每一个cluster分配most common cell type:
> for(i in levels(bone)) {
cells_to_reid <- WhichCells(bone, idents = i)
newid <- names(sort(table(bone$celltype[cells_to_reid]),decreasing=TRUE))[1]
Idents(bone, cells = cells_to_reid) <- newid
}
> bone$assigned_celltype <- Idents(bone)
> DimPlot(bone, label = TRUE)
![](https://img.haomeiwen.com/i18922188/212e798083fba5d8.png)
下一步我们取不同的谱系子集,并且构建每一个谱系的轨迹。另一个方法是构建一个轨迹,这个轨迹可以用于整体的dataset,然后再分别构建不同细胞群的伪时间轨迹。
#取不同的子集
> DefaultAssay(bone) <- "ATAC"
> erythroid <- bone[, bone$assigned_celltype %in% c("HSC", "MEP", "CMP-BMP")]
> lymphoid <- bone[, bone$assigned_celltype %in% c("HSC", "LMPP", "GMP", "CLP", "Pro-B", "pDC", "MDP", "GMP")]
利用Monocle3构建“轨迹”
我们可以利用[as.cell_data_set()]
功能把seurat对象转换成CellDataSet对象,再构建轨迹。这里我们将分别处理红系和淋巴细胞谱系:
> erythroid.cds <- as.cell_data_set(erythroid)
> erythroid.cds <- cluster_cells(cds = erythroid.cds, reduction_method = "UMAP")
> erythroid.cds <- learn_graph(erythroid.cds, use_partition = TRUE)
|==================================================================| 100%
> lymphoid.cds <- as.cell_data_set(lymphoid)
> lymphoid.cds <- cluster_cells(cds = lymphoid.cds, reduction_method = "UMAP")
> lymphoid.cds <- learn_graph(lymphoid.cds, use_partition = TRUE)
|==================================================================| 100%
为了计算每个轨迹的伪时间,我们需要决定每个轨迹的起点是什么。在我们的例子中,我们知道造血干细胞是轨迹中其他类型细胞的祖细胞,所以我们可以把这些细胞设为轨迹的根(root)。Monocle 3包括一个交互功能,在图中选择细胞作为根节点。调用order_cells()
但不指定root_cells参数,将启动此函数自动寻找根节点。在这里,我们预先选择了一些细胞作为root,并将它们保存到一个文件中,以便重现结果。这个文件可以在 这里下载。
# load the pre-selected HSCs
> hsc <- readLines("./hsc_cells.txt")
# order cells
> erythroid.cds <- order_cells(erythroid.cds, reduction_method = "UMAP", root_cells = hsc)
> lymphoid.cds <- order_cells(lymphoid.cds, reduction_method = "UMAP", root_cells = hsc)
# plot trajectories colored by pseudotime
> plot_cells(
cds = erythroid.cds,
color_cells_by = "pseudotime",
show_trajectory_graph = TRUE
)
![](https://img.haomeiwen.com/i18922188/44d21491681b6889.png)
> plot_cells(
cds = lymphoid.cds,
color_cells_by = "pseudotime",
show_trajectory_graph = TRUE
)
![](https://img.haomeiwen.com/i18922188/1374fee63dbe104e.png)
提取“伪时间”数值,添加到Seurat对象里:
> bone <- AddMetaData(
object = bone,
metadata = erythroid.cds@principal_graph_aux@listData$UMAP$pseudotime,
col.name = "Erythroid"
)
> bone <- AddMetaData(
object = bone,
metadata = lymphoid.cds@principal_graph_aux@listData$UMAP$pseudotime,
col.name = "Lymphoid"
)
> FeaturePlot(bone, c("Erythroid", "Lymphoid"), pt.size = 0.1) & scale_color_viridis_c()
![](https://img.haomeiwen.com/i18922188/854a9fa5e22fca8a.png)
网友评论