美文网首页
[scATAC-seq]利用Monocle3构建细胞“轨迹”

[scATAC-seq]利用Monocle3构建细胞“轨迹”

作者: 生信start_site | 来源:发表于2020-09-25 11:34 被阅读0次

官网教程:https://satijalab.org/signac/articles/monocle.html

在这一篇官网教程里,主要是演示了如何利用Monocle3来构建单细胞ATAC-seq数据的细胞“轨迹”。

为了方便Seurat (Signac使用)和CellDataSet (Cicero使用)格式之间的转换,我们将使用GitHub上提供的SeuratWrappers 包中的一个转换函数 。

数据下载

我们使用的单细胞ATAC-seq数据包含人类CD34+造血干细胞和祖细胞,这个数据发表在Satpathy and Granja et al. (2019, Nature Biotechnology)上。

处理好的dataset可以在NCBI GEO下载: 这里

你需要下载的是下面红色框里的:

还需要下载:

点击custom后,会弹出里面包含的所有文件

都下载完成后,可以开始进行练习了~

数据预处理

> library(Signac)
> library(Seurat)
> library(SeuratWrappers)
> library(monocle3)
> library(Matrix)
> library(ggplot2)
> library(patchwork)
> set.seed(1234)

上面要安装的SeuratWrappers包的安装方法见:https://github.com/satijalab/seurat-wrappers,如果没有报错当然更好,如果报错了按照报错的提示,缺哪个包,就安装哪个包就行了。

> peaks <- read.table(paste0("./GSE129785_scATAC-Hematopoiesis-CD34.peaks.txt"), header = TRUE)
> cells <- read.table(paste0( "./GSE129785_scATAC-Hematopoiesis-CD34.cell_barcodes.txt"), header = TRUE, stringsAsFactors = FALSE)
> head(peaks)
             Feature
1   chr1_10238_10738
2 chr1_115476_115976
3 chr1_236856_237356
4 chr1_237511_238011
5 chr1_240801_241301
6 chr1_250084_250584
> head(cells)
      UMAP1     UMAP2  Clusters            Group depth      FRIP           Barcodes      Internal_Name    Group_Barcode
1 -2.419656  7.603472  Cluster6 Bone_Marrow_Rep1 29301 0.3471895 CAAGAAAGTCAAGACG-1 SUHealthy_BM_B1_50 Bone_Marrow_Rep1
2 -7.153399 -5.512960 Cluster11 Bone_Marrow_Rep1 29608 0.4491860 AAAGGGCAGTACCCAT-1 SUHealthy_BM_B1_52 Bone_Marrow_Rep1
3 -8.475636 -5.377475  Cluster9 Bone_Marrow_Rep1 16810 0.4986318 CACAACATCGTGGGTC-1 SUHealthy_BM_B1_53 Bone_Marrow_Rep1
4 -6.706686 12.041433  Cluster2 Bone_Marrow_Rep1 47369 0.3847031 TTACCCGTCTGATCCC-1 SUHealthy_BM_B1_55 Bone_Marrow_Rep1
5 -7.591989 -3.001234  Cluster9 Bone_Marrow_Rep1 23342 0.4957159 CTTTGCGGTACGGTTT-1 SUHealthy_BM_B1_56 Bone_Marrow_Rep1
6 -6.898403 11.516414  Cluster2 Bone_Marrow_Rep1 78378 0.4175853 GGTTGCGCAGGGAGTT-1 SUHealthy_BM_B1_57 Bone_Marrow_Rep1
> rownames(cells) <- make.unique(cells$Barcodes)
> head(cells)
                       UMAP1     UMAP2  Clusters            Group depth      FRIP           Barcodes      Internal_Name
CAAGAAAGTCAAGACG-1 -2.419656  7.603472  Cluster6 Bone_Marrow_Rep1 29301 0.3471895 CAAGAAAGTCAAGACG-1 SUHealthy_BM_B1_50
AAAGGGCAGTACCCAT-1 -7.153399 -5.512960 Cluster11 Bone_Marrow_Rep1 29608 0.4491860 AAAGGGCAGTACCCAT-1 SUHealthy_BM_B1_52
CACAACATCGTGGGTC-1 -8.475636 -5.377475  Cluster9 Bone_Marrow_Rep1 16810 0.4986318 CACAACATCGTGGGTC-1 SUHealthy_BM_B1_53
TTACCCGTCTGATCCC-1 -6.706686 12.041433  Cluster2 Bone_Marrow_Rep1 47369 0.3847031 TTACCCGTCTGATCCC-1 SUHealthy_BM_B1_55
CTTTGCGGTACGGTTT-1 -7.591989 -3.001234  Cluster9 Bone_Marrow_Rep1 23342 0.4957159 CTTTGCGGTACGGTTT-1 SUHealthy_BM_B1_56
GGTTGCGCAGGGAGTT-1 -6.898403 11.516414  Cluster2 Bone_Marrow_Rep1 78378 0.4175853 GGTTGCGCAGGGAGTT-1 SUHealthy_BM_B1_57
                      Group_Barcode
CAAGAAAGTCAAGACG-1 Bone_Marrow_Rep1
AAAGGGCAGTACCCAT-1 Bone_Marrow_Rep1
CACAACATCGTGGGTC-1 Bone_Marrow_Rep1
TTACCCGTCTGATCCC-1 Bone_Marrow_Rep1
CTTTGCGGTACGGTTT-1 Bone_Marrow_Rep1
GGTTGCGCAGGGAGTT-1 Bone_Marrow_Rep1
> mtx <- readMM(file = paste0("./GSE129785_scATAC-Hematopoiesis-CD34.mtx.gz"))
> mtx <- as(object = mtx, Class = "dgCMatrix")
> colnames(mtx) <- rownames(cells)
> rownames(mtx) <- peaks$Feature
> bone_assay <- CreateChromatinAssay(
   counts = mtx,
   min.cells = 5,
   fragments = "./GSM3722029_CD34_Progenitors_Rep1_fragments.tsv.gz",
   sep = c("_", "_"),
   genome = "hg19"
 )

Computing hash
Checking for 18489 cell barcodes

> bone <- CreateSeuratObject(
   counts = bone_assay,
   meta.data = cells,
   assay = "ATAC"
 )
# 这个dataset包含多种细胞类型
# 我们可以只保留Group_barcode里是CD34_Progenitors_Rep1的数据
> bone <- bone[, bone$Group_Barcode == "CD34_Progenitors_Rep1"]
# 添加细胞类型注释(来自文献)
> cluster_names <- c("HSC",   "MEP",  "CMP-BMP",  "LMPP", "CLP",  "Pro-B",    "Pre-B",    "GMP",
                   "MDP",    "pDC",  "cDC",  "Monocyte-1",   "Monocyte-2",   "Naive-B",  "Memory-B",
                   "Plasma-cell",    "Basophil", "Immature-NK",  "Mature-NK1",   "Mature-NK2",   "Naive-CD4-T1",
                   "Naive-CD4-T2",   "Naive-Treg",   "Memory-CD4-T", "Treg", "Naive-CD8-T1", "Naive-CD8-T2",
                   "Naive-CD8-T3",   "Central-memory-CD8-T", "Effector-memory-CD8-T",    "Gamma delta T")
> num.labels <- length(cluster_names)
> num.labels
[1] 31
> names(cluster_names) <- paste0(rep("Cluster",num.labels),seq(num.labels))
> cluster_names #这样每一个细胞类型都有一个cluster编号
               Cluster1                Cluster2                Cluster3 
                  "HSC"                   "MEP"               "CMP-BMP" 
               Cluster4                Cluster5                Cluster6 
                 "LMPP"                   "CLP"                 "Pro-B" 
               Cluster7                Cluster8                Cluster9 
                "Pre-B"                   "GMP"                   "MDP" 
              Cluster10               Cluster11               Cluster12 
                  "pDC"                   "cDC"            "Monocyte-1" 
              Cluster13               Cluster14               Cluster15 
           "Monocyte-2"               "Naive-B"              "Memory-B" 
              Cluster16               Cluster17               Cluster18 
          "Plasma-cell"              "Basophil"           "Immature-NK" 
              Cluster19               Cluster20               Cluster21 
           "Mature-NK1"            "Mature-NK2"          "Naive-CD4-T1" 
              Cluster22               Cluster23               Cluster24 
         "Naive-CD4-T2"            "Naive-Treg"          "Memory-CD4-T" 
              Cluster25               Cluster26               Cluster27 
                 "Treg"          "Naive-CD8-T1"          "Naive-CD8-T2" 
              Cluster28               Cluster29               Cluster30 
         "Naive-CD8-T3"  "Central-memory-CD8-T" "Effector-memory-CD8-T" 
              Cluster31 
        "Gamma delta T" 
> bone$celltype <- cluster_names[as.character(bone$Clusters)]
> bone[["ATAC"]]
ChromatinAssay data with 570154 features for 9913 cells
Variable features: 0 
Genome: hg19 
Annotation present: FALSE 
Motifs present: FALSE 
Fragment files: 1 

下一步就是加基因注释到object里了:

#add gene annotations for the hg19 genome to the object
> library(EnsDb.Hsapiens.v75)
# extract gene annotations from EnsDb
> annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)
# change to UCSC style since the data was mapped to hg19
> seqlevelsStyle(annotations) <- 'UCSC'
> genome(annotations) <- "hg19"
# add the gene information to the object
> Annotation(bone) <- annotations

质量控制

现在要计算转录起始位点的reads富集,核小体信号分数,以及落在基因组“黑名单”区域内的count数,利用这些参数来过滤我们的低质量细胞。

> bone <- TSSEnrichment(bone)
Extracting TSS positions
Extracting fragments at TSSs
  |========================================================================| 100%
Computing TSS enrichment score
> bone <- NucleosomeSignal(bone)
Found 18489 cell barcodes
Done Processing 99 million lines                  
> bone$blacklist_fraction <- FractionCountsInRegion(bone, regions = blacklist_hg19)
> VlnPlot(
   object = bone,
   features = c("nCount_ATAC", "TSS.enrichment", "nucleosome_signal", "blacklist_fraction"),
   pt.size = 0.1,
   ncol = 4
 )
#过滤细胞
> bone <- bone[, (bone$nCount_ATAC < 50000) &
               (bone$TSS.enrichment > 2) &
               (bone$nucleosome_signal < 5)]

数据处理

现在运行标准的scATAC-seq分析流程(使用Signac包)进行降维处理、聚类以及细胞类型的注释。

> bone <- FindTopFeatures(bone, min.cells = 10)
> bone <- RunTFIDF(bone)
Performing TF-IDF normalization
Warning message:
In RunTFIDF.default(object = GetAssayData(object = object, slot = "counts"),  :
  Some features contain 0 total counts
> bone <- RunSVD(bone, n = 100)
Running SVD
Scaling cell embeddings
> DepthCor(bone)
> bone <- RunUMAP(
  bone,
  reduction = "lsi",
  dims = 2:100,
  reduction.name = "UMAP"
)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
12:25:45 UMAP embedding parameters a = 0.9922 b = 1.112
12:25:46 Read 9765 rows and found 99 numeric columns
12:25:46 Using Annoy for neighbor search, n_neighbors = 30
12:25:46 Building Annoy index with metric = cosine, n_trees = 50
12:25:57 Writing NN index file to temp file
12:25:57 Searching Annoy index using 1 thread, search_k = 3000
12:26:01 Annoy recall = 100%
12:26:14 Commencing smooth kNN distance calibration using 1 thread
12:26:20 Initializing from normalized Laplacian + noise
12:26:21 Commencing optimization for 500 epochs, with 461970 positive edges
12:27:00 Optimization finished

> bone <- FindNeighbors(bone, dims = 2:100, reduction = "lsi")
Computing nearest neighbor graph
Computing SNN
> bone <- FindClusters(bone, resolution = 0.8, algorithm = 3)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 9765
Number of edges: 633636

Running smart local moving algorithm...
0%   10   20   30   40   50   60   70   80   90   100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8206
Number of communities: 61
Elapsed time: 11 seconds
45 singletons identified. 16 final clusters.
> DimPlot(bone, label = TRUE) + NoLegend()

根据文献,给每一个cluster分配most common cell type:

> for(i in levels(bone)) {
  cells_to_reid <- WhichCells(bone, idents = i)
  newid <- names(sort(table(bone$celltype[cells_to_reid]),decreasing=TRUE))[1]
  Idents(bone, cells = cells_to_reid) <- newid
}
> bone$assigned_celltype <- Idents(bone)
> DimPlot(bone, label = TRUE)

下一步我们取不同的谱系子集,并且构建每一个谱系的轨迹。另一个方法是构建一个轨迹,这个轨迹可以用于整体的dataset,然后再分别构建不同细胞群的伪时间轨迹。

#取不同的子集
> DefaultAssay(bone) <- "ATAC"
> erythroid <- bone[,  bone$assigned_celltype %in% c("HSC", "MEP", "CMP-BMP")]
> lymphoid <- bone[, bone$assigned_celltype %in% c("HSC", "LMPP", "GMP", "CLP", "Pro-B", "pDC", "MDP", "GMP")]

利用Monocle3构建“轨迹”

我们可以利用[as.cell_data_set()]功能把seurat对象转换成CellDataSet对象,再构建轨迹。这里我们将分别处理红系和淋巴细胞谱系:

> erythroid.cds <- as.cell_data_set(erythroid)
> erythroid.cds <- cluster_cells(cds = erythroid.cds, reduction_method = "UMAP")
> erythroid.cds <- learn_graph(erythroid.cds, use_partition = TRUE)
  |==================================================================| 100%

> lymphoid.cds <- as.cell_data_set(lymphoid)
> lymphoid.cds <- cluster_cells(cds = lymphoid.cds, reduction_method = "UMAP")
> lymphoid.cds <- learn_graph(lymphoid.cds, use_partition = TRUE)
  |==================================================================| 100%

为了计算每个轨迹的伪时间,我们需要决定每个轨迹的起点是什么。在我们的例子中,我们知道造血干细胞是轨迹中其他类型细胞的祖细胞,所以我们可以把这些细胞设为轨迹的根(root)。Monocle 3包括一个交互功能,在图中选择细胞作为根节点。调用order_cells()但不指定root_cells参数,将启动此函数自动寻找根节点。在这里,我们预先选择了一些细胞作为root,并将它们保存到一个文件中,以便重现结果。这个文件可以在 这里下载。

# load the pre-selected HSCs
> hsc <- readLines("./hsc_cells.txt")
# order cells
> erythroid.cds <- order_cells(erythroid.cds, reduction_method = "UMAP", root_cells = hsc)
> lymphoid.cds <- order_cells(lymphoid.cds, reduction_method = "UMAP", root_cells = hsc)

# plot trajectories colored by pseudotime
> plot_cells(
  cds = erythroid.cds,
  color_cells_by = "pseudotime",
  show_trajectory_graph = TRUE
)
> plot_cells(
   cds = lymphoid.cds,
   color_cells_by = "pseudotime",
   show_trajectory_graph = TRUE
 )

提取“伪时间”数值,添加到Seurat对象里:

> bone <- AddMetaData(
  object = bone,
  metadata = erythroid.cds@principal_graph_aux@listData$UMAP$pseudotime,
  col.name = "Erythroid"
)

> bone <- AddMetaData(
  object = bone,
  metadata = lymphoid.cds@principal_graph_aux@listData$UMAP$pseudotime,
  col.name = "Lymphoid"
)
> FeaturePlot(bone, c("Erythroid", "Lymphoid"), pt.size = 0.1) & scale_color_viridis_c()

相关文章

网友评论

      本文标题:[scATAC-seq]利用Monocle3构建细胞“轨迹”

      本文链接:https://www.haomeiwen.com/subject/nrhnyktx.html