使用monocle3进行拟时序分析（从Seurat对象开始）20

作者: 黄甫一 | 来源:发表于2023-08-30 22:06 被阅读0次

Monocle3：拟时序分析
使用monocle 2进行拟时序分析
单细胞转录组之Scanpy - 轨迹推断/拟时序分析
scRNA-seq拟时间分析数据处理
拟时序分析
拟时序分析
2022-01-20 拟时序分析-monocle2包
seurat对象处理
Monocle拟时间分析
SCPA（三）：具有拟时序特征的通路活性评估

适用背景

单细胞转录组分析中如果涉及到发育或具有时间序列的细胞谱系，往往需要进行拟时序分析，而最最最常用的拟时序分析软件就是Monocle，如今它已经出到了第三个版本monocle3。关于monocle3与monocle2之间的区别，可以参考这篇博客，有兴趣可以查看monocle3的官网教程。

安装

建议使用conda 安装

conda install -c conda-forge r-seurat
conda install -c conda-forge r-seuratobject
conda install -c conda-forge r-terra==1.5.12
conda install -c conda-forge r-units
conda install -c conda-forge r-raster
conda install -c conda-forge r-spdata
conda install -c conda-forge r-sf
conda install -c conda-forge r-spdep
conda install -c conda-forge r-ragg
conda install -c conda-forge r-ggrastr
conda install -c conda-forge libgit2
conda install -c conda-forge r-devtools
install.packages("BiocManager")
conda install -c anaconda cmake

在R里面进行最后的安装

BiocManager::install(c("nloptr", "lme4"))
if(!require(monocle3))devtools::install_github('cole-trapnell-lab/monocle3')

需要加载的R包

library(stringr)
library(ggplot2)
library(Seurat)
library(future)
library(rhdf5)
library(dplyr)
library(data.table)
library(Matrix)
library(rjson)
library(monocle3)
library(patchwork)
library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)

运行Monocle3的主函数

partition.umap.jpg

int.umap.jpg

obj是Seurat的对象，注意需要有pca和umap降维信息，assay选择使用的assay，ex.umap默认为F，使用Monocle3软件运行得到的umap，如果为T/TRUE，则使用Seurat对象的umap，group.by是细胞分组，label是输出文件的前缀，默认为’out’。

run_monocle3 <- function(obj,assay='RNA',ex.umap=F,group.by='seurat_clusters',label='out') {
all <- obj
expression_matrix <- all@assays[assay][[1]]@counts
cell_metadata <- all@meta.data
gene_annotation <- data.frame(rownames(all), rownames(all))
rownames(gene_annotation) <- rownames(all)
colnames(gene_annotation) <- c("GeneSymbol", "gene_short_name")

NucSeq_cds <- new_cell_data_set(expression_matrix,
                         cell_metadata = cell_metadata,
                         gene_metadata = gene_annotation)

NucSeq_cds@int_colData$reducedDims$PCA<- all@reductions$pca@cell.embeddings
NucSeq_cds <- reduce_dimension(NucSeq_cds, reduction_method = 'UMAP', preprocess_method = "PCA")

NucSeq_cds$celltype <- NucSeq_cds

jpeg('cds.umap.jpg')
p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype",show_trajectory_graph=F) + ggtitle('cds.umap')
print(p1)
dev.off()
NucSeq.coembed <- all

##import  seurat umap
if(ex.umap){
  cds.embed <- NucSeq_cds@int_colData$reducedDims$UMAP
  int.embed <- Embeddings(NucSeq.coembed, reduction = "umap")
  int.embed <- int.embed[rownames(cds.embed),]
  NucSeq_cds@int_colData$reducedDims$UMAP <- int.embed
jpeg('int.umap.jpg')
  p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype", ,show_trajectory_graph=F) + ggtitle('int.umap')
print(p1)
dev.off()

}

NucSeq_cds <- cluster_cells(NucSeq_cds, reduction_method='UMAP')
print(length(unique(partitions(NucSeq_cds))))

jpeg('partition.umap.jpg')
p1 <- plot_cells(NucSeq_cds, show_trajectory_graph = FALSE,group_label_size = 5,color_cells_by = "partition")
print(p1)
dev.off()

NucSeq_cds <- learn_graph(NucSeq_cds)


jpeg('celltype.umap.jpg')
p1 <- plot_cells(NucSeq_cds, color_cells_by = "celltype",
           label_groups_by_cluster = FALSE, label_leaves = TRUE,
           label_branch_points = TRUE,group_label_size = 5)
print(p1)
dev.off()

saveRDS(NucSeq_cds,paste0(label,".rds"))
meta <- data.frame(NucSeq_cds@colData@listData)
write.table(meta,paste0(label,"_metadata.xls"),sep='\t',quote=F)
}

选择轨迹起点的函数

celltype.umap.jpg

首先获取细胞名列表，以下是两个例子：

cell_ids <- which(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] == 5)
cell_ids <- which(cds$Celltypes == "OPCs")

NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象，cell_ids是轨迹起点细胞名向量，label是输出文件的前缀，label_cell_groups、label_leaves、label_branch_points、graph_label_size和show_trajectory_graph是作图设置的参数，可自调。

select_root <- function(NucSeq_cds,cell_ids,label='out', label_cell_groups=FALSE,label_leaves=FALSE,label_branch_points=FALSE,
                        graph_label_size=1.5,show_trajectory_graph = F) {
closest_vertex <-  NucSeq_cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(NucSeq_cds), ])
root_pr_nodes <- igraph::V(principal_graph(NucSeq_cds)[["UMAP"]])$name[as.numeric(names(which.max(table(closest_vertex[cell_ids,]))))]

NucSeq_cds <- order_cells(NucSeq_cds,root_pr_nodes=root_pr_nodes)

jpeg(paste0(label,'_select_root_umap.jpg'))
p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",
           label_cell_groups=label_cell_groups,label_leaves=label_leaves,label_branch_points=label_branch_points,
           graph_label_size=graph_label_size, show_trajectory_graph = show_trajectory_graph)
print(p1)
dev.off()
}

基因的拟时序作图函数

NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象，ingene是需要画的基因向量。

select_gene_plot <- function(NucSeq_cds,ingene=NULL) {
jpeg(paste0('select_gene_umap.jpg'))

p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",label_cell_groups=F,label_leaves=T,label_branch_points=T,graph_label_size=1.5)
print(p1)
dev.off()

for (g in ingene) {
jpeg(paste0('gene_',g,'_select_gene_umap.jpg'))

p1 <- plot_cells(NucSeq_cds,
           genes=g,
           label_cell_groups=FALSE,
           show_trajectory_graph=FALSE)
print(p1)
dev.off()
}
}

计算拟时序轨迹的相关基因及热图可视化

heatmap_sig_genes.jpg

运行graph_test时会出现‘Error rBind“报错

#去除不需要的细胞亚群
NucSeq_cds <- NucSeq_cds[,names(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]])[NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] != 10]]
#这里有个bug，需要将calculateLW中的Matrix::rBind改为rbind才能运行graph_test
trace('calculateLW', edit = T, where = asNamespace("monocle3"))
sig_gene <- graph_test(NucSeq_cds, neighbor_graph="principal_graph", cores=8)

如果不想每次手动改，可以以下步骤
1、运行trace('calculateLW', edit = T, where = asNamespace("monocle3"))后会生成一个临时的R脚本，将其复制到一个新的R脚本文件
2、新的R脚本文件中修改tmp <- Matrix::rBind(tmp, cur_tmp) 为 tmp <- rbind(tmp, cur_tmp)（https://github.com/cole-trapnell-lab/monocle3/issues/512#issuecomment-1004224006）
3、新的R脚本文件中将jaccard_coeff全部替换为monocle3:::jaccard_coeff
4、运行trace('graph_test', edit = T, where = asNamespace("monocle3"))后复制代码到新的R脚本文件中并把名字改为graph_test_new。
5、加载这个新脚本后，运行graph_test_new函数。
6、calculateLW函数还需要将my.moran.test改为monocle3:::my.moran.test，my.geary.test改为monocle3:::my.geary.test。

函数obtain_sigene中，NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象，obj之前输入的Seurat对象，neighbor_graph选择使用的graph，一般使用principal_graph就好，q_value.cutoff是q_value阈值，用于筛选基因，一般设为0.01，group.by是热图的细胞注释。热图可根据这个[网址]（https://jokergoo.github.io/ComplexHeatmap-reference/book/heatmap-annotations.html)进行修改。

obtain_sigene <- function(NucSeq_cds, obj, neighbor_graph="principal_graph", cores=8,q_value.cutoff=0.01,group.by=NULL) {
NucSeq.coembed <- obj
source('~/graph_test_new.R')
sig_gene <- graph_test_new(NucSeq_cds, neighbor_graph=neighbor_graph, cores=cores)
write.table(sig_gene,'sig_gene.xls',sep='\t',quote=F)
sig_gene <- subset(sig_gene, q_value < q_value.cutoff)
genes <- row.names(subset(sig_gene, morans_I >= quantile(sig_gene$morans_I)[4]))

pt.matrix <- GetAssayData(NucSeq.coembed,slot='data', assay='RNA')[match(genes,rownames(NucSeq.coembed)),order(pseudotime(NucSeq_cds))]
pt.matrix.raw <- pt.matrix
pt.matrix <- t(apply(pt.matrix,1,function(x){smooth.spline(x,df=3)$y}))

pt.matrix = pt.matrix[!apply(pt.matrix, 1, sd) == 0, ]
pt.matrix = Matrix::t(scale(Matrix::t(pt.matrix), center = TRUE))
pt.matrix = pt.matrix[is.na(row.names(pt.matrix)) == FALSE, ]
# pt.matrix[is.nan(pt.matrix)] = 0
pt.matrix[pt.matrix > 3] = 3
pt.matrix[pt.matrix < -3] = -3
row_dist <- as.dist((1 - cor(Matrix::t(pt.matrix)))/2)
# row_dist[is.na(row_dist)] <- 1

library(ComplexHeatmap)
library(RColorBrewer)
library(circlize)
lab2=HeatmapAnnotation(foo = NucSeq.coembed@meta.data[,group.by])
hthc <- Heatmap(
  pt.matrix,
  name                         = "z-score",
  col                          = colorRamp2(seq(from=-3,to=3,length=11),rev(brewer.pal(11, "RdYlGn"))),
  show_row_names               = FALSE,
  show_column_names            = FALSE,
  #row_names_gp                 = gpar(fontsize = 6),
  clustering_distance_rows = row_dist,
  clustering_method_rows = "ward.D2",
  clustering_method_columns = "ward.D2",
  row_title_rot                = 0,
  cluster_rows                 = TRUE,
  cluster_row_slices           = FALSE,
  cluster_columns              = FALSE,
  right_annotation = NULL,
  top_annotation = lab2)
jpeg('heatmap_sig_genes.jpg')
print(hthc)
dev.off()
}

graph_test_new.R的代码

calculateLW <- function (cds, k, neighbor_graph, reduction_method, verbose = FALSE)
{
    if (verbose) {
        message("retrieve the matrices for Moran's I test...")
    }
    knn_res <- NULL
    principal_g <- NULL
    cell_coords <- reducedDims(cds)[[reduction_method]]
    if (neighbor_graph == "knn") {
        knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
            1, nrow(cell_coords)), searchtype = "standard")[[1]]
    }
    else if (neighbor_graph == "principal_graph") {
        pr_graph_node_coords <- cds@principal_graph_aux[[reduction_method]]$dp_mst
        principal_g <- igraph::get.adjacency(cds@principal_graph[[reduction_method]])[colnames(pr_graph_node_coords),
            colnames(pr_graph_node_coords)]
    }
    exprs_mat <- exprs(cds)
    if (neighbor_graph == "knn") {
        if (is.null(knn_res)) {
            knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
                1, nrow(cell_coords)), searchtype = "standard")[[1]]
        }
        links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
        links <- links[links[, 1] > 0, ]
        relations <- as.data.frame(links)
        colnames(relations) <- c("from", "to", "weight")
        knn_res_graph <- igraph::graph.data.frame(relations,
            directed = T)
        knn_list <- lapply(1:nrow(knn_res), function(x) knn_res[x,
            -1])
        region_id_names <- colnames(cds)
        id_map <- 1:ncol(cds)
        names(id_map) <- id_map
        points_selected <- 1:nrow(knn_res)
        knn_list <- lapply(points_selected, function(x) id_map[as.character(knn_res[x,
            -1])])
    }
    else if (neighbor_graph == "principal_graph") {
        cell2pp_map <- cds@principal_graph_aux[[reduction_method]]$pr_graph_cell_proj_closest_vertex
        if (is.null(cell2pp_map)) {
            stop(paste("Error: projection matrix for each cell to principal",
                "points doesn't exist, you may need to rerun learn_graph"))
        }
        cell2pp_map <- cell2pp_map[row.names(cell2pp_map) %in%
            row.names(colData(cds)), , drop = FALSE]
        cell2pp_map <- cell2pp_map[colnames(cds), ]
        if (verbose) {
            message("Identify connecting principal point pairs ...")
        }
        knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
            1, nrow(cell_coords)), searchtype = "standard")[[1]]
        principal_g_tmp <- principal_g
        diag(principal_g_tmp) <- 1
        cell_membership <- as.factor(cell2pp_map)
        uniq_member <- sort(unique(cell_membership))
        membership_matrix <- Matrix::sparse.model.matrix(~cell_membership +
            0)
        colnames(membership_matrix) <- levels(uniq_member)
        feasible_space <- membership_matrix %*% Matrix::tcrossprod(principal_g_tmp[as.numeric(levels(uniq_member)),
            as.numeric(levels(uniq_member))], membership_matrix)
        links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
        links <- links[links[, 1] > 0, ]
        relations <- as.data.frame(links)
        colnames(relations) <- c("from", "to", "weight")
        knn_res_graph <- igraph::graph.data.frame(relations,
            directed = T)
        tmp_a <- igraph::get.adjacency(knn_res_graph)
        block_size <- 10000
        num_blocks = ceiling(nrow(tmp_a)/block_size)
        if (verbose) {
            message("start calculating valid kNN graph ...")
        }
        tmp <- NULL
        for (j in 1:num_blocks) {
            if (j < num_blocks) {
                block_a <- tmp_a[((((j - 1) * block_size) + 1):(j *
                  block_size)), ]
                block_b <- feasible_space[((((j - 1) * block_size) +
                  1):(j * block_size)), ]
            }
            else {
                block_a <- tmp_a[((((j - 1) * block_size) + 1):(nrow(tmp_a))),
                  ]
                block_b <- feasible_space[((((j - 1) * block_size) +
                  1):(nrow(tmp_a))), ]
            }
            cur_tmp <- block_a * block_b
            if (is.null(tmp)) {
                tmp <- cur_tmp
            }
            else {
                tmp <- rbind(tmp, cur_tmp)
            }
        }
        if (verbose) {
            message("Calculating valid kNN graph, done ...")
        }
        region_id_names <- colnames(cds)
        id_map <- 1:ncol(cds)
        names(id_map) <- id_map
        knn_list <- slam::rowapply_simple_triplet_matrix(slam::as.simple_triplet_matrix(tmp),
            function(x) {
                res <- which(as.numeric(x) > 0)
                if (length(res) == 0)
                  res <- 0L
                res
            })
    }
    else {
        stop("Error: unrecognized neighbor_graph option")
    }
    names(knn_list) <- id_map[names(knn_list)]
    class(knn_list) <- "nb"
    attr(knn_list, "region.id") <- region_id_names
    attr(knn_list, "call") <- match.call()
    lw <- spdep::nb2listw(knn_list, zero.policy = TRUE)
    lw
}
graph_test_new <- function (cds, neighbor_graph = c("knn", "principal_graph"),
    reduction_method = "UMAP", k = 25, method = c("Moran_I"),
    alternative = "greater", expression_family = "quasipoisson",
    cores = 1, verbose = FALSE)
{
    neighbor_graph <- match.arg(neighbor_graph)
    lw <- calculateLW(cds, k = k, verbose = verbose, neighbor_graph = neighbor_graph,
        reduction_method = reduction_method)
    if (verbose) {
        message("Performing Moran's I test: ...")
    }
    exprs_mat <- SingleCellExperiment::counts(cds)[, attr(lw,
        "region.id"), drop = FALSE]
    sz <- size_factors(cds)[attr(lw, "region.id")]
    wc <- spdep::spweights.constants(lw, zero.policy = TRUE,
        adjust.n = TRUE)
    test_res <- pbmcapply::pbmclapply(row.names(exprs_mat), FUN = function(x,
        sz, alternative, method, expression_family) {
        exprs_val <- exprs_mat[x, ]
        if (expression_family %in% c("uninormal", "binomialff")) {
            exprs_val <- exprs_val
        }
        else {
            exprs_val <- log10(exprs_val/sz + 0.1)
        }
        test_res <- tryCatch({
            if (method == "Moran_I") {
                mt <- suppressWarnings(monocle3:::my.moran.test(exprs_val,
                  lw, wc, alternative = alternative))
                data.frame(status = "OK", p_value = mt$p.value,
                  morans_test_statistic = mt$statistic, morans_I = mt$estimate[["Moran I statistic"]])
            }
            else if (method == "Geary_C") {
                gt <- suppressWarnings(monocle3:::my.geary.test(exprs_val,
                  lw, wc, alternative = alternative))
                data.frame(status = "OK", p_value = gt$p.value,
                  geary_test_statistic = gt$statistic, geary_C = gt$estimate[["Geary C statistic"]])
            }
        }, error = function(e) {
            data.frame(status = "FAIL", p_value = NA, morans_test_statistic = NA,
                morans_I = NA)
        })
    }, sz = sz, alternative = alternative, method = method, expression_family = expression_family,
        mc.cores = cores, ignore.interactive = TRUE)
    if (verbose) {
        message("returning results: ...")
    }
    test_res <- do.call(rbind.data.frame, test_res)
    row.names(test_res) <- row.names(cds)
    test_res <- merge(test_res, rowData(cds), by = "row.names")
    row.names(test_res) <- test_res[, 1]
    test_res[, 1] <- NULL
    test_res$q_value <- 1
    test_res$q_value[which(test_res$status == "OK")] <- stats::p.adjust(subset(test_res,
        status == "OK")[, "p_value"], method = "BH")
    test_res$status = as.character(test_res$status)
    test_res[row.names(cds), ]
}

Monocle3：拟时序分析
写在前面：今天突然想写这个教程，花了一下午看资料，其实我前面是有一些基础的，但是看的还是会很乱，所以想把这个包的脉...
使用monocle 2进行拟时序分析
monocle做拟时序分析首先要构建CDS需要3个矩阵：expr.matrix、pd、fd，其次将Seurat中的...
单细胞转录组之Scanpy - 轨迹推断/拟时序分析
什么是拟时序分析？拟时序（pseudotime）分析，又称细胞轨迹（cell trajectory）分析，通过拟时...
scRNA-seq拟时间分析数据处理
拟时间分析简介拟时序（pseudotime）分析，又称细胞轨迹（cell trajectory）分析，通过拟时分...
拟时序分析
简要内容--拟时序分析的定义--结果解读--------做出结果（cluster、state）--------确定...
拟时序分析
单细胞测序可以检测一块组织中每个细胞的转录组情况，但其实一块组织中存在着不同发育状态的细胞，因此通过基因的表达情况...
2022-01-20 拟时序分析-monocle2包
安装创建文件夹存储结果用CCA后的数据进行拟时序分析结果展示
seurat对象处理
Seurat是单细胞分析经常使用的分析包。seurat对象的处理是分析的一个难点，这里我根据我自己的理解整理了下常...
Monocle拟时间分析
Monocle可以进行下面三种类型的分析，可以与seurat无缝连接： 1 对细胞进行聚类、分组和计算；2.进行拟...
SCPA（三）：具有拟时序特征的通路活性评估
前言单细胞的拟时序分析是利用算法模拟不同细胞之间的分化趋势，目前已经是一种相对常规化的分析手段。通过拟时序分析，...