美文网首页
使用monocle3进行拟时序分析(从Seurat对象开始)20

使用monocle3进行拟时序分析(从Seurat对象开始)20

作者: 黄甫一 | 来源:发表于2023-08-30 22:06 被阅读0次

    适用背景

    单细胞转录组分析中如果涉及到发育或具有时间序列的细胞谱系,往往需要进行拟时序分析,而最最最常用的拟时序分析软件就是Monocle,如今它已经出到了第三个版本monocle3。关于monocle3与monocle2之间的区别,可以参考这篇博客,有兴趣可以查看monocle3的官网教程

    安装

    建议使用conda 安装

    conda install -c conda-forge r-seurat
    conda install -c conda-forge r-seuratobject
    conda install -c conda-forge r-terra==1.5.12
    conda install -c conda-forge r-units
    conda install -c conda-forge r-raster
    conda install -c conda-forge r-spdata
    conda install -c conda-forge r-sf
    conda install -c conda-forge r-spdep
    conda install -c conda-forge r-ragg
    conda install -c conda-forge r-ggrastr
    conda install -c conda-forge libgit2
    conda install -c conda-forge r-devtools
    install.packages("BiocManager")
    conda install -c anaconda cmake
    

    在R里面进行最后的安装

    BiocManager::install(c("nloptr", "lme4"))
    if(!require(monocle3))devtools::install_github('cole-trapnell-lab/monocle3')
    
    需要加载的R包
    library(stringr)
    library(ggplot2)
    library(Seurat)
    library(future)
    library(rhdf5)
    library(dplyr)
    library(data.table)
    library(Matrix)
    library(rjson)
    library(monocle3)
    library(patchwork)
    library(ComplexHeatmap)
    library(RColorBrewer)
    library(circlize)
    
    运行Monocle3的主函数
    partition.umap.jpg
    int.umap.jpg

    obj是Seurat的对象,注意需要有pca和umap降维信息,assay选择使用的assay,ex.umap默认为F,使用Monocle3软件运行得到的umap,如果为T/TRUE,则使用Seurat对象的umap,group.by是细胞分组,label是输出文件的前缀,默认为’out’。

    run_monocle3 <- function(obj,assay='RNA',ex.umap=F,group.by='seurat_clusters',label='out') {
    all <- obj
    expression_matrix <- all@assays[assay][[1]]@counts
    cell_metadata <- all@meta.data
    gene_annotation <- data.frame(rownames(all), rownames(all))
    rownames(gene_annotation) <- rownames(all)
    colnames(gene_annotation) <- c("GeneSymbol", "gene_short_name")
    
    NucSeq_cds <- new_cell_data_set(expression_matrix,
                             cell_metadata = cell_metadata,
                             gene_metadata = gene_annotation)
    
    NucSeq_cds@int_colData$reducedDims$PCA<- all@reductions$pca@cell.embeddings
    NucSeq_cds <- reduce_dimension(NucSeq_cds, reduction_method = 'UMAP', preprocess_method = "PCA")
    
    NucSeq_cds$celltype <- NucSeq_cds
    
    jpeg('cds.umap.jpg')
    p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype",show_trajectory_graph=F) + ggtitle('cds.umap')
    print(p1)
    dev.off()
    NucSeq.coembed <- all
    
    ##import  seurat umap
    if(ex.umap){
      cds.embed <- NucSeq_cds@int_colData$reducedDims$UMAP
      int.embed <- Embeddings(NucSeq.coembed, reduction = "umap")
      int.embed <- int.embed[rownames(cds.embed),]
      NucSeq_cds@int_colData$reducedDims$UMAP <- int.embed
    jpeg('int.umap.jpg')
      p1 <- plot_cells(NucSeq_cds, reduction_method="UMAP", color_cells_by="celltype", ,show_trajectory_graph=F) + ggtitle('int.umap')
    print(p1)
    dev.off()
    
    }
    
    NucSeq_cds <- cluster_cells(NucSeq_cds, reduction_method='UMAP')
    print(length(unique(partitions(NucSeq_cds))))
    
    jpeg('partition.umap.jpg')
    p1 <- plot_cells(NucSeq_cds, show_trajectory_graph = FALSE,group_label_size = 5,color_cells_by = "partition")
    print(p1)
    dev.off()
    
    NucSeq_cds <- learn_graph(NucSeq_cds)
    
    
    jpeg('celltype.umap.jpg')
    p1 <- plot_cells(NucSeq_cds, color_cells_by = "celltype",
               label_groups_by_cluster = FALSE, label_leaves = TRUE,
               label_branch_points = TRUE,group_label_size = 5)
    print(p1)
    dev.off()
    
    saveRDS(NucSeq_cds,paste0(label,".rds"))
    meta <- data.frame(NucSeq_cds@colData@listData)
    write.table(meta,paste0(label,"_metadata.xls"),sep='\t',quote=F)
    }
    
    选择轨迹起点的函数
    celltype.umap.jpg

    首先获取细胞名列表,以下是两个例子:

    cell_ids <- which(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] == 5)
    cell_ids <- which(cds$Celltypes == "OPCs")
    

    NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,cell_ids是轨迹起点细胞名向量,label是输出文件的前缀,label_cell_groups、label_leaves、label_branch_points、graph_label_size和show_trajectory_graph是作图设置的参数,可自调。

    select_root <- function(NucSeq_cds,cell_ids,label='out', label_cell_groups=FALSE,label_leaves=FALSE,label_branch_points=FALSE,
                            graph_label_size=1.5,show_trajectory_graph = F) {
    closest_vertex <-  NucSeq_cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
    closest_vertex <- as.matrix(closest_vertex[colnames(NucSeq_cds), ])
    root_pr_nodes <- igraph::V(principal_graph(NucSeq_cds)[["UMAP"]])$name[as.numeric(names(which.max(table(closest_vertex[cell_ids,]))))]
    
    NucSeq_cds <- order_cells(NucSeq_cds,root_pr_nodes=root_pr_nodes)
    
    jpeg(paste0(label,'_select_root_umap.jpg'))
    p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",
               label_cell_groups=label_cell_groups,label_leaves=label_leaves,label_branch_points=label_branch_points,
               graph_label_size=graph_label_size, show_trajectory_graph = show_trajectory_graph)
    print(p1)
    dev.off()
    }
    
    基因的拟时序作图函数

    NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,ingene是需要画的基因向量。

    select_gene_plot <- function(NucSeq_cds,ingene=NULL) {
    jpeg(paste0('select_gene_umap.jpg'))
    
    p1 <- plot_cells(NucSeq_cds,color_cells_by = "pseudotime",label_cell_groups=F,label_leaves=T,label_branch_points=T,graph_label_size=1.5)
    print(p1)
    dev.off()
    
    for (g in ingene) {
    jpeg(paste0('gene_',g,'_select_gene_umap.jpg'))
    
    p1 <- plot_cells(NucSeq_cds,
               genes=g,
               label_cell_groups=FALSE,
               show_trajectory_graph=FALSE)
    print(p1)
    dev.off()
    }
    }
    
    计算拟时序轨迹的相关基因及热图可视化
    heatmap_sig_genes.jpg

    运行graph_test时会出现‘Error rBind“报错

    #去除不需要的细胞亚群
    NucSeq_cds <- NucSeq_cds[,names(NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]])[NucSeq_cds@clusters@listData[["UMAP"]][["clusters"]] != 10]]
    #这里有个bug,需要将calculateLW中的Matrix::rBind改为rbind才能运行graph_test
    trace('calculateLW', edit = T, where = asNamespace("monocle3"))
    sig_gene <- graph_test(NucSeq_cds, neighbor_graph="principal_graph", cores=8)
    

    如果不想每次手动改,可以以下步骤
    1、运行trace('calculateLW', edit = T, where = asNamespace("monocle3"))后会生成一个临时的R脚本,将其复制到一个新的R脚本文件
    2、新的R脚本文件中修改tmp <- Matrix::rBind(tmp, cur_tmp) 为 tmp <- rbind(tmp, cur_tmp)(https://github.com/cole-trapnell-lab/monocle3/issues/512#issuecomment-1004224006
    3、新的R脚本文件中将jaccard_coeff全部替换为monocle3:::jaccard_coeff
    4、运行trace('graph_test', edit = T, where = asNamespace("monocle3"))后复制代码到新的R脚本文件中并把名字改为graph_test_new。
    5、加载这个新脚本后,运行graph_test_new函数。
    6、calculateLW函数还需要将my.moran.test改为monocle3:::my.moran.test,my.geary.test改为monocle3:::my.geary.test。

    函数obtain_sigene中,NucSeq_cds是monocle3运行完上面run_monocle3主函数得到的monocle3对象,obj之前输入的Seurat对象,neighbor_graph选择使用的graph,一般使用principal_graph就好,q_value.cutoff是q_value阈值,用于筛选基因,一般设为0.01,group.by是热图的细胞注释。热图可根据这个[网址](https://jokergoo.github.io/ComplexHeatmap-reference/book/heatmap-annotations.html)进行修改。

    obtain_sigene <- function(NucSeq_cds, obj, neighbor_graph="principal_graph", cores=8,q_value.cutoff=0.01,group.by=NULL) {
    NucSeq.coembed <- obj
    source('~/graph_test_new.R')
    sig_gene <- graph_test_new(NucSeq_cds, neighbor_graph=neighbor_graph, cores=cores)
    write.table(sig_gene,'sig_gene.xls',sep='\t',quote=F)
    sig_gene <- subset(sig_gene, q_value < q_value.cutoff)
    genes <- row.names(subset(sig_gene, morans_I >= quantile(sig_gene$morans_I)[4]))
    
    pt.matrix <- GetAssayData(NucSeq.coembed,slot='data', assay='RNA')[match(genes,rownames(NucSeq.coembed)),order(pseudotime(NucSeq_cds))]
    pt.matrix.raw <- pt.matrix
    pt.matrix <- t(apply(pt.matrix,1,function(x){smooth.spline(x,df=3)$y}))
    
    pt.matrix = pt.matrix[!apply(pt.matrix, 1, sd) == 0, ]
    pt.matrix = Matrix::t(scale(Matrix::t(pt.matrix), center = TRUE))
    pt.matrix = pt.matrix[is.na(row.names(pt.matrix)) == FALSE, ]
    # pt.matrix[is.nan(pt.matrix)] = 0
    pt.matrix[pt.matrix > 3] = 3
    pt.matrix[pt.matrix < -3] = -3
    row_dist <- as.dist((1 - cor(Matrix::t(pt.matrix)))/2)
    # row_dist[is.na(row_dist)] <- 1
    
    library(ComplexHeatmap)
    library(RColorBrewer)
    library(circlize)
    lab2=HeatmapAnnotation(foo = NucSeq.coembed@meta.data[,group.by])
    hthc <- Heatmap(
      pt.matrix,
      name                         = "z-score",
      col                          = colorRamp2(seq(from=-3,to=3,length=11),rev(brewer.pal(11, "RdYlGn"))),
      show_row_names               = FALSE,
      show_column_names            = FALSE,
      #row_names_gp                 = gpar(fontsize = 6),
      clustering_distance_rows = row_dist,
      clustering_method_rows = "ward.D2",
      clustering_method_columns = "ward.D2",
      row_title_rot                = 0,
      cluster_rows                 = TRUE,
      cluster_row_slices           = FALSE,
      cluster_columns              = FALSE,
      right_annotation = NULL,
      top_annotation = lab2)
    jpeg('heatmap_sig_genes.jpg')
    print(hthc)
    dev.off()
    }
    
    graph_test_new.R的代码
    calculateLW <- function (cds, k, neighbor_graph, reduction_method, verbose = FALSE)
    {
        if (verbose) {
            message("retrieve the matrices for Moran's I test...")
        }
        knn_res <- NULL
        principal_g <- NULL
        cell_coords <- reducedDims(cds)[[reduction_method]]
        if (neighbor_graph == "knn") {
            knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
                1, nrow(cell_coords)), searchtype = "standard")[[1]]
        }
        else if (neighbor_graph == "principal_graph") {
            pr_graph_node_coords <- cds@principal_graph_aux[[reduction_method]]$dp_mst
            principal_g <- igraph::get.adjacency(cds@principal_graph[[reduction_method]])[colnames(pr_graph_node_coords),
                colnames(pr_graph_node_coords)]
        }
        exprs_mat <- exprs(cds)
        if (neighbor_graph == "knn") {
            if (is.null(knn_res)) {
                knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
                    1, nrow(cell_coords)), searchtype = "standard")[[1]]
            }
            links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
            links <- links[links[, 1] > 0, ]
            relations <- as.data.frame(links)
            colnames(relations) <- c("from", "to", "weight")
            knn_res_graph <- igraph::graph.data.frame(relations,
                directed = T)
            knn_list <- lapply(1:nrow(knn_res), function(x) knn_res[x,
                -1])
            region_id_names <- colnames(cds)
            id_map <- 1:ncol(cds)
            names(id_map) <- id_map
            points_selected <- 1:nrow(knn_res)
            knn_list <- lapply(points_selected, function(x) id_map[as.character(knn_res[x,
                -1])])
        }
        else if (neighbor_graph == "principal_graph") {
            cell2pp_map <- cds@principal_graph_aux[[reduction_method]]$pr_graph_cell_proj_closest_vertex
            if (is.null(cell2pp_map)) {
                stop(paste("Error: projection matrix for each cell to principal",
                    "points doesn't exist, you may need to rerun learn_graph"))
            }
            cell2pp_map <- cell2pp_map[row.names(cell2pp_map) %in%
                row.names(colData(cds)), , drop = FALSE]
            cell2pp_map <- cell2pp_map[colnames(cds), ]
            if (verbose) {
                message("Identify connecting principal point pairs ...")
            }
            knn_res <- RANN::nn2(cell_coords, cell_coords, min(k +
                1, nrow(cell_coords)), searchtype = "standard")[[1]]
            principal_g_tmp <- principal_g
            diag(principal_g_tmp) <- 1
            cell_membership <- as.factor(cell2pp_map)
            uniq_member <- sort(unique(cell_membership))
            membership_matrix <- Matrix::sparse.model.matrix(~cell_membership +
                0)
            colnames(membership_matrix) <- levels(uniq_member)
            feasible_space <- membership_matrix %*% Matrix::tcrossprod(principal_g_tmp[as.numeric(levels(uniq_member)),
                as.numeric(levels(uniq_member))], membership_matrix)
            links <- monocle3:::jaccard_coeff(knn_res[, -1], F)
            links <- links[links[, 1] > 0, ]
            relations <- as.data.frame(links)
            colnames(relations) <- c("from", "to", "weight")
            knn_res_graph <- igraph::graph.data.frame(relations,
                directed = T)
            tmp_a <- igraph::get.adjacency(knn_res_graph)
            block_size <- 10000
            num_blocks = ceiling(nrow(tmp_a)/block_size)
            if (verbose) {
                message("start calculating valid kNN graph ...")
            }
            tmp <- NULL
            for (j in 1:num_blocks) {
                if (j < num_blocks) {
                    block_a <- tmp_a[((((j - 1) * block_size) + 1):(j *
                      block_size)), ]
                    block_b <- feasible_space[((((j - 1) * block_size) +
                      1):(j * block_size)), ]
                }
                else {
                    block_a <- tmp_a[((((j - 1) * block_size) + 1):(nrow(tmp_a))),
                      ]
                    block_b <- feasible_space[((((j - 1) * block_size) +
                      1):(nrow(tmp_a))), ]
                }
                cur_tmp <- block_a * block_b
                if (is.null(tmp)) {
                    tmp <- cur_tmp
                }
                else {
                    tmp <- rbind(tmp, cur_tmp)
                }
            }
            if (verbose) {
                message("Calculating valid kNN graph, done ...")
            }
            region_id_names <- colnames(cds)
            id_map <- 1:ncol(cds)
            names(id_map) <- id_map
            knn_list <- slam::rowapply_simple_triplet_matrix(slam::as.simple_triplet_matrix(tmp),
                function(x) {
                    res <- which(as.numeric(x) > 0)
                    if (length(res) == 0)
                      res <- 0L
                    res
                })
        }
        else {
            stop("Error: unrecognized neighbor_graph option")
        }
        names(knn_list) <- id_map[names(knn_list)]
        class(knn_list) <- "nb"
        attr(knn_list, "region.id") <- region_id_names
        attr(knn_list, "call") <- match.call()
        lw <- spdep::nb2listw(knn_list, zero.policy = TRUE)
        lw
    }
    graph_test_new <- function (cds, neighbor_graph = c("knn", "principal_graph"),
        reduction_method = "UMAP", k = 25, method = c("Moran_I"),
        alternative = "greater", expression_family = "quasipoisson",
        cores = 1, verbose = FALSE)
    {
        neighbor_graph <- match.arg(neighbor_graph)
        lw <- calculateLW(cds, k = k, verbose = verbose, neighbor_graph = neighbor_graph,
            reduction_method = reduction_method)
        if (verbose) {
            message("Performing Moran's I test: ...")
        }
        exprs_mat <- SingleCellExperiment::counts(cds)[, attr(lw,
            "region.id"), drop = FALSE]
        sz <- size_factors(cds)[attr(lw, "region.id")]
        wc <- spdep::spweights.constants(lw, zero.policy = TRUE,
            adjust.n = TRUE)
        test_res <- pbmcapply::pbmclapply(row.names(exprs_mat), FUN = function(x,
            sz, alternative, method, expression_family) {
            exprs_val <- exprs_mat[x, ]
            if (expression_family %in% c("uninormal", "binomialff")) {
                exprs_val <- exprs_val
            }
            else {
                exprs_val <- log10(exprs_val/sz + 0.1)
            }
            test_res <- tryCatch({
                if (method == "Moran_I") {
                    mt <- suppressWarnings(monocle3:::my.moran.test(exprs_val,
                      lw, wc, alternative = alternative))
                    data.frame(status = "OK", p_value = mt$p.value,
                      morans_test_statistic = mt$statistic, morans_I = mt$estimate[["Moran I statistic"]])
                }
                else if (method == "Geary_C") {
                    gt <- suppressWarnings(monocle3:::my.geary.test(exprs_val,
                      lw, wc, alternative = alternative))
                    data.frame(status = "OK", p_value = gt$p.value,
                      geary_test_statistic = gt$statistic, geary_C = gt$estimate[["Geary C statistic"]])
                }
            }, error = function(e) {
                data.frame(status = "FAIL", p_value = NA, morans_test_statistic = NA,
                    morans_I = NA)
            })
        }, sz = sz, alternative = alternative, method = method, expression_family = expression_family,
            mc.cores = cores, ignore.interactive = TRUE)
        if (verbose) {
            message("returning results: ...")
        }
        test_res <- do.call(rbind.data.frame, test_res)
        row.names(test_res) <- row.names(cds)
        test_res <- merge(test_res, rowData(cds), by = "row.names")
        row.names(test_res) <- test_res[, 1]
        test_res[, 1] <- NULL
        test_res$q_value <- 1
        test_res$q_value[which(test_res$status == "OK")] <- stats::p.adjust(subset(test_res,
            status == "OK")[, "p_value"], method = "BH")
        test_res$status = as.character(test_res$status)
        test_res[row.names(cds), ]
    }
    

    相关文章

      网友评论

          本文标题:使用monocle3进行拟时序分析(从Seurat对象开始)20

          本文链接:https://www.haomeiwen.com/subject/trmoydtx.html