美文网首页scRNAseq
单细胞转录组之拟时序分析

单细胞转录组之拟时序分析

作者: 青青青山 | 来源:发表于2022-07-08 22:38 被阅读0次

    1.数据的准备

    1.1选择想要进行时间轨迹分析的seurat亚群,,并将亚群提取出来

    #走一遍seurat标准流程后,得到数据,提取亚群
    table( Idents(sce ))
    table(sce@meta.data$seurat_clusters) 
    table(sce@meta.data$orig.ident) 
    
    # 取子集
    levels(Idents(sce))
    sce = sce[, Idents(sce) %in% c( "FCGR3A+ Mono", "CD14+ Mono"  )] 
    #sce[基因,细胞]
    levels(Idents(sce))
    
    markers_df <- FindMarkers(object = sce, ident.1 = 'FCGR3A+ Mono',ident.2 = 'CD14+ Mono', #logfc.threshold = 0,min.pct = 0.25)
    head(markers_df)
    挑选差异基因
    cg_markers_df=markers_df[abs(markers_df$avg_log2FC) >1,]
    

    1.2 创建CellDataSet对象及monocle标准流程

    详见此链接:https://www.jianshu.com/p/34c23dbd9dc1

    创建CellDataSet对象

    library(monocle)
    
    sample_ann <-  sce@meta.data  
    
    sample_ann$celltype=Idents(sce)
    
    #准备newCellDataSet函数的输入文件
    gene_ann <- data.frame(gene_short_name = rownames(sce@assays$RNA,  row.names =  rownames(sce@assays$RNA))
    pd <- new("AnnotatedDataFrame",
              data=sample_ann)
    fd <- new("AnnotatedDataFrame",
              data=gene_ann)
    ct=as.data.frame(sce@assays$RNA@counts)
    
    sc_cds <- newCellDataSet(as.matrix(ct), phenoData = pd,featureData =fd,expressionFamily = negbinomial.size(),lowerDetectionLimit=1)
    

    monocle标准流程

    sc_cds <- detectGenes(sc_cds, min_expr = 1) 
    
    sc_cds <- sc_cds[fData(sc_cds$num_cells_expressed > 10, ]
    
    cds <- estimateSizeFactors(cds)
    cds <- estimateDispersions(cds) 
    
    disp_table <- dispersionTable(cds)
    
    unsup_clustering_genes <- subset(disp_table,mean_expression >= 0.1)
    
    cds <- setOrderingFilter(cds,unsup_clustering_genes$gene_id)
    
    cds <- reduceDimension(cds, max_components = 2, num_dim = 6,reduction_method = 'tSNE', verbose = T)
    
    cds <- clusterCells(cds, num_clusters = 6) 
    

    2.查找差异基因及时间轨迹分析

    2.1 查找差异基因

    根据自己生物学意图,选择查看哪个性状的轨迹

    将monocle的分群改为seurat分群
    pData(cds)$Cluster=pData(cds)$celltype
    table(pData(cds1)$Cluster)
    
    #测试每个基因作为伪时间函数或根据指定的其他协变量的差异表达。fullModelFormulaStr 选择按照什么找差异
    diff_test_res <- differentialGeneTest(cds,fullModelFormulaStr = "~Cluster")
    
    # 选择 FDR < 10% 的基因作为差异基因
    sig_genes <- subset(diff_test_res, qval < 0.1)
    
    sig_genes<-sig_genes[order(sig_genes$pval),]
    
    head(sig_genes[,c("gene_short_name", "pval", "qval")] ) 
    
    cg=as.character(head(sig_genes$gene_short_name)) 
    #  挑选差异最显著的基因可视化,将一个或多个基因的表达绘制点图
    
    plot_genes_jitter(cds[cg,], grouping = "Cluster", color_by = "Cluster",nrow= 3,ncol = NULL )
    
    cg2=as.character(tail(sig_genes$gene_short_name)) 
    plot_genes_jitter(cds[cg2,],grouping = "Cluster",color_by = "Cluster",nrow= 3,ncol = NULL )
    

    2.2 时间轨迹分析

    第一步: 挑选合适的基因. 有多个方法,例如提供已知的基因集,这里选取统计学显著的差异基因列表。

    ordering_genes <- row.names (subse(diff_test_res, qval < 0.01))
    
    ordering_genes
    
    #准备聚类基因名单
    cds <- setOrderingFilter(cds, ordering_genes)
    plot_ordering_genes
    

    第二步: 降维。降维的目的是为了更好的展示数据。函数里提供了很多种方法,不同方法的最后展示的图都不太一样, 其中“DDRTree”是Monocle2使用的默认方法。

    cds <- reduceDimension(cds, max_components = 2, method = 'DDRTree')
    

    第三步: 对细胞进行排序 学习描述细胞正在经历的生物过程的“轨迹”,并计算每个细胞在该轨迹内的位置。

    cds <- orderCells(cds)
    

    最后: 可视化
    注意,伪时序的解读需要结合生物学意义

    plot_cell_trajectory(cds, color_by = "Cluster")  
    
    #绘制一个或多个基因的表达作为伪时序。
    plot_genes_in_pseudotime(cds[cg,],color_by = "Cluster") 
    

    前面根据差异基因,推断好了拟时序,也就是说把差异基因动态化了,后面就可以具体推断哪些基因随着拟时序如何的变化

    my_cds_subset=cds
    # 拟时序数据和细胞位置在pData 中
    head(pData(my_cds_subset))
    
    # 这个differentialGeneTest会比较耗费时间,测试每个基因的拟时序表达
    my_pseudotime_de <- differentialGeneTest(my_cds_subset,fullModelFormulaStr = "~sm.ns(Pseudotime)",cores = 4 )#cores调用的核心数
    
    head(my_pseudotime_de)
    

    3.分析结果的精致可视化

    library(Seurat)
    library(gplots)
    library(ggplot2)
    library(monocle)
    library(dplyr)
    
    cds=my_cds_subset
    phe=pData(cds)
    colnames(phe)
    library(ggsci)
    
    #绘制最小生成树
    p1=plot_cell_trajectory(cds, color_by = "Cluster")  + scale_color_nejm() 
    p1
    ggsave('trajectory_by_cluster.pdf')
    plot_cell_trajectory(cds, color_by = "celltype")  
    
    p2=plot_cell_trajectory(cds, color_by = "Pseudotime")  
    p2
    ggsave('trajectory_by_Pseudotime.pdf')
    
    p3=plot_cell_trajectory(cds, color_by = "State")  + scale_color_npg()
    p3
    ggsave('trajectory_by_State.pdf')
    library(patchwork)#拼图
    p1+p2/p3
    
    

    以qval前六的基因做图

    phe=pData(cds)
    head(phe)
    table(phe$State,phe$Cluster) 
    
    library(dplyr)  
    #%>% 管道函数 把左边的值发送给右边的表达式,并作为右件表达式函数的第一个参数
    my_pseudotime_de %>% arrange(qval) %>% head() 
    
    # 保存前六的基因
    my_pseudotime_de %>% arrange(qval) %>% head() %>% select(gene_short_name) -> my_pseudotime_gene
    my_pseudotime_gene=my_pseudotime_gene[,1]
    my_pseudotime_gene
    
    #绘制一个或多个基因的拟时序
    plot_genes_in_pseudotime(my_cds_subset[my_pseudotime_gene,])+ scale_color_npg()
    ggsave('monocle_top6_pseudotime_by_state.pdf')
    

    将一个或多个基因的表达绘制点图

    plot_genes_jitter(my_cds_subset[my_pseudotime_gene,],grouping = "Cluster",color_by = "Cluster", nrow= 3,ncol = NULL )+ scale_color_nejm()
    ggsave('monocle_top6_pseudotime_by_cluster.pdf')
    

    将前50个随拟时序变化的基因做聚类热图

    # cluster the top 50 genes that vary as a function of pseudotime
    my_pseudotime_de %>% arrange(qval) %>% head(50) %>% select(gene_short_name) -> gene_to_cluster
    gene_to_cluster <- gene_to_cluster[,1]
    gene_to_cluster
    colnames(pData(my_cds_subset))
    table(pData(my_cds_subset)$Cluster,pData(my_cds_subset)$State) 
    ac=pData(my_cds_subset)[c('celltype','State','Pseudotime')]
    head(ac)
    
    # 这个热图绘制的并不是纯粹的细胞基因表达量矩阵,而是被 Pseudotime 好了的100列,50行的矩阵
    
    my_pseudotime_cluster <- plot_pseudotime_heatmap(my_cds_subset[gene_to_cluster,],# num_clusters = 2, # add_annotation_col = ac,show_rownames = TRUE,
    return_heatmap = TRUE)
    
    my_pseudotime_cluster
    
    pdf('monocle_top50_heatmap.pdf')
    print(my_pseudotime_cluster)
    dev.off()
    

    分支表达分析建模 识别具有分支依赖性表达的基因。

    #计算建模分支节点
    BEAM_branch1 <- BEAM(my_cds_subset, branch_point = 1, cores = 4)
    
    head(BEAM_branch1)
    colnames(BEAM_branch1)
    
    BEAM_branch1 <- BEAM_branch1[order(BEAM_branch1$qval),]
    
    BEAM_branch1 <- BEAM_branch1[,c("gene_short_name", "pval", "qval")]
    head(BEAM_branch1) 
      
    BEAM_branch2 <- BEAM(my_cds_subset, branch_point = 2, cores = 20)
    
    BEAM_branch2 <- BEAM_branch2[order(BEAM_branch2$qval),]
    BEAM_branch2 <- BEAM_branch2[,c("gene_short_name", "pval", "qval")]
    head(BEAM_branch2)
    

    使用全部的基因进行绘图 创建一个热图来展示基因表达沿两个分支的分叉

    BEAM_res = BEAM_branch1
    my_branched_heatmap <- plot_genes_branched_heatmap( my_cds_subset[row.names(subset(BEAM_res, qval < 1e-4)),], branch_point = 1,num_clusters = 4, e_short_name = TRUE,show_rownames = F,return_heatmap = TRUE)
    
    pdf('monocle_BEAM_branch1_heatmap.pdf')
    print(my_branched_heatmap$ph)
    dev.off()
    
    BEAM_res = BEAM_branch2
    
    my_branched_heatmap <- plot_genes_branched_heatmap(my_cds_subset[row.names(subset(BEAM_res, qval < 1e-4)),],branch_point = 1,num_clusters = 4, use_gene_short_name = TRUE,show_rownames = F,return_heatmap = TRUE)
    
    pdf('monocle_BEAM_branch2_heatmap.pdf')
    print(my_branched_heatmap$ph)
    dev.off()
    
    #将所做热图的基因和cluster提取出来
    head(my_branched_heatmap$annotation_row)
    table(my_branched_heatmap$annotation_row$Cluster) 
    my_row <- my_branched_heatmap$annotation_row
    my_row <- data.frame(cluster = my_row$Cluster,
                         gene = row.names(my_row),
                         stringsAsFactors = FALSE)
    
    head(my_row[my_row$cluster == 3,'gene']) 
    
    my_gene <- row.names(subset(fData(my_cds_subset),
                                gene_short_name %in% head(my_row[my_row$cluster == 2,'gene'])))
    my_gene
    
    # 绘制分支处的基因拟时序轨迹
    #分支1
    plot_genes_branched_pseudotime(my_cds_subset[my_gene,],
                                   branch_point = 1,
                                   ncol = 1)
    #分支2
    plot_genes_branched_pseudotime(my_cds_subset[my_gene,],
                                   branch_point = 2,
                                   ncol = 1)
    
    分支1 分支2

    做热图查看拟时序基因在两个亚群的表达

    cds=my_cds_subset
    phe=pData(cds)
    colnames(phe)
    plot_cell_trajectory(cds)
    counts = Biobase::exprs(cds)
    dim(counts)
    
    library(dplyr) 
    my_pseudotime_de %>% arrange(qval) %>% head(100) %>% select(gene_short_name) -> my_pseudotime_gene
    my_pseudotime_gene=my_pseudotime_gene[,1]
    my_pseudotime_gene
    
    
    library(pheatmap)
    #数据中心化和归一化
    n=t(scale(t( counts[my_pseudotime_gene,] ))) # 'scale'可以对log-ratio数值进行归一化
    n[n>2]=2 
    n[n< -2]= -2
    n[1:4,1:4]
    pheatmap(n,show_colnames =F,show_rownames = F)
    ac=phe[,c(10,16,17)]
    head(ac)
    rownames(ac)=colnames(n)
    dim(n)
    n[1:4,1:4]
    pheatmap(n,show_colnames =F,
             show_rownames = F,
             annotation_col=ac)
    od=order(ac$Pseudotime)
    pheatmap(n[,od],show_colnames =F,
             show_rownames = F,cluster_cols = F,
             annotation_col=ac[od,])
    
    

    参考来源

    #section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili

    致谢

    I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.

    THE END

    相关文章

      网友评论

        本文标题:单细胞转录组之拟时序分析

        本文链接:https://www.haomeiwen.com/subject/oliibrtx.html