引言: 在学习并实践了单细胞转录组分析的Seurat工具包之后,下一步分析就是细胞发育轨迹重建的Monocle包了,在此记录下学习过程。因为服务器系统不能安装Monocle3, 因此所有的学习都是基于Monocle2。
Monocle
Monocle 拟时分析
主要步骤
- 创建CellDataSet数据对象,官方建议不能使用已经标准化之后的数据,(或者使用归一化后的数据,也服从正态分布?)注意选择合适的数据分布函数
- Estimate size factors and dispersions,仅当数据符合negbinomial() or negbinomial.size() 分布时
- 选择能定义细胞发育过程的基因
- 降维
- 根据轨迹排序细胞
主要分析步骤代码
library(monocle)
library(Seurat)
library(clusterProfiler)
library(dplyr)
#1. 从Seurat 对象中提取monocle 对象基本元素
load("Rdata/subara_integrated_seurat.RData") #subara_integrated
head(subara_integrated[[]])
levels(subara_integrated)
ara.m.pd = subara_integrated@meta.data[,c(1,4,7)]
ara.m.expr = subara_integrated@assays$RNA@counts[,rownames(ara.m.pd)]
ara.m.fd = rownames(ara.m.expr) %>% bitr(,fromType = "TAIR", toType =c("SYMBOL") ,OrgDb= "org.At.tair.db")
m.fd = ara.m.fd[!duplicated(ara.m.fd$TAIR),]
m.expr = left_join(m.fd, data.frame(TAIR = rownames(ara.m.expr),as.data.frame(ara.m.expr)),by = "TAIR")
# 几点注意:expr 的行与 fd 的行数一致,expr的列和pd的行数一致。且fd, pd 数据集的行名对应到expr的行名和列名,且fd 中必须有一列“gene_short_name”, pd 数据集中信息越多,后续分析点越多
#2. 创建monocle数据对象并 Estimate size factors and dispersions
counts.data <- as(ara.m.matrix, 'sparseMatrix') # sparseMatrix is suitable for large dataset
pheno.data <- new('AnnotatedDataFrame', data = sub.m.pd)
feature.data <- new('AnnotatedDataFrame', data = sub.m.fd)
cds <- newCellDataSet(counts.data,
into mRNA counts,
phenoData = pheno.data,
featureData = feature.data,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size()) #negbinomial.size(), tobit(), gaussianff()
cds <- estimateSizeFactors(cds) # give Size_Factor values
cds <- estimateDispersions(cds) # time costly
#3. 构建轨迹,拟时分析
# 选择基因
# second methods: using algorithm that based on average expression level
disp_table <- dispersionTable(cds) # based on average expression level,
ordering_genes <- subset(disp_table, mean_expression >= 0.1)
cds <- setOrderingFilter(cds, ordering_genes) #none gene can be used for ordering?????
save_plot("subara_integrated_out_results/ordering_genes_dispersion.png",plot_ordering_genes(cds),base_height = 8,base_asp = 1.3)
# 降维
cds <- reduceDimension(cds, max_components = 2,method = 'DDRTree') # run very long time
# 排序,构建轨迹
cds <- orderCells(cds) # out of memory
head(pData(cds))
save_plot("subara_integrated_out_results/cell_trajectory.png",plot_cell_trajectory(cds, color_by = "Hours"),base_height = 8,base_asp = 1.3)
## branch point analysis
BEAM_res <- BEAM(cds, branch_point = 1, cores = 1)
save(BEAM_res, file = "Rdata/list61_sub_cluster_seurat/BEMA_res_object.Rdata")
BEAM_res <- BEAM_res[order(BEAM_res$qval),]
BEAM_res <- BEAM_res[,c("gene_short_name", "pval", "qval")]
cds.symbol.unique = cds[row.names(subset(BEAM_res,qval < 1e-4 & !duplicated(BEAM_res$gene_short_name))),]
save_plot("subara_integrated_out_results/branch_gene_heatmap_symbol.png",plot_genes_branched_heatmap(cds.symbol.unique,
branch_point = 1,
num_clusters = 3,
cores = 1,
use_gene_short_name = T,
show_rownames = T),base_height = 8,base_asp = 1.3)
其他数据结构查看代码
### 主要函数
head(pData(cds))
head(fData(cds))
网友评论