1. 准备数据

rm(list = ls())
options(warn=-1)
suppressMessages(library(Seurat))
load('./patient1.PBMC.output.Rdata')

# 根据文章的要求，取PBMC_RespD376这个时间点的第4，第10群
PBMC_RespD376 = SubsetData(PBMC,TimePoints ='PBMC_RespD376')

# 曾老师给的代码像下面是这样取的，但不知道是包版本不一样还是其他原因，这样并不能取出来子集，可以看到依旧有10000多个细胞，实际上应该是1000多个细胞
PBMC_RespD376_for_DEG = SubsetData(PBMC_RespD376,PBMC_RespD376@active.ident %in% c(4,10))
count_matrix=PBMC_RespD376@assays[[1]]@data
> dim(count_matrix)
[1] 17712 12874
> table(PBMC_RespD376@active.ident)
   0    1    2    3    4    5    6    7    8    9   10   11   12   13
1883 1610 1598 1537 1497 1177  921  671  399  386  384  327  256  228

# 因此我绕开出问题的Subset，选择用比较基础的代码来取子集
# 首先是验证一下count_matrix的列名和active.ident的name是一致的，以保证用active.ident作为索引取count_matrix的子集也没问题（实际上如果对Seurat对象足够熟悉应该不需要验证，但我还在探索阶段，各位见笑）
> identical(colnames(count_matrix),names(PBMC_RespD376@active.ident))
[1] TRUE

# 然后把表达量数据子集取出来
tmp=count_matrix[,PBMC_RespD376@active.ident %in% c(4,10)]
> dim(tmp)
[1] 17712  1881 #没有问题了
count_matrix=tmp

# 另外我们还需要细胞类型这个分组信息，以及基因名信息
cluster=PBMC_RespD376@active.ident[PBMC_RespD376@active.ident %in% c(4,10)]
gene_annotation <- as.data.frame(rownames(count_matrix))

# 这样就绕开了取Seurat对象子集的问题，同样也得到了我们后续分析要用的3个信息：表达量、分组、基因名

2. 构建monocle对象

library(monocle)
> packageVersion('monocle')
[1] ‘2.14.0’

# 1.表达矩阵
expr_matrix <- as.matrix(count_matrix)
# 2.细胞信息
sample_ann <- data.frame(cells=colnames(count_matrix),
                         cellType=cluster)
rownames(sample_ann)<- colnames(count_matrix)
# 3.基因信息
gene_ann <- as.data.frame(rownames(count_matrix))
rownames(gene_ann)<- rownames(count_matrix)
colnames(gene_ann)<- "genes"

# 然后转换为AnnotatedDataFrame对象
pd <- new("AnnotatedDataFrame",
          data=sample_ann)
fd <- new("AnnotatedDataFrame",
          data=gene_ann)

# 最后构建CDS对象
sc_cds <- newCellDataSet(
  expr_matrix,
  phenoData = pd,
  featureData =fd,
  expressionFamily = negbinomial.size(),
  lowerDetectionLimit=1)

save(sc_cds,file = "monocle_object.Rdata") #其实上面的步骤都挺快的，这里不存问题也不大。

3. 过滤质控

# 去掉一些只在很少的细胞中表达的基因
cds=sc_cds
cds <- detectGenes(cds, min_expr = 0.1)
expressed_genes <- row.names(subset(cds@featureData@data,
                                    num_cells_expressed >= 5))
cds <- cds[expressed_genes,]

4. 聚类

4.1. 判断使用哪些基因进行细胞分群

cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)
disp_table <- dispersionTable(cds) # 挑有差异的
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1) # 挑表达量不太低的
cds <- setOrderingFilter(cds, unsup_clustering_genes$gene_id)  # 准备聚类基因名单
plot_ordering_genes(cds)

黑色的点是会被选用的基因

4.2. 可视化选出的主成分

plot_pc_variance_explained(cds, return_all = F) # norm_method='log'

4.3. 降维聚类

# 2. 进行降维
cds <- reduceDimension(cds, max_components = 2, num_dim = 6,
                       reduction_method = 'tSNE', verbose = T)
# 3. 进行聚类
cds <- clusterCells(cds, num_clusters = 4)
# 4. Distance cutoff calculated to 1.812595
plot_cell_clusters(cds, 1, 2, color = "cellType")

save(cds, file = "monocle_object2.Rdata")

5. 差异分析

start=Sys.time()
diff_test_res <- differentialGeneTest(cds,
                                      fullModelFormulaStr = "~cellType")
end=Sys.time()
end-start
# Time difference of 3.139717 mins

5.1. 挑选显著的差异基因

sig_genes <- subset(diff_test_res, qval < 0.1)
> nrow(sig_genes)
[1] 910

5.2. 选取文章中的基因画热图

htmapGenes=c(
  'GAPDH','CD52','TRAC','IL32','ACTB','ACTG1','COTL1',
  'GZMA','GZMB','GZMH','GNLY'
)
> htmapGenes %in% rownames(sig_genes)
 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

library(pheatmap)
dat=count_matrix[htmapGenes,]
n=t(scale(t(dat)))
n[n>2]=2
n[n< -1]= -1
ac=data.frame(group=cluster)
rownames(ac)=colnames(n)
pheatmap(n,annotation_col = ac,
         show_colnames =F,
         show_rownames = T,
         cluster_cols = F,
         cluster_rows = F)