美文网首页ggplot集锦
【单细胞转录组 实战】十、复现文章数据——scater、mono

【单细胞转录组 实战】十、复现文章数据——scater、mono

作者: 佳奥 | 来源:发表于2022-09-03 15:37 被阅读0次

    这里是佳奥,我们继续复现文章数据。

    1 scater Package

    rm(list = ls()) 
    Sys.setenv(R_MAX_NUM_DLLS=999)
    ## 首先载入文章的数据
    load(file='../input.Rdata')
    counts=a
    counts[1:4,1:4];dim(counts)
    library(stringr) 
    meta=df
    head(meta) 
    
    options(warn=-1) # turn off warning message globally
    suppressMessages(library(scater))
    ## 创建 scater 要求的对象
    sce <- SingleCellExperiment(
      assays = list(counts = as.matrix(counts)), 
      colData = meta
    )
    sce
    exprs(sce) <- log2(  calculateCPM(sce ) + 1)
    ## 只有运行了下面的函数后才有各式各样的过滤指标
    genes=rownames(rowData(sce))
    genes[grepl('^MT-',genes)]
    genes[grepl('^ERCC-',genes)]
    sce <- calculateQCMetrics(sce, 
                              feature_controls = list(ERCC = grep('^ERCC',genes)))
    
    
    sce <- addPerCellQC(sce,
                        subsets=list(ERCC = grep('^ERCC',genes)))
    
    keep_feature <- rowSums(exprs(sce) > 0) > 5
    table(keep_feature)
    sce <- sce[keep_feature,]
    tf=sce$detected
    boxplot(tf)
    fivenum(tf)
    table(tf>2000)
    sce=sce[,tf > 2000 ]
    sce
    head(meta)
    
    QQ截图20220902202443.png
    ## 基因表达,理论上应该是跟384孔板 这个变量无关
    plotExpression(sce, rownames(sce)[1:6],
                   x = "plate", 
                   exprs_values = "logcounts") 
    
    QQ截图20220902202601.png
    # 展示高表达量基因, 绘图有点耗时
    plotHighestExprs(sce, exprs_values = "counts")
    
    QQ截图20220902203002.png
    sce <- runPCA(sce)
    plotPCA(sce)
    reducedDimNames(sce)
    
    QQ截图20220902203036.png
    # colnames(as.data.frame(colData(sce)))
    head(colData(sce))
    ## PCA分布图上面添加临床信息--------------
    plotReducedDim(sce, "PCA", 
                    shape_by= "plate", 
                    colour_by= "g")
    
    QQ截图20220902203244.png
    ##包更新了,没有feature_set功能了
    
    ## 考虑ERCC 影响后继续PCA
    #sce2 <- runPCA(sce, 
                   feature_set = rowData(sce)$is_feature_control)
    #plotPCA(sce2)
    ## PCA分布图上面添加临床信息--------------
    #plotReducedDim(sce2, use_dimred = "PCA", 
                   shape_by= "plate", 
                   colour_by= "g")
    
    ## 运行 tSNE 降维算法
    set.seed(1000)
    sce <- runTSNE(sce, perplexity=10)
    plotTSNE(sce, 
             shape_by= "plate", 
             colour_by= "g")
    
    QQ截图20220902210133.png
    ## 对tSNE降维后结果进行不同的聚类
    ##原始代码跑不了
    #colData(sce)$tSNE_kmeans <- as.character(kmeans(sce@reducedDims$TSNE,
                                                    centers = 4)$clust)
    ##蛮试了一个
    colData(sce)$tSNE_kmeans <- as.character(kmeans(sce@colData$detected,
                                                    centers = 4)$clust)
    
    head(sce@colData$detected)
    hc=hclust(dist( sce@colData$detected ))
    clus = cutree(hc, 4) 
    colData(sce)$tSNE_hc <-  as.character(clus)
    plotTSNE(sce,  colour_by = "tSNE_kmeans")
    plotTSNE(sce,  colour_by = "tSNE_hc")
    table(colData(sce)$tSNE_hc , colData(sce)$tSNE_kmeans)##查看两种分组情况
          1   2   3   4
      1 107  51   0   0
      2   0 188   0  30
      3   0   0 110 179
      4   0   0  28   0
    
    QQ截图20220902210933.png QQ截图20220902210943.png
    ##'runDiffusionMap'不再有用。
    ## 同样是一直降维方式,不同的算法
    # sce <- runDiffusionMap(sce)
    # plotDiffusionMap(sce,  
                     shape_by= "plate", 
                     colour_by= "g")
    
    library(SC3) # BiocManager::install('SC3')
    sce <- sc3_estimate_k(sce)
    metadata(sce)$sc3$k_estimation
    rowData(sce)$feature_symbol=rownames(rowData(sce))
    # 耗费时间
    kn=4
    sc3_cluster="sc3_4_clusters"
    # 非常耗时
    sce <- sc3(sce, ks = kn, biology = TRUE)
    
    sc3_plot_consensus(sce, k = kn, show_pdata = c("g",sc3_cluster))
    sc3_plot_expression(sce, k = kn, show_pdata =  c("g",sc3_cluster))
    sc3_plot_markers(sce, k = kn, show_pdata =  c("g",sc3_cluster))
    plotPCA(sce, shape_by= "g" , colour_by =  sc3_cluster )
    sc3_interactive(sce)
    
    QQ截图20220902212049.png QQ截图20220902212104.png QQ截图20220902212152.png QQ截图20220902212209.png QQ截图20220902212321.png

    2 monocle Package

    参考代码:

    https://cloud.tencent.com/developer/article/1606574
    
    rm(list = ls()) 
    Sys.setenv(R_MAX_NUM_DLLS=999)
    ## 首先载入文章的数据
    load(file='../input.Rdata')
    
    # 原始表达矩阵
    counts=a
    # using raw counts is the easiest way to process data through Seurat.
    counts[1:4,1:4];dim(counts)
    library(stringr) 
    
    # 样本信息
    meta=df
    head(meta) 
    
    # 按行/基因检查:每个基因在多少细胞中有表达量
    fivenum(apply(counts,1,function(x) sum(x>0) ))
    boxplot(apply(counts,1,function(x) sum(x>0) ))
    # 按列/样本检查:每个细胞中存在多少表达的基因
    fivenum(apply(counts,2,function(x) sum(x>0) ))
    hist(apply(counts,2,function(x) sum(x>0) ))
               
    # 上面检测了 counts 和 meta 两个变量,后面需要使用
    

    有50%的基因只在低于20个细胞中有表达量,还有许多没有表达量的

    QQ截图20220903102950.png QQ截图20220903102959.png
    table(apply(counts,1,function(x) sum(x>0) )==0)
    
    FALSE  TRUE 
    17429  7153 
    # 存在7000多个基因在任何一个细胞中都没表达
    
    ##开始使用newCellDataSet()构建monocle对象
    # ---------首先构建基因的注释信息(feature_data)-----------
                
    gene_ann <- data.frame(
      gene_short_name = row.names(counts), 
      row.names = row.names(counts)
    )
                      
    sample_ann=meta
                
    fd <- new("AnnotatedDataFrame",
              data=gene_ann)  
                
    # ---------然后构建样本的注释信息(sample_data)---------
                
    pd <- new("AnnotatedDataFrame",
              data=sample_ann)
    
    # ---------开始构建对象---------
                
    #sc_cds <- newCellDataSet(
      as.matrix(counts), 
      phenoData = pd,
      featureData =fd,
      expressionFamily = negbinomial.size(),
      lowerDetectionLimit=1)
    
    counts<-as(as.matrix(counts),"sparseMatrix")
    sc_cds <- newCellDataSet(
      counts, 
      phenoData = pd,
      featureData =fd,
      lowerDetectionLimit = 1,
      expressionFamily = VGAM::negbinomial.size())            
                
    sc_cds
    
    > sc_cds
    CellDataSet (storageMode: environment)
    assayData: 24582 features, 768 samples 
      element names: exprs 
    protocolData: none
    phenoData
      sampleNames: SS2_15_0048_A3 SS2_15_0048_A6 ... SS2_15_0049_P24 (768 total)
      varLabels: g plate ... Size_Factor (5 total)
      varMetadata: labelDescription
    featureData
      featureNames: 0610005C13Rik 0610007P14Rik ... ERCC-00171 (24582 total)
      fvarLabels: gene_short_name
      fvarMetadata: labelDescription
    experimentData: use 'experimentData(object)'
    Annotation:  
    
    ##接下来质控过滤
    cds=sc_cds
    cds
    ## 起初是 24582 features, 768 samples 
    
    #---------首先是对基因的过滤-------------
    
    cds <- detectGenes(cds, min_expr = 0.1)
    print(head(cds@featureData@data))
    expressed_genes <- row.names(subset(cds@featureData@data,
                                        num_cells_expressed >= 5))
    length(expressed_genes)
    # 14442
    # 这里需要去掉ERCC基因
    # 去掉ERCC基因
    is.ercc <- grepl("ERCC",expressed_genes)
    length(expressed_genes[!is.ercc])
    # 14362(看到去掉了80个ERCC)
    cds <- cds[expressed_genes[!is.ercc],]
    cds
    # 过滤基因后是 14362 features, 768 samples 
    
    #---------然后是对细胞的过滤-------------
    
    # 如果不支持使用pData()函数,可以使用cds@phenoData@data来获得各种细胞注释信息
    cell_anno <- cds@phenoData@data
    > head(cell_anno)
                   g plate  n_g all Size_Factor num_genes_expressed
    SS2_15_0048_A3 1  0048 2624 all   0.6693919                3069
    SS2_15_0048_A6 1  0048 2664 all   0.6820532                3040
    SS2_15_0048_A5 2  0048 3319 all   1.0759418                3743
    SS2_15_0048_A4 3  0048 4447 all   0.7294812                5014
    SS2_15_0048_A1 2  0048 4725 all   1.5658507                5128
    SS2_15_0048_A2 3  0048 5263 all   1.6187569                5693
    
    # 这里简单过滤细胞
    valid_cells <- row.names(cell_anno[cell_anno$num_genes_expressed>2000,] )
    cds <- cds[,valid_cells]
    cds 
    # 最后剩下:14362 features, 693 samples
    
    library(dplyr)
    colnames(phenoData(sc_cds)@data)
    
    ## 接下来的分析,都是基于sc_cds对象
    
    cds=sc_cds
    cds
    ## 起初是 24582 features, 768 samples 
    cds <- detectGenes(cds, min_expr = 0.1)
    
    #print(head(fData(cds)))
    #expressed_genes <- row.names(subset(fData(cds),
                                        num_cells_expressed >= 5))
    
    print(head(cds@featureData@data))
    expressed_genes <- row.names(subset(cds@featureData@data,
                                        num_cells_expressed >= 5))
    
    length(expressed_genes)
    cds <- cds[expressed_genes,]
    cds
    # 过滤基因后是  14442 features, 768 samples 
    
    ##然后进行归一化
    library(dplyr)
    colnames(phenoData(cds)@data)
    ## 必要的归一化 
    cds <- estimateSizeFactors(cds)
    cds <- estimateDispersions(cds)
    
    ##降维聚类
    disp_table <- dispersionTable(cds)
    unsup_clustering_genes <- subset(disp_table, 
                                     mean_expression >= 0.1)
    cds <- setOrderingFilter(cds, unsup_clustering_genes$gene_id)
    cds
    plot_ordering_genes(cds) 
    # 图中黑色的点就是被标记出来一会要进行聚类的基因
    
    QQ截图20220903134337.png
    plot_pc_variance_explained(cds, return_all = F) # norm_method='log'
    
    QQ截图20220903134941.png
    关于聚类:monocle一共有三种方法:densityPeak", "louvain", "DDRTree"

    默认使用density peak的方法,但是对于大型数据(例如有5万细胞)推荐louvain方法

    # ---------进行降维---------
    cds <- reduceDimension(cds, max_components = 2, num_dim = 6,
                            reduction_method = 'tSNE', verbose = T)
    # ---------进行聚类---------
    # (这里先设置num_clusters,不一定最后真就分成5群;我们这里设置5,最后只能得到4群;如果设成4,结果就得到3群)
    cds <- clusterCells(cds, num_clusters = 5) 
    # Distance cutoff calculated to 1.818667 
    
    # color使用的这些数据就在:cds$Cluster
    plot_cell_clusters(cds, 1, 2, color = "Cluster")
    
    QQ截图20220903135039.png

    因为之前还做过层次聚类,所以还可以对比一下:

    plot_cell_clusters(cds, 1, 2, color = "g")
    

    看到monocle使用其他的聚类算法确实不如使用自己的结果得到的效果好


    QQ截图20220903135059.png

    再次对比不同分群结果的基因数量差异:

    boxplot(cds@phenoData@data$num_genes_expressed~cds@phenoData@data$Cluster)
    boxplot(cds@phenoData@data$num_genes_expressed~cds@phenoData@data$g)
    
    QQ截图20220903135122.png QQ截图20220903135143.png

    去除一些影响因素

    因为这几群的细胞中基因表达数量是有差别的,因此我们可以在聚类之前先去掉这部分影响,从而关注它们真正的生物学影响

    ## 去除检测到基因数量效应
    cds <- reduceDimension(cds, max_components = 2, num_dim = 2,
                           reduction_method = 'tSNE',
                           residualModelFormulaStr = "~num_genes_expressed",
                           verbose = T)
    cds <- clusterCells(cds, num_clusters = 5)
    plot_cell_clusters(cds, 1, 2, color = "Cluster")
    
    QQ截图20220903135252.png
    ##差异分析
    start=Sys.time()
    diff_test_res <- differentialGeneTest(cds,
                                          fullModelFormulaStr = "~Cluster")
    end=Sys.time()
    end-start
    # Time difference of 4.724045 mins
    
    ##挑差异基因
    # 选择FDR-adjusted p-value(也就是q值) < 10%的基因作为差异基因
    sig_genes <- subset(diff_test_res, qval < 0.1)
    dim(sig_genes)
    > head(sig_genes[,c("gene_short_name", "pval", "qval")] )
                  gene_short_name         pval         qval
    0610007P14Rik   0610007P14Rik 3.377244e-03 1.277429e-02
    0610010F05Rik   0610010F05Rik 1.243943e-02 3.924761e-02
    0610011F06Rik   0610011F06Rik 2.338530e-03 9.285587e-03
    1110004E09Rik   1110004E09Rik 2.487600e-02 7.003903e-02
    1110020A21Rik   1110020A21Rik 2.318327e-02 6.618129e-02
    1110059E24Rik   1110059E24Rik 5.193533e-09 4.856089e-08
    
    ##作图
    cg=as.character(head(sig_genes$gene_short_name))
    # 还能上色
    plot_genes_jitter(cds[cg,],
                      grouping = "Cluster",
                      color_by = "Cluster",
                      nrow= 3,
                      ncol = NULL )
    
    QQ截图20220903141430.png
    ##推断发育轨迹
    
    ##step1: 选合适基因
    ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
    cds <- setOrderingFilter(cds, ordering_genes)
    plot_ordering_genes(cds)
    
    QQ截图20220903141500.png
    ##step2: 降维
    # 默认使用DDRTree的方法 
    cds <- reduceDimension(cds, max_components = 2,
                                method = 'DDRTree')
    
    ##step3: 细胞排序
    cds <- orderCells(cds)
    head(pData(cds))
    
    ##最后可视化
    plot_cell_trajectory(cds, color_by = "Cluster")  
    
    QQ截图20220903141709.png

    写在后面

    orderCells会出现报错:
    Error in if (class(projection) != "matrix") projection <- as.matrix(projection) : 
      the condition has length > 1
    In addition: Warning message:
    In graph.dfs(dp_mst, root = root_cell, neimode = "all", unreachable = FALSE,  :
      Argument `neimode' is deprecated; use `mode' instead
    
    GitHub上解决办法是修改该包的代码:
    https://github.com/cole-trapnell-lab/monocle-release/issues/434
    
    from
    if(class(projection) != 'matrix')
    projection <- as.matrix(projection)
    
    to
    projection <- as.matrix(projection)
    
    尚不确定是否有用。
    

    本流程花了很多时间把所有代码都调试到可以顺利运行最新版本的package。

    希望能有所帮助。

    下一篇我们看一下作者的原始代码。

    我们下一篇再见!

    相关文章

      网友评论

        本文标题:【单细胞转录组 实战】十、复现文章数据——scater、mono

        本文链接:https://www.haomeiwen.com/subject/ksbunrtx.html