美文网首页单细胞测序rna_seq单细胞
使用scran分析单细胞数据

使用scran分析单细胞数据

作者: Hayley笔记 | 来源:发表于2021-05-30 21:47 被阅读0次

    官网教程:Scran1.20.1(最新)。本教程使用的是Scran1.18.7,有几个函数不太一样。

    # 查看安装的scran的网页版教程:
    browseVignettes("scran")
    
    1. 介绍

    The scran package implements methods to perform low-level processing of scRNA-seq data, including cell cycle phase assignment, variance modelling and testing for marker genes and gene-gene correlations. This vignette provides brief descriptions of these methods and some toy examples to demonstrate their use.
    更多教程

    2. 数据准备

    这个数据集是CEL-seq测的人类胰腺数据集,由SingleCellExperiment包提供。

    library(scRNAseq)
    sce <- GrunPancreasData()
    sce
    ## class: SingleCellExperiment 
    ## dim: 20064 1728 
    ## metadata(0):
    ## assays(1): counts
    ## rownames(20064): A1BG-AS1__chr19 A1BG__chr19 ... ZZEF1__chr17
    ##   ZZZ3__chr1
    ## rowData names(2): symbol chr
    ## colnames(1728): D2ex_1 D2ex_2 ... D17TGFB_95 D17TGFB_96
    ## colData names(2): donor sample
    ## reducedDimNames(0):
    ## mainExpName: endogenous
    ## altExpNames(1): ERCC
    View(sce)
    

    关于sce对象,参考: https://www.jianshu.com/p/533aa6c50a38

    首先进行一些(粗略的)质控以去除counts过低或spike-in过高的细胞。

    library(scuttle)
    qcstats <- perCellQCMetrics(sce) 
    qcfilter <- quickPerCellQC(qcstats, percent_subsets="altexps_ERCC_percent")
    sce <- sce[,!qcfilter$discard]
    summary(qcfilter$discard)
    ##    Mode   FALSE    TRUE 
    ## logical    1291     437
    
    3. 标准化(Normalizing cell-specific biases)

    scran中的标准化使用computeSumFactors()函数,该过程是基于反卷积方法:先合并许多细胞的计数以避免drop-out问题,然后将基于池的量化因子(scale factor)“去卷积”为cell-based factors,以进行特定细胞的标准化。而标准化之前对细胞进行聚类(quickCluster)不是必需的,不过减少cluster中细胞之间的差异表达基因数量确实可以提高标准化的准确性。

    library(scran)
    clusters <- quickCluster(sce)
    sce <- computeSumFactors(sce, clusters=clusters)
    summary(sizeFactors(sce))
    ##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    ## 0.006722 0.442629 0.801312 1.000000 1.295809 9.227575
    

    也可以使用spike-in来进行标准化

    sce2 <- computeSpikeFactors(sce, "ERCC")
    summary(sizeFactors(sce2))
    ##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    ## 0.01041 0.57760 0.88667 1.00000 1.27679 7.43466
    

    不管使用哪种方法来计算量化因子,计算标准化的表达值都是使用scuttle包的logNormCounts()函数。标准化后的表达矩阵可以用于下游的聚类和降维等分析。

    sce <- logNormCounts(sce)
    
    4. 差异建模(Variance modelling)
    dec <- modelGeneVar(sce)
    plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance")
    curve(metadata(dec)$trend(x), col="blue", add=TRUE)
    
    dec2 <- modelGeneVarWithSpikes(sce, 'ERCC')
    plot(dec2$mean, dec2$total, xlab="Mean log-expression", ylab="Variance")
    points(metadata(dec2)$mean, metadata(dec2)$var, col="red")
    curve(metadata(dec2)$trend(x), col="blue", add=TRUE)
    
    # Turned off weighting to avoid overfitting for each donor.
    dec3 <- modelGeneVar(sce, block=sce$donor, density.weights=FALSE)
    per.block <- dec3$per.block
    par(mfrow=c(3, 2))
    for (i in seq_along(per.block)) {
        decX <- per.block[[i]]
        plot(decX$mean, decX$total, xlab="Mean log-expression", 
            ylab="Variance", main=names(per.block)[i])
        curve(metadata(decX)$trend(x), col="blue", add=TRUE)
    }
    
    # Get the top 10% of genes.
    top.hvgs <- getTopHVGs(dec, prop=0.1)
    
    # Get the top 2000 genes.
    top.hvgs2 <- getTopHVGs(dec, n=2000)
    
    # Get all genes with positive biological components.
    top.hvgs3 <- getTopHVGs(dec, var.threshold=0)
    
    # Get all genes with FDR below 5%.
    top.hvgs4 <- getTopHVGs(dec, fdr.threshold=0.05)
    
    # Running the PCA with the 10% of HVGs.
    library(scater)
    sce <- runPCA(sce, subset_row=top.hvgs)
    reducedDimNames(sce)
    
    5. 自动化PC选择
    sced <- denoisePCA(sce, dec2, subset.row=getTopHVGs(dec2, prop=0.1))
    ncol(reducedDim(sced, "PCA"))
    ## [1] 50
    
    output <- getClusteredPCs(reducedDim(sce))
    npcs <- metadata(output)$chosen
    reducedDim(sce, "PCAsub") <- reducedDim(sce, "PCA")[,1:npcs,drop=FALSE]
    npcs
    ## [1] 14
    
    6. Graph-based clustering
    # In this case, using the PCs that we chose from getClusteredPCs().
    g <- buildSNNGraph(sce, use.dimred="PCAsub")
    cluster <- igraph::cluster_walktrap(g)$membership
    
    # Assigning to the 'colLabels' of the 'sce'.
    colLabels(sce) <- factor(cluster)
    table(colLabels(sce))
    ## 
    ##   1   2   3   4   5   6   7   8   9  10  11  12 
    ##  79 285  64 170 166 164 124  32  57  63  63  24
    
    sce <- runTSNE(sce, dimred="PCAsub")
    plotTSNE(sce, colour_by="label", text_by="label")
    
    ratio <- clusterModularity(g, cluster, as.ratio=TRUE)
    
    library(pheatmap)
    pheatmap(log10(ratio+1), cluster_cols=FALSE, cluster_rows=FALSE,
        col=rev(heat.colors(100)))
    
    ass.prob <- bootstrapCluster(sce, FUN=function(x) {
        g <- buildSNNGraph(x, use.dimred="PCAsub")
        igraph::cluster_walktrap(g)$membership
    }, clusters=sce$cluster)
    
    pheatmap(ass.prob, cluster_cols=FALSE, cluster_rows=FALSE,
        col=colorRampPalette(c("white", "blue"))(100))
    
    subout <- quickSubCluster(sce, groups=colLabels(sce))
    table(metadata(subout)$subcluster) # formatted as '<parent>.<subcluster>'
    ## 
    ##  1.1  1.2 10.1 10.2 10.3 11.1 11.2   12  2.1  2.2  2.3  2.4  3.1  3.2  3.3  4.1 
    ##   38   41   16   19   28   29   34   24   96   35   81   73   26   19   19   53 
    ##  4.2  4.3  4.4  5.1  5.2  6.1  6.2  6.3  7.1  7.2  7.3    8  9.1  9.2 
    ##   67   33   17   73   93   64   65   35   35   35   54   32   33   24
    
    7. marker基因鉴定
    # Uses clustering information from 'colLabels(sce)' by default:
    markers <- findMarkers(sce)
    markers[[1]][,1:3]
    ## DataFrame with 20064 rows and 3 columns
    ##                          Top      p.value          FDR
    ##                    <integer>    <numeric>    <numeric>
    ## CPE__chr4                  1  1.72176e-62  1.43939e-59
    ## KCNQ1OT1__chr11            1  2.07621e-61  1.60220e-58
    ## LOC100131257__chr7         1  2.66948e-43  7.87654e-41
    ## PGM5P2__chr9               1  2.67376e-51  1.27730e-48
    ## SCG2__chr2                 1 1.41986e-104 2.84880e-100
    ## ...                      ...          ...          ...
    ## TLX1NB__chr10          19918            1            1
    ## TNFRSF17__chr16        19946            1            1
    ## TSPAN32__chr11         19973            1            1
    ## WFDC11__chr20          20015            1            1
    ## XKR7__chr20            20025            1  
    
    wmarkers <- findMarkers(sce, test.type="wilcox", direction="up", lfc=1)
    wmarkers[[1]][,1:3]
    ## DataFrame with 20064 rows and 3 columns
    ##                          Top     p.value         FDR
    ##                    <integer>   <numeric>   <numeric>
    ## FBLIM1__chr1               1 7.55507e-30 2.16550e-26
    ## KCNQ1OT1__chr11            1 1.30379e-37 1.51912e-33
    ## PGM5P2__chr9               1 1.46543e-35 9.80080e-32
    ## UGDH-AS1__chr4             1 1.51428e-37 1.51912e-33
    ## LOC100131257__chr7         2 7.66839e-33 3.07717e-29
    ## ...                      ...         ...         ...
    ## TLX1NB__chr10          19918           1           1
    ## TNFRSF17__chr16        19946           1           1
    ## TSPAN32__chr11         19973           1           1
    ## WFDC11__chr20          20015           1           1
    ## XKR7__chr20            20025           1    
    
    markers <- findMarkers(sce, pval.type="all")
    markers[[1]][,1:2]
    ## DataFrame with 20064 rows and 2 columns
    ##                        p.value         FDR
    ##                      <numeric>   <numeric>
    ## UGDH-AS1__chr4     8.36195e-20 1.67774e-15
    ## KCNQ1OT1__chr11    2.72694e-19 2.42652e-15
    ## LOC100131257__chr7 3.62817e-19 2.42652e-15
    ## TFDP2__chr3        5.98639e-19 3.00277e-15
    ## LOC643406__chr20   1.09251e-18 4.38402e-15
    ## ...                        ...         ...
    ## ZP1__chr11                   1           1
    ## ZP3__chr7                    1           1
    ## ZPBP__chr7                   1           1
    ## ZSCAN1__chr19                1           1
    ## ZSCAN20__chr1                1           1
    
    8. Detecting correlated genes
    # Using the first 200 HVs, which are the most interesting anyway.
    # Also turning down the number of iterations for speed.
    of.interest <- top.hvgs[1:200]
    cor.pairs <- correlatePairs(sce, subset.row=of.interest, iters=1e5)
    cor.pairs
    ## DataFrame with 19900 rows and 6 columns
    ##                 gene1           gene2         rho     p.value         FDR
    ##           <character>     <character>   <numeric>   <numeric>   <numeric>
    ## 1      UGDH-AS1__chr4 KCNQ1OT1__chr11    0.830410 1.99998e-05 3.83241e-05
    ## 2           GCG__chr2      TTR__chr18    0.798357 1.99998e-05 3.83241e-05
    ## 3         REG1A__chr2     PRSS1__chr7    0.784339 1.99998e-05 3.83241e-05
    ## 4         REG1A__chr2    SPINK1__chr5    0.780994 1.99998e-05 3.83241e-05
    ## 5         REG1A__chr2     REG1B__chr2    0.760322 1.99998e-05 3.83241e-05
    ## ...               ...             ...         ...         ...         ...
    ## 19896    TM4SF1__chr3    HYDIN2__chr1 0.000115737     0.99877    0.998971
    ## 19897   SLC30A8__chr8     MUC13__chr3 0.000169530     0.99919    0.999341
    ## 19898     KRT7__chr12 FLJ30403__chr19 0.000159983     0.99957    0.999670
    ## 19899 C15orf48__chr15 FLJ30403__chr19 0.000144531     0.99973    0.999780
    ## 19900     RBP4__chr10     DUSP1__chr5 0.000153488     0.99993    0.999930
    ##         limited
    ##       <logical>
    ## 1          TRUE
    ## 2          TRUE
    ## 3          TRUE
    ## 4          TRUE
    ## 5          TRUE
    ## ...         ...
    ## 19896     FALSE
    ## 19897     FALSE
    ## 19898     FALSE
    ## 19899     FALSE
    ## 19900     FALSE
    
    cor.pairs2 <- correlatePairs(sce, subset.row=of.interest, 
        block=sce$donor, iters=1e5)
    
    cor.genes <- correlateGenes(cor.pairs)
    cor.genes
    ## DataFrame with 200 rows and 5 columns
    ##                gene       rho     p.value         FDR   limited
    ##         <character> <numeric>   <numeric>   <numeric> <logical>
    ## 1    UGDH-AS1__chr4  0.830410 7.23629e-05 9.39778e-05      TRUE
    ## 2         GCG__chr2  0.798357 2.94812e-05 6.17910e-05      TRUE
    ## 3       REG1A__chr2  0.784339 2.88403e-05 6.17910e-05      TRUE
    ## 4        TTR__chr18  0.798357 2.72600e-05 6.17910e-05      TRUE
    ## 5       CHGB__chr20  0.751585 2.70746e-05 6.17910e-05      TRUE
    ## ...             ...       ...         ...         ...       ...
    ## 196       FN1__chr2  0.164284 1.59198e-04 1.60806e-04      TRUE
    ## 197      MAFA__chr8  0.362235 6.12302e-05 8.33063e-05      TRUE
    ## 198        F3__chr1  0.378165 3.82688e-05 6.31239e-05      TRUE
    ## 199    SRXN1__chr20  0.340372 2.09472e-04 2.09472e-04      TRUE
    ## 200 KDM4A-AS1__chr1  0.330622 9.47610e-05 1.03564e-04      TRUE
    
    9. 转换成其他数据形式
    y <- convertTo(sce, type="edgeR")
    

    相关文章

      网友评论

        本文标题:使用scran分析单细胞数据

        本文链接:https://www.haomeiwen.com/subject/esbpsltx.html