美文网首页单细胞测序专题集合
9、DEG(Differential expressed gen

9、DEG(Differential expressed gen

作者: 小贝学生信 | 来源:发表于2020-10-05 15:29 被阅读0次

    原文链接
    10、DA(Differential abundance) - 简书

    1、overview

    • the design of replicated multi-condition experiments to detect changes in composition or expression between conditions.
      即通过设计多个实验样本(不同conition的多次重复,一般来说 two conditions and three replications)
    • 本节主要学习DE(differential expression),基因表达差异。这与之前学习的RNAseq的差异分析最主要的区别在于single cell DE一般是基于细胞注释之后,进行不同细胞之间的差异基因分析。

    2、setup data

    • 示例数据来自MouseGastrulationData包,我们从中提取6个sample
    • design:a paired design with three replicate batches of two samples each.
    • conditions(two samples):td-Tomato positive cells VS negative cells

    2.1 下载

    library(MouseGastrulationData)
    sce.chimera <- WTChimeraData(samples=5:10)
    sce.chimera
    
    #将近一个G大小,赶紧保存为Rdata
    save(sce.chimera, file="sce.chimera.Rdata")
    rm(list=ls())
    load(sce.chimera.Rdata)
    

    2.2初步探索数据

    #two conditions
    table(colData(sce.chimera)$tomato)
    # three replication
    table(colData(sce.chimera)$tomato,
          colData(sce.chimera)$pool)
    #six samples
    table(colData(sce.chimera)$sample)
    #annotated celltype
    table(colData(sce.chimera)$celltype.mapped,
          colData(sce.chimera)$tomato)
    

    2.3 前处理 preprocessing

    if (T){
    #--- feature-annotation ---# ensemble→symbol
      library(scater)
      rownames(sce.chimera) <- uniquifyFeatureNames(
        rowData(sce.chimera)$ENSEMBL, rowData(sce.chimera)$SYMBOL)
      
      #--- quality-control ---#
      drop <- sce.chimera$celltype.mapped %in% c("stripped", "Doublet")
      sce.chimera <- sce.chimera[,!drop] #dim 29453 19426 
      
      #--- normalization ---#
      sce.chimera <- logNormCounts(sce.chimera)
      
      #--- variance-modelling ---#
      library(scran)
      dec.chimera <- modelGeneVar(sce.chimera, block=sce.chimera$sample)
      chosen.hvgs <- dec.chimera$bio > 0
      
      #--- merging ---#
      library(batchelor)
      set.seed(01001001)
      #Apply a correction to multiple SingleCellExperiment objects, 
      #while also combining the assay data and column metadata for easy use.
      merged <- correctExperiments(sce.chimera, 
                                   batch=sce.chimera$sample,  #共六个样本(3*2)
                                   subset.row=chosen.hvgs,
                                   PARAM=FastMnnParam( #MNN算法去除指定两组的批次效应
                                     merge.order=list(
                                       list(1,3,5), # WT (3 replicates)
                                       list(2,4,6)  # td-Tomato (3 replicates)
                                     )
                                   )
      )
      
      #--- clustering ---#
      g <- buildSNNGraph(merged, use.dimred="corrected")
      clusters <- igraph::cluster_louvain(g)
      merged$new.clusters <- factor(clusters$membership
      
      #--- dimensionality-reduction ---#
      merged <- runTSNE(merged, dimred="corrected", external_neighbors=TRUE)
      merged <- runUMAP(merged, dimred="corrected", external_neighbors=TRUE)
    }
    merged
    
    2-1

    2.4 可视化

    • 图1:二维可视化,有无明显批次效应
    gridExtra::grid.arrange(
      plotTSNE(merged, colour_by="tomato", text_by="new.clusters"),
      plotTSNE(merged, colour_by=data.frame(pool=factor(merged$pool))),
      ncol=2
    )
    
    2-2
    • 图2:new.clusters and annotated celltype
    by.label <- table(merged$new.clusters, merged$celltype.mapped)
    pheatmap::pheatmap(log2(by.label+1), cluster_cols=FALSE, cluster_rows=FALSE,
                       color=viridis::viridis(101))
    

    Ordinarily, we would be obliged to perform marker detection to assign biological meaning to these clusters.
    For simplicity, we will skip this step by directly using the cell type labels have annotated.


    2-3

    3、detailed DE with one celltype

    3.1 Creating pseudo-bulk samples

    简单来说就是在已经注释celltype的基础上,将每个样本中聚类到一个celltype中的gene counts加在一起。

    summed <- aggregateAcrossCells(merged, 
                                   id=colData(merged)[,c("celltype.mapped", "sample")])
    #将六个样本中各个样本所有cell的各个celltype.mapped的cell gene counts加在一起
    dim(summed)
    counts(summed)[1:4,1:4]
    head(colData(summed),2)
    
    • 如下图注释,样本5的第一列counts即为注释到在Mesenchyme celltype上的cell所有表达情况概况。


      3-1

    3.2 添加ncells

    • 教程示例中,colnames(colData(summed))有一列ncells,为统计相应样本里,注释到相应细胞类型的total cells。在后面DE中,过滤时会用到。
    • 但在操作中并没有发现,推测可能是版本更新的原因,我的R还是3.6,而现在教程应该是基于R4.0的对应的R包才可以。有时间去升级一下。
    • 现在就手动添加吧
    dim(colData(summed))
    #各个sample的celltype共有186个
    
    table(merged$celltype.mapped,merged$sample)
    #转换成表格
    tmp <- as.data.frame(table(merged$celltype.mapped,merged$sample))
    tmp <- tmp[!(tmp$Freq==0),]
    dim(tmp)
    #也是186个,应该对应上了,Freq就是目标ncells
    tmp1 <- as.data.frame(colData(summed))[,c("celltype.mapped","sample")]
    
    • 先paste确定公共列,再merge
    #paste
    tmp$x <- paste(tmp$Var1,tmp$Var2,sep = "-")
    tmp1$x <- paste(tmp1$celltype.mapped,tmp1$sample,sep = "-")
    #merge
    tmp1 <- merge(tmp1,tmp,by="x",sort=F)
    #注意sort=F,不要改变顺序
    head(tmp1)
    #添加到colData
    summed$ncells <- tmp1$Freq
    head(colData(summed))
    
    3-2

    3.3 以一个celltype进行差异分析

    (1)筛选指定celltype数据
    label <- "Mesenchyme"
    current <- summed[,label==summed$celltype.mapped]
    head(colData(current),10)
    
    (3) edgeR
    • DEGList
    library(edgeR)
    y <- DGEList(counts(current), samples=colData(current))
    y
    #将counts(current)矩阵与colData(current)组成一个list
    

    ####### 小插曲:质控
    remove samples with very low library sizes due to failed library preparation or sequencing.

    • 思路1:remove combinations containing fewer than 20 cells(筛选cells)
    discarded <- current$ncells < 20
    summary(discarded)
    #丢弃sample里只有不到20个cell的sample,这里的数据都大于20
    y <- y[,!discarded]
    
    • 思路2:remove genes that are lowly expressed(筛选gene)
    keep <- filterByExpr(y, group=current$tomato)
    summary(keep)
    y <- y[keep,]
    #丢弃了9000+个gene
    
    • matrics design
    design <- model.matrix(~factor(pool) + factor(tomato), y$samples)
    factor(y$samples$pool)
    factor(y$samples$tomato)
    design
    
    • 差异分析
    y <- estimateDisp(y, design)
    summary(y$trended.dispersion)
    fit <- glmQLFit(y, design, robust=TRUE)
    summary(fit$var.prior)
    res <- glmQLFTest(fit, coef=ncol(design))
    summary(decideTests(res))
    topTags(res)
    
    3-3

    3.4 all DE

    • 上述是以具体的一个celltype进行以edgeR的差异分析。
    • scran包也提供了一次性的封装好的函数以供使用。
      同样可能是因为版本的原因,未找到pseudoBulkDGE()这个函数。故仅将代码录于此处,之后再尝试一下

    在写上述草稿的第二天后,下载了最新版的R4.02,再安装相关Bioconductor包进行分析,的确就成功了(但主要还是使用3.6,需要用到4.0版本时再切换到该版本即可)


    3-4
    save(merged,file="merged.Rdata")
    rm(list=ls())
    #切换R4.0
    load("merged.Rdata")
    library(scater)
    summed <- aggregateAcrossCells(merged, 
                                   id=colData(merged)[,c("celltype.mapped", "sample")])
    summed$ncells #果然基于4.0版本的scater会添加ncells列
    summed.filt <- summed[,summed$ncells >= 20]
    summed.filt$ncells
    dim(summed.filt)
    # Pulling out a sample-level 'targets' data.frame:
    targets <- colData(merged)[!duplicated(merged$sample),]
    
    # Constructing the design matrix:
    design <-  model.matrix(~factor(pool) + factor(tomato), data=targets)
    rownames(design) <- targets$sample
    design
    library(scran)
    de.results <- pseudoBulkDGE(summed.filt, 
                                sample=summed.filt$sample,
                                label=summed.filt$celltype.mapped,
                                design=design,
                                coef=ncol(design),
                                
                                # 'condition' sets the group size for filterByExpr(),
                                # to perfectly mimic our previous manual analysis.
                                condition=targets$tomato 
    )
    
    3-5
    is.de <- decideTestsPerLabel(de.results, threshold=0.05)
    summarizeTestsPerLabel(is.de)
    
    # Upregulated across most cell types.
    up.de <- is.de > 0 & !is.na(is.de)
    head(sort(rowMeans(up.de), decreasing=TRUE), 10)
    
    # Downregulated across cell types.
    down.de <- is.de < 0 & !is.na(is.de)
    head(sort(rowMeans(down.de), decreasing=TRUE), 10)
    
    3-6

    以上是第十五章differential-expression-between-conditions第一部分的简单流程笔记,主要学习了single cell DEG详见Chapter 14 Multi-sample comparisons
    本系列笔记基于OSCA全流程的大致流程梳理,详细原理可参考原文。如有错误,恳请指正!
    此外还有刘小泽老师整理的全文翻译笔记,详见目录


    最后本来想绘制个火山图,但结果并没有预想的那么好,草稿先放在这,以后再看看吧。有什么想法的朋友,欢迎留言。

    tmp3 <- as.data.frame(do.call(rbind, lapply(tmp, as.matrix)),stringsAsFactors=F)
    cell_26 <- names(de.results)
    cell_26
    cells <- rep(cell_26,each=nrow(tmp3)/26)
    tmp3 <- cbind(tmp3,cells,stringsAsFactors=F)
    test <- na.omit(tmp3)
    test$change=ifelse(test$PValue>0.01, 'stable', 
             ifelse(test$logFC>1,'UP',
                    ifelse(test$logFC< -1,'DOWN','stable'))
      )
    table(test$cells,test$change)
    
    head(test)
    test$color <- test$cells
    test$color[which(test$change=="stable")] <- "stable"
    test$color=factor(test$color)
    length(unique(test$cells))
    test$'-log10(pvalue)' <- -log10(test$PValue)
    p <- ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
              color="color",size = 0.7)
    save(p,file = "1.pdf")
    rm(merged)
    pdf("1.pdf") 
    ggpubr::ggscatter(test, x="logFC", y="-log10(pvalue)",
                      color="color",size = 0.7)
    dev.off()
    

    相关文章

      网友评论

        本文标题:9、DEG(Differential expressed gen

        本文链接:https://www.haomeiwen.com/subject/vcmnuktx.html