三种差异分析的整理

作者: 生信小鹏 | 来源:发表于2021-05-07 23:10 被阅读0次
    volcano plot

    针对测序数据和芯片数据,目前常用差异分析的R包有edgeR、limma、DESeq2,做一简单比较,方便平时分析。内容多为搬运,主要方便下次寻找。

    1. 三种分析方法的比较

    三种分析比较
    image.png

    三种packages的比较

    1.limma包做差异分析要求数据满足正态分布或近似正态分布,如基因芯片、TPM格式的高通量测序数据。
    2.通常认为Count数据不符合正态分布而服从泊松分布。对于count数据来说,用limma包做差异分析,误差较大
    3.DESeq2、和 EdgeR都是基于count,然后两个都是NB(negative binomial)但是在估计dispersion parameter的方法上面不一样。
    4.limma,edgeR,DESeq2三大包基本是做转录组差异分析的金标准,大多数转录组的文章都是用这三个R包进行差异分析。
    5.edgeR差异分析速度快,得到的基因数目比较多,假阳性高(实际不差异,结果差异)。DESeq2差异分析速度慢,得到的基因数目比较少,假阴性高(实际差异,结果不差异)。
    6.需要注意的是制作分组信息的因子向量是,因子水平的前后顺序,在R的很多模型中,默认将因子向量的第一个水平看作对照组。

    2.实战

    2.1数据准备

    rm(list = ls())
    library("DESeq2")
    library("limma")
    library("edgeR")
    expr = read.csv("mRNA_exprSet.csv",sep = ',',header=T)  
    head(expr)
    

    读取的基因矩阵文件,行为基因名,列为样本名


    2.2 表达数据整理

    # 对重复基因名取平均表达量,然后将基因名作为行名
    expr = avereps(expr[,-1],ID = expr$X) # 自定义
    
    # 去除低表达的基因
    expr = expr[rowMeans(expr)>1,] # 自定义
    
    # 表达矩阵分组(癌症组织和癌旁组织)
    library(stringr)
    tumor <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) < 10]
    normal <- colnames(expr)[as.integer(substr(colnames(expr),14,15)) >= 10]
    
    tumor_sample <- expr[,tumor]
    normal_sample <- expr[,normal]
    
    exprSet_by_group <- cbind(tumor_sample,normal_sample)
    group_list <- c(rep('tumor',ncol(tumor_sample)),rep('normal',ncol(normal_sample)))
    
    save(exprSet_by_group, group_list, file = 'exprSet_by_group_list.Rdata')
    

    2.3 edgeR包进行差异分析

    # 表达矩阵
    data = exprSet_by_group
    
    # 分组矩阵
    group_list = factor(group_list)
    design <- model.matrix(~0+group_list)
    rownames(design) = colnames(data)
    colnames(design) <- levels(group_list)
    
    # 差异表达矩阵
    DGElist <- DGEList( counts = data, group = group_list)
    ## Counts per Million or Reads per Kilobase per Million
    keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 ## 自定义
    table(keep_gene)
    DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
    
    DGElist <- calcNormFactors( DGElist )
    DGElist <- estimateGLMCommonDisp(DGElist, design)
    DGElist <- estimateGLMTrendedDisp(DGElist, design)
    DGElist <- estimateGLMTagwiseDisp(DGElist, design)
    
    fit <- glmFit(DGElist, design)
    results <- glmLRT(fit, contrast = c(-1, 1)) 
    nrDEG_edgeR <- topTags(results, n = nrow(DGElist))
    nrDEG_edgeR <- as.data.frame(nrDEG_edgeR)
    head(nrDEG_edgeR)
    
    # 提取基因差异显著的差异矩阵
    padj = 0.01 # 自定义
    foldChange= 2 # 自定义
    nrDEG_edgeR_signif  = nrDEG_edgeR[(nrDEG_edgeR$FDR < padj & 
                                         (nrDEG_edgeR$logFC>foldChange | nrDEG_edgeR$logFC<(-foldChange))),]
    nrDEG_edgeR_signif = nrDEG_edgeR_signif[order(nrDEG_edgeR_signif$logFC),]
    save(nrDEG_edgeR_signif,file = 'nrDEG_edgeR_signif.Rdata')
    

    2.4 DESeq2包做差异表达

    data = exprSet_by_group
    
    # 分组矩阵
    condition = factor(group_list)
    coldata <- data.frame(row.names = colnames(data), condition)
    dds <- DESeqDataSetFromMatrix(countData = data,
                                  colData = coldata,
                                  design = ~condition)
    dds$condition<- relevel(dds$condition, ref = "normal") # 指定哪一组作为对照组
    
    # 差异表达矩阵
    dds <- DESeq(dds)  
    allDEG2 <- as.data.frame(results(dds))
    
    # 提取基因差异显著的差异矩阵
    padj = 0.01 # 自定义
    foldChange= 2 # 自定义
    nrDEG_DESeq2_signif = allDEG2[(allDEG2$padj < padj & 
                                     (allDEG2$log2FoldChange>foldChange | allDEG2$log2FoldChange<(-foldChange))),]
    nrDEG_DESeq2_signif = nrDEG_DESeq2_signif[order(nrDEG_DESeq2_signif$log2FoldChange),]
    save(nrDEG_DESeq2_signif, file = 'nrDEG_DESeq2_signif.Rdata')
    

    2.5 limma包分析过程

    # 表达矩阵
    data = exprSet_by_group
    
    # 分组矩阵
    group_list = factor(group_list)
    design <- model.matrix(~0+group_list)
    rownames(design) = colnames(data)
    colnames(design) <- levels(group_list)
    
    # 差异表达矩阵
    DGElist <- DGEList( counts = data, group = group_list )
    keep_gene <- rowSums( cpm(DGElist) > 1 ) >= 2 # 自定义
    table(keep_gene)
    DGElist <- DGElist[ keep_gene, , keep.lib.sizes = FALSE ]
    
    DGElist <- calcNormFactors( DGElist )
    v <- voom(DGElist, design, plot = TRUE, normalize = "quantile")
    fit <- lmFit(v, design)
    cont.matrix <- makeContrasts(contrasts = c('tumor-normal'), levels = design)
    
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    
    nrDEG_limma_voom = topTable(fit2, coef = 'tumor-normal', n = Inf)
    nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
    head(nrDEG_limma_voom)
    
    # 提取基因差异显著的差异矩阵
    padj = 0.01 # 自定义
    foldChange= 2 # 自定义
    nrDEG_limma_voom_signif = nrDEG_limma_voom[(nrDEG_limma_voom$adj.P.Val < padj & 
                                                  (nrDEG_limma_voom$logFC>foldChange | nrDEG_limma_voom$logFC<(-foldChange))),]
    nrDEG_limma_voom_signif = nrDEG_limma_voom_signif[order(nrDEG_limma_voom_signif$logFC),]
    save(nrDEG_limma_voom_signif, file = 'nrDEG_limma_voom_signif.RDATA')
    

    最后,参考文档也很重要。可以翻阅这三个包的说明文档

    参考:
    https://www.jianshu.com/p/cf2ec58e5361
    https://blog.csdn.net/weixin_43700050/article/details/98085127

    相关文章

      网友评论

        本文标题:三种差异分析的整理

        本文链接:https://www.haomeiwen.com/subject/tpoedltx.html