04. EdegeR Limma DESeq

作者: 吴十三和小可爱的札记 | 来源:发表于2021-05-12 19:54 被阅读0次

    Introduction

    除了利用ballgown流程,目前比较流行的方法还是通过序列比对和计数获得的计数矩阵,然后用广义线性模型去评估和处理这些复杂实验条件下的数据;常见的R语言工具包有edgeR, DESeq2, and limma+voom(Law et al. 2014Stark, Grzelak, and Hadfield 2019).

    值得注意的是:计数矩阵必须使用原始计数矩阵,便于评估和标准化。

    EdgeR

    1. load matirx and build DEGlist

    library(edgeR)
    
    # Load gene(/transcript) count matrix and labels
    gene_count <- as.matrix(read.csv("RNAseq/gene_count_matrix.csv", row.names = "gene_id"))
    
    pheno_data <- read.csv("RNAseq/geuvadis_phenodata.csv", header = TRUE)
    
    # colnames(gene_count) <- rownames(pheno_data)
    # head(pheno_data, 3)
    
    # Check sample IDs are match to each other in orders
    all(rownames(pheno_data) %in% colnames(gene_count))
    #> [1] FALSE
    
    all(pheno_data[,1] %in% colnames(gene_count))
    #> [1] TRUE
    
    # Create a DGEList
    dge_list <- DGEList(counts = gene_count, group = pheno_data$group)
    

    2. Removing genes that are lowly expressed

    Plotting the distribution log-CPM values shows that a sizeable proportion of genes within each sample are either unexpressed or lowly-expressed with log-CPM values that are small or negative.

    # Transformations from the raw-scale
    log_cpm <- cpm(dge_list, log = TRUE, prior.count = 2)
    # summary(lcpm)
    
    # set group index
    mygroup <- "sex"
    
    # filtering uses CPM values  
    keep_exprs <- filterByExpr(dge_list, group = pheno_data[ ,mygroup])
    dge_clean <- dge_list[keep_exprs, , keep.lib.sizes=FALSE]
    
    # Transformation
    clean_lcpm <- cpm(dge_clean, log = TRUE, prior.count = 2)
    
    # 
    library(RColorBrewer)
    nsamples <- ncol(dge_list)
    col <- brewer.pal(nsamples, "Paired")
    samplenames <- colnames(dge_list)
    
    # set new plot plate
    dev.new()
    par(mfrow = c(1,2))
    # density of raw  data
    plot(density(log_cpm[,1]), col=col[1], main="", xlab="")
    title(main="A. Raw data", xlab="Log-cpm")
    for (i in 2:nsamples){
      den <- density(log_cpm[,i])
      lines(den$x, den$y, col=col[i])
    }
    legend("topright", samplenames, text.col=col, bty="n")
    
    # density of filtered  data
    plot(density(clean_lcpm[,1]), col = col[1], main = '', xlab = '')
    title(main="B. Filtered data", xlab="Log-cpm")
    for (i in 2:nsamples){
      den <- density(clean_lcpm[,i])
      lines(den$x, den$y, col = col[i])
    }
    legend("topright", samplenames, text.col=col, bty="n")
    

    3. Normalising gene expression distributions

    dge_clean <- calcNormFactors(dge_clean, method = "TMM")
    
    # dge_clean$samples$norm.factors
    clean_lcpm <- cpm(dge_clean, log=TRUE)
    
    par(mfrow=c(1,2))
    boxplot(log_cpm, las=2,main="")
    title(main="A. Example: Unnormalised data", ylab="Log-cpm")
    
    boxplot(clean_lcpm, las=2, main="")
    title(main="B. Example: Normalised data",ylab="Log-cpm")
    
    image.png

    4. Distances

    1. heat map
    library(pheatmap)
    require(gridExtra)
    
    p1 <- pheatmap(mat = log_cpm,
             show_rownames = FALSE,
             color = palette,
             main = "raw lcpm")
    
    p2 <- pheatmap(mat = clean_lcpm,
             show_rownames = FALSE,
             color = palette,
             main = "clean lcpm")
    
    # extract plot list
    plot_list <- list(p1[[4]], p2[[4]])
    
    grid.arrange(arrangeGrob(grobs= plot_list,ncol=2))
    
    image.png
    1. Pearson correlation
    library(corrplot)
    mat <- cor(clean_lcpm)
    
    corrplot(corr = mat, is.corr = FALSE, type = "upper", 
             col = palette, 
             diag = FALSE)
    
    image.png
    1. MDS Plot
    library(ggplot2)
    dist_mat <- dist(t(clean_lcpm))
    mds_data <- cmdscale(dist_mat)
    
    # scale the plot data
    mds_data <- as.data.frame(scale(mds_data))
    
    colnames(mds_data) <- c("MDS 1", "MDS 2")
    mds_data$name <- rownames(mds_data)
    
    ggplot(data = mds_data) + 
      geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
      geom_text(aes(x = `MDS 1` + 0.12, y =`MDS 2`, 
                    label = name), size = 2) + 
      theme_bw()
    
    image.png
    1. PCA Analysis
    # library(ggplot2)
    pca_fit <- prcomp(t(clean_lcpm))
    
    components <- summary(pca_fit )$importance[2,c(1,2)]
    plot_data <- as.data.frame(pca_fit$x)
    plot_data$name <- row.names(plot_data)
    
    ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2), 
                                          size = 2) +
      geom_text(aes(x = PC1*1.2, y = PC2, label = name)) +
      xlab(label = paste0("PC 1 (", 
                          round(components[1], 4)*100, "%)")) + 
      ylab(label = paste0("PC 2 (", 
                          round(components[2], 4)*100, "%)")) +
      theme_bw()
    
    image.png

    5. Differential expression analysis

    1. Removing heteroscedascity from count data
    design <- model.matrix(~0 + factor(pheno_data[ ,mygroup]))
    colnames(design) <- c("group1", "group2")
    contrast_matrix <- makeContrasts(group1-group2, levels=design)
    
    v <- voom(dge_clean, design, plot=TRUE)
    
    image.png
    1. Fitting linear models
    
    vfit <- lmFit(v, design)
    vfit <- contrasts.fit(vfit, contrasts=contrast_matrix)
    efit <- eBayes(vfit)
    
    plotSA(efit, main="Final model: Mean-variance trend")
    
    DE_gene <- topTable(efit, coef = 1, adjust="BH", n=Inf)
    
    # head map
    library(pheatmap)
    first <- colorRampPalette(c('#999999','#ffffff'))(10)
    second <- colorRampPalette(c('#99d8c9','#66c2a4'))(100)
    third <- colorRampPalette(c('#fc8d59','#d53e4f'))(100)
    
    palette <- c(first, second, third)
    
    pheatmap(mat = DE_gene,
             show_rownames = FALSE,
             #color = palette,
             main = "raw lcpm")
    
    # volcano plot
    ## adding threshold
    attach(DE_gene)
    DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
    detach()
    
    ggplot(DE_gene, aes(logFC, 
                              -log10(adj.P.Val))) +
      geom_point(aes(color = threshold)) +
      scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
      theme_bw()+
      xlab(label = "log(fold-change)") + 
      ylab(label = expression(-log[10] (adj_Pvalue)))
    
    # head map
    gene_names <- rownames(DE_gene)[1:100]
    
    i <- which(rownames(v) %in% gene_names)
    
    pheatmap(mat = clean_lcpm[i,],
             show_rownames = FALSE,
    
             main = "clean lcpm")
    
    image.png
    1. Stricter definition on significance

    在某些时候,不仅仅需要用到校正p值阈值,还需要差异倍数的对数(lfc=1)也高于某个最小值来更为严格地定义显著性。

    tfit <- treat(vfit, lfc=1)
    dt <- decideTests(tfit)
    DE_gene <- topTreat(efit, coef = 1, adjust="BH", n=Inf)
    
    # adding threshold
    attach(DE_gene)
    DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
    detach()
    
    ggplot(DE_gene, aes(logFC, 
                              -log10(adj.P.Val))) +
      geom_point(aes(color = threshold)) +
      scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
      theme_bw()+
      xlab(label = "log(fold-change)") + 
      ylab(label = expression(-log[10] (adj_Pvalue)))
    
    image.png
    1. 差异基因可视化2
    DE_gene <- DE_gene[order(DE_gene$logFC, decreasing = FALSE), ]
    DE_gene$xaixs <- seq(1, nrow(DE_gene), 1)
    
    ggplot(DE_gene, aes(xaixs, logFC)) +
      geom_point(aes(color = threshold)) + 
      geom_hline(yintercept=c(2,-2),linetype=2,size=0.25)+
      geom_hline(yintercept = c(0),linetype=1,size=0.5) + 
      xlab('rank of differentially expressed genes')+ 
      theme_bw()
    
    image.png

    6. GO enrichment

    1. ID trasformation

    GO和KEGG 分析比较烦的一点就是ID转换,由于数据是人类测序结果,我们就以 org.Hs.eg.db 包为例子,看看数据转换的情况。

    key types 是里面最需要小心的信息:
    1. Ensemble id: ENSG开头
    2. Entrez id: 纯数字
    3. Symbol id: 基因名称
    4. Refseq id: NG, NM, NP等

    数据间映射关系(总的来说,ENTREZID是进行GO分析最好的ID类型):

    1. org.Hs.egACCNUM:将Entrez ID标识符映射到GenBank的登录号
    2. org.Hs.egCHRLOC:获取映射到染色体位置的Entrez ID标识符
    3. org.Hs.egENSEMBL:用于Entrez ID与EnsemblID之间的映射
    4. org.Hs.egENSEMBLPROT:用于Entrez ID与Ensembl蛋白ID的映射
    5. org.Hs.egENSEMBLTRANS:用于Entrez ID与Ensembl transcript编号
    6. org.Hs.egENZYME:Entrez基因id和酶活性(EC)之间的图谱
    7. org.Hs.egGENENAME:Entrez ID与基因名称之间的图谱
    8. org.Hs.egGO:Entrez ID与基因本体论(GO) id之间的映射
    9. org.Hs.egMAP:Entrez ID和细胞遗传学图谱/条带之间的映射
    10. org.Hs.egOMIM:Map between Entrez Gene Identifiers and Mendelian Inheritance in Man (MIM) identifiers
    11. org.Hs.egPATH:Entrez ID和KEGG通路标识符之间的映射
    12. org.Hs.egREFSEQ:Entrez ID与RefSeq标识符之间的映射
    13. org.Hs.egSYMBOL:Entrez ID和基因符号之间的映射
    14. org.Hs.egUNIPROT:Map Uniprot accession numbers with Entrez Gene identifiers

    2. Exercise

    # RSQLite(v2.2.5) has some problems
    options(connectionObserver = NULL)
    library(org.Hs.eg.db)
    library(clusterProfiler)
    
    #
    keytypes(org.Hs.eg.db)
    temp_names <- rownames(DE_gene)[1:100]
    
    # temp_names <- gsub('NR[0-9]*[|]', '', temp_names)
    gene_names <- gsub('[|].*', '', temp_names)
    
    # keytype =  org.Hs.eg(XXX)
    ## stand form
    head(keys(org.Hs.eg.db, keytype = "SYMBOL"))
    head(keys(org.Hs.eg.db, keytype = "REFSEQ"))
    
    # ID transformation
    gene_list <- mapIds(org.Hs.eg.db, keys = gene_names,
                     column= "ENTREZID",
                     keytype = "REFSEQ")
    
    gene_list <- clusterProfiler::bitr(gene_names, fromType = "ENSEMBL" , toType = "ENTREZID", OrgDb = "org.Hs.eg.db", drop = TRUE)
    
    go_all <- clusterProfiler::enrichGO(gene_list, OrgDb = org.Hs.eg.db,
                   ont = 'ALL', pAdjustMethod = 'BH',
                   pvalueCutoff = 0.2, qvalueCutoff = 0.5,
                   keyType ='ENTREZID')
    
    go_result <- go_all@result
    
    write.table(x = go_result,
                file = "RNAseq/GO_result.txt",
                sep = "\t",
                col.names =  TRUE,
                row.names = FALSE)
    

    3. Visualization

    3.1 dot plot

    # dot plot
    go_result <- read.table(file = "RNAseq/GO_result.txt", 
                            sep = "\t",
                            header = TRUE, 
                            stringsAsFactors = F)
    
    plot_data <- go_result[order(go_result$Count, decreasing = T),]
    plot_data$Description <- forcats::fct_reorder(plot_data$Description, 
                                                  plot_data$Count)
    
    ggplot(data = plot_data[1:10, ]) + 
      geom_point(aes(x = Count, y = Description, size = Count)) + 
      theme_bw() + 
      scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) + 
      xlab(label = NULL) + 
      ylab(label = NULL)
    
    image.png

    3.2 bar plot

    result1 <- read.table(file = "RNAseq/GO_result.txt", 
                            sep = "\t",
                            header = TRUE, 
                            stringsAsFactors = F)
    
    result2 <- read.table(file = "RNAseq/GO_result2.txt", 
                             sep = "\t",
                             header = TRUE, 
                             stringsAsFactors = F)
    
    set_bar_plot <- function(result1, result2, myrank){
      myrank = as.numeric(myrank)
      ## setting df1
      temp1 <- subset(result1, select = c("Description", "qvalue", 
                                             "Count"))
      temp1 <- na.omit(temp1)
      temp1 <- unique(temp1)
      temp1$qvalue <- as.numeric(temp1$qvalue)
      df1 <- temp1[order(temp1$qvalue, 
                         decreasing = F), ][c(1:myrank ), ]
    
      ## setting df2
    
      temp2 <- subset(result2, select = c("Description", "qvalue", 
                                             "Count"))
      temp2 <- na.omit(temp2)
      temp2 <- unique(temp2)
      df2 <- temp2[order(temp2$qvalue, 
                         decreasing = F), ][c(1:myrank ), ]
    
      ## setting bar order for df1
      df1_1 <- df1[!df1$Description %in% df2$Description, ]
      df1_2 <- df1[df1$Description %in% df2$Description, ]
    
      df1_1 <- df1_1[order(df1_1$Count, decreasing = FALSE), ]
      df1_1$xlab <- seq(1,nrow(df1_1),1)
      df1_1$Count <- df1_1$Count* -1
    
      df1_2 <- df1[df1$Description %in% df2$Description, ]
      df1_2 <- df1_2[order(df1_2$Count, decreasing = FALSE), ]
    
      df1_2$xlab <- seq(nrow(df1_1)+1, (nrow(df1_1) + nrow(df1_2)), 1)
      df1_2$Count <- df1_2$Count* -1
    
      ## setting bar order for df2
      df2_1 <- df2[!df2$Description %in% df1$Description, ]
      df2_2 <- df2[df2$Description %in% df1$Description, ]
    
      df2_1 <- df2_1[order(df2_1$Count, decreasing = FALSE), ]
    
      df2_1$xlab <- seq((myrank  + nrow(df2_2) + 1),
                        (myrank  + nrow(df2_2) + nrow(df2_1)),1)
      df2_2 <- df2_2[order(df2_2$Count, decreasing = FALSE), ]
      df2_2$xlab <- seq((myrank   + 1), (myrank  +  nrow(df2_2)), 1)
      df2_2$Count <- df2_2$Count
      mylist <- list(df1_1, df1_2, df2_1, df2_2)
      mydf <- dplyr::bind_rows(mylist)
      return(mydf)
    }
    
    library(ggplot2)
    ggplot()+ 
      geom_col(data = df1, aes(x = xlab, 
                                y =  Count,
                                fill = qvalue)) +
      geom_text(data = subset(df1, Count < 0), 
                aes(x = xlab, y = 0.1, label = Description),
                hjust = "outward", color = "#c02c38",
                size = 3) + 
      geom_text(data = subset(df1, Count > 0), 
                aes(x = xlab, y = -0.1, label = Description),
                hjust = "inward",  color = "#15559a",
                size = 3) +
      scale_fill_gradient(low = "#046582", 
                          high = "#fa1e0e") + 
      scale_y_continuous(breaks = seq(-5,20,5)) + 
      coord_flip() +
      theme_void() +
      theme(legend.position = c(0.96,0.3),
            axis.line.x = element_line(size = 1),
            axis.ticks.x = element_line(size = 1),
            axis.ticks.length=unit(0.06, "cm"),
            axis.text.x = element_text())
    
    image.png

    References

    Law, C. W., Y. Chen, W. Shi, and G. K. Smyth. 2014. “Voom: Precision Weights Unlock Linear Model Analysis Tools for RNA-Seq Read Counts.” Journal Article. Genome Biol 15 (2): R29. https://doi.org/10.1186/gb-2014-15-2-r29.

    Stark, R., M. Grzelak, and J. Hadfield. 2019. “RNA Sequencing: The Teenage Years.” Journal Article. Nat Rev Genet 20 (11): 631–56. https://doi.org/10.1038/s41576-019-0150-2.

    Bioconductor: RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR

    相关文章

      网友评论

        本文标题:04. EdegeR Limma DESeq

        本文链接:https://www.haomeiwen.com/subject/ylbudltx.html