04. EdegeR Limma DESeq

作者: 吴十三和小可爱的札记 | 来源:发表于2021-05-12 19:54 被阅读0次

Introduction

除了利用ballgown流程，目前比较流行的方法还是通过序列比对和计数获得的计数矩阵，然后用广义线性模型去评估和处理这些复杂实验条件下的数据；常见的R语言工具包有edgeR, DESeq2, and limma+voom(Law et al. 2014 Stark, Grzelak, and Hadfield 2019).

值得注意的是：计数矩阵必须使用原始计数矩阵，便于评估和标准化。

EdgeR

1. load matirx and build DEGlist

library(edgeR)

# Load gene(/transcript) count matrix and labels
gene_count <- as.matrix(read.csv("RNAseq/gene_count_matrix.csv", row.names = "gene_id"))

pheno_data <- read.csv("RNAseq/geuvadis_phenodata.csv", header = TRUE)

# colnames(gene_count) <- rownames(pheno_data)
# head(pheno_data, 3)

# Check sample IDs are match to each other in orders
all(rownames(pheno_data) %in% colnames(gene_count))
#> [1] FALSE

all(pheno_data[,1] %in% colnames(gene_count))
#> [1] TRUE

# Create a DGEList
dge_list <- DGEList(counts = gene_count, group = pheno_data$group)

2. Removing genes that are lowly expressed

Plotting the distribution log-CPM values shows that a sizeable proportion of genes within each sample are either unexpressed or lowly-expressed with log-CPM values that are small or negative.

# Transformations from the raw-scale
log_cpm <- cpm(dge_list, log = TRUE, prior.count = 2)
# summary(lcpm)

# set group index
mygroup <- "sex"

# filtering uses CPM values  
keep_exprs <- filterByExpr(dge_list, group = pheno_data[ ,mygroup])
dge_clean <- dge_list[keep_exprs, , keep.lib.sizes=FALSE]

# Transformation
clean_lcpm <- cpm(dge_clean, log = TRUE, prior.count = 2)

# 
library(RColorBrewer)
nsamples <- ncol(dge_list)
col <- brewer.pal(nsamples, "Paired")
samplenames <- colnames(dge_list)

# set new plot plate
dev.new()
par(mfrow = c(1,2))
# density of raw  data
plot(density(log_cpm[,1]), col=col[1], main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
for (i in 2:nsamples){
  den <- density(log_cpm[,i])
  lines(den$x, den$y, col=col[i])
}
legend("topright", samplenames, text.col=col, bty="n")

# density of filtered  data
plot(density(clean_lcpm[,1]), col = col[1], main = '', xlab = '')
title(main="B. Filtered data", xlab="Log-cpm")
for (i in 2:nsamples){
  den <- density(clean_lcpm[,i])
  lines(den$x, den$y, col = col[i])
}
legend("topright", samplenames, text.col=col, bty="n")

3. Normalising gene expression distributions

dge_clean <- calcNormFactors(dge_clean, method = "TMM")

# dge_clean$samples$norm.factors
clean_lcpm <- cpm(dge_clean, log=TRUE)

par(mfrow=c(1,2))
boxplot(log_cpm, las=2,main="")
title(main="A. Example: Unnormalised data", ylab="Log-cpm")

boxplot(clean_lcpm, las=2, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")

image.png

4. Distances

heat map

library(pheatmap)
require(gridExtra)

p1 <- pheatmap(mat = log_cpm,
         show_rownames = FALSE,
         color = palette,
         main = "raw lcpm")

p2 <- pheatmap(mat = clean_lcpm,
         show_rownames = FALSE,
         color = palette,
         main = "clean lcpm")

# extract plot list
plot_list <- list(p1[[4]], p2[[4]])

grid.arrange(arrangeGrob(grobs= plot_list,ncol=2))

image.png

Pearson correlation

library(corrplot)
mat <- cor(clean_lcpm)

corrplot(corr = mat, is.corr = FALSE, type = "upper", 
         col = palette, 
         diag = FALSE)

image.png

MDS Plot

library(ggplot2)
dist_mat <- dist(t(clean_lcpm))
mds_data <- cmdscale(dist_mat)

# scale the plot data
mds_data <- as.data.frame(scale(mds_data))

colnames(mds_data) <- c("MDS 1", "MDS 2")
mds_data$name <- rownames(mds_data)

ggplot(data = mds_data) + 
  geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
  geom_text(aes(x = `MDS 1` + 0.12, y =`MDS 2`, 
                label = name), size = 2) + 
  theme_bw()

image.png

PCA Analysis

# library(ggplot2)
pca_fit <- prcomp(t(clean_lcpm))

components <- summary(pca_fit )$importance[2,c(1,2)]
plot_data <- as.data.frame(pca_fit$x)
plot_data$name <- row.names(plot_data)

ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2), 
                                      size = 2) +
  geom_text(aes(x = PC1*1.2, y = PC2, label = name)) +
  xlab(label = paste0("PC 1 (", 
                      round(components[1], 4)*100, "%)")) + 
  ylab(label = paste0("PC 2 (", 
                      round(components[2], 4)*100, "%)")) +
  theme_bw()

image.png

5. Differential expression analysis

Removing heteroscedascity from count data

design <- model.matrix(~0 + factor(pheno_data[ ,mygroup]))
colnames(design) <- c("group1", "group2")
contrast_matrix <- makeContrasts(group1-group2, levels=design)

v <- voom(dge_clean, design, plot=TRUE)

image.png

Fitting linear models


vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contrast_matrix)
efit <- eBayes(vfit)

plotSA(efit, main="Final model: Mean-variance trend")

DE_gene <- topTable(efit, coef = 1, adjust="BH", n=Inf)

# head map
library(pheatmap)
first <- colorRampPalette(c('#999999','#ffffff'))(10)
second <- colorRampPalette(c('#99d8c9','#66c2a4'))(100)
third <- colorRampPalette(c('#fc8d59','#d53e4f'))(100)

palette <- c(first, second, third)

pheatmap(mat = DE_gene,
         show_rownames = FALSE,
         #color = palette,
         main = "raw lcpm")

# volcano plot
## adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()

ggplot(DE_gene, aes(logFC, 
                          -log10(adj.P.Val))) +
  geom_point(aes(color = threshold)) +
  scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
  theme_bw()+
  xlab(label = "log(fold-change)") + 
  ylab(label = expression(-log[10] (adj_Pvalue)))

# head map
gene_names <- rownames(DE_gene)[1:100]

i <- which(rownames(v) %in% gene_names)

pheatmap(mat = clean_lcpm[i,],
         show_rownames = FALSE,

         main = "clean lcpm")

image.png

Stricter definition on significance

在某些时候，不仅仅需要用到校正p值阈值，还需要差异倍数的对数(lfc=1)也高于某个最小值来更为严格地定义显著性。

tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
DE_gene <- topTreat(efit, coef = 1, adjust="BH", n=Inf)

# adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()

ggplot(DE_gene, aes(logFC, 
                          -log10(adj.P.Val))) +
  geom_point(aes(color = threshold)) +
  scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
  theme_bw()+
  xlab(label = "log(fold-change)") + 
  ylab(label = expression(-log[10] (adj_Pvalue)))

image.png

差异基因可视化2

DE_gene <- DE_gene[order(DE_gene$logFC, decreasing = FALSE), ]
DE_gene$xaixs <- seq(1, nrow(DE_gene), 1)

ggplot(DE_gene, aes(xaixs, logFC)) +
  geom_point(aes(color = threshold)) + 
  geom_hline(yintercept=c(2,-2),linetype=2,size=0.25)+
  geom_hline(yintercept = c(0),linetype=1,size=0.5) + 
  xlab('rank of differentially expressed genes')+ 
  theme_bw()

image.png

6. GO enrichment

1. ID trasformation

GO和KEGG 分析比较烦的一点就是ID转换，由于数据是人类测序结果，我们就以 org.Hs.eg.db 包为例子，看看数据转换的情况。

key types 是里面最需要小心的信息：
1. Ensemble id: ENSG开头
2. Entrez id: 纯数字
3. Symbol id: 基因名称
4. Refseq id: NG, NM, NP等

数据间映射关系(总的来说，ENTREZID是进行GO分析最好的ID类型)：

org.Hs.egACCNUM：将Entrez ID标识符映射到GenBank的登录号
org.Hs.egCHRLOC：获取映射到染色体位置的Entrez ID标识符
org.Hs.egENSEMBL：用于Entrez ID与EnsemblID之间的映射
org.Hs.egENSEMBLPROT：用于Entrez ID与Ensembl蛋白ID的映射
org.Hs.egENSEMBLTRANS：用于Entrez ID与Ensembl transcript编号
org.Hs.egENZYME：Entrez基因id和酶活性(EC)之间的图谱
org.Hs.egGENENAME：Entrez ID与基因名称之间的图谱
org.Hs.egGO：Entrez ID与基因本体论(GO) id之间的映射
org.Hs.egMAP：Entrez ID和细胞遗传学图谱/条带之间的映射
org.Hs.egOMIM：Map between Entrez Gene Identifiers and Mendelian Inheritance in Man (MIM) identifiers
org.Hs.egPATH：Entrez ID和KEGG通路标识符之间的映射
org.Hs.egREFSEQ：Entrez ID与RefSeq标识符之间的映射
org.Hs.egSYMBOL：Entrez ID和基因符号之间的映射
org.Hs.egUNIPROT：Map Uniprot accession numbers with Entrez Gene identifiers

2. Exercise

# RSQLite(v2.2.5) has some problems
options(connectionObserver = NULL)
library(org.Hs.eg.db)
library(clusterProfiler)

#
keytypes(org.Hs.eg.db)
temp_names <- rownames(DE_gene)[1:100]

# temp_names <- gsub('NR[0-9]*[|]', '', temp_names)
gene_names <- gsub('[|].*', '', temp_names)

# keytype =  org.Hs.eg(XXX)
## stand form
head(keys(org.Hs.eg.db, keytype = "SYMBOL"))
head(keys(org.Hs.eg.db, keytype = "REFSEQ"))

# ID transformation
gene_list <- mapIds(org.Hs.eg.db, keys = gene_names,
                 column= "ENTREZID",
                 keytype = "REFSEQ")

gene_list <- clusterProfiler::bitr(gene_names, fromType = "ENSEMBL" , toType = "ENTREZID", OrgDb = "org.Hs.eg.db", drop = TRUE)

go_all <- clusterProfiler::enrichGO(gene_list, OrgDb = org.Hs.eg.db,
               ont = 'ALL', pAdjustMethod = 'BH',
               pvalueCutoff = 0.2, qvalueCutoff = 0.5,
               keyType ='ENTREZID')

go_result <- go_all@result

write.table(x = go_result,
            file = "RNAseq/GO_result.txt",
            sep = "\t",
            col.names =  TRUE,
            row.names = FALSE)

3. Visualization

3.1 dot plot

# dot plot
go_result <- read.table(file = "RNAseq/GO_result.txt", 
                        sep = "\t",
                        header = TRUE, 
                        stringsAsFactors = F)

plot_data <- go_result[order(go_result$Count, decreasing = T),]
plot_data$Description <- forcats::fct_reorder(plot_data$Description, 
                                              plot_data$Count)

ggplot(data = plot_data[1:10, ]) + 
  geom_point(aes(x = Count, y = Description, size = Count)) + 
  theme_bw() + 
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) + 
  xlab(label = NULL) + 
  ylab(label = NULL)

image.png

3.2 bar plot

result1 <- read.table(file = "RNAseq/GO_result.txt", 
                        sep = "\t",
                        header = TRUE, 
                        stringsAsFactors = F)

result2 <- read.table(file = "RNAseq/GO_result2.txt", 
                         sep = "\t",
                         header = TRUE, 
                         stringsAsFactors = F)

set_bar_plot <- function(result1, result2, myrank){
  myrank = as.numeric(myrank)
  ## setting df1
  temp1 <- subset(result1, select = c("Description", "qvalue", 
                                         "Count"))
  temp1 <- na.omit(temp1)
  temp1 <- unique(temp1)
  temp1$qvalue <- as.numeric(temp1$qvalue)
  df1 <- temp1[order(temp1$qvalue, 
                     decreasing = F), ][c(1:myrank ), ]

  ## setting df2

  temp2 <- subset(result2, select = c("Description", "qvalue", 
                                         "Count"))
  temp2 <- na.omit(temp2)
  temp2 <- unique(temp2)
  df2 <- temp2[order(temp2$qvalue, 
                     decreasing = F), ][c(1:myrank ), ]

  ## setting bar order for df1
  df1_1 <- df1[!df1$Description %in% df2$Description, ]
  df1_2 <- df1[df1$Description %in% df2$Description, ]

  df1_1 <- df1_1[order(df1_1$Count, decreasing = FALSE), ]
  df1_1$xlab <- seq(1,nrow(df1_1),1)
  df1_1$Count <- df1_1$Count* -1

  df1_2 <- df1[df1$Description %in% df2$Description, ]
  df1_2 <- df1_2[order(df1_2$Count, decreasing = FALSE), ]

  df1_2$xlab <- seq(nrow(df1_1)+1, (nrow(df1_1) + nrow(df1_2)), 1)
  df1_2$Count <- df1_2$Count* -1

  ## setting bar order for df2
  df2_1 <- df2[!df2$Description %in% df1$Description, ]
  df2_2 <- df2[df2$Description %in% df1$Description, ]

  df2_1 <- df2_1[order(df2_1$Count, decreasing = FALSE), ]

  df2_1$xlab <- seq((myrank  + nrow(df2_2) + 1),
                    (myrank  + nrow(df2_2) + nrow(df2_1)),1)
  df2_2 <- df2_2[order(df2_2$Count, decreasing = FALSE), ]
  df2_2$xlab <- seq((myrank   + 1), (myrank  +  nrow(df2_2)), 1)
  df2_2$Count <- df2_2$Count
  mylist <- list(df1_1, df1_2, df2_1, df2_2)
  mydf <- dplyr::bind_rows(mylist)
  return(mydf)
}

library(ggplot2)
ggplot()+ 
  geom_col(data = df1, aes(x = xlab, 
                            y =  Count,
                            fill = qvalue)) +
  geom_text(data = subset(df1, Count < 0), 
            aes(x = xlab, y = 0.1, label = Description),
            hjust = "outward", color = "#c02c38",
            size = 3) + 
  geom_text(data = subset(df1, Count > 0), 
            aes(x = xlab, y = -0.1, label = Description),
            hjust = "inward",  color = "#15559a",
            size = 3) +
  scale_fill_gradient(low = "#046582", 
                      high = "#fa1e0e") + 
  scale_y_continuous(breaks = seq(-5,20,5)) + 
  coord_flip() +
  theme_void() +
  theme(legend.position = c(0.96,0.3),
        axis.line.x = element_line(size = 1),
        axis.ticks.x = element_line(size = 1),
        axis.ticks.length=unit(0.06, "cm"),
        axis.text.x = element_text())

image.png

References

Law, C. W., Y. Chen, W. Shi, and G. K. Smyth. 2014. “Voom: Precision Weights Unlock Linear Model Analysis Tools for RNA-Seq Read Counts.” Journal Article. Genome Biol 15 (2): R29. https://doi.org/10.1186/gb-2014-15-2-r29.

Stark, R., M. Grzelak, and J. Hadfield. 2019. “RNA Sequencing: The Teenage Years.” Journal Article. Nat Rev Genet 20 (11): 631–56. https://doi.org/10.1038/s41576-019-0150-2.

Bioconductor: RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR

04. EdegeR Limma DESeq

Introduction

EdgeR

1. load matirx and build DEGlist

2. Removing genes that are lowly expressed

3. Normalising gene expression distributions

4. Distances

5. Differential expression analysis

6. GO enrichment

1. ID trasformation

2. Exercise

3. Visualization

3.2 bar plot

References

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

生信分析

组学数据分析

转录组数据分析

走进转录组

RNA