Loading data

ballgown 通过正则表达和自带函数可以实现RNAseq文件夹下所有/部分样本stringtie结果读取。

library(ballgown)
library(tidyverse)
file <- "path\\to\\RNAseq\\"
# load the sample information
pheno_data <- read.table(paste0(file, "geuvadis_phenodata.csv"), header = TRUE, sep =',')

# create a ballgown object
ballgown_obj <- ballgown(dataDir = paste0(file, "ballgown"),  samplePattern = "", pData = pheno_data)

# gexpr(), texpr(), eexpr(), and iexpr()
gene_data <- gexpr(ballgown_obj)

# Load all attributes including gene name
trans_data <-  texpr(ballgown_obj, 'all')

Exploratory Analysis

通过查看转录本数量、分布、结构和样本间重复性，对数据进行探索性分析，了解数据结构和内容。

转录本数量分布

counts = data.frame(table(trans_data[, "gene_id"]))
colnames(counts) <- c("gene_id", "numbers")
ggplot(data = counts) + 
 geom_histogram(aes(x = numbers)) + 
 labs(title = "Distribution of transcript count per gene") +
 xlab(label = "Transcripts per gene") +
 theme_bw()

转录本长度分布

ggplot() + 
 geom_histogram(aes(x = trans_data$length)) + 
 labs(title = "Distribution of transcript lengths") +
 xlab(label = "Transcript length (bp)") +
 theme_bw()

# Summarize statistics of FPKM values 
# summary(trans_data[, length(trans_data)])

转录本在基因上的结构

可视化转录本或外显子在基因组上的位置和表达水平。

plotTranscripts(gene = 'NM_000489', gown = ballgown_obj, samples = 'ERR188044_chrX', 
 meas = 'FPKM', colorby = 'transcript', 
 main='transcripts from gene NM_000489\nsample ERR188044, FPKM')

处理低表达量

一般有两种方法处理处理低表达量：

set minimum non-zero FPKM
- grabbing a copy of all data values,
- coverting 0's to NA
- calculating the minimum or all non NA values
- min_nonzero = minimum
set min_nonzero = 1

数据重复性

选择技术重复的样本做散点图，一般来说重复性好的数据是按照对角线对称分割的。

min_nonzero = 1
fpkm = texpr(ballgown_obj, meas="FPKM")
colnames(fpkm) <- gsub("_chrX", "", pheno_data$ids)

ERR188044 <- log2(fpkm[, "ERR188044"] + min_nonzero)
ERR188104 <- log2(fpkm[, "ERR188104"] + min_nonzero)

ggplot() + 
 geom_hex(aes(x = ERR188044, y = ERR188104)) + 
 geom_abline(intercept = 0, slope = 1, size = 1, color = "red") +
 xlab(label = "ERR188044") + 
 ylab(label = "ERR188104") + 
 theme_bw()

处理低表达

转录组数据存在大量低表达和0值，一般可以直接设定阈值过滤或者结合MatrixGenerics 包进行variance过滤。

# variance filter
# library(MatrixGenerics)
# ballgown_filtted <- subset(ballgown_obj, "rowVars(texpr(ballgown_obj)) >1", genomesubset=TRUE)

# or remove the lower transcripts with a threshold value
fpkm = data.frame(fpkm)
fpkm[ ,"Row_Sum"] <- rowSums(fpkm)

judge_sum <- fpkm[ ,"Row_Sum"] > 5
ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)

index = which(fpkm[ ,"Row_Sum"] > 5)

距离分析

热图

data_mat <-  fpkm[index, -ncol(fpkm)]

pheatmap(mat = log10(data_mat + 1), show_rownames = FALSE)

共有转录本数量

library(UpSetR)

judge_data <- function(x){
  ifelse(x == 0, 0, 1)
}

data_mat <- fpkm[index, -ncol(fpkm)]
upset_data <- data.frame(apply(data_mat, 2, judge_data))
col_names <- colnames(upset_data)


upset(upset_data, sets = col_names, 
      order.by = c("freq"),
      main.bar.color = "#009688")

样本间相关性

library(corrplot)
data_mat <- fpkm[index, -ncol(fpkm)]
mat <- cor(data_mat)

corrplot(corr = mat, is.corr = FALSE, type = "upper", 
         diag = FALSE)

MDS Plot

data_mat <- fpkm[index, -ncol(fpkm)]
dist_mat <- dist(t(data_mat))
mds_data <- cmdscale(dist_mat)

# scale the plot data
mds_data <- as.data.frame(scale(mds_data))

colnames(mds_data) <- c("MDS 1", "MDS 2")
mds_data$name <- rownames(mds_data)

ggplot(data = mds_data) + 
  geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
  geom_text(aes(x = `MDS 1` + 0.2, y =`MDS 2`, 
                label = name), size = 2) + 
  theme_bw()

PCA Analysis

data_mat <- fpkm[index, -ncol(fpkm)]
pca_fit <- prcomp(t(data_mat))

components <- summary(pca_fit )$importance[2,c(1,2)]
plot_data <- as.data.frame(pca_fit$x)
plot_data$name <- row.names(plot_data)

ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2), 
                                      size = 2) +
  ggrepel::geom_text_repel(aes(x = PC1, y = PC2, label = name)) +
  xlab(label = paste0("PC 1 (", 
                      round(components[1], 4)*100, "%)")) + 
  ylab(label = paste0("PC 2 (", 
                      round(components[2], 4)*100, "%)")) +
  theme_bw()

差异表达分析

用于差异表达的矩阵是不经过数据转换的，一般是原始矩阵或者是筛选过低表达基因的原始矩阵。

ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)

results_transcripts <- stattest(ballgown_filtted,
                                feature = "transcript",
                                covariate = "population",
                                getFC = TRUE, 
                                meas = "FPKM")
# combine gene ids
results_transcripts <- data.frame(
  geneNames = geneNames(ballgown_filtted),
  geneIDs = geneIDs(ballgown_filtted), 
  results_transcripts)
  
attach(results_transcripts)
results_transcripts$threshold <- ifelse(pval < 0.05 & abs(log2(fc)) > 5,ifelse(log2(fc) > 5, 'up', 'down'), 'no')
detach()

ggplot(results_transcripts, aes(log2(fc), -log10(pval))) +
  geom_point(aes(color = threshold)) +
  scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
  theme_bw()+
  xlab(label = expression(log[2] (fold-change))) + 
  ylab(label = expression(-log[10] (p-value)))