Loading data
ballgown 通过正则表达和自带函数可以实现RNAseq文件夹下所有/部分样本stringtie结果读取。
library(ballgown)
library(tidyverse)
file <- "path\\to\\RNAseq\\"
# load the sample information
pheno_data <- read.table(paste0(file, "geuvadis_phenodata.csv"), header = TRUE, sep =',')
# create a ballgown object
ballgown_obj <- ballgown(dataDir = paste0(file, "ballgown"), samplePattern = "", pData = pheno_data)
# gexpr(), texpr(), eexpr(), and iexpr()
gene_data <- gexpr(ballgown_obj)
# Load all attributes including gene name
trans_data <- texpr(ballgown_obj, 'all')
Exploratory Analysis
通过查看转录本数量、分布、结构和样本间重复性,对数据进行探索性分析,了解数据结构和内容。
转录本数量分布
counts = data.frame(table(trans_data[, "gene_id"]))
colnames(counts) <- c("gene_id", "numbers")
ggplot(data = counts) +
geom_histogram(aes(x = numbers)) +
labs(title = "Distribution of transcript count per gene") +
xlab(label = "Transcripts per gene") +
theme_bw()
转录本长度分布
ggplot() +
geom_histogram(aes(x = trans_data$length)) +
labs(title = "Distribution of transcript lengths") +
xlab(label = "Transcript length (bp)") +
theme_bw()
# Summarize statistics of FPKM values
# summary(trans_data[, length(trans_data)])
转录本在基因上的结构
可视化转录本或外显子在基因组上的位置和表达水平。
plotTranscripts(gene = 'NM_000489', gown = ballgown_obj, samples = 'ERR188044_chrX',
meas = 'FPKM', colorby = 'transcript',
main='transcripts from gene NM_000489\nsample ERR188044, FPKM')
处理低表达量
一般有两种方法处理处理低表达量:
-
set minimum non-zero FPKM
-
grabbing a copy of all data values,
-
coverting 0's to NA
-
calculating the minimum or all non NA values
-
min_nonzero = minimum
-
-
set min_nonzero = 1
数据重复性
选择技术重复的样本做散点图,一般来说重复性好的数据是按照对角线对称分割的。
min_nonzero = 1
fpkm = texpr(ballgown_obj, meas="FPKM")
colnames(fpkm) <- gsub("_chrX", "", pheno_data$ids)
ERR188044 <- log2(fpkm[, "ERR188044"] + min_nonzero)
ERR188104 <- log2(fpkm[, "ERR188104"] + min_nonzero)
ggplot() +
geom_hex(aes(x = ERR188044, y = ERR188104)) +
geom_abline(intercept = 0, slope = 1, size = 1, color = "red") +
xlab(label = "ERR188044") +
ylab(label = "ERR188104") +
theme_bw()
处理低表达
转录组数据存在大量低表达和0值,一般可以直接设定阈值过滤或者结合MatrixGenerics 包进行variance过滤。
# variance filter
# library(MatrixGenerics)
# ballgown_filtted <- subset(ballgown_obj, "rowVars(texpr(ballgown_obj)) >1", genomesubset=TRUE)
# or remove the lower transcripts with a threshold value
fpkm = data.frame(fpkm)
fpkm[ ,"Row_Sum"] <- rowSums(fpkm)
judge_sum <- fpkm[ ,"Row_Sum"] > 5
ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)
index = which(fpkm[ ,"Row_Sum"] > 5)
距离分析
热图
data_mat <- fpkm[index, -ncol(fpkm)]
pheatmap(mat = log10(data_mat + 1), show_rownames = FALSE)
共有转录本数量
library(UpSetR)
judge_data <- function(x){
ifelse(x == 0, 0, 1)
}
data_mat <- fpkm[index, -ncol(fpkm)]
upset_data <- data.frame(apply(data_mat, 2, judge_data))
col_names <- colnames(upset_data)
upset(upset_data, sets = col_names,
order.by = c("freq"),
main.bar.color = "#009688")
样本间相关性
library(corrplot)
data_mat <- fpkm[index, -ncol(fpkm)]
mat <- cor(data_mat)
corrplot(corr = mat, is.corr = FALSE, type = "upper",
diag = FALSE)
MDS Plot
data_mat <- fpkm[index, -ncol(fpkm)]
dist_mat <- dist(t(data_mat))
mds_data <- cmdscale(dist_mat)
# scale the plot data
mds_data <- as.data.frame(scale(mds_data))
colnames(mds_data) <- c("MDS 1", "MDS 2")
mds_data$name <- rownames(mds_data)
ggplot(data = mds_data) +
geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
geom_text(aes(x = `MDS 1` + 0.2, y =`MDS 2`,
label = name), size = 2) +
theme_bw()
PCA Analysis
data_mat <- fpkm[index, -ncol(fpkm)]
pca_fit <- prcomp(t(data_mat))
components <- summary(pca_fit )$importance[2,c(1,2)]
plot_data <- as.data.frame(pca_fit$x)
plot_data$name <- row.names(plot_data)
ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2),
size = 2) +
ggrepel::geom_text_repel(aes(x = PC1, y = PC2, label = name)) +
xlab(label = paste0("PC 1 (",
round(components[1], 4)*100, "%)")) +
ylab(label = paste0("PC 2 (",
round(components[2], 4)*100, "%)")) +
theme_bw()
差异表达分析
用于差异表达的矩阵是不经过数据转换的,一般是原始矩阵或者是筛选过低表达基因的原始矩阵。
ballgown_filtted <- subset(ballgown_obj, "judge_sum", genomesubset=TRUE)
results_transcripts <- stattest(ballgown_filtted,
feature = "transcript",
covariate = "population",
getFC = TRUE,
meas = "FPKM")
# combine gene ids
results_transcripts <- data.frame(
geneNames = geneNames(ballgown_filtted),
geneIDs = geneIDs(ballgown_filtted),
results_transcripts)
attach(results_transcripts)
results_transcripts$threshold <- ifelse(pval < 0.05 & abs(log2(fc)) > 5,ifelse(log2(fc) > 5, 'up', 'down'), 'no')
detach()
ggplot(results_transcripts, aes(log2(fc), -log10(pval))) +
geom_point(aes(color = threshold)) +
scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
theme_bw()+
xlab(label = expression(log[2] (fold-change))) +
ylab(label = expression(-log[10] (p-value)))
Reference
Ballgown:https://www.bioconductor.org/packages/release/bioc/vignettes/ballgown/inst/doc/ballgown.html
网友评论