前言
对质谱数据做单变量的组间差异检验后,我们获得了检验的结果。常用展示结果的方法有图和表的方式,我们在文献中常用图来展示,而表可作为附表补充图。更多知识分享请到 https://zouhua.top/。
本文讲使用火山图,韦恩图和扩展火山图展示组间差异结果,最后再结合热图展示所有的组间差异蛋白的丰度分布情况。
Importing data
library(dplyr)
library(tibble)
library(convert)
library(ggplot2)
library(ggrepel)
library(data.table)
phen <- read.csv("phenotype_20210625.csv")
NC_PC_DEP <- fread("NC_PC_limma_Mass_LOG2Impute.csv")
NC_PT_DEP <- fread("NC_PT_limma_Mass_LOG2Impute.csv")
ExprSet <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")
Volcano Function
# function
volcanofun <- function(datset=NC_PC_DEP,
genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
group_name=subgrp[1:2],
group_col=grp.col[1:2],
pval=0.05,
fc=1){
# datset=NC_PC_DEP
# genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
# group_name=subgrp[1:2]
# group_col=grp.col[1:2]
# pval=0.05
# fc=1
dat <- datset %>%
mutate(color=factor(Enrichment,
levels = c(group_name, "Nonsignif")))
# print(table(dat$color))
dat_status <- table(dat$color)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
legend_label <- c(paste0(dat_status_name[1], " (", dat_status_number[1], ")"),
paste0(dat_status_name[2], " (", dat_status_number[2], ")"),
paste0("Nonsignif", " (", dat_status_number[3], ")"))
dat.signif <- subset(dat, adj.P.Val < pval & abs(logFC) > fc) %>%
filter(GeneID%in%genelist)
print(table(dat.signif$color))
group_col_new <- c(group_col, "#979797")
group_name_new <- levels(dat$color)
xlabel <- paste0("log2(", paste(group_name, collapse="/"), ")")
# Make a basic ggplot2 object with x-y values
pl <- ggplot(dat, aes(x = -logFC, y = -log10(adj.P.Val), color = color))+
geom_point(size = 2, alpha = 1, stroke = 1)+
scale_color_manual(name = NULL,
values = group_col_new,
labels = c(legend_label, "Nonsignif"))+
xlab(xlabel) +
ylab(expression(-log[10]("adjusted p-value")))+
geom_hline(yintercept=-log10(pval), alpha=.8, linetype=2, size=.7)+
geom_vline(xintercept=fc, alpha=.8, linetype=2, size=.7)+
geom_vline(xintercept=-fc, alpha=.8, linetype=2, size=.7)+
geom_text_repel(data = dat.signif,
aes(label = GeneID),
size = 4,
max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
segment.linetype = 1,
segment.curvature = -1e-20,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines"),
arrow = arrow(length = unit(0.005, "npc")),
color = "black", # text color
bg.color = "white", # shadow color
bg.r = 0.15)+
annotate("text", x=min(dat$logFC), y=-log10(pval), label=pval, size=6, color="red")+
annotate("text", x=fc, y=0, label=fc, size=6, color="red")+
annotate("text", x=-fc, y=0, label=-fc, size=6, color="red")+
scale_y_continuous(trans = "log1p")+
guides(color=guide_legend(override.aes = list(size = 3)))+
theme_bw()+
theme(axis.title = element_text(color = "black", size = 12),
axis.text = element_text(color = "black", size = 10),
text = element_text(size = 8, color = "black", family="serif"),
panel.grid = element_blank(),
#legend.position = "right",
legend.position = c(.15, .1),
legend.key.height = unit(0.6,"cm"),
legend.text = element_text(face = "bold", color = "black", size = 8),
strip.text = element_text(face = "bold", size = 14))
return(pl)
}
Gene_boxplot <- function(datset=ExprSet,
genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1"),
group_name=subgrp[1:2],
group_col=grp.col[1:2]){
# datset=ExprSet
# genelist=c("FIBB","FIBA","FA5","EGF","OIT3","IBP2","GANAB","RELN","LYSC","CADH1")
# group_name=subgrp[1:2]
# group_col=grp.col[1:2]
pheno <- pData(datset) %>%
filter(SubGroup%in%group_name)
pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
edata <- data.frame(exprs(datset)) %>%
dplyr::select(rownames(pheno)) %>%
rownames_to_column("GeneID") %>%
filter(GeneID%in%genelist) %>%
column_to_rownames("GeneID")
mdat <- pheno %>% dplyr::select(SubGroup) %>%
rownames_to_column("SampleID") %>%
inner_join(t(edata) %>% data.frame() %>% rownames_to_column("SampleID"), by = "SampleID") %>%
column_to_rownames("SampleID")
plotdata <- mdat %>% tidyr::gather(key="geneID", value="value", -SubGroup) %>%
mutate(SubGroup=factor(SubGroup, levels = group_name))
plotdata$geneID <- factor(plotdata$geneID, levels = genelist)
pl <- ggplot(plotdata, aes(x = SubGroup, y = value, fill= SubGroup))+
stat_boxplot(geom = "errorbar", width = 0.15,
position = position_dodge(0.4)) +
geom_boxplot(width = 0.4,
outlier.colour = "black",
outlier.shape = 21,
outlier.size = .5)+
scale_fill_manual(values = group_col)+
#scale_y_continuous(labels = scales::scientific)+
facet_wrap(facets = "geneID", scales = "free_y", nrow = 2)+
labs(x="", y="Relative Abundance (Mass Spectrometry)")+
guides(fill=F)+
theme_classic()+
theme(axis.title = element_text(color = "black", size = 12),
axis.text.x = element_text(color = "black", size = 10, hjust = .5, vjust = .5),
text = element_text(size = 8, color = "black", family="serif"),
panel.grid = element_blank(),
strip.text = element_text(face = "bold", size = 12))
return(pl)
}
NC_PC_volcano <- volcanofun(datset=NC_PC_DEP,
genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
group_name=subgrp[1:2],
group_col=grp.col[1:2])
NC_PC_volcano
ggsave("NC_PC_Mass_volcano.pdf", NC_PC_volcano, width = 8, height = 6)
NC_PC_boxplot <- Gene_boxplot(datset=ExprSet,
genelist=c("FIBB","FIBA","FA5","OIT3","IBP2","GANAB","RELN","LYSC","CADH1","EGF"),
group_name=subgrp[1:2],
group_col=grp.col[1:2])
NC_PC_boxplot
ggsave("NC_PC_Mass_SpecificDEP_boxplot.pdf", NC_PC_boxplot, width = 8, height = 6)
Venn plot for Overlap DEGs
get_DEG_List <- function(datset1=NC_PC_DEP,
datset2=NC_PT_DEP){
# datset1=NC_PC_DEP
# datset2=NC_PT_DEP
diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif")
diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif")
res <- list(diff1=diff_gene1$GeneID,
diff2=diff_gene2$GeneID)
return(res)
}
diff_gene_list <- get_DEG_List(datset1=NC_PC_DEP, datset2=NC_PT_DEP)
# eulerr
require(eulerr)
diff_venn <- euler(diff_gene_list, shape = "ellipse")
# pdf(file = "Mass_DEPs_Venn.pdf", width = 5, height = 4)
plot(diff_venn,
fills = alpha(grp.col[2:3], 0.5),
labels = c("NC vs PC", "NC vs PT"),
quantities = TRUE,
col = "black")
# dev.off()
quadrant plot
A quadrant plot shows the common or different Proteins between two comparisons.
-
1st quadrant: the Proteins up-regulate in both PC and PT
-
2nd quadrant: the Proteins up-regulate in PC but down-regulate in PT
-
3rd quadrant: the Proteins down-regulate in both PC and PT
-
4th quadrant: the Proteins down-regulate in PC but up-regulate in PT
quadrant_plot <- function(datset1=NC_PC_DEP,
datset2=NC_PT_DEP,
thresholdFC=3,
group_name=subgrp,
axis_len=12){
# datset1=NC_PC_DEP
# datset2=NC_PT_DEP
# thresholdFC=1
# group_name=subgrp
# axis_len=5
overlap_gene <- union(datset1$GeneID, datset2$GeneID)
dat_x <- datset1 %>% filter(GeneID%in%overlap_gene)
dat_y <- datset2 %>% filter(GeneID%in%overlap_gene)
# the Differential Genes
dat_x_signif <- subset(dat_x, Enrichment != "Nonsignif")
dat_y_signif <- subset(dat_y, Enrichment != "Nonsignif")
# enriched in NC and Disease in both
common_signif <- intersect(dat_x_signif$GeneID, dat_y_signif$GeneID)
# enriched in NC and Disease in each
dat_x_sig_only <- setdiff(dat_x_signif$GeneID, common_signif)
dat_y_sig_only <- setdiff(dat_y_signif$GeneID, common_signif)
non_signif <- setdiff(overlap_gene, c(common_signif, dat_x_sig_only, dat_y_sig_only))
gene_type_name <- c(paste(group_name[2:3], collapse = "/"),
paste(group_name[2], "only"),
paste(group_name[3], "only"),
"Nonsignif")
gene_type_df <- rbind(data.frame(GeneID=common_signif, group=gene_type_name[1]),
data.frame(GeneID=dat_x_sig_only, group=gene_type_name[2]),
data.frame(GeneID=dat_y_sig_only, group=gene_type_name[3]),
data.frame(GeneID=non_signif, group=gene_type_name[4]))
mdat <- inner_join(dat_x %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(xvalue=logFC),
dat_y %>% dplyr::select(GeneID, logFC) %>% dplyr::rename(yvalue=logFC),
by = "GeneID") %>%
inner_join(gene_type_df, by="GeneID") %>%
mutate(group=factor(group, levels = rev(gene_type_name)))
print(table(mdat$group))
common_signif_gene <- mdat %>% filter(GeneID%in%common_signif)
common_signif_gene <- mdat %>% filter(GeneID%in%common_signif) %>%
mutate(GeneID_v2=ifelse(abs(xvalue) > thresholdFC | abs(yvalue) > thresholdFC, GeneID, NA))
common_signif_gene_v2 <- na.omit(common_signif_gene)
print(table(common_signif_gene_v2$group))
require(magrittr)
# constants
axis_begin <- -axis_len
axis_end <- axis_len
total_ticks <- 8
# point to plot
my_point <- data.frame(x=1, y=1)
# chart junk data
tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
subset(ticks != 0)
tick_frame <- data.frame(ticks = seq(axis_begin, axis_end, by=2), zero=0) %>%
subset(ticks != 0)
lab_frame <- data.frame(lab = seq(axis_begin, axis_end, 2), zero = 0) %>%
subset(lab != 0)
tick_sz <- (tail(lab_frame$lab, 1) - lab_frame$lab[1]) / 128
x_title <- paste(group_name[1], "vs", group_name[2])
y_title <- paste(group_name[1], "vs", group_name[3])
pl <- ggplot(mdat)+
geom_segment(x = 0, xend = 0,
y = lab_frame$lab[1], yend = tail(lab_frame$lab, 1),
size = 0.5) +
geom_segment(y = 0, yend = 0,
x = lab_frame$lab[1], xend = tail(lab_frame$lab, 1),
size = 0.5) +
geom_segment(data = tick_frame,
aes(x = ticks, xend = ticks,
y = zero, yend = zero + tick_sz)) +
geom_segment(data = tick_frame,
aes(x = zero, xend = zero + tick_sz,
y = ticks, yend = ticks)) +
geom_text(data=lab_frame, aes(x=lab, y=zero, label=lab),
family = 'Times', vjust=1.5) +
geom_text(data=lab_frame, aes(x=zero, y=lab, label=lab),
family = 'Times', hjust=1.5) +
annotate("text", x = axis_len-1, y = -.7, color = "black", size=3,
label = paste0("log2(", x_title, ")"))+
annotate("text", x = -.7, y = axis_len-1, color = "black", size=3, angle=90,
label = paste0("log2(", y_title, ")"))+
geom_point(aes(x = xvalue, y = yvalue, color=group), size = 1.5)+
geom_text_repel(data = common_signif_gene,
aes(x=xvalue, y=yvalue, label = GeneID_v2),
size = 5,
max.overlaps = getOption("ggrepel.max.overlaps", default = 80),
segment.linetype = 1,
segment.curvature = -1e-20,
box.padding = unit(0.35, "lines"),
point.padding = unit(0.3, "lines"),
arrow = arrow(length = unit(0.005, "npc")),
# color = "white", # text color
# bg.color = "grey30", # shadow color
bg.r = 0.15)+
scale_color_manual(values = c("#A6A6A6", "#7BBDE0", "#B67FD0", "#FDC361"))+
guides(color=guide_legend(title = NULL, keywidth=.9, keyheight=.9, linetype=2))+
theme_void()+
theme(panel.grid = element_blank(),
text = element_text(size = 8, color = "black", family="serif"),
legend.text=element_text(size=11, color = "black"),
legend.position = c(.7, .2),
legend.justification = c(0, 0),
legend.background = element_rect(linetype=2, color = "black", fill="white"))
return(pl)
}
quadrantpl <- quadrant_plot(datset1=NC_PC_DEP,
datset2=NC_PT_DEP,
thresholdFC=3,
group_name=subgrp,
axis_len=8)
quadrantpl
ggsave("Mass_DEPs_quadrant.pdf", quadrantpl, width = 6, height = 6)
Heatmap Of DEGs
heatFun <- function(datset1=NC_PC_DEP,
datset2=NC_PT_DEP,
thresholdFC=1,
ExprSet=ExprSet,
group_name=subgrp){
# datset1=NC_PC_DEP
# datset2=NC_PT_DEP
# thresholdFC=1
# ExprSet=ExprSet
# group_name=subgrp
diff_gene1 <- datset1 %>% filter(Enrichment != "Nonsignif") %>%
filter(abs(logFC) > thresholdFC)
diff_gene2 <- datset2 %>% filter(Enrichment != "Nonsignif") %>%
filter(abs(logFC) > thresholdFC)
union_gene <- Reduce(union, list(diff_gene1$GeneID, diff_gene2$GeneID))
pheno <- pData(ExprSet) %>% data.frame() %>%
rownames_to_column("SampleID") %>%
filter(SubGroup%in%group_name) %>%
mutate(SubGroup=factor(SubGroup, levels = group_name)) %>%
arrange(SubGroup) %>%
column_to_rownames("SampleID")
edata <- exprs(ExprSet) %>% data.frame() %>%
rownames_to_column("geneid") %>%
filter(geneid%in%union_gene) %>%
dplyr::select(c("geneid", rownames(pheno))) %>%
column_to_rownames("geneid")
# scale data: z-score
scale_rows <- function (x) {
m = apply(x, 1, mean, na.rm = T)
s = apply(x, 1, sd, na.rm = T)
return((x - m)/s)
}
edata_scaled <- t(scale_rows(edata))
require(circlize)
col_fun <- colorRamp2(c(round(range(edata_scaled)[1]), 0,
round(range(edata_scaled)[2])),
c("blue", "white", "red"))
# row split
dat_status <- table(pheno$SubGroup)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
row_split <- c()
for (i in 1:length(dat_status_number)) {
row_split <- c(row_split, rep(i, dat_status_number[i]))
}
require(ComplexHeatmap)
pl <- Heatmap(
edata_scaled,
#col = col_fun,
cluster_rows = FALSE,
row_order = rownames(pheno),
show_column_names = FALSE,
show_row_names = FALSE,
row_names_gp = gpar(fontsize = 12),
row_names_side = "right",
row_dend_side = "left",
column_title = NULL,
heatmap_legend_param = list(
title = "Relative Abundance\nZscore",
title_position = "topcenter",
border = "black",
legend_height = unit(10, "cm"),
direction = "horizontal"),
row_split = row_split,
left_annotation = rowAnnotation(foo = anno_block(gp = gpar(fill = 2:4),
labels = group_name,
labels_gp = gpar(col = "black", fontsize = 12))),
column_km = 3
)
return(pl)
}
heatFun(datset1=NC_PC_DEP,
datset2=NC_PT_DEP,
thresholdFC=1,
ExprSet=ExprSet,
group_name=subgrp)
结果效果图类似如下截图
systemic information
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)
Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggrepel_0.9.1.9999 SummarizedExperiment_1.20.0 GenomicRanges_1.42.0 GenomeInfoDb_1.26.4
[5] IRanges_2.24.1 S4Vectors_0.28.1 MatrixGenerics_1.2.1 matrixStats_0.58.0
[9] data.table_1.14.0 umap_0.2.7.0 vegan_2.5-6 lattice_0.20-41
[13] permute_0.9-5 Rtsne_0.15 convert_1.64.0 marray_1.68.0
[17] limma_3.46.0 Biobase_2.50.0 BiocGenerics_0.36.0 ggplot2_3.3.3
[21] tibble_3.1.0 dplyr_1.0.5
loaded via a namespace (and not attached):
[1] nlme_3.1-150 bitops_1.0-6 tools_4.0.2 backports_1.2.1 utf8_1.2.1
[6] R6_2.5.0 DBI_1.1.1 mgcv_1.8-34 colorspace_2.0-0 withr_2.4.1
[11] tidyselect_1.1.0 curl_4.3 compiler_4.0.2 DelayedArray_0.16.3 scales_1.1.1
[16] askpass_1.1 digest_0.6.27 foreign_0.8-81 rmarkdown_2.7 rio_0.5.16
[21] XVector_0.30.0 pkgconfig_2.0.3 htmltools_0.5.1.1 rlang_0.4.10 readxl_1.3.1
[26] generics_0.1.0 jsonlite_1.7.2 zip_2.1.1 car_3.0-10 RCurl_1.98-1.3
[31] magrittr_2.0.1 GenomeInfoDbData_1.2.4 Matrix_1.3-2 Rcpp_1.0.6 munsell_0.5.0
[36] fansi_0.4.2 abind_1.4-5 reticulate_1.18 lifecycle_1.0.0 stringi_1.4.6
[41] yaml_2.2.1 edgeR_3.32.1 carData_3.0-4 MASS_7.3-53.1 zlibbioc_1.36.0
[46] grid_4.0.2 forcats_0.5.0 crayon_1.4.1 haven_2.3.1 splines_4.0.2
[51] hms_1.0.0 locfit_1.5-9.4 knitr_1.31 pillar_1.6.0 ggpubr_0.4.0
[56] ggsignif_0.6.0 glue_1.4.2 evaluate_0.14 vctrs_0.3.7 cellranger_1.1.0
[61] gtable_0.3.0 openssl_1.4.3 purrr_0.3.4 tidyr_1.1.3 assertthat_0.2.1
[66] xfun_0.20 openxlsx_4.2.3 broom_0.7.6 RSpectra_0.16-0 rstatix_0.7.0
Reference
- Core functional nodes and sex-specific pathways in human ischaemic and dilated cardiomyopathy
网友评论