FILTER, Exploratory Analysis, Normalization(mediancenter , scale), Expression Analysis, Clustering and Profile Plots, the data is download form Proteomic and transcriptomic profiling of aerial organ development in Arabidopsis(Mergner et al. 2020).
6.1 Filter
Subset all rows where column Reverse, Potential contaminant
, and Only identified by site
matches blank.
library(data.table)
raw_prot <- fread("maxquant/proteinGroups.txt",
stringsAsFactors = F, integer64 = 'double')
# set all keys
setkey(raw_prot, Reverse,
`Potential contaminant`,
`Only identified by site`)
valid_prot <- raw_prot[.('', '', '')]
6.2 Normalization
Reporter intensity 进行标准化。
# mediancenter normalization
mediancenter <- function(x) {
get_median <- apply(x, 2, median, na.rm = T)
x <- sweep(x, 2, get_median , `-`) + median(x, na.rm = T)
return(x)
}
# with = F: restoring the "data.frame" mode
tmt_RI <- as.matrix(valid_prot[,grep('Reporter intensity [0-9]{1,2} DV_', colnames(valid_prot)), with=F])
col_names <- colnames(tmt_RI)
tmt_RI[tmt_RI==0] <- NA
tmt_RI_log2 <- log2(tmt_RI)
tmt_RI_norm <- mediancenter(tmt_RI_log2)
# head(tmt_flower_log2, 3)
# mapping the data back
valid_prot[,eval(col_names):=as.data.table(tmt_RI_norm)]
6.3 Normalization Protein IDs
- remove attach num(‘AT111.1’)
- remove nonsense versions of reversed sequences(’REV__’)
- remove contaminants (‘CON_’)
- Unique the protein IDs: if duplicated, select the one with hte largest
Razor + unique peptides
value.
library(stringr)
valid_prot[,AGI_code:=gsub('\\.[0-9]*;', ';', `Protein IDs`)]
# clean Protein IDs
## transform AT1G01030.1 to AT1G01030
valid_prot[,AGI_code:=gsub('\\.[0-9]*$', '', AGI_code)]
valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9];', '', AGI_code)]
valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9]', '', AGI_code)]
valid_prot[,AGI_code:=gsub('CON__.*;', '', AGI_code)]
valid_prot[,AGI_code:=gsub('CON__.*', '', AGI_code)]
# Unique the protein IDs
valid_prot[,AGI_code:=sapply(strsplit(valid_prot$AGI_code,";"), function(x) paste(unique(x), collapse = ";"))]
valid_prot[,'Number of genes':=str_count(valid_prot$AGI_code, pattern="AT")]
# if duplicated, select be largest one
valid_prot <- valid_prot[order(valid_prot$AGI_code,
-abs(valid_prot$`Razor + unique peptides`)),]
valid_prot <- valid_prot[!duplicated(valid_prot$AGI_code),]
# fwrite(valid_prot, paste0("/path/to/", "valid_prot.txt"), sep = '\t', col.names = T, row.names = F)
6.4 Data Analysis and Visualization
可视化策略(Mergner et al. 2020):火山图,韦恩图,热图,相关性热图,PCA,富集气泡图,准确性评估图,spatial proteomics data
# valid_prot <- fread(paste0(file, valid_prot.txt), stringsAsFactors = F, integer64 = 'double')
intensity_matrix <-valid_prot[,c(154, grep("Reporter intensity [0-9] DV_FL", colnames(valid_prot))), with=F]
# colnames(valid_prot)
col_names <- c('AGI_cods', 'FL9', 'FL10', 'FL11',
'FL12', 'FL13', 'FL15')
colnames(intensity_matrix) <- col_names
# Fileter normalized data
intnsity_matrix <- intensity_matrix[rowSums(intensity_matrix[,c(2:7)], na.rm = TRUE) > 0]
## protein groups all
proteingroups_all <- nrow(intensity_matrix)
## unambiguous protein groups
unambiguous_proteingroups <- nrow(intensity_matrix) - length(grep(";", intensity_matrix$AGI_cods))
6.5 Distribution of Score
Score of identification(The larger the Score , the more certain is the identification of a proteingroup. )
# Score plot
library(ggplot2)
ggplot(data = valid_prot) +
geom_point(aes(x = Score,
y = `Reporter intensity 5 DV_FL`),
alpha = 0.5) +
ylab(label = "Reporter intensity") +
theme_bw()
image.png
6.6 heat map
library(pheatmap)
plot_mat <- data.frame(intnsity_matrix[,-1])
plot_mat[is.na(plot_mat)] = 0
first <- colorRampPalette(c('#4575b4','#91bfdb'))(100)
second <- colorRampPalette(c('#e0f3f8','#fee090'))(100)
third <- colorRampPalette(c('#fc8d59','#d73027'))(25)
palette <- c(first, second, third)
pheatmap(mat = plot_mat, show_rownames = FALSE,
color = palette, cluster_rows = FALSE)
image.png
6.7 Pearson correlation
library(corrplot)
cor_data <- intnsity_matrix[,-1]
cor_data[is.na(cor_data)] <- 0
mat <- cor(cor_data)
# color palette
first <- colorRampPalette(c("black", "white"))(200)
second <- colorRampPalette(c("green", "black"))(100)
third <- colorRampPalette(c("black", "yellow"))(100)
palette <- c(first, second, third)
corrplot(corr = mat, is.corr = FALSE, type = "upper",
col = palette, cl.lim = c(0.85, 1),
diag = FALSE)
image.png
6.8 PCA
library(ggplot2)
PCA_data <- intnsity_matrix[,-1]
PCA_data[is.na(PCA_data)] <- 0
PCA_FL <- prcomp(t(PCA_data))
components <- summary(PCA_FL)$importance[2,c(1,2)]
plot_data <- as.data.frame(PCA_FL$x)
plot_data$name <- row.names(plot_data)
ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2)) +
geom_text(aes(x = PC1, y = PC2 + 3, label = name)) +
xlab(label = paste0("component 1 (",
round(components[1], 4)*100, "%)")) +
ylab(label = paste0("component 2 (",
round(components[2], 4)*100, "%)")) +
theme_bw()
image.png
6.9 Differential intensity Analysis
limma + voom
6.10 GO enrichment analysis
library(clusterProfiler)
library(AnnotationHub)
# library(biomaRt) 是另一个数据库
# library("org.Hs.eg.db") 人类
ah <- AnnotationHub()
# check the database
#subset(ah, species == 'Arabidopsis thaliana' & rdataclass == 'OrgDb')
# make OrgDB
A_thaliana_DB <- ah[['AH84114']]
# 提取等分为前100
GO_data <- valid_prot[order(valid_prot$Score,
decreasing = TRUE)][c(1:100), ]
# id 转换
SYMBOL <- mapIds(A_thaliana_DB, keys = GO_data$AGI_code,
column= "ENTREZID",
keytype = "TAIR")
# Go 富集
GO_FL <- enrichGO(SYMBOL, OrgDb = A_thaliana_DB, ont='ALL',
pAdjustMethod = 'BH', pvalueCutoff = 0.05,
qvalueCutoff = 0.2, keyType ='ENTREZID')
# head(GO_FL)
dotplot(GO_FL, showCategory=5)
barplot(GO_FL, showCategory=5)
image.png
image.png
6.11 蛋白质功能注释
It is recommended to use proteins found in the ‘Majority protein > IDs’ column for subsequent functional or enrichment analysis, as > they are protected against accidental hits to a protein group.
通过搜库对蛋白质进行鉴定后,接着就是对这些搜到的蛋白进行功能注释,这有助于了解蛋白的功能,从而解析样本相关表型,常用于功能注释的数据库有:GO、COG、KEGG、NR、Pfam、Swiss-Prot。
References
“Identification of Protein Clusters Predictive of Tumor Response in Rectal Cancer Patients Receiving Neoadjuvant Chemoradiotherapy.” n.d. Journal Article.
Mergner, J., M. Frejno, M. Messerer, D. Lang, P. Samaras, M. Wilhelm, K. F. X. Mayer, C. Schwechheimer, and B. Kuster. 2020. “Proteomic and Transcriptomic Profiling of Aerial Organ Development in Arabidopsis.” Journal Article. Sci Data 7 (1): 334. https://doi.org/10.1038/s41597-020-00678-w.
网友评论