美文网首页质谱蛋白代谢
MaxQuant Results with R

MaxQuant Results with R

作者: 吴十三和小可爱的札记 | 来源:发表于2020-11-27 22:30 被阅读0次

FILTER, Exploratory Analysis, Normalization(mediancenter , scale), Expression Analysis, Clustering and Profile Plots, the data is download form Proteomic and transcriptomic profiling of aerial organ development in Arabidopsis(Mergner et al. 2020).

6.1 Filter

Subset all rows where column Reverse, Potential contaminant, and Only identified by site matches blank.

library(data.table)

raw_prot <- fread("maxquant/proteinGroups.txt", 
                 stringsAsFactors = F, integer64 = 'double')

# set all keys
setkey(raw_prot, Reverse, 
       `Potential contaminant`, 
       `Only identified by site`)

valid_prot <- raw_prot[.('', '', '')]

6.2 Normalization

Reporter intensity 进行标准化。

# mediancenter normalization
mediancenter <- function(x) {
    get_median <- apply(x,  2,  median,  na.rm = T)
    x <- sweep(x,  2,  get_median ,  `-`)  +  median(x,  na.rm = T)
  return(x)
}

# with = F: restoring the "data.frame" mode
tmt_RI <- as.matrix(valid_prot[,grep('Reporter intensity [0-9]{1,2} DV_', colnames(valid_prot)), with=F])

col_names <- colnames(tmt_RI)

tmt_RI[tmt_RI==0] <- NA
tmt_RI_log2 <- log2(tmt_RI)
tmt_RI_norm <- mediancenter(tmt_RI_log2)
# head(tmt_flower_log2, 3)

# mapping the data back 
valid_prot[,eval(col_names):=as.data.table(tmt_RI_norm)]

6.3 Normalization Protein IDs

  1. remove attach num(‘AT111.1’)
  2. remove nonsense versions of reversed sequences(’REV__’)
  3. remove contaminants (‘CON_’)
  4. Unique the protein IDs: if duplicated, select the one with hte largest Razor + unique peptides value.
library(stringr)
valid_prot[,AGI_code:=gsub('\\.[0-9]*;', ';', `Protein IDs`)]

# clean Protein IDs
## transform AT1G01030.1 to AT1G01030
valid_prot[,AGI_code:=gsub('\\.[0-9]*$', '', AGI_code)]

valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9];', '',  AGI_code)]

valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9]', '',  AGI_code)]

valid_prot[,AGI_code:=gsub('CON__.*;', '', AGI_code)]
valid_prot[,AGI_code:=gsub('CON__.*', '', AGI_code)]

# Unique the protein IDs 
valid_prot[,AGI_code:=sapply(strsplit(valid_prot$AGI_code,";"), function(x) paste(unique(x), collapse = ";"))]

valid_prot[,'Number of genes':=str_count(valid_prot$AGI_code, pattern="AT")]

# if duplicated, select be largest one
valid_prot <- valid_prot[order(valid_prot$AGI_code, 
                         -abs(valid_prot$`Razor + unique peptides`)),]

valid_prot <- valid_prot[!duplicated(valid_prot$AGI_code),]

# fwrite(valid_prot, paste0("/path/to/", "valid_prot.txt"), sep = '\t', col.names = T, row.names = F)

6.4 Data Analysis and Visualization

可视化策略(Mergner et al. 2020):火山图,韦恩图,热图,相关性热图,PCA,富集气泡图,准确性评估图,spatial proteomics data

# valid_prot <- fread(paste0(file, valid_prot.txt),  stringsAsFactors = F, integer64 = 'double')

intensity_matrix <-valid_prot[,c(154, grep("Reporter intensity [0-9] DV_FL", colnames(valid_prot))), with=F]
# colnames(valid_prot)
col_names <- c('AGI_cods', 'FL9', 'FL10', 'FL11', 
               'FL12', 'FL13', 'FL15')
colnames(intensity_matrix) <- col_names

# Fileter normalized data 

intnsity_matrix <- intensity_matrix[rowSums(intensity_matrix[,c(2:7)],  na.rm = TRUE) > 0]

## protein groups all
proteingroups_all <- nrow(intensity_matrix)

## unambiguous protein groups 
unambiguous_proteingroups <- nrow(intensity_matrix) - length(grep(";", intensity_matrix$AGI_cods))

6.5 Distribution of Score

Score of identification(The larger the Score , the more certain is the identification of a proteingroup. )

# Score plot 
library(ggplot2)
ggplot(data = valid_prot)  + 
  geom_point(aes(x = Score, 
                 y = `Reporter intensity 5 DV_FL`), 
             alpha = 0.5) +
  ylab(label = "Reporter intensity") + 
  theme_bw()
image.png

6.6 heat map

library(pheatmap)

plot_mat <- data.frame(intnsity_matrix[,-1])
plot_mat[is.na(plot_mat)] = 0

first <- colorRampPalette(c('#4575b4','#91bfdb'))(100)
second <- colorRampPalette(c('#e0f3f8','#fee090'))(100)
third <- colorRampPalette(c('#fc8d59','#d73027'))(25)

palette <- c(first, second, third)

pheatmap(mat = plot_mat, show_rownames = FALSE,
         color = palette, cluster_rows = FALSE)
image.png

6.7 Pearson correlation

library(corrplot)

cor_data <- intnsity_matrix[,-1]
cor_data[is.na(cor_data)]  <-  0

mat <- cor(cor_data)

# color palette
first <- colorRampPalette(c("black", "white"))(200)
second <- colorRampPalette(c("green", "black"))(100)
third <- colorRampPalette(c("black", "yellow"))(100)

palette <- c(first, second, third)

corrplot(corr = mat, is.corr = FALSE, type = "upper", 
         col = palette, cl.lim = c(0.85, 1), 
         diag = FALSE)
image.png

6.8 PCA

library(ggplot2)

PCA_data <- intnsity_matrix[,-1]
PCA_data[is.na(PCA_data)]  <-  0

PCA_FL <- prcomp(t(PCA_data))

components <- summary(PCA_FL)$importance[2,c(1,2)]

plot_data <- as.data.frame(PCA_FL$x)
plot_data$name <- row.names(plot_data)

ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2)) +
  geom_text(aes(x = PC1, y = PC2 + 3, label = name)) +
  xlab(label = paste0("component 1 (", 
                      round(components[1], 4)*100, "%)")) + 
  ylab(label = paste0("component 2 (", 
                      round(components[2], 4)*100, "%)")) +
  theme_bw()
image.png

6.9 Differential intensity Analysis

limma + voom

6.10 GO enrichment analysis

library(clusterProfiler)
library(AnnotationHub)
# library(biomaRt) 是另一个数据库
# library("org.Hs.eg.db") 人类
ah <- AnnotationHub()

# check the database
#subset(ah, species == 'Arabidopsis thaliana' & rdataclass == 'OrgDb')

# make OrgDB
A_thaliana_DB <- ah[['AH84114']]

# 提取等分为前100
GO_data <- valid_prot[order(valid_prot$Score, 
                         decreasing = TRUE)][c(1:100), ]

# id 转换
SYMBOL <- mapIds(A_thaliana_DB, keys = GO_data$AGI_code, 
                 column= "ENTREZID", 
                 keytype = "TAIR")
# Go 富集
GO_FL <- enrichGO(SYMBOL, OrgDb = A_thaliana_DB, ont='ALL', 
                   pAdjustMethod = 'BH', pvalueCutoff = 0.05, 
                   qvalueCutoff = 0.2, keyType ='ENTREZID')
# head(GO_FL)

dotplot(GO_FL, showCategory=5)
barplot(GO_FL, showCategory=5)
image.png image.png

6.11 蛋白质功能注释

It is recommended to use proteins found in the ‘Majority protein > IDs’ column for subsequent functional or enrichment analysis, as > they are protected against accidental hits to a protein group.

通过搜库对蛋白质进行鉴定后,接着就是对这些搜到的蛋白进行功能注释,这有助于了解蛋白的功能,从而解析样本相关表型,常用于功能注释的数据库有:GO、COG、KEGG、NR、Pfam、Swiss-Prot。

References

“Identification of Protein Clusters Predictive of Tumor Response in Rectal Cancer Patients Receiving Neoadjuvant Chemoradiotherapy.” n.d. Journal Article.

Mergner, J., M. Frejno, M. Messerer, D. Lang, P. Samaras, M. Wilhelm, K. F. X. Mayer, C. Schwechheimer, and B. Kuster. 2020. “Proteomic and Transcriptomic Profiling of Aerial Organ Development in Arabidopsis.” Journal Article. Sci Data 7 (1): 334. https://doi.org/10.1038/s41597-020-00678-w.

相关文章

网友评论

    本文标题:MaxQuant Results with R

    本文链接:https://www.haomeiwen.com/subject/okffwktx.html