MaxQuant Results with R

作者: 吴十三和小可爱的札记 | 来源:发表于2020-11-27 22:30 被阅读0次

MaxQuant Results with R
R包：PTXQC蛋白质组学质控评估
CMOT中对跟踪结果results.mat的理解
CGBlendMode
OKR问题总结
我的2019OKR
Results
数据分析：质谱数据预处理
2、Stata入门---如何保证分析的可重复性
results 5

FILTER, Exploratory Analysis, Normalization(mediancenter , scale), Expression Analysis, Clustering and Profile Plots, the data is download form Proteomic and transcriptomic profiling of aerial organ development in Arabidopsis(Mergner et al. 2020).

6.1 Filter

Subset all rows where column Reverse, Potential contaminant, and Only identified by site matches blank.

library(data.table)

raw_prot <- fread("maxquant/proteinGroups.txt", 
                 stringsAsFactors = F, integer64 = 'double')

# set all keys
setkey(raw_prot, Reverse, 
       `Potential contaminant`, 
       `Only identified by site`)

valid_prot <- raw_prot[.('', '', '')]

6.2 Normalization

Reporter intensity 进行标准化。

# mediancenter normalization
mediancenter <- function(x) {
    get_median <- apply(x,  2,  median,  na.rm = T)
    x <- sweep(x,  2,  get_median ,  `-`)  +  median(x,  na.rm = T)
  return(x)
}

# with = F: restoring the "data.frame" mode
tmt_RI <- as.matrix(valid_prot[,grep('Reporter intensity [0-9]{1,2} DV_', colnames(valid_prot)), with=F])

col_names <- colnames(tmt_RI)

tmt_RI[tmt_RI==0] <- NA
tmt_RI_log2 <- log2(tmt_RI)
tmt_RI_norm <- mediancenter(tmt_RI_log2)
# head(tmt_flower_log2, 3)

# mapping the data back 
valid_prot[,eval(col_names):=as.data.table(tmt_RI_norm)]

6.3 Normalization Protein IDs

remove attach num(‘AT111.1’)
remove nonsense versions of reversed sequences(’REV__’)
remove contaminants (‘CON_’)
Unique the protein IDs: if duplicated, select the one with hte largest Razor + unique peptides value.

library(stringr)
valid_prot[,AGI_code:=gsub('\\.[0-9]*;', ';', `Protein IDs`)]

# clean Protein IDs
## transform AT1G01030.1 to AT1G01030
valid_prot[,AGI_code:=gsub('\\.[0-9]*$', '', AGI_code)]

valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9];', '',  AGI_code)]

valid_prot[,AGI_code:=gsub('REV__AT[0-9|C|M]G[0-9]*.[0-9]', '',  AGI_code)]

valid_prot[,AGI_code:=gsub('CON__.*;', '', AGI_code)]
valid_prot[,AGI_code:=gsub('CON__.*', '', AGI_code)]

# Unique the protein IDs 
valid_prot[,AGI_code:=sapply(strsplit(valid_prot$AGI_code,";"), function(x) paste(unique(x), collapse = ";"))]

valid_prot[,'Number of genes':=str_count(valid_prot$AGI_code, pattern="AT")]

# if duplicated, select be largest one
valid_prot <- valid_prot[order(valid_prot$AGI_code, 
                         -abs(valid_prot$`Razor + unique peptides`)),]

valid_prot <- valid_prot[!duplicated(valid_prot$AGI_code),]

# fwrite(valid_prot, paste0("/path/to/", "valid_prot.txt"), sep = '\t', col.names = T, row.names = F)

6.4 Data Analysis and Visualization

可视化策略(Mergner et al. 2020)：火山图，韦恩图，热图，相关性热图，PCA，富集气泡图，准确性评估图，spatial proteomics data

# valid_prot <- fread(paste0(file, valid_prot.txt),  stringsAsFactors = F, integer64 = 'double')

intensity_matrix <-valid_prot[,c(154, grep("Reporter intensity [0-9] DV_FL", colnames(valid_prot))), with=F]
# colnames(valid_prot)
col_names <- c('AGI_cods', 'FL9', 'FL10', 'FL11', 
               'FL12', 'FL13', 'FL15')
colnames(intensity_matrix) <- col_names

# Fileter normalized data 

intnsity_matrix <- intensity_matrix[rowSums(intensity_matrix[,c(2:7)],  na.rm = TRUE) > 0]

## protein groups all
proteingroups_all <- nrow(intensity_matrix)

## unambiguous protein groups 
unambiguous_proteingroups <- nrow(intensity_matrix) - length(grep(";", intensity_matrix$AGI_cods))

6.5 Distribution of Score

Score of identification(The larger the Score , the more certain is the identification of a proteingroup. )

# Score plot 
library(ggplot2)
ggplot(data = valid_prot)  + 
  geom_point(aes(x = Score, 
                 y = `Reporter intensity 5 DV_FL`), 
             alpha = 0.5) +
  ylab(label = "Reporter intensity") + 
  theme_bw()

image.png

6.6 heat map

library(pheatmap)

plot_mat <- data.frame(intnsity_matrix[,-1])
plot_mat[is.na(plot_mat)] = 0

first <- colorRampPalette(c('#4575b4','#91bfdb'))(100)
second <- colorRampPalette(c('#e0f3f8','#fee090'))(100)
third <- colorRampPalette(c('#fc8d59','#d73027'))(25)

palette <- c(first, second, third)

pheatmap(mat = plot_mat, show_rownames = FALSE,
         color = palette, cluster_rows = FALSE)

image.png

6.7 Pearson correlation

library(corrplot)

cor_data <- intnsity_matrix[,-1]
cor_data[is.na(cor_data)]  <-  0

mat <- cor(cor_data)

# color palette
first <- colorRampPalette(c("black", "white"))(200)
second <- colorRampPalette(c("green", "black"))(100)
third <- colorRampPalette(c("black", "yellow"))(100)

palette <- c(first, second, third)

corrplot(corr = mat, is.corr = FALSE, type = "upper", 
         col = palette, cl.lim = c(0.85, 1), 
         diag = FALSE)

image.png

6.8 PCA

library(ggplot2)

PCA_data <- intnsity_matrix[,-1]
PCA_data[is.na(PCA_data)]  <-  0

PCA_FL <- prcomp(t(PCA_data))

components <- summary(PCA_FL)$importance[2,c(1,2)]

plot_data <- as.data.frame(PCA_FL$x)
plot_data$name <- row.names(plot_data)

ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2)) +
  geom_text(aes(x = PC1, y = PC2 + 3, label = name)) +
  xlab(label = paste0("component 1 (", 
                      round(components[1], 4)*100, "%)")) + 
  ylab(label = paste0("component 2 (", 
                      round(components[2], 4)*100, "%)")) +
  theme_bw()

image.png

6.9 Differential intensity Analysis

limma + voom

6.10 GO enrichment analysis

library(clusterProfiler)
library(AnnotationHub)
# library(biomaRt) 是另一个数据库
# library("org.Hs.eg.db") 人类
ah <- AnnotationHub()

# check the database
#subset(ah, species == 'Arabidopsis thaliana' & rdataclass == 'OrgDb')

# make OrgDB
A_thaliana_DB <- ah[['AH84114']]

# 提取等分为前100
GO_data <- valid_prot[order(valid_prot$Score, 
                         decreasing = TRUE)][c(1:100), ]

# id 转换
SYMBOL <- mapIds(A_thaliana_DB, keys = GO_data$AGI_code, 
                 column= "ENTREZID", 
                 keytype = "TAIR")
# Go 富集
GO_FL <- enrichGO(SYMBOL, OrgDb = A_thaliana_DB, ont='ALL', 
                   pAdjustMethod = 'BH', pvalueCutoff = 0.05, 
                   qvalueCutoff = 0.2, keyType ='ENTREZID')
# head(GO_FL)

dotplot(GO_FL, showCategory=5)
barplot(GO_FL, showCategory=5)

image.png

6.11 蛋白质功能注释

It is recommended to use proteins found in the ‘Majority protein > IDs’ column for subsequent functional or enrichment analysis, as > they are protected against accidental hits to a protein group.

通过搜库对蛋白质进行鉴定后，接着就是对这些搜到的蛋白进行功能注释，这有助于了解蛋白的功能，从而解析样本相关表型，常用于功能注释的数据库有：GO、COG、KEGG、NR、Pfam、Swiss-Prot。

References

“Identification of Protein Clusters Predictive of Tumor Response in Rectal Cancer Patients Receiving Neoadjuvant Chemoradiotherapy.” n.d. Journal Article.

Mergner, J., M. Frejno, M. Messerer, D. Lang, P. Samaras, M. Wilhelm, K. F. X. Mayer, C. Schwechheimer, and B. Kuster. 2020. “Proteomic and Transcriptomic Profiling of Aerial Organ Development in Arabidopsis.” Journal Article. Sci Data 7 (1): 334. https://doi.org/10.1038/s41597-020-00678-w.

网友评论

质谱蛋白代谢

本文标题：MaxQuant Results with R

本文链接：https://www.haomeiwen.com/subject/okffwktx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

MaxQuant Results with R

6.1 Filter

6.2 Normalization

6.3 Normalization Protein IDs

6.4 Data Analysis and Visualization

6.5 Distribution of Score

6.6 heat map

6.7 Pearson correlation

6.8 PCA

6.9 Differential intensity Analysis

6.10 GO enrichment analysis

6.11 蛋白质功能注释

References

相关文章

MaxQuant Results with R

R包：PTXQC蛋白质组学质控评估

CMOT中对跟踪结果results.mat的理解

CGBlendMode

OKR问题总结

我的2019OKR

Results

数据分析：质谱数据预处理

2、Stata入门---如何保证分析的可重复性

results 5

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

质谱蛋白代谢