2024-08-07突变数据整理

作者: 小胡同学ime | 来源:发表于2024-08-06 11:28 被阅读0次

TCGA数据挖掘五：突变数据画图
把maf格式的somatic突变数据导入annovar去除人群频
R代码合并TCGA体细胞突变数据
新版TCGA的突变SNP数据下载和整理
使用oncotator做突变注释
综合性突变危害性预测软件
突变数据下载
突变过滤--构建实验室内部突变过滤数据库
TCGA突变数据挖掘：把样本整理为突变型或野生型
突变数据的处理及柱状图的使用

##############
#参考链接：[https://mp.weixin.qq.com/s/0VPAy8f9u5ol8ryFZbmk7Q]
BiocManager::install("PoisonAlien/TCGAmutations")
library(TCGAmutations)
library(maftools) 
library(dplyr)
proj='TCGA-LIHC'
laml = tcga_load(study = "LIHC") 
laml 

###############
#载入分组的信息Cluster.rda
#将两个表的样本ID统一

#>   Hugo_Symbol Chromosome Start_Position End_Position
#> 1        NMT2         10       15154934     15154934
#> 2    ARHGAP42         11      100845194    100845194
#> 3        RAG1         11       36597064     36597064
#> 4      NLRP14         11        7064166      7064166
library(stringr)
length(unique(str_sub(mut$Tumor_Sample_Barcode,1,16)))

# 以病人为中心，表达矩阵按病人ID去重复
#exprSet <- exp_id
#k = !duplicated(str_sub(colnames(exprSet),1,12));table(k)
#exprSet = exprSet[,k]
#调整meta的ID顺序与exprSet列名一致
#meta=meta[match(str_sub(colnames(exprSet),1,12),meta$ID),]
#identical(meta$ID,str_sub(colnames(exprSet),1,12))
#colnames(exprSet) <- str_sub(colnames(exprSet),1,16)
#k = colnames(exprSet) %in% unique(str_sub(mut$Tumor_Sample_Barcode,1,16));table(k)
#expm = exprSet[,k]

#重新赋值避免影响原始数据
maf_object<- laml
a=laml@data
#############整理分组信息
ex2 = read.csv("SASP_group.csv",
               row.names = 1, check.names = F)
# 选择 group 列并保留行名

Cluster <- ex2[, "group", drop = FALSE] # drop = FALSE提取一列时保证提取后还是数据框
table(Cluster$group)
Cluster$Cluster=ifelse(Cluster$group=="high MRGPI","high MRGPI",'low MRGPI')
Cluster$Cluster =factor(Cluster$Cluster,levels=c("high MRGPI",'low MRGPI'))
#Cluster <- Cluster[, "Cluster", drop = FALSE]
save(Cluster,file = "Cluster.Rda")
#确定分组信息的ID完全在突变数据里面，此时需要对突变患者编码进行截取前12位
Cluster=Cluster[rownames(Cluster)%in%substr(maf_object@data$Tumor_Sample_Barcode, 1, 12),,drop = FALSE] 
save(Cluster,file = "Cluster.Rda")

load("Cluster.Rda")
#group列转换成因子
Cluster$group =factor(Cluster$group,levels=c("high MRGPI",'low MRGPI'))
#删除cluster列，行名为患者ID，列为group，且为因子
Cluster=Cluster[,-2,drop=FALSE]
class(Cluster$group)
#”factor“
table(Cluster$group)
#high MRGPI  low MRGPI 
#178        179 
save(Cluster,file = "Cluster.Rda")

#设置分组信息
#手动读取分组Cluster.rda
#将行名改为列
Cluster=rownames_to_column(Cluster,var = "symbol")
#根据分组信息对患者ID进行重新排序
Cluster=Cluster[order(Cluster$group),]
#行名为空
rownames(Cluster)=NULL
#行名转换为患者ID
Cluster=column_to_rownames(Cluster,var="symbol")
save(Cluster,file = "Cluster.Rda") 

#载入分组的信息Cluster.rda
#将两个表的样本ID统一
#把突变样本ID转化为分组样本的患者ID
a$Tumor_Sample_Barcode = substring(a$Tumor_Sample_Barcode,1,12)
group=rownames_to_column(Cluster,var="sample")
colnames(group)[1]="Tumor_Sample_Barcode"
#把分组文件各自拆分
group_high=group[group$group=="high MRGPI",]#178个
group_low=group[group$group=="low MRGPI",]#179个

#提取cluster的maf文件
maf_1=a[a$Tumor_Sample_Barcode%in%group_high$Tumor_Sample_Barcode,]
maf_2=a[a$Tumor_Sample_Barcode%in%group_low$Tumor_Sample_Barcode,]

#构建两组的maf文件
maf.coad_high=read.maf(maf=maf_1)
maf.coad_low=read.maf(maf=maf_2)

# 3.2作图
#绘制ICD-high瀑布图
oncoplot(maf = maf.coad_high,
         top = 20,   #显示前10个的突变基因信息
         fontSize = 0.6,   #设置字体大小
         showTumorSampleBarcodes = F)  


#绘制ICD-low瀑布图
oncoplot(maf = maf.coad_low,
         top = 20,   #显示前30个的突变基因信息
         fontSize = 0.6,   #设置字体大小
         showTumorSampleBarcodes = F)

#maftools自带可视化函数plotmaf总结，可以比较分析统计maf文件的数据。
#if (as.numeric(dev.cur()) != 1) graphics.off()
plotmafSummary(maf = maf.coad_high, rmOutlier = TRUE,
               #showBarcodes = FALSE,
               addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
plotmafSummary(maf = maf.coad_low, rmOutlier = TRUE,
               #showBarcodes = FALSE,
               addStat = 'median', dashboard = TRUE, titvRaw = FALSE)



###########3
#归纳每个基因的突变
getGeneSummary(laml)

lollipopPlot(maf = laml, gene = 'LIPG', 
             AACol = 'HGVSp_Short', showMutationRate = TRUE)
#指定基因的突变
g <- c("ADRA1A","TRPM8","AR","EPHX2","WEE1","PFKFB3","PIM3","ESRRA","RORC","LIPG","P2RY1","SFRP1","PLK3","AHCY","PPARD","ADCY1")
oncoplot(maf = laml,genes = g, fontSize = 0.7)