题目来源:https://mp.weixin.qq.com/s/jzDD05M5AuhCkiavkoLiHg
- 使用TCGA数据库的数据对PTP4A3基因做生存分析
1.1 从UCSC Xena下载TCGA BRCA的表达矩阵HiSeqV2.gz
,临床信息BRCA_clinicalMatrix
,生存信息BRCA_survival.txt.gz
,读入R。
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
library(data.table)
##读入TCGA_BRCA表达信息
exprSet <- fread("HiSeqV2.gz",header = T,sep = '\t')
exprSet <- as.data.frame(exprSet)
rownames(exprSet) <- exprSet[,1]
exprSet <- exprSet[,-1]
## 从exprSet中提取PTP4A3的表达情况
dat <- exprSet["PTP4A3",]
##读入TCGA_BRCA生存信息
surdata <- read.table("BRCA_survival.txt.gz",header = T,sep = '\t')
rownames(surdata) <- surdata$sample
surdata <- surdata[,-1]
##读入TCGA_BRCA临床信息
pheno <- read.table("BRCA_clinicalMatrix",header = T,sep = '\t')
rownames(pheno) <- pheno$sampleID
pheno <- pheno[,-1]
1.2 数据清洗
1)病人数据去重
table(duplicated(surdata$X_PATIENT))
#FALSE TRUE
# 1097 139
choose_samples <- rownames(surdata[!duplicated(surdata$X_PATIENT),])
choose_samples[1:3]
length(choose_samples)
surdata <- surdata[choose_samples,]
pheno <- pheno[choose_samples,]
dat <- dat[,(colnames(dat) %in% choose_samples)]
choose_samples <- colnames(dat)
surdata <- surdata[choose_samples,]
pheno <- pheno[choose_samples,]
dat <- dat[,(colnames(dat) %in% choose_samples)]
2)选择Primary Tumor
样本
pheno[1:3,1:3]
table(pheno$sample_type)
# Metastatic Primary Tumor Solid Tissue Normal
# 7 1086 2
choose_samples <- rownames(pheno[pheno$sample_type=='Primary Tumor',])
surdata <- surdata[choose_samples,]
pheno <- pheno[choose_samples,]
dat <- dat[,(colnames(dat) %in% choose_samples)]
1.3 生存分析
参考:【生信技能树】TCGA的28篇教程- 对TCGA数据库的任意癌症中任意基因做生存分析
dat_bak <- dat
dat <- t(dat)
dat <- as.data.frame(dat)
dat$sampleID <- rownames(dat)
surdata$sampleID <- rownames(surdata)
surdata <- merge(surdata,dat,by='sampleID')
surdata$PTP4A3 <- as.numeric(surdata$PTP4A3)
surdata$g <- ifelse(surdata$PTP4A3 > median(surdata$PTP4A3),'high','low')
table(surdata$g)
#high low
# 543 543
library(survival)
table(surdata$OS)
# 0 1
#939 147
table(surdata$OS.time>0)
#FALSE TRUE
# 13 1072
my.surv <- Surv(surdata$OS.time,surdata$OS==1)
kmfit2 <- survfit(my.surv~surdata$g,data = surdata)
library("survminer")
ggsurvplot(kmfit2,conf.int =F, pval = T,risk.table =F, ncensor.plot = F)
BRCA_PTP4A3_survplot.png
p=0.91,结果不显著。按照文献里写的用三阴性乳腺癌样本分析。
colnames_num_tnbc <- grep('receptor_status',colnames(pheno))
colnames(pheno)[colnames_num_tnbc]
eph <- pheno[,colnames_num_tnbc[1:3]]
eph$tnbc_row_num <- apply(eph, 1, function(x) sum(x =='Negative')) ## 均为阴性的为三阴性乳腺癌
tnbc_samples <- rownames(pheno)[eph$tnbc_row_num == 3]
length(tnbc_samples)
#[1] 115 ##TCGA中tnbc病人只有115个
tnbc_surdata <- surdata[surdata$sampleID %in% tnbc_samples,]
tnbc.surv <- Surv(tnbc_surdata$OS.time,tnbc_surdata$OS==1)
tnbc.kmfit <- survfit(tnbc.surv~tnbc_surdata$g,data=tnbc_surdata)
ggsurvplot(tnbc.kmfit,conf.int = F,pval = T)
tnbc_PTP4A3_survplot.png
p=0.47。还是不显著,参考公众号【生信技能树】文章《生存分析就是一个任人打扮的小姑凉》继续折腾。
surv.cut <- surv_cutpoint(
surdata,
time = "OS.time",
event = "OS",
variables = "PTP4A3"
)
summary(surv.cut)
plot(surv.cut, "PTP4A3", palette = "npg")
cutpoint_PTP4A3.png
surv.cat <- surv_categorize(surv.cut)
surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3,
data = surv.cat)
ggsurvplot(
surv.fit, # survfit object with calculated statistics.
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test.
conf.int = TRUE, # show confidence intervals for
# point estimaes of survival curves.
#xlim = c(0,3000), # present narrower X axis, but not affect
# survival estimates.
break.time.by = 1000, # break X axis in time intervals by 500.
risk.table.y.text.col = T, # colour risk table text annotations.
risk.table.y.text = FALSE # show bars instead of names in text annotations
# in legend of risk table
)
surv_plot.png
p=0.01!!!
再来分析一遍三阴性乳腺癌样本。
tnbc.surv.cut <- surv_cutpoint(
tnbc_surdata,
time = "OS.time",
event = "OS",
variables = "PTP4A3"
)
summary(tnbc.surv.cut)
plot(tnbc.surv.cut, "PTP4A3", palette = "npg")
cutpoint_tnbc_PTP4A3.png
tnbc.surv.cat <- surv_categorize(tnbc.surv.cut)
tnbc.surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3,
data = tnbc.surv.cat)
ggsurvplot(
tnbc.surv.fit, # survfit object with calculated statistics.
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test.
conf.int = TRUE, # show confidence intervals for
# point estimaes of survival curves.
# xlim = c(0,3000), # present narrower X axis, but not affect
# survival estimates.
break.time.by = 1000, # break X axis in time intervals by 500.
risk.table.y.text.col = T, # colour risk table text annotations.
risk.table.y.text = FALSE # show bars instead of names in text annotations
# in legend of risk table
)
tnbc_surv_plot.png
p=0.078。大概数据量太少了吧(尬笑)
1.4 网页工具分析TCGA BRCA中PTP4A3基因的生存分析
1.4.1 oncoln(http://www.oncolnc.org)
1.4.2 kmplot(http://kmplot.com/analysis)
kmplot.png
1.4.3 GEPIA (http://gepia.cancer-pku.cn/detail.php?gene=PTP4A3)
GEPIA.png
1.4.4 UCSC Xena(https://xenabrowser.net/)
UCSC_Xena.png
网友评论