1.准备输入数据
生信星球公众号回复风险,即可获得输入数据。也可参照前面的教程自己获取。
options(stringsAsFactors = F)
load("risk.Rdata")
exprSet = expr[,group_list=="tumor"]
dim(exprSet) ## remove the nomral
#> [1] 673 515
head(phe)
#> ID event race age gender stage days age_group
#> 52.70.0 TCGA-05-4244 0 <NA> 70 male iv 0 older
#> 52.70.0.2 TCGA-05-4249 0 <NA> 67 male ib 1158 older
#> 52.70.0.3 TCGA-05-4250 1 <NA> 79 female iiia 121 older
#> 58.73.0 TCGA-05-4382 0 <NA> 68 male ib 607 older
#> 58.73.0.1 TCGA-05-4389 0 <NA> 70 male ia 1369 older
#> 58.73.0.2 TCGA-05-4395 1 <NA> 76 male iiib 0 older
#> time
#> 52.70.0 0.000000
#> 52.70.0.2 38.600000
#> 52.70.0.3 4.033333
#> 58.73.0 20.233333
#> 58.73.0.1 45.633333
#> 58.73.0.2 0.000000
exprSet[1:4,1:4]
#> TCGA-05-4244-01A-01T-1108-13 TCGA-05-4249-01A-01T-1108-13
#> hsa-let-7a-1 3985 8916
#> hsa-let-7a-2 7947 17800
#> hsa-let-7a-3 4128 9079
#> hsa-let-7b 9756 32960
#> TCGA-05-4250-01A-01T-1108-13 TCGA-05-4382-01A-01T-1207-13
#> hsa-let-7a-1 3472 6332
#> hsa-let-7a-2 6822 12885
#> hsa-let-7a-3 3459 6558
#> hsa-let-7b 14223 16781
head(colnames(exprSet))
#> [1] "TCGA-05-4244-01A-01T-1108-13" "TCGA-05-4249-01A-01T-1108-13"
#> [3] "TCGA-05-4250-01A-01T-1108-13" "TCGA-05-4382-01A-01T-1207-13"
#> [5] "TCGA-05-4389-01A-01T-1207-13" "TCGA-05-4395-01A-01T-1207-13"
head(phe$ID)
#> [1] "TCGA-05-4244" "TCGA-05-4249" "TCGA-05-4250" "TCGA-05-4382"
#> [5] "TCGA-05-4389" "TCGA-05-4395"
## 必须保证生存资料和表达矩阵,两者一致
all(substring(colnames(exprSet),1,12)==phe$ID)
#> [1] TRUE
library(survival)
library(survminer)
library(ggplotify)
library(cowplot)
library(Hmisc)
options(scipen=200)
# http://dni-institute.in/blogs/cox-regression-interpret-result-and-predict/
library(pheatmap)
2. 挑选感兴趣的基因构建coxph模型
这里的基因是另一篇文章里的,只是举例子。
e=t(exprSet[c('hsa-mir-31','hsa-mir-196b','hsa-mir-766','hsa-mir-519a-1','hsa-mir-375','hsa-mir-187','hsa-mir-331','hsa-mir-101-1'),])
e=log2(e+1)
colnames(e)=c('miR31','miR196b','miR766','miR519a1','miR375','miR187','miR331','miR101')
dat=cbind(phe,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)
#> [1] "ID" "event" "race" "age" "gender"
#> [6] "stage" "days" "age_group" "time" "miR31"
#> [11] "miR196b" "miR766" "miR519a1" "miR375" "miR187"
#> [16] "miR331" "miR101"
s=Surv(time, event) ~ miR31+miR196b+miR766+miR519a1+miR375+miR187+miR331+miR101
model <- coxph(s, data = dat )
summary(model,data=dat)
#> Call:
#> coxph(formula = s, data = dat)
#>
#> n= 515, number of events= 125
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> miR31 0.05104 1.05237 0.04002 1.275 0.202
#> miR196b 0.05617 1.05778 0.03621 1.551 0.121
#> miR766 0.13498 1.14451 0.10259 1.316 0.188
#> miR519a1 -0.02977 0.97067 0.04477 -0.665 0.506
#> miR375 -0.04004 0.96075 0.04976 -0.805 0.421
#> miR187 -0.04231 0.95858 0.04375 -0.967 0.334
#> miR331 -0.08455 0.91892 0.12715 -0.665 0.506
#> miR101 -0.12002 0.88690 0.09694 -1.238 0.216
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> miR31 1.0524 0.9502 0.9730 1.138
#> miR196b 1.0578 0.9454 0.9853 1.136
#> miR766 1.1445 0.8737 0.9360 1.399
#> miR519a1 0.9707 1.0302 0.8891 1.060
#> miR375 0.9607 1.0409 0.8715 1.059
#> miR187 0.9586 1.0432 0.8798 1.044
#> miR331 0.9189 1.0882 0.7162 1.179
#> miR101 0.8869 1.1275 0.7334 1.072
#>
#> Concordance= 0.652 (se = 0.031 )
#> Likelihood ratio test= 17.99 on 8 df, p=0.02
#> Wald test = 18.86 on 8 df, p=0.02
#> Score (logrank) test = 19.12 on 8 df, p=0.01
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)
3.自己预测自己
查看?predict.coxph的帮助文档会发现:type参数有不同的取值,其中
the expected number of events given the covariates and follow-up time ("expected").本来预测生存时间应该是用type="expected",这里用了默认值,是因为文章中对应的图预测值取值范围是-1到1,只是为了保持一致。
#fp <- predict(model,dat,type="risk");boxplot(fp)
#fp <- predict(model,dat,type="expected") ;boxplot(fp)
#names(fp) = rownames(phe)
fp <- predict(model,dat) ;boxplot(fp)
4.三图联动
这里的难点在于三个图的横坐标顺序是一致的。本强迫症还想让三个图严格对齐,然而无功而返,由于热图拼图本来就比较难弄,导致patchwork、cowplot都搞不定,多次尝试后只能通过对grid.arrange的调整对齐一下,应该是有别的办法的,如有后续,将在简书中更新
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
time=phe[names(sort(fp)),'time'] ,
event=phe[names(sort(fp )),'event'] )
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),(ncol(dat)-7):ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = F)
#拼图实现三图联动
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
c(rep(2,7)),
c(rep(3,7))) #布局矩阵
#> Warning in rbind(c(rep(1, 7), NA), c(rep(2, 7)), c(rep(3, 7))): number of
#> columns of result is not a multiple of vector length (arg 2)
grid.arrange(grobs = plots,
layout_matrix = lay1,
heigths = c(1, 2,3))
从上向下三个图分别表示:
1.每个病人的预测值,按照从小到大排序
2.每个病人的生存时间,颜色区分生死
3.热图,挑出的基因在每个样本中的表达量
本图标志着TCGA系列完结,等我的目录敖~
网友评论