TCGA-13.TCGA风险因子关联-三图联动

作者: 小洁忘了怎么分身 | 来源:发表于2020-02-11 13:51 被阅读0次

TCGA-13.TCGA风险因子关联-三图联动
TCGA+biomarker——风险因子关联图
TCGA的28篇教程-风险因子关联图-一个价值1000但是迟到的
Lasso法构建多基因预测模型-01
TCGA数据挖掘十：风险因子联动图
因子模型
纠正线上数据的方式
芯片探针序列的基因注释已经无需你自己亲自做了
香帅的北大金融学课123：资产定价理论
【实用】11个逆天技巧，看完变数据管理绝顶高手！

1.准备输入数据

生信星球公众号回复风险，即可获得输入数据。也可参照前面的教程自己获取。

options(stringsAsFactors = F)
load("risk.Rdata")
exprSet = expr[,group_list=="tumor"]
dim(exprSet) ## remove the nomral
#> [1] 673 515
head(phe)
#>                     ID event race age gender stage days age_group
#> 52.70.0   TCGA-05-4244     0 <NA>  70   male    iv    0     older
#> 52.70.0.2 TCGA-05-4249     0 <NA>  67   male    ib 1158     older
#> 52.70.0.3 TCGA-05-4250     1 <NA>  79 female  iiia  121     older
#> 58.73.0   TCGA-05-4382     0 <NA>  68   male    ib  607     older
#> 58.73.0.1 TCGA-05-4389     0 <NA>  70   male    ia 1369     older
#> 58.73.0.2 TCGA-05-4395     1 <NA>  76   male  iiib    0     older
#>                time
#> 52.70.0    0.000000
#> 52.70.0.2 38.600000
#> 52.70.0.3  4.033333
#> 58.73.0   20.233333
#> 58.73.0.1 45.633333
#> 58.73.0.2  0.000000
exprSet[1:4,1:4]
#>              TCGA-05-4244-01A-01T-1108-13 TCGA-05-4249-01A-01T-1108-13
#> hsa-let-7a-1                         3985                         8916
#> hsa-let-7a-2                         7947                        17800
#> hsa-let-7a-3                         4128                         9079
#> hsa-let-7b                           9756                        32960
#>              TCGA-05-4250-01A-01T-1108-13 TCGA-05-4382-01A-01T-1207-13
#> hsa-let-7a-1                         3472                         6332
#> hsa-let-7a-2                         6822                        12885
#> hsa-let-7a-3                         3459                         6558
#> hsa-let-7b                          14223                        16781
head(colnames(exprSet))
#> [1] "TCGA-05-4244-01A-01T-1108-13" "TCGA-05-4249-01A-01T-1108-13"
#> [3] "TCGA-05-4250-01A-01T-1108-13" "TCGA-05-4382-01A-01T-1207-13"
#> [5] "TCGA-05-4389-01A-01T-1207-13" "TCGA-05-4395-01A-01T-1207-13"
head(phe$ID)
#> [1] "TCGA-05-4244" "TCGA-05-4249" "TCGA-05-4250" "TCGA-05-4382"
#> [5] "TCGA-05-4389" "TCGA-05-4395"
## 必须保证生存资料和表达矩阵，两者一致
all(substring(colnames(exprSet),1,12)==phe$ID)
#> [1] TRUE

library(survival)
library(survminer)
library(ggplotify)
library(cowplot)
library(Hmisc)
options(scipen=200)
# http://dni-institute.in/blogs/cox-regression-interpret-result-and-predict/
library(pheatmap)

2. 挑选感兴趣的基因构建coxph模型

这里的基因是另一篇文章里的，只是举例子。

e=t(exprSet[c('hsa-mir-31','hsa-mir-196b','hsa-mir-766','hsa-mir-519a-1','hsa-mir-375','hsa-mir-187','hsa-mir-331','hsa-mir-101-1'),])
e=log2(e+1)
colnames(e)=c('miR31','miR196b','miR766','miR519a1','miR375','miR187','miR331','miR101')
dat=cbind(phe,e)

dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)

colnames(dat) 
#>  [1] "ID"        "event"     "race"      "age"       "gender"   
#>  [6] "stage"     "days"      "age_group" "time"      "miR31"    
#> [11] "miR196b"   "miR766"    "miR519a1"  "miR375"    "miR187"   
#> [16] "miR331"    "miR101"
s=Surv(time, event) ~ miR31+miR196b+miR766+miR519a1+miR375+miR187+miR331+miR101
model <- coxph(s, data = dat )
summary(model,data=dat)
#> Call:
#> coxph(formula = s, data = dat)
#> 
#>   n= 515, number of events= 125 
#> 
#>              coef exp(coef) se(coef)      z Pr(>|z|)
#> miR31     0.05104   1.05237  0.04002  1.275    0.202
#> miR196b   0.05617   1.05778  0.03621  1.551    0.121
#> miR766    0.13498   1.14451  0.10259  1.316    0.188
#> miR519a1 -0.02977   0.97067  0.04477 -0.665    0.506
#> miR375   -0.04004   0.96075  0.04976 -0.805    0.421
#> miR187   -0.04231   0.95858  0.04375 -0.967    0.334
#> miR331   -0.08455   0.91892  0.12715 -0.665    0.506
#> miR101   -0.12002   0.88690  0.09694 -1.238    0.216
#> 
#>          exp(coef) exp(-coef) lower .95 upper .95
#> miR31       1.0524     0.9502    0.9730     1.138
#> miR196b     1.0578     0.9454    0.9853     1.136
#> miR766      1.1445     0.8737    0.9360     1.399
#> miR519a1    0.9707     1.0302    0.8891     1.060
#> miR375      0.9607     1.0409    0.8715     1.059
#> miR187      0.9586     1.0432    0.8798     1.044
#> miR331      0.9189     1.0882    0.7162     1.179
#> miR101      0.8869     1.1275    0.7334     1.072
#> 
#> Concordance= 0.652  (se = 0.031 )
#> Likelihood ratio test= 17.99  on 8 df,   p=0.02
#> Wald test            = 18.86  on 8 df,   p=0.02
#> Score (logrank) test = 19.12  on 8 df,   p=0.01
options(scipen=1)
ggforest(model, data =dat, 
         main = "Hazard ratio", 
         cpositions = c(0.10, 0.22, 0.4), 
         fontsize = 1.0, 
         refLabel = "1", noDigits = 4)

3.自己预测自己

查看?predict.coxph的帮助文档会发现：type参数有不同的取值，其中
the expected number of events given the covariates and follow-up time ("expected").本来预测生存时间应该是用type="expected"，这里用了默认值，是因为文章中对应的图预测值取值范围是-1到1，只是为了保持一致。

#fp <- predict(model,dat,type="risk");boxplot(fp)
#fp <- predict(model,dat,type="expected") ;boxplot(fp)
#names(fp) = rownames(phe)
fp <- predict(model,dat) ;boxplot(fp)

4.三图联动

这里的难点在于三个图的横坐标顺序是一致的。本强迫症还想让三个图严格对齐，然而无功而返，由于热图拼图本来就比较难弄，导致patchwork、cowplot都搞不定，多次尝试后只能通过对grid.arrange的调整对齐一下，应该是有别的办法的，如有后续，将在简书中更新

fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
                   time=phe[names(sort(fp)),'time'] ,
                   event=phe[names(sort(fp )),'event']  ) 
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),(ncol(dat)-7):ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1

#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2

#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = F)
#拼图实现三图联动
plots = list(p1,p2,as.ggplot(as.grob(p3)))

library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
             c(rep(2,7)),
             c(rep(3,7))) #布局矩阵
#> Warning in rbind(c(rep(1, 7), NA), c(rep(2, 7)), c(rep(3, 7))): number of
#> columns of result is not a multiple of vector length (arg 2)
grid.arrange(grobs = plots, 
             layout_matrix = lay1,
             heigths = c(1, 2,3))