美文网首页生物信息学R语言源码TCGA数据挖掘
TCGA-13.TCGA风险因子关联-三图联动

TCGA-13.TCGA风险因子关联-三图联动

作者: 小洁忘了怎么分身 | 来源:发表于2020-02-11 13:51 被阅读0次

    1.准备输入数据

    生信星球公众号回复风险,即可获得输入数据。也可参照前面的教程自己获取。

    options(stringsAsFactors = F)
    load("risk.Rdata")
    exprSet = expr[,group_list=="tumor"]
    dim(exprSet) ## remove the nomral
    #> [1] 673 515
    head(phe)
    #>                     ID event race age gender stage days age_group
    #> 52.70.0   TCGA-05-4244     0 <NA>  70   male    iv    0     older
    #> 52.70.0.2 TCGA-05-4249     0 <NA>  67   male    ib 1158     older
    #> 52.70.0.3 TCGA-05-4250     1 <NA>  79 female  iiia  121     older
    #> 58.73.0   TCGA-05-4382     0 <NA>  68   male    ib  607     older
    #> 58.73.0.1 TCGA-05-4389     0 <NA>  70   male    ia 1369     older
    #> 58.73.0.2 TCGA-05-4395     1 <NA>  76   male  iiib    0     older
    #>                time
    #> 52.70.0    0.000000
    #> 52.70.0.2 38.600000
    #> 52.70.0.3  4.033333
    #> 58.73.0   20.233333
    #> 58.73.0.1 45.633333
    #> 58.73.0.2  0.000000
    exprSet[1:4,1:4]
    #>              TCGA-05-4244-01A-01T-1108-13 TCGA-05-4249-01A-01T-1108-13
    #> hsa-let-7a-1                         3985                         8916
    #> hsa-let-7a-2                         7947                        17800
    #> hsa-let-7a-3                         4128                         9079
    #> hsa-let-7b                           9756                        32960
    #>              TCGA-05-4250-01A-01T-1108-13 TCGA-05-4382-01A-01T-1207-13
    #> hsa-let-7a-1                         3472                         6332
    #> hsa-let-7a-2                         6822                        12885
    #> hsa-let-7a-3                         3459                         6558
    #> hsa-let-7b                          14223                        16781
    head(colnames(exprSet))
    #> [1] "TCGA-05-4244-01A-01T-1108-13" "TCGA-05-4249-01A-01T-1108-13"
    #> [3] "TCGA-05-4250-01A-01T-1108-13" "TCGA-05-4382-01A-01T-1207-13"
    #> [5] "TCGA-05-4389-01A-01T-1207-13" "TCGA-05-4395-01A-01T-1207-13"
    head(phe$ID)
    #> [1] "TCGA-05-4244" "TCGA-05-4249" "TCGA-05-4250" "TCGA-05-4382"
    #> [5] "TCGA-05-4389" "TCGA-05-4395"
    ## 必须保证生存资料和表达矩阵,两者一致
    all(substring(colnames(exprSet),1,12)==phe$ID)
    #> [1] TRUE
    
    library(survival)
    library(survminer)
    library(ggplotify)
    library(cowplot)
    library(Hmisc)
    options(scipen=200)
    # http://dni-institute.in/blogs/cox-regression-interpret-result-and-predict/
    library(pheatmap)
    

    2. 挑选感兴趣的基因构建coxph模型

    这里的基因是另一篇文章里的,只是举例子。

    e=t(exprSet[c('hsa-mir-31','hsa-mir-196b','hsa-mir-766','hsa-mir-519a-1','hsa-mir-375','hsa-mir-187','hsa-mir-331','hsa-mir-101-1'),])
    e=log2(e+1)
    colnames(e)=c('miR31','miR196b','miR766','miR519a1','miR375','miR187','miR331','miR101')
    dat=cbind(phe,e)
    
    dat$gender=factor(dat$gender)
    dat$stage=factor(dat$stage)
    
    colnames(dat) 
    #>  [1] "ID"        "event"     "race"      "age"       "gender"   
    #>  [6] "stage"     "days"      "age_group" "time"      "miR31"    
    #> [11] "miR196b"   "miR766"    "miR519a1"  "miR375"    "miR187"   
    #> [16] "miR331"    "miR101"
    s=Surv(time, event) ~ miR31+miR196b+miR766+miR519a1+miR375+miR187+miR331+miR101
    model <- coxph(s, data = dat )
    summary(model,data=dat)
    #> Call:
    #> coxph(formula = s, data = dat)
    #> 
    #>   n= 515, number of events= 125 
    #> 
    #>              coef exp(coef) se(coef)      z Pr(>|z|)
    #> miR31     0.05104   1.05237  0.04002  1.275    0.202
    #> miR196b   0.05617   1.05778  0.03621  1.551    0.121
    #> miR766    0.13498   1.14451  0.10259  1.316    0.188
    #> miR519a1 -0.02977   0.97067  0.04477 -0.665    0.506
    #> miR375   -0.04004   0.96075  0.04976 -0.805    0.421
    #> miR187   -0.04231   0.95858  0.04375 -0.967    0.334
    #> miR331   -0.08455   0.91892  0.12715 -0.665    0.506
    #> miR101   -0.12002   0.88690  0.09694 -1.238    0.216
    #> 
    #>          exp(coef) exp(-coef) lower .95 upper .95
    #> miR31       1.0524     0.9502    0.9730     1.138
    #> miR196b     1.0578     0.9454    0.9853     1.136
    #> miR766      1.1445     0.8737    0.9360     1.399
    #> miR519a1    0.9707     1.0302    0.8891     1.060
    #> miR375      0.9607     1.0409    0.8715     1.059
    #> miR187      0.9586     1.0432    0.8798     1.044
    #> miR331      0.9189     1.0882    0.7162     1.179
    #> miR101      0.8869     1.1275    0.7334     1.072
    #> 
    #> Concordance= 0.652  (se = 0.031 )
    #> Likelihood ratio test= 17.99  on 8 df,   p=0.02
    #> Wald test            = 18.86  on 8 df,   p=0.02
    #> Score (logrank) test = 19.12  on 8 df,   p=0.01
    options(scipen=1)
    ggforest(model, data =dat, 
             main = "Hazard ratio", 
             cpositions = c(0.10, 0.22, 0.4), 
             fontsize = 1.0, 
             refLabel = "1", noDigits = 4)
    

    3.自己预测自己

    查看?predict.coxph的帮助文档会发现:type参数有不同的取值,其中
    the expected number of events given the covariates and follow-up time ("expected").本来预测生存时间应该是用type="expected",这里用了默认值,是因为文章中对应的图预测值取值范围是-1到1,只是为了保持一致。

    #fp <- predict(model,dat,type="risk");boxplot(fp)
    #fp <- predict(model,dat,type="expected") ;boxplot(fp)
    #names(fp) = rownames(phe)
    fp <- predict(model,dat) ;boxplot(fp)
    

    4.三图联动

    这里的难点在于三个图的横坐标顺序是一致的。本强迫症还想让三个图严格对齐,然而无功而返,由于热图拼图本来就比较难弄,导致patchwork、cowplot都搞不定,多次尝试后只能通过对grid.arrange的调整对齐一下,应该是有别的办法的,如有后续,将在简书中更新

    fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
    sur_dat=data.frame(patientid=1:length(fp),
                       time=phe[names(sort(fp)),'time'] ,
                       event=phe[names(sort(fp )),'event']  ) 
    sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
    sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
    exp_dat=dat[names(sort(fp)),(ncol(dat)-7):ncol(dat)]
    ###第一个图----
    p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
    
    #第二个图
    p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
    
    #第三个图
    mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
    tmp=t(scale(exp_dat))
    tmp[tmp > 1] = 1
    tmp[tmp < -1] = -1
    #p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
    p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = F)
    #拼图实现三图联动
    plots = list(p1,p2,as.ggplot(as.grob(p3)))
    
    library(gridExtra)
    lay1 = rbind(c(rep(1,7),NA),
                 c(rep(2,7)),
                 c(rep(3,7))) #布局矩阵
    #> Warning in rbind(c(rep(1, 7), NA), c(rep(2, 7)), c(rep(3, 7))): number of
    #> columns of result is not a multiple of vector length (arg 2)
    grid.arrange(grobs = plots, 
                 layout_matrix = lay1,
                 heigths = c(1, 2,3))
    

    从上向下三个图分别表示:

    1.每个病人的预测值,按照从小到大排序
    2.每个病人的生存时间,颜色区分生死
    3.热图,挑出的基因在每个样本中的表达量

    本图标志着TCGA系列完结,等我的目录敖~

    相关文章

      网友评论

        本文标题:TCGA-13.TCGA风险因子关联-三图联动

        本文链接:https://www.haomeiwen.com/subject/kwjvxhtx.html