美文网首页科研信息学实例
非肿瘤疾病预测模型

非肿瘤疾病预测模型

作者: 养猪场小老板 | 来源:发表于2020-07-14 18:18 被阅读0次

    PMID: 30237698 本文的例文

    (一)课程介绍

    image.png
    image.png
    image.png
    image.png

    (二)例文思路

    注意顺序

    (三)R和Rstudio软件的下载和安装(略)

    尽量用R

    (四)临床资料数据整理

    image.png

    (五)lasso回归筛选变量

    目的: 防止模型过度拟合
    
    代码如下
    #install.packages("glmnet")
    
    library(glmnet)
    
    setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\1_lasso") #需要改为自己工作目录
    mydata<-read.table("input.txt",header=T,sep="\t") #需要修改
    #View(mydata)
    v1<-as.matrix(mydata[,c(3:7)]) #需要修改,选择因素
    v2 <-mydata[,2]#结局
    myfit <- glmnet(v1, v2, family = "binomial")
    pdf("lambda.pdf")
    plot(myfit, xvar = "lambda", label = TRUE)
    dev.off()
    
    myfit2 <- cv.glmnet(v1, v2, family="binomial")
    pdf("min.pdf")
    plot(myfit2)
    abline(v=log(c(myfit2$lambda.min,myfit2$lambda.1se)),lty="dashed")
    dev.off()
    
    myfit2$lambda.min#最小lambda
    ##下面求出因素的项目是什么
    coe <- coef(myfit, s = myfit2$lambda.min)
    act_index <- which(coe != 0)
    act_coe <- coe[act_index]
    row.names(coe)[act_index]
    [1] "(Intercept)"         "Use_of_NSAIDs"       "Number_of_questions"
    [4] "Education_level"     "Distance"           
    
    ##纳入的因素多的话,可以最后只纳入lasso回归的项目,如果比较少,可以全部纳入,需要考虑临床意义和统计学结果。以上有四个
    
    结果如下
    image.png
    image.png

    (六)基于lasso回归筛选出的变量进行logistic回归分析或者有的论文直接将其认为重要的变量纳入logistics回归

    > library(rms)#加载包
    > non_tumor<-read.table("input.txt",header=T,sep="\t")
    > #将因素转为因子,为列线图准备
    > non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
    > non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
    > non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
    > non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
    > non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
    #加载数据
    > ddist <- datadist(non_tumor)
    > options(datadist="ddist") 
    #构建多因素logistic回归
    #Status为因变量结局事件,其余为自变量
    #多因素“+”
    > mylog<- glm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,family=binomial(link = "logit"),data = non_tumor)  
    > summary(mylog)
    
    Call:
    glm(formula = Status ~ Use_of_GC + Use_of_NSAIDs + Number_of_questions + 
        Education_level + Distance, family = binomial(link = "logit"), 
        data = non_tumor)
    
    Deviance Residuals: 
        Min       1Q   Median       3Q      Max  
    -2.6280  -0.5903  -0.4371   0.7981   2.1889  
    
    Coefficients:
                             Estimate Std. Error z value Pr(>|z|)    
    (Intercept)              -2.23991    0.19213 -11.659  < 2e-16 ***
    Use_of_GCYes             -0.06019    0.19894  -0.303  0.76221    
    Use_of_NSAIDsYes          0.64123    0.23777   2.697  0.00700 ** 
    Number_of_questions1      2.57943    0.20398  12.645  < 2e-16 ***
    Number_of_questions>=2   -0.55084    1.14881  -0.479  0.63159    
    Education_levelSecondary  1.43120    0.29699   4.819 1.44e-06 ***
    Education_levelHigher    -1.53754    1.10229  -1.395  0.16306    
    Distance>=3km             1.00899    0.30764   3.280  0.00104 ** 
    ---
    Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
    
    (Dispersion parameter for binomial family taken to be 1)
    
        Null deviance: 908.71  on 734  degrees of freedom
    Residual deviance: 653.88  on 727  degrees of freedom
    AIC: 669.88
    
    Number of Fisher Scoring iterations: 5
    
    > coefficients(mylog)
                 (Intercept)             Use_of_GCYes         Use_of_NSAIDsYes 
                 -2.23990663              -0.06019402               0.64122681 
        Number_of_questions1   Number_of_questions>=2 Education_levelSecondary 
                  2.57942983              -0.55084386               1.43120253 
       Education_levelHigher            Distance>=3km 
                 -1.53753800               1.00899490 
    > exp(coefficients(mylog))  #计算OR值
                 (Intercept)             Use_of_GCYes         Use_of_NSAIDsYes 
                   0.1064684                0.9415818                1.8988089 
        Number_of_questions1   Number_of_questions>=2 Education_levelSecondary 
                  13.1896157                0.5764632                4.1837272 
       Education_levelHigher            Distance>=3km 
                   0.2149096                2.7428428 
    > exp(confint(mylog))  #OR值得置信区间
    Waiting for profiling to be done...
                                  2.5 %     97.5 %
    (Intercept)              0.07211103  0.1533035
    Use_of_GCYes             0.63692511  1.3908062
    Use_of_NSAIDsYes         1.19016996  3.0275911
    Number_of_questions1     8.90991341 19.8414271
    Number_of_questions>=2   0.02846893  4.1170238
    Education_levelSecondary 2.34800900  7.5386868
    Education_levelHigher    0.01110045  1.3034572
    Distance>=3km            1.50146947  5.0269071
    

    (七)列线图的绘制

    #install.packages("rms")#加载R包
    library(rms)
    non_tumor<-read.table("input.txt",header=T,sep="\t")#读取数据
    #多因素回归因子化
    non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
    non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
    non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
    non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
    non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
    #加载差异数据
    ddist <- datadist(non_tumor)
    options(datadist="ddist")
    #多因素回归分析
    mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
    #画列线图
    mynom<- nomogram(mylog, fun=plogis,fun.at=c(0.0001,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.9999),lp=F, funlabel="Risk of nonadherence")#"Risk of nonadherence",可以改名,具体论文具体分析
    #保存为PDF格式的文件
    pdf("Nom.pdf",10,8)
    plot(mynom)
    dev.off()
    
    列线图结果
    image.png

    (八)C_index

    library(rms)
    
    setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\4_C-index")
    
    non_tumor<-read.table("input.txt",header=T,sep="\t")
     
    non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
    non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
    non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
    non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
    non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
    
    ddist <- datadist(non_tumor)
    options(datadist="ddist")
    
    mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
    
    mylog
    
    #install.packages("Hmisc")
    library(Hmisc)
    ###这一节,主要看这里的代码,其余和之前差别不大
    Cindex <- rcorrcens(non_tumor$Status~predict(mylog))
    Cindex
    #################c-index#######
    ####0.5,模型没有任何预测能力
    ####0.5-0.7,比差的准确性
    ####0.71-0.9,中等的准确性
    ####>0.9,高准确性
    

    (九)校准 Calibration

    #install.packages("rms")
    
    library(rms)
    
    setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\5_Calibration")
    
    non_tumor<-read.table("input.txt",header=T,sep="\t")
     
    non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
    non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
    non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
    non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
    non_tumor$Distance<-factor(non_tumor$Distance,labels=c("No","Yes"))
    
    ddist <- datadist(non_tumor)
    options(datadist="ddist")
    
    mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
    #
    mycal<-calibrate(mylog,method="boot",B=1000) #B=1000,医院的数据一般不会大于1000,seer数据库挖掘可能比较大,因为越大,电脑运行的越久。这里建议1000
    
    pdf("Calibration.pdf")
    plot(mycal,xlab="Nomogram-predicted probability of nonadherence",ylab="Actual diagnosed nonadherence (proportion)",sub=F)
    dev.off()
    

    结果如图

    校准线越贴近理想线,说明结果就越好。

    校准

    十(ROC曲线)

    ROC的解读类似于C指数,二者仍有不同

    #install.packages("ROCR")
    library(ROCR)
    library(rms)
    #读取数据
    non_tumor<-read.table("input.txt",header=T,sep="\t")
    #建模
    mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
    pre_rate<-predict(mylog)
    ROC1<- prediction(pre_rate,non_tumor$Status) 
    ROC2<- performance(ROC1,"tpr","fpr")
    AUC <- performance(ROC1,"auc")
    
    AUC
    
    AUC<-0.8175882 #这里一定要改为自己的AUC
    
    pdf("ROC.pdf")
    plot(ROC2,col="blue", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC))
    abline(0,1,lty=2,lwd=3)
    dev.off() 
    
    auc

    (十一)决策曲线 DCA

    #为考虑病人的获益情况(有文章也没有做)
    install.packages("rmda")
    
    
    library(rms)
    library(rmda)
    
    setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\7_DCA")
    
    non_tumor<-read.table("input.txt",header=T,sep="\t")
    
    modul<- decision_curve(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data= non_tumor, 
    family = binomial(link ='logit'),
    thresholds= seq(0,1, by = 0.01),
    confidence.intervals = 0.95)
    
    pdf("DCA.pdf")
    plot_decision_curve(modul,
    curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
    cost.benefit.axis =FALSE,col= "blue",
    confidence.intervals=FALSE,
    standardize = FALSE)
    dev.off()
    
    modul
    净收益     只有在范围内才是收益的
    
    
    获益区间阈值底线0.12 获益区间阈值上线0.68
    获益图

    (十二)模型验证

    12.1 C-index

    #本文利用随机抽样的方法进行验证
    library(rms)
    
    non_tumor<-read.table("input.txt",header=T,sep="\t")
     
    non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
    non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
    non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
    non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
    non_tumor$Distance<-factor(non_tumor$Distance,labels=c("No","Yes"))
    
    ddist <- datadist(non_tumor)
    options(datadist="ddist")
    
    mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
    #验证部分
    set.seed(300)
    myc<-validate(mylog,method="b",B = 1000,pr=T,dxy=T)
    c_index<-(myc[1,5]+1)/2
    c_index
    

    12.2

    #一部分建模(70%),一部分验证(30%)
    #如果有两/多中心数据,可以分开;否则只能内部抽样
    #之后,我们可以计算验证部分的c指数
    #install.packages("caret")
    
    library(foreign)
    library(survival)
    library(caret)
    
    #加载数据
    non_tumor<-read.table("input.txt",header=T,sep="\t")
    set.seed(300)
    non_tumord<-createDataPartition(y=non_tumor$id,p=0.70,list=F)
    non_tumordev<-non_tumor[non_tumord, ]
    non_tumorv<-non_tumor[-non_tumord,] 
    #生成两个文件,一个为建模人群,第二个为验证人群
    write.csv(non_tumordev, "non_tumordev.csv")
    write.csv(non_tumorv, "non_tumorv.csv")
    
    
    #验证
    tcga<-read.table("ver.txt",header=T,sep="\t")
    #从non_tumorv.csv数据来的,删除了第一项,保留id和其它因素
    
    #将数据转换成因子格式 
    tcga$age<-factor(tcga$age,labels=c("<50","50-59","60-69",">=70"))
    tcga$sex<-factor(tcga$sex,labels=c("FEMALE","MALE"))
    tcga$race<-factor(tcga$race,labels=c("WHITE","BLACK OR AFRICAN AMERICAN","ASIAN"))
    tcga$smoking<-factor(tcga$smoking,labels=c("1","2","3","4","5"))
    tcga$radiation<-factor(tcga$radiation,labels=c("YES","NO"))
    tcga$pharmaceutical<-factor(tcga$pharmaceutical,labels=c("YES","NO"))
    tcga$stage_T<-factor(tcga$stage_T,labels=c("T1","T1a","T1b","T1c","T2","T2a","T2b","T3","T4","TX"))
    tcga$stage_M<-factor(tcga$stage_M,labels=c("M0","M1","M1b","MX"))
    tcga$stage_N<-factor(tcga$stage_N,labels=c("N0","N1","N2","NX"))
    tcga$surgery<-factor(tcga$surgery,labels=c("0","1"))
    
    #将数据打包好
    ddist <- datadist(tcga)
    options(datadist='ddist')
    
    #构建多因素的Cox回归模型
    fmla1 <- as.formula(Surv(survival_time,status) ~ age + sex + race + smoking + radiation + pharmaceutical + stage_T + stage_N + stage_M)
    cox2 <- coxph(fmla1,data=tcga)
    summary(cox2)
    
    #c-index 0.714
    
    #画校准图
    cox1 <- cph(Surv(survival_time,status) ~ age + sex + race + smoking + radiation + pharmaceutical + stage_T + stage_N + stage_M,surv=T,x=T, y=T,time.inc = 1*365*5,data=tcga) 
    cal <- calibrate(cox1, cmethod="KM", method="boot", u=1*365*5, m=50, B=1000)
    pdf("calibrate_ver.pdf",12,8)
    par(mar = c(10,5,3,2),cex = 1.0)
    plot(cal,lwd=3,lty=2,errbar.col="black",xlim = c(0,1),ylim = c(0,1),xlab ="Nomogram Predicted Survival",ylab="Actual Survival",col="blue")
    lines(cal,c('mean.predicted',‘KM'),type = ‘a',lwd = 3,col ="black" ,pch = 16)
    mtext(“ ”)
    box(lwd = 1)
    abline(0,1,lty = 3,lwd = 3,col = "black")
    dev.off()
    

    例1 临床指标和肾动脉下载的关系

    例2 临床指标和术后并发症的关系

    例3 预测糖尿病发生概率(包括检测的有关基因表达高低)

    例4 预测发生动脉粥样硬化的风险(检测发生基因突变否)

    例5 预测发生耐药的几率的风险(基因是否发生甲基化)

    相关文章

      网友评论

        本文标题:非肿瘤疾病预测模型

        本文链接:https://www.haomeiwen.com/subject/mqiacktx.html