美文网首页
02基于NHANES数据库的加权基线表及加权模型

02基于NHANES数据库的加权基线表及加权模型

作者: Jachin111 | 来源:发表于2023-05-22 22:26 被阅读0次

    权重背景:

    nhanes的权重由3部分组成:
    基础权重basic weight:与常规抽样数据类似,对应样本被抽中的概率。
    无应答调整 non-response adjustments
    分层后调整 post-stratification adjustments

    基础权重:
    普通的问卷调查,为了使抽样样本具有一定的人群代表性,会赋予其一定的权重,相当于样本中的各个观测(本研究为每个研究对象)在总体中的重要程度,或样本中每个观测所能代表的总体中的个体的数目。
    无应答调整:
    之所以有这个权重,其原因是nhanes每个周期的调查中,participants参与项目的完整度不一致,且一些特定项目只有部分人员参加。
    分层后调整:
    权重还进行后分层,以匹配每个抽样子域的总体控制总数。这一额外调整使加权计数与美国非机构化平民人口的独立估计相同。

    权重的计算:

    1,基于合并周期数计算
    2,基于最小子集结合研究目的选择合适权重
    本质:不管怎样挑选样本都能够代表美国整体人群

    基线表

    library(haven)
    library(Hmisc)
    library(tableone)
    library(survey)
    
    bcSvy2 <- svydesign(ids=~SDMVPSU,   #集群,抽样单元PSU
                        strata=~SDMVSTRA,   #分层
                        weights=~WEIGHT,
                        nest=TRUE,
                        survey.lonely.psu="adjust",  #抽样单元为1时不报错
                        data=out.put)
    dput(names(out.put))
    allVars <- c("RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDMARTL", "DMDEDUC2", "DMDFMSIZ", 
                 "INDFMIN2","BMXBMI", "SMQ020", "ALQ", "DIQ010", "BPQ020", "PA", "LBXBCD",
                 "LBXBPB", "LBXBSE", "LBXBMN")
    fvars <- c("RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDMARTL", "DMDEDUC2", "DMDFMSIZ", 
               "INDFMIN2","BMXBMI", "SMQ020", "ALQ", "DIQ010", "BPQ020", "PA")
    Svytab2 <- svyCreateTableOne(vars=allVars, strata="PHQ9", data=bcSvy2, factorVars=fvars)
    
    
    # 正态性检验(样本超5000)
    data <- scale(out.put$LBXBCD)
    ks.test(data,"pnorm")  #<0.05,非正态分布
    data <- scale(out.put$LBXBPB)
    ks.test(data,"pnorm")  #<0.05,非正态分布
    data <- scale(out.put$LBXBSE)
    ks.test(data,"pnorm")  #<0.05,非正态分布
    data <- scale(out.put$LBXBMN)
    ks.test(data,"pnorm")  #<0.05,非正态分布
    
    bb <- c("LBXBCD", "LBXBPB", "LBXBSE", "LBXBMN")
    tab3 <- print(Svytab2, factorVars=fvars, nonnormal=bb, quote=T, showAllLevels=T, smd=T, missing=F)
    

    logistic回归分析

    svy.fit <- svyglm(LUXSMED ~ ratio + factor(RIAGENDR) + factor(RIDRETH1) + factor(DMDFMSIZ) + factor(INDFMIN2) + factor(BMXBMI) + RIDAGEYR + factor(Smoke) + factor(ALQ111) + factor(diabetes) + LBXSATSI + LBXSASSI + LBXSGTSI + LBXSTR + LBXSCH, family="quasibinomial", design=bcSvy2)
    
    tbl_regression(svy.fit, exponentiate=TRUE) %>%
      add_global_p(anova_fun=gtsummary::tidy_wald_test) %>%
      add_glance_table(include=c(nobs, AIC, BIC))
    

    基于logistic回归限制性立方样条

    分析逻辑:survey包无法进行加权限制性立方样条分析,需要借助rms包。先利用survey包进行加权logistic回归分析,并提取出标准化权重然后使用rms包进行分析

    library(gtsummary)
    library(survey)
    library(haven)
    library(rms)
    
    svy.fit <- svyglm(PHQ9 ~ rcs(LBXBCD,4) + factor(RIAGENDR) + factor(RIDAGEYR) + factor(RIDRETH1) + factor(DMDMARTL) + factor(DMDEDUC2) + factor(DMDFMSIZ) + factor(INDFMIN2) + factor(BMXBMI) + factor(SMQ020) + factor(ALQ) + factor(DIQ010) + factor(BPQ020), family="quasibinomial", design=bcSvy2)
    
    data <- svy.fit$survey.design$variables
    ori.weight <- 1/(svy.fit$survey.design$prob)
    mean.weight <- mean(ori.weight)
    data$weights <- ori.weight/mean.weight
    
    dd <- rms::datadist(data)
    options(datadist="dd")
    
    data$PHQ9 <- factor(data$PHQ9)
    data$RIAGENDR <- factor(data$RIAGENDR)
    data$RIDAGEYR <- factor(data$RIDAGEYR)
    data$RIDRETH1 <- factor(data$RIDRETH1)
    data$DMDMARTL <- factor(data$DMDMARTL)
    data$DMDEDUC2 <- factor(data$DMDEDUC2)
    data$DMDFMSIZ <- factor(data$DMDFMSIZ)
    data$INDFMIN2 <- factor(data$INDFMIN2)
    data$BMXBMI <- factor(data$BMXBMI)
    data$SMQ020 <- factor(data$SMQ020)
    data$ALQ <- factor(data$ALQ)
    data$DIQ010 <- factor(data$DIQ010)
    data$BPQ020 <- factor(data$BPQ020)
    
    
    fit.rcs <- rms::Glm(PHQ9 ~ rcs(LBXBCD,4) + RIAGENDR + RIDAGEYR + RIDRETH1 + DMDMARTL + DMDEDUC2 + DMDFMSIZ + INDFMIN2 + BMXBMI + SMQ020 + ALQ + DIQ010 + BPQ020, data=data, family="quasibinomial", weights=weights, normwt=TRUE, control=list(eps=1e-8))
    # weights=weights,normwt=TRUE
    
    # AIC的值
    extractAIC(fit.rcs)
    # 非线性检验
    anova(fit.rcs)
    
    OR <- rms::Predict(fit.rcs, LBXBCD, type="predictions", fun=exp, ref.zero=TRUE)
    
    pdf("Mn_rcs.pdf")
    ggplot() +
      geom_line(data=OR, aes(x=LBXBCD, y=yhat), linetype="solid", size=1, alpha=0.7, color="red") +
      geom_ribbon(data=OR, aes(x=LBXBCD, ymin=lower, ymax=upper), alpha=0.1, fill="blue") +
      geom_hline(yintercept=1, linetype=2, color="grey") +
      scale_x_continuous("Cd(ug/L)") +
      scale_y_continuous("OR (95% CI)") +
      geom_text(aes(x=0.5, y=3,
                    label=paste0("P-overall = 0.0352", "\nP-non-linear = 0.0771")), hjust=0) +
      theme_bw() +
      theme(axis.line=element_line(),
            panel.grid=element_blank(),
            panel.border=element_blank())
    dev.off()
    

    logistic回归列线图(患者风险得分)

    trl <- sample(nrow(out.put), 0.7*nrow(out.put))
    train_data <- out.put[trl,]
    test_data <- out.put[-trl,]
    
    # 生成复杂抽样NHANES_design
    bcSvy2 <- svydesign(data=train_data,
                        ids=~SDMVPSU,
                        strata=~SDMVSTRA,
                        nest=TRUE,
                        weights=~WTINT2YR,
                        survey.lonely.psu="adjust")
    summary(bcSvy2)
    
    svy.fit <- svyglm(MORTSTAT ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR, family="quasibinomial", design=bcSvy2)
    
    # 生成预测变量,这里生成的是log(p),如果要p的话需要转换
    library(readr)
    library(dplyr)
    library(plyr)
    library(tidyverse)
    library(gtsummary)
    library(tidyr)
    library(survey)
    library(haven)
    library(rms)
    library(SvyNom)
    
    pr1 <- predict(svy.fit)
    pr1 <- as.numeric(pr1)
    
    # 以pr1为结局变量建立一个线性回归方程
    f <- ols(pr1 ~ condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + RIDAGEYR + INDFMPIR, sigma=1, x=TRUE, y=TRUE, data=train_data)
    pr2 <- predict(f)
    
    # 列线图
    pdf("norm_logistic.pdf", width=12, height=6)
    dd <- datadist(train_data)
    options(datadist="dd")
    ss3 = c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
    nom <- nomogram(f, fun=function(x)1/(1+exp(-x)), funlabel="Risk", fun.at=ss3, lp=TRUE, vnames="labels")
    plot(nom)
    dev.off()
    
    # C-index
    Cindex <- rcorrcens(train_data$MORTSTAT ~ predict(svy.fit))
    Cindex
    
    # 可信区间,se等于sd的一半
    se <- 0.026/2
    ul <- 0.871 + 1.96*se
    ll <- 0.871 - 1.96*se
    
    
    # 校准曲线数据准备gg2函数
    gg2 <- function(data, p, y, group=1, g, leb){
      if (missing(g)){g=10}
      matres <- function(data, p, y){
        data <- data
        y=y
        sor <- order(p)
        p <- p[sor]
        y <- y[sor]
        groep <- cut2(p, g=g)
        meanpred <- round(tapply(p, groep, mean), 3)
        meanobs <- round(tapply(y, groep, mean), 3)
        msd <- round(tapply(y, groep, sd), 3)
        se <- msd/sqrt(dim(data)[1])
        matres <- as.data.frame(cbind(meanpred, meanobs, se))
        matres
      }
      dat <- matres(data, p, y)
      if (group==1) dat
      if (group!=1){
        dat <- cbind(dat, gro=leb)
      }
      dat
    }
    
    
    p <- exp(pr1)/(1+exp(pr1))          #或p <- predict(svy.fit, type=c("response"))
    
    
    # 绘制校准曲线
    pdf("calibrate_logistic.pdf", width=12, height=6)
    plot1 <- gg2(train_data, p, train_data$MORTSTAT)
    ggplot(plot1, aes(x=meanpred, y=meanobs)) + 
      geom_errorbar(aes(ymin=meanobs - 1.96*se, ymax=meanobs + 1.96*se), width=.02) +
      annotate(geom="segment", x=0, y=0, xend=1, yend=1) +
      expand_limits(x=0, y=0) +
      scale_x_continuous(expand=c(0,0)) +
      scale_y_continuous(expand=c(0,0)) +
      geom_point(size=3, shape=21, fill="white") +
      xlab("Prediction probability") +
      ylab("Actual probability")
    dev.off()
    
    
    # ROC曲线
    library(ResourceSelection)
    library(pROC)
    
    pr.roc <- predict(svy.fit, type=c("response"))
    h2 <- hoslem.test(train_data$MORTSTAT, pr.roc, g=10)
    h2  #p>0.05较好
    cbind(h2$observed, h2$expected)
    
    roccurvel <- roc(train_data$MORTSTAT ~ pr.roc)
    auc(roccurvel)
    pdf("train_ROC.pdf")
    plot.roc(roccurvel, xlim=c(1,0), ylim=c(0,1), print.auc=TRUE, print.auc.x=0.4, print.auc.y=0.2, print.thres="best")
    dev.off()
    
    
    # 在验证集上生成p和log(p)
    bc_best.pr1 <- predict(svy.fit, newdata=test_data)
    bc_best.pr1 <- as.numeric(bc_best.pr1)
    bc_best.p <- predict(svy.fit, newdata=test_data, type=c("response"))
    # 验证集上求C-index
    Cindex <- rcorrcens(test_data$MORTSTAT ~ predict(svy.fit, newdata=test_data))
    Cindex
    # 验证集校准曲线
    pdf("calibrate_logistic_test.pdf", width=12, height=6)
    plot2 <- gg2(test_data, bc_best.p, test_data$MORTSTAT)
    ggplot(plot2, aes(x=meanpred, y=meanobs)) + 
      geom_errorbar(aes(ymin=meanobs - 1.96*se, ymax=meanobs + 1.96*se), width=.02) +
      annotate(geom="segment", x=0, y=0, xend=1, yend=1) +
      expand_limits(x=0, y=0) +
      scale_x_continuous(expand=c(0,0)) +
      scale_y_continuous(expand=c(0,0)) +
      geom_point(size=3, shape=21, fill="white") +
      xlab("Prediction probability") +
      ylab("Actual probability")
    dev.off()
    # 验证集ROC
    roccurvel2 <- roc(test_data$MORTSTAT ~ bc_best.p)
    pdf("test_ROC.pdf")
    auc(roccurvel2)
    plot(roccurvel2, xlim=c(1,0), ylim=c(0,1), print.auc=TRUE, print.auc.x=0.4, print.auc.y=0.2, print.thres="best")
    dev.off()
    # 验证集列线图绘制
    f1 <- ols(bc_best.pr1 ~ condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + RIDAGEYR + INDFMPIR, sigma=1, x=TRUE, y=TRUE, data=test_data)
    pr3 <- predict(f1)
    pdf("norm_logistic_test.pdf", width=12, height=6)
    dd <- datadist(test_data)
    options(datadist="dd")
    ss3 <- c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
    nom <- nomogram(f, fun=function(x)1/(1+exp(-x)), funlabel="Risk", fun.at=ss3, lp=TRUE, vnames="labels")
    plot(nom)
    dev.off()
    

    KM分析

    library(readr)
    library(dplyr)
    library(plyr)
    library(tidyverse)
    library(gtsummary)
    library(tidyr)
    library(survey)
    library(haven)
    library(survival)
    library(survminer)
    library(jskm)
    library(rms)
    library(SvyNom)
    library(timeROC)
    
    # 生成复杂抽样NHANES_design
    bcSvy2 <- svydesign(data=out.put,
                        ids=~SDMVPSU,
                        strata=~SDMVSTRA,
                        nest=TRUE,
                        weights=~WTINT2YR,
                        survey.lonely.psu="adjust")
    summary(bcSvy2)
    
    sl <- svykm(Surv(PERMTH_INT, MORTSTAT) ~ factor(condition), design=bcSvy2, se=TRUE)
    pdf("survival.pdf")
    svyjskm(sl, pval=TRUE, table=TRUE, showpercent=TRUE)
    dev.off()
    

    cox回归分析

    ml1 <- svycoxph(Surv(PERMTH_INT, MORTSTAT) ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR, x=TRUE, design=bcSvy2)
    summary(ml1)
    
    tbl_regression(ml1, exponentiate=TRUE)
    

    基于cox回归限制性立方样条

    bcSvy2 <- svydesign(data=out.put,
                        ids=~SDMVPSU,
                        strata=~SDMVSTRA,
                        nest=TRUE,
                        weights=~WTINT2YR,
                        survey.lonely.psu="adjust")
    summary(bcSvy2)
    
    
    svy.fit <- svycoxph(Surv(PERMTH_INT, MORTSTAT) ~ rcs(RIDAGEYR,4) + factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + INDFMPIR, x=TRUE, design=bcSvy2)
    
    
    
    data <- svy.fit$survey.design$variables
    ori.weight <- 1/(svy.fit$survey.design$prob)
    mean.weight <- mean(ori.weight)
    data$weights <- ori.weight/mean.weight
    
    dd <- rms::datadist(data)
    options(datadist="dd")
    
    data$condition <- as.factor(data$condition)
    data$RIAGENDR <- as.factor(data$RIAGENDR)
    data$DMDEDUC2 <- as.factor(data$DMDEDUC2)
    data$RIDRETH1 <- as.factor(data$RIDRETH1)
    
    fit.rcs <- rms::cph(Surv(PERMTH_INT, MORTSTAT) ~ rcs(RIDAGEYR,4) + condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + INDFMPIR, data=data, weights=weights, normwt=TRUE, control=list(eps=1e-8))
    
    # AIC的值
    extractAIC(fit.rcs)
    # 非线性检验
    anova(fit.rcs)
    
    HR <- rms::Predict(fit.rcs, RIDAGEYR, type="predictions", fun=exp, ref.zero=TRUE)
    
    ggplot() +
      geom_line(data=HR, aes(x=RIDAGEYR, y=yhat), linetype="solid", size=1, alpha=0.7, color="red") +
      geom_ribbon(data=HR, aes(x=RIDAGEYR, ymin=lower, ymax=upper), alpha=0.1, fill="blue") +
      geom_hline(yintercept=1, linetype=2, color="grey") +
      scale_x_continuous("Age") +
      scale_y_continuous("HR (95% CI)") +
      geom_text(aes(x=0.5, y=5,
                    label=paste0("P-overall < 0.0001", "\nP-non-linear = 0.7178")), hjust=0) +
      theme_bw() +
      theme(axis.line=element_line(),
            panel.grid=element_blank(),
            panel.border=element_blank())
    dev.off()
    
    
    HR1 <- Predict(fit.rcs, RIDAGEYR, RIAGENDR=c(1,2),
                   fun=exp, type="predictions",
                   ref.zero=TRUE, conf.int = 0.95, digits=2)
    HR1$RIAGENDR <- as.factor(HR1$RIAGENDR)
    
    ggplot() +
      geom_line(data=HR1, aes(x=RIDAGEYR, y=yhat, group=RIAGENDR, col=RIAGENDR)) +
      geom_ribbon(data=HR1, aes(x=RIDAGEYR, ymin=lower, ymax=upper, fill=RIAGENDR), alpha=0.3) +
      geom_hline(yintercept=1, linetype=2, size=1) +
      theme_classic() +
      labs(title="RCS", x="age", y="HR (95%CI)")
    
    
    ggplot()+
      geom_line(data=HR1, aes(RIDAGEYR, yhat, color=RIAGENDR),
                linetype="solid", size=1, alpha=0.7) +
      geom_ribbon(data=HR1, 
                  aes(RIDAGEYR, ymin=lower, ymax=upper, fill=RIAGENDR),
                  alpha=0.1)+
      scale_color_manual(values = c('#0070b9','#d40e8c')) +
      scale_fill_manual(values = c('#0070b9','#d40e8c')) +
      theme_classic()+
      geom_hline(yintercept=1, linetype=2,size=1)+
      labs(title = "RCS", x="Age", y="HR (95%CI)")
    

    cox回归列线图(患者风险得分)

    out.put$condition <- as.factor(out.put$condition)
    out.put$RIAGENDR <- as.factor(out.put$RIAGENDR)
    out.put$DMDEDUC2 <- as.factor(out.put$DMDEDUC2)
    out.put$RIDRETH1 <- as.factor(out.put$RIDRETH1)
    
    
    dd <- datadist(out.put)
    options(datadist="dd")
    ss <- c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
    mynom <- svycox.nomogram(.design=bcSvy2, 
                             .model=Surv(PERMTH_INT, MORTSTAT) ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR,
                             .data=out.put,  #由于传了该文件,分类变量需要提前调整为factor
                             pred.at=12*2,
                             fun.lab="Prob of 3 Yr OS")
    pdf("norm.pdf", width=12, height=6)
    plot(mynom$nomog)
    dev.off()
    
    # 校正曲线
    pdf("calibrate.pdf", width=12, height=6)
    svycox.calibrate(mynom)
    dev.off()
    
    
    bootit=200
    library(survey)
    library(rms)
    data(noNA)
    dd=datadist(noNA)
    options(datadist="dd")
    dstr2=svydesign(id=~1, strata=~group, prob=~inv_weight, fpc=~ssize, data=noNA)
    mynom=svycox.nomogram(.design=dstr2, .model=Surv(survival,surv_cens)~ECOG+liver_only+Alb+Hb+Age+
                            Differentiation+Gt_1_m1site+lymph_only, .data=noNA, pred.at=24, fun.lab="Prob of 2 Yr OS")
    
    cases=which(noNA$group=="long")
    controls=which(noNA$group=="<24")
    boot.index=matrix(NA,nrow(noNA),bootit)
    for(i in 1:bootit){
      boot.index[,i]=c(sample(cases,replace=TRUE),sample(controls,replace=TRUE))
    }
    myval=svycox.validate(boot.index,mynom,noNA)  
    

    相关文章

      网友评论

          本文标题:02基于NHANES数据库的加权基线表及加权模型

          本文链接:https://www.haomeiwen.com/subject/csczsdtx.html