权重背景:
nhanes的权重由3部分组成:
基础权重basic weight:与常规抽样数据类似,对应样本被抽中的概率。
无应答调整 non-response adjustments
分层后调整 post-stratification adjustments
基础权重:
普通的问卷调查,为了使抽样样本具有一定的人群代表性,会赋予其一定的权重,相当于样本中的各个观测(本研究为每个研究对象)在总体中的重要程度,或样本中每个观测所能代表的总体中的个体的数目。
无应答调整:
之所以有这个权重,其原因是nhanes每个周期的调查中,participants参与项目的完整度不一致,且一些特定项目只有部分人员参加。
分层后调整:
权重还进行后分层,以匹配每个抽样子域的总体控制总数。这一额外调整使加权计数与美国非机构化平民人口的独立估计相同。
权重的计算:
1,基于合并周期数计算
2,基于最小子集结合研究目的选择合适权重
本质:不管怎样挑选样本都能够代表美国整体人群
基线表
library(haven)
library(Hmisc)
library(tableone)
library(survey)
bcSvy2 <- svydesign(ids=~SDMVPSU, #集群,抽样单元PSU
strata=~SDMVSTRA, #分层
weights=~WEIGHT,
nest=TRUE,
survey.lonely.psu="adjust", #抽样单元为1时不报错
data=out.put)
dput(names(out.put))
allVars <- c("RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDMARTL", "DMDEDUC2", "DMDFMSIZ",
"INDFMIN2","BMXBMI", "SMQ020", "ALQ", "DIQ010", "BPQ020", "PA", "LBXBCD",
"LBXBPB", "LBXBSE", "LBXBMN")
fvars <- c("RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDMARTL", "DMDEDUC2", "DMDFMSIZ",
"INDFMIN2","BMXBMI", "SMQ020", "ALQ", "DIQ010", "BPQ020", "PA")
Svytab2 <- svyCreateTableOne(vars=allVars, strata="PHQ9", data=bcSvy2, factorVars=fvars)
# 正态性检验(样本超5000)
data <- scale(out.put$LBXBCD)
ks.test(data,"pnorm") #<0.05,非正态分布
data <- scale(out.put$LBXBPB)
ks.test(data,"pnorm") #<0.05,非正态分布
data <- scale(out.put$LBXBSE)
ks.test(data,"pnorm") #<0.05,非正态分布
data <- scale(out.put$LBXBMN)
ks.test(data,"pnorm") #<0.05,非正态分布
bb <- c("LBXBCD", "LBXBPB", "LBXBSE", "LBXBMN")
tab3 <- print(Svytab2, factorVars=fvars, nonnormal=bb, quote=T, showAllLevels=T, smd=T, missing=F)
logistic回归分析
svy.fit <- svyglm(LUXSMED ~ ratio + factor(RIAGENDR) + factor(RIDRETH1) + factor(DMDFMSIZ) + factor(INDFMIN2) + factor(BMXBMI) + RIDAGEYR + factor(Smoke) + factor(ALQ111) + factor(diabetes) + LBXSATSI + LBXSASSI + LBXSGTSI + LBXSTR + LBXSCH, family="quasibinomial", design=bcSvy2)
tbl_regression(svy.fit, exponentiate=TRUE) %>%
add_global_p(anova_fun=gtsummary::tidy_wald_test) %>%
add_glance_table(include=c(nobs, AIC, BIC))
基于logistic回归限制性立方样条
分析逻辑:survey包无法进行加权限制性立方样条分析,需要借助rms包。先利用survey包进行加权logistic回归分析,并提取出标准化权重然后使用rms包进行分析
library(gtsummary)
library(survey)
library(haven)
library(rms)
svy.fit <- svyglm(PHQ9 ~ rcs(LBXBCD,4) + factor(RIAGENDR) + factor(RIDAGEYR) + factor(RIDRETH1) + factor(DMDMARTL) + factor(DMDEDUC2) + factor(DMDFMSIZ) + factor(INDFMIN2) + factor(BMXBMI) + factor(SMQ020) + factor(ALQ) + factor(DIQ010) + factor(BPQ020), family="quasibinomial", design=bcSvy2)
data <- svy.fit$survey.design$variables
ori.weight <- 1/(svy.fit$survey.design$prob)
mean.weight <- mean(ori.weight)
data$weights <- ori.weight/mean.weight
dd <- rms::datadist(data)
options(datadist="dd")
data$PHQ9 <- factor(data$PHQ9)
data$RIAGENDR <- factor(data$RIAGENDR)
data$RIDAGEYR <- factor(data$RIDAGEYR)
data$RIDRETH1 <- factor(data$RIDRETH1)
data$DMDMARTL <- factor(data$DMDMARTL)
data$DMDEDUC2 <- factor(data$DMDEDUC2)
data$DMDFMSIZ <- factor(data$DMDFMSIZ)
data$INDFMIN2 <- factor(data$INDFMIN2)
data$BMXBMI <- factor(data$BMXBMI)
data$SMQ020 <- factor(data$SMQ020)
data$ALQ <- factor(data$ALQ)
data$DIQ010 <- factor(data$DIQ010)
data$BPQ020 <- factor(data$BPQ020)
fit.rcs <- rms::Glm(PHQ9 ~ rcs(LBXBCD,4) + RIAGENDR + RIDAGEYR + RIDRETH1 + DMDMARTL + DMDEDUC2 + DMDFMSIZ + INDFMIN2 + BMXBMI + SMQ020 + ALQ + DIQ010 + BPQ020, data=data, family="quasibinomial", weights=weights, normwt=TRUE, control=list(eps=1e-8))
# weights=weights,normwt=TRUE
# AIC的值
extractAIC(fit.rcs)
# 非线性检验
anova(fit.rcs)
OR <- rms::Predict(fit.rcs, LBXBCD, type="predictions", fun=exp, ref.zero=TRUE)
pdf("Mn_rcs.pdf")
ggplot() +
geom_line(data=OR, aes(x=LBXBCD, y=yhat), linetype="solid", size=1, alpha=0.7, color="red") +
geom_ribbon(data=OR, aes(x=LBXBCD, ymin=lower, ymax=upper), alpha=0.1, fill="blue") +
geom_hline(yintercept=1, linetype=2, color="grey") +
scale_x_continuous("Cd(ug/L)") +
scale_y_continuous("OR (95% CI)") +
geom_text(aes(x=0.5, y=3,
label=paste0("P-overall = 0.0352", "\nP-non-linear = 0.0771")), hjust=0) +
theme_bw() +
theme(axis.line=element_line(),
panel.grid=element_blank(),
panel.border=element_blank())
dev.off()
logistic回归列线图(患者风险得分)
trl <- sample(nrow(out.put), 0.7*nrow(out.put))
train_data <- out.put[trl,]
test_data <- out.put[-trl,]
# 生成复杂抽样NHANES_design
bcSvy2 <- svydesign(data=train_data,
ids=~SDMVPSU,
strata=~SDMVSTRA,
nest=TRUE,
weights=~WTINT2YR,
survey.lonely.psu="adjust")
summary(bcSvy2)
svy.fit <- svyglm(MORTSTAT ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR, family="quasibinomial", design=bcSvy2)
# 生成预测变量,这里生成的是log(p),如果要p的话需要转换
library(readr)
library(dplyr)
library(plyr)
library(tidyverse)
library(gtsummary)
library(tidyr)
library(survey)
library(haven)
library(rms)
library(SvyNom)
pr1 <- predict(svy.fit)
pr1 <- as.numeric(pr1)
# 以pr1为结局变量建立一个线性回归方程
f <- ols(pr1 ~ condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + RIDAGEYR + INDFMPIR, sigma=1, x=TRUE, y=TRUE, data=train_data)
pr2 <- predict(f)
# 列线图
pdf("norm_logistic.pdf", width=12, height=6)
dd <- datadist(train_data)
options(datadist="dd")
ss3 = c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
nom <- nomogram(f, fun=function(x)1/(1+exp(-x)), funlabel="Risk", fun.at=ss3, lp=TRUE, vnames="labels")
plot(nom)
dev.off()
# C-index
Cindex <- rcorrcens(train_data$MORTSTAT ~ predict(svy.fit))
Cindex
# 可信区间,se等于sd的一半
se <- 0.026/2
ul <- 0.871 + 1.96*se
ll <- 0.871 - 1.96*se
# 校准曲线数据准备gg2函数
gg2 <- function(data, p, y, group=1, g, leb){
if (missing(g)){g=10}
matres <- function(data, p, y){
data <- data
y=y
sor <- order(p)
p <- p[sor]
y <- y[sor]
groep <- cut2(p, g=g)
meanpred <- round(tapply(p, groep, mean), 3)
meanobs <- round(tapply(y, groep, mean), 3)
msd <- round(tapply(y, groep, sd), 3)
se <- msd/sqrt(dim(data)[1])
matres <- as.data.frame(cbind(meanpred, meanobs, se))
matres
}
dat <- matres(data, p, y)
if (group==1) dat
if (group!=1){
dat <- cbind(dat, gro=leb)
}
dat
}
p <- exp(pr1)/(1+exp(pr1)) #或p <- predict(svy.fit, type=c("response"))
# 绘制校准曲线
pdf("calibrate_logistic.pdf", width=12, height=6)
plot1 <- gg2(train_data, p, train_data$MORTSTAT)
ggplot(plot1, aes(x=meanpred, y=meanobs)) +
geom_errorbar(aes(ymin=meanobs - 1.96*se, ymax=meanobs + 1.96*se), width=.02) +
annotate(geom="segment", x=0, y=0, xend=1, yend=1) +
expand_limits(x=0, y=0) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
geom_point(size=3, shape=21, fill="white") +
xlab("Prediction probability") +
ylab("Actual probability")
dev.off()
# ROC曲线
library(ResourceSelection)
library(pROC)
pr.roc <- predict(svy.fit, type=c("response"))
h2 <- hoslem.test(train_data$MORTSTAT, pr.roc, g=10)
h2 #p>0.05较好
cbind(h2$observed, h2$expected)
roccurvel <- roc(train_data$MORTSTAT ~ pr.roc)
auc(roccurvel)
pdf("train_ROC.pdf")
plot.roc(roccurvel, xlim=c(1,0), ylim=c(0,1), print.auc=TRUE, print.auc.x=0.4, print.auc.y=0.2, print.thres="best")
dev.off()
# 在验证集上生成p和log(p)
bc_best.pr1 <- predict(svy.fit, newdata=test_data)
bc_best.pr1 <- as.numeric(bc_best.pr1)
bc_best.p <- predict(svy.fit, newdata=test_data, type=c("response"))
# 验证集上求C-index
Cindex <- rcorrcens(test_data$MORTSTAT ~ predict(svy.fit, newdata=test_data))
Cindex
# 验证集校准曲线
pdf("calibrate_logistic_test.pdf", width=12, height=6)
plot2 <- gg2(test_data, bc_best.p, test_data$MORTSTAT)
ggplot(plot2, aes(x=meanpred, y=meanobs)) +
geom_errorbar(aes(ymin=meanobs - 1.96*se, ymax=meanobs + 1.96*se), width=.02) +
annotate(geom="segment", x=0, y=0, xend=1, yend=1) +
expand_limits(x=0, y=0) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
geom_point(size=3, shape=21, fill="white") +
xlab("Prediction probability") +
ylab("Actual probability")
dev.off()
# 验证集ROC
roccurvel2 <- roc(test_data$MORTSTAT ~ bc_best.p)
pdf("test_ROC.pdf")
auc(roccurvel2)
plot(roccurvel2, xlim=c(1,0), ylim=c(0,1), print.auc=TRUE, print.auc.x=0.4, print.auc.y=0.2, print.thres="best")
dev.off()
# 验证集列线图绘制
f1 <- ols(bc_best.pr1 ~ condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + RIDAGEYR + INDFMPIR, sigma=1, x=TRUE, y=TRUE, data=test_data)
pr3 <- predict(f1)
pdf("norm_logistic_test.pdf", width=12, height=6)
dd <- datadist(test_data)
options(datadist="dd")
ss3 <- c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
nom <- nomogram(f, fun=function(x)1/(1+exp(-x)), funlabel="Risk", fun.at=ss3, lp=TRUE, vnames="labels")
plot(nom)
dev.off()
KM分析
library(readr)
library(dplyr)
library(plyr)
library(tidyverse)
library(gtsummary)
library(tidyr)
library(survey)
library(haven)
library(survival)
library(survminer)
library(jskm)
library(rms)
library(SvyNom)
library(timeROC)
# 生成复杂抽样NHANES_design
bcSvy2 <- svydesign(data=out.put,
ids=~SDMVPSU,
strata=~SDMVSTRA,
nest=TRUE,
weights=~WTINT2YR,
survey.lonely.psu="adjust")
summary(bcSvy2)
sl <- svykm(Surv(PERMTH_INT, MORTSTAT) ~ factor(condition), design=bcSvy2, se=TRUE)
pdf("survival.pdf")
svyjskm(sl, pval=TRUE, table=TRUE, showpercent=TRUE)
dev.off()
cox回归分析
ml1 <- svycoxph(Surv(PERMTH_INT, MORTSTAT) ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR, x=TRUE, design=bcSvy2)
summary(ml1)
tbl_regression(ml1, exponentiate=TRUE)
基于cox回归限制性立方样条
bcSvy2 <- svydesign(data=out.put,
ids=~SDMVPSU,
strata=~SDMVSTRA,
nest=TRUE,
weights=~WTINT2YR,
survey.lonely.psu="adjust")
summary(bcSvy2)
svy.fit <- svycoxph(Surv(PERMTH_INT, MORTSTAT) ~ rcs(RIDAGEYR,4) + factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + INDFMPIR, x=TRUE, design=bcSvy2)
data <- svy.fit$survey.design$variables
ori.weight <- 1/(svy.fit$survey.design$prob)
mean.weight <- mean(ori.weight)
data$weights <- ori.weight/mean.weight
dd <- rms::datadist(data)
options(datadist="dd")
data$condition <- as.factor(data$condition)
data$RIAGENDR <- as.factor(data$RIAGENDR)
data$DMDEDUC2 <- as.factor(data$DMDEDUC2)
data$RIDRETH1 <- as.factor(data$RIDRETH1)
fit.rcs <- rms::cph(Surv(PERMTH_INT, MORTSTAT) ~ rcs(RIDAGEYR,4) + condition + RIAGENDR + DMDEDUC2 + RIDRETH1 + INDFMPIR, data=data, weights=weights, normwt=TRUE, control=list(eps=1e-8))
# AIC的值
extractAIC(fit.rcs)
# 非线性检验
anova(fit.rcs)
HR <- rms::Predict(fit.rcs, RIDAGEYR, type="predictions", fun=exp, ref.zero=TRUE)
ggplot() +
geom_line(data=HR, aes(x=RIDAGEYR, y=yhat), linetype="solid", size=1, alpha=0.7, color="red") +
geom_ribbon(data=HR, aes(x=RIDAGEYR, ymin=lower, ymax=upper), alpha=0.1, fill="blue") +
geom_hline(yintercept=1, linetype=2, color="grey") +
scale_x_continuous("Age") +
scale_y_continuous("HR (95% CI)") +
geom_text(aes(x=0.5, y=5,
label=paste0("P-overall < 0.0001", "\nP-non-linear = 0.7178")), hjust=0) +
theme_bw() +
theme(axis.line=element_line(),
panel.grid=element_blank(),
panel.border=element_blank())
dev.off()
HR1 <- Predict(fit.rcs, RIDAGEYR, RIAGENDR=c(1,2),
fun=exp, type="predictions",
ref.zero=TRUE, conf.int = 0.95, digits=2)
HR1$RIAGENDR <- as.factor(HR1$RIAGENDR)
ggplot() +
geom_line(data=HR1, aes(x=RIDAGEYR, y=yhat, group=RIAGENDR, col=RIAGENDR)) +
geom_ribbon(data=HR1, aes(x=RIDAGEYR, ymin=lower, ymax=upper, fill=RIAGENDR), alpha=0.3) +
geom_hline(yintercept=1, linetype=2, size=1) +
theme_classic() +
labs(title="RCS", x="age", y="HR (95%CI)")
ggplot()+
geom_line(data=HR1, aes(RIDAGEYR, yhat, color=RIAGENDR),
linetype="solid", size=1, alpha=0.7) +
geom_ribbon(data=HR1,
aes(RIDAGEYR, ymin=lower, ymax=upper, fill=RIAGENDR),
alpha=0.1)+
scale_color_manual(values = c('#0070b9','#d40e8c')) +
scale_fill_manual(values = c('#0070b9','#d40e8c')) +
theme_classic()+
geom_hline(yintercept=1, linetype=2,size=1)+
labs(title = "RCS", x="Age", y="HR (95%CI)")
cox回归列线图(患者风险得分)
out.put$condition <- as.factor(out.put$condition)
out.put$RIAGENDR <- as.factor(out.put$RIAGENDR)
out.put$DMDEDUC2 <- as.factor(out.put$DMDEDUC2)
out.put$RIDRETH1 <- as.factor(out.put$RIDRETH1)
dd <- datadist(out.put)
options(datadist="dd")
ss <- c(0.05,0.2,0.4,0.6,0.7,0.8,0.9,0.95,0.99)
mynom <- svycox.nomogram(.design=bcSvy2,
.model=Surv(PERMTH_INT, MORTSTAT) ~ factor(condition) + factor(RIAGENDR) + factor(DMDEDUC2) + factor(RIDRETH1) + RIDAGEYR + INDFMPIR,
.data=out.put, #由于传了该文件,分类变量需要提前调整为factor
pred.at=12*2,
fun.lab="Prob of 3 Yr OS")
pdf("norm.pdf", width=12, height=6)
plot(mynom$nomog)
dev.off()
# 校正曲线
pdf("calibrate.pdf", width=12, height=6)
svycox.calibrate(mynom)
dev.off()
bootit=200
library(survey)
library(rms)
data(noNA)
dd=datadist(noNA)
options(datadist="dd")
dstr2=svydesign(id=~1, strata=~group, prob=~inv_weight, fpc=~ssize, data=noNA)
mynom=svycox.nomogram(.design=dstr2, .model=Surv(survival,surv_cens)~ECOG+liver_only+Alb+Hb+Age+
Differentiation+Gt_1_m1site+lymph_only, .data=noNA, pred.at=24, fun.lab="Prob of 2 Yr OS")
cases=which(noNA$group=="long")
controls=which(noNA$group=="<24")
boot.index=matrix(NA,nrow(noNA),bootit)
for(i in 1:bootit){
boot.index[,i]=c(sample(cases,replace=TRUE),sample(controls,replace=TRUE))
}
myval=svycox.validate(boot.index,mynom,noNA)
网友评论