美文网首页
一篇数据挖掘文章的复现-2

一篇数据挖掘文章的复现-2

作者: 小洁忘了怎么分身 | 来源:发表于2022-04-21 15:54 被阅读0次

书接上回:https://www.jianshu.com/p/c7597357a915

ER相关的基因

endoplasmic reticulum stress

gs = rio::import("GeneCards-SearchResults.csv")
k = gs$`Relevance score`>=7;table(k)
## k
## FALSE  TRUE 
##  6267   802
gs = gs[k,]

两种算法计算每个基因的p值,并取交集

4个数据需要循环,先拿其中一个数据跑通,再把代码写入循环

library(tinyarray)
load("../1_data_pre/exp_surv1.Rdata")
exp1 = exp1[rownames(exp1) %in% gs$`Gene Symbol`,]
c1 = surv_cox(exp1,surv1,continuous = T)
head(c1)
##             coef         se         z            p       HR      HRse      HRz
## UBE2J2 1.4404365 0.13446765 10.712141 0.000000e+00 4.222539 0.5677949 5.675533
## RER1   1.8780151 0.11813991 15.896534 0.000000e+00 6.540509 0.7726952 7.170369
## ICMT   1.0621919 0.11658641  9.110770 0.000000e+00 2.892705 0.3372500 5.612170
## PARK7  0.6545554 0.13911446  4.705157 2.536705e-06 1.924287 0.2676961 3.452746
## H6PD   0.8217466 0.10642461  7.721396 1.154632e-14 2.274469 0.2420595 5.265107
## FBXO6  0.9195681 0.08969749 10.251882 0.000000e+00 2.508207 0.2249799 6.703741
##                 HRp   HRCILL   HRCIUL
## UBE2J2 1.382573e-08 3.244252 5.495823
## RER1   7.479573e-13 5.188606 8.244654
## ICMT   1.998047e-08 2.301789 3.635320
## PARK7  5.549107e-04 1.465060 2.527460
## H6PD   1.401079e-07 1.846253 2.802004
## FBXO6  2.031497e-11 2.103840 2.990295
nrow(c1)
## [1] 653
k1 = surv_KM(exp1,surv1)
head(k1) #p值实在太小,逼近0
##  UBE2J2    RER1   FBXO6   DDOST   CDC42 SELENON 
##       0       0       0       0       0       0
length(k1)
## [1] 642
g1 = intersect(rownames(c1),names(k1))
head(g1)
## [1] "UBE2J2" "RER1"   "ICMT"   "PARK7"  "H6PD"   "FBXO6"
length(g1)
## [1] 615

循环

load("../1_data_pre/exp_surv2.Rdata")
load("../1_data_pre/exp_surv3.Rdata")
load("../1_data_pre/exp_surv4.Rdata")
exp = list(exp1,exp2,exp3,exp4)
surv = list(surv1,surv2,surv3,surv4)
g = list()
for(i in 1:4){
  exp[[i]] = exp[[i]][rownames(exp[[i]]) %in% gs$`Gene Symbol`,]
  c1 = surv_cox(exp[[i]],surv[[i]],continuous = T)
  k = surv_KM(exp[[i]],surv[[i]])
  g[[i]] = intersect(rownames(c1),names(k))
}
names(g) = c("TCGA","CGGA_array","CGGA","GSE16011")
sapply(g, length)
##       TCGA CGGA_array       CGGA   GSE16011 
##        615        413        551        335
v_plot = draw_venn(g,"")
ggplot2::ggsave(v_plot,filename = "venn.png")

4个数据,两种算法p<0.05的基因取交集,得到195个基因,用于lasso回归

g = intersect_all(g)
length(g)
## [1] 195

lasso回归

使用TCGA的数据构建模型

library(survival)
x = t(exp1[g,])
y = data.matrix(Surv(surv1$time,surv1$event)) 
library(glmnet)
set.seed(10210)
cvfit = cv.glmnet(x, y, family="cox") 
fit=glmnet(x, y, family = "cox") 

coef = coef(fit, s = cvfit$lambda.min) 
index = which(coef != 0) 
actCoef = coef[index] 
lassoGene = row.names(coef)[index] 
lassoGene
##  [1] "GPX7"     "MR1"      "SHISA5"   "CP"       "PPM1L"    "DNAJB11" 
##  [7] "SNCA"     "ANXA5"    "HFE"      "EPM2A"    "FKBP14"   "BET1"    
## [13] "SERPINE1" "CASP2"    "BAG1"     "SIRT1"    "DNAJB12"  "ERLIN1"  
## [19] "CYP2E1"   "CASP4"    "SLN"      "ERP27"    "DDIT3"    "ATP2A2"  
## [25] "ERP29"    "MYH7"     "CDKN3"    "MAPK3"    "MMP2"     "NOL3"    
## [31] "BRCA1"    "PRNP"     "MX1"      "RNF185"   "SREBF2"   "GLA"
par(mfrow = c(1,2))
plot(cvfit) 
plot(fit,xvar="lambda",label = F)

逐步回归法

library(My.stepwise)
vl <- lassoGene
dat = cbind(surv1,t(exp1[lassoGene,]))
# My.stepwise.coxph(Time = "time",
#                   Status = "event",
#                   variable.list = vl,
#                   data = dat)

model = coxph(formula = Surv(time, event) ~ HFE + SHISA5 + BRCA1 + EPM2A + 
    ERLIN1 + GPX7 + SLN + DNAJB11 + MMP2 + NOL3 + CP + ATP2A2 + 
    GLA + MAPK3 + SREBF2 + CASP2 + SNCA + DDIT3 + BAG1 + ANXA5, 
    data = dat)

summary(model)$concordance
##           C       se(C) 
## 0.865988496 0.009754695
genes = names(model$coefficients);length(genes)
## [1] 20
library(survminer)
ggforest(model,data = dat)
ggsave("forest.png",width = 10,height = 8)

相关文章

网友评论

      本文标题:一篇数据挖掘文章的复现-2

      本文链接:https://www.haomeiwen.com/subject/ianiertx.html