前言
此复现过程全程由kinesin老师指导,过程有点复杂,如发现问题,请及时简书联系我,我及时更改,主要复现标准化免疫细胞组成的生存分析。
图片.png
图片.png
下载数据
#加载R包
library(survival)
library(survminer)
library(tidyverse)
library(data.table)
library(IOBR)#IOBR包建议github下载,devtools::install_local("D:/BaiduNetdiskDownload/123/IOBR-master.zip"),但是更新的包count2tpmZ这个函数出现bug,后期作者应该会修改,我是用老的c2tpm包,如需要联系我
library(UCSCXenaTools)#install.packages('UCSCXenaTools')
# 查看TCGA队列
grep("TCGA", unique(UCSCXenaTools::XenaData$XenaCohorts), value=T)
# 查看LUAD队列的数据集
XenaGenerate(subset = XenaCohorts =="GDC TCGA Lung Adenocarcinoma (LUAD)")@datasets
# 提取表达数据
mydata <- XenaGenerate(subset = XenaCohorts == "GDC TCGA Lung Adenocarcinoma (LUAD)") %>%
XenaFilter(filterDatasets = "TCGA-LUAD.htseq_counts.tsv") %>% XenaQuery() %>%
XenaDownload() %>% XenaPrepare()
### count转tpm
# 剔除Ensembl ID的版本号
mydata$Ensembl_ID <- substring(mydata$Ensembl_ID, 1, 15)
mydata <- column_to_rownames(mydata, var = "Ensembl_ID")
# TCGA的数据经过了log2(x+1)转换,需要还原回去
mydata <- (2^mydata)-1
# 转换基因名称,并将count数据转为tpm数据
# In this process, *biomaRt* R package is utilized to acquire the gene length of each Ensembl ID
# and calculate the TPM of each sample. If identical gene symbols exists, these genes would be
# ordered by the mean expression levels. The gene symbol with highest mean expression level is
# selected and remove others.
#mydata <- count2tpm(countMat = mydata, idType = "Ensembl", org="hsa")
dd <- c2tpm(countMat = mydata, idType = "Ensembl", org="hsa")#这一步由于新的代码bug,想要旧的c2tpm代码联系我
saveRDS(dd, file = "TCGA.LUAD.TPM.rds")
免疫分析
#IOBR包方法
### cibersort分析
dd <- readRDS("TCGA.LUAD.TPM.rds")
cibersort <- deconvo_cibersort(dd, arrays = F)
saveRDS(cibersort, "LAUDcibersort.rds")
#这一步我开服务器都很慢,目前我已经跑了一个多小时了,已经受不了了,可以转下面的传统方法
#传统方法
source("CIBERSORT/CIBERSORT.R")
# 主要是 read.table 的参数可以随意修改,根据你自己的表达矩阵txt文件适应性调整即可
## sig_matrix <- read.table("CIBERSORT/LM22.txt",header=T,sep="\t",row.names=1,check.names=F)
write.table(cbind(rownames(dd), dd),"exprMat.txt",quote = F, sep = "\t", row.names=FALSE)
sig_matrix <- "CIBERSORT/LM22.txt"
mixture_file = 'exprMat.txt'
res_cibersort <- CIBERSORT(sig_matrix, mixture_file, perm=10, QN=TRUE)
saveRDS(res_cibersort, "LUADcibersort.rds")
整理原文基因
genesets <- read_csv("Data/genesets.csv")
genesets <- as.list(genesets)
genesets <- lapply(genesets, FUN = na.omit)
整理生存数据
dd <- as.data.frame(dd)
dd =dd[,str_sub(colnames(dd),14,16)=="01A"]
surv <- fread("Data/TCGA-LUAD.survival.tsv")
pheno <- fread("Data/TCGA-LUAD.GDC_phenotype.tsv.gz")
clin1 = merge(surv, pheno, by.x = "sample", by.y = "submitter_id.samples")#去重复测序两次的标本
class(clin1)
clin1 <- as.data.frame(clin1)
rownames(clin1) <- clin1$sample
clin=clin1[,c("sample", "age_at_initial_pathologic_diagnosis",
"age_at_diagnosis.diagnoses", "gender.demographic", "tumor_stage.diagnoses",'OS','OS.time')]
clin =clin[str_sub(rownames(clin),14,15)=="01",]
index <- intersect(rownames(clin),colnames(dd))
clin10 <- clin[index,]
#clin11<-subset(clin10,clin10$OS.time != "NA")
dd <- dd[,index]
meta <- na.omit(clin10)
ids <- meta$tumor_stage.diagnoses != 'not reported'
meta <- meta[ids,]
#去除临床信息不完全的
names <- rownames(meta)
dd <- dd[,names]
参照原文对数据标准化
# Z-Score 标准化
expr.log <- log2(dd + 1)
expr.zscore <- apply(expr.log, 1, function(x){return((x - mean(x))/sd(x))})
expr.zscore <- t(expr.zscore)
saveRDS(expr.zscore, "expr.zscore.rds")
# 按基因集求表达量均值
ids <- match(genesets[["Activated-Tregs"]], rownames(expr.zscore))
expr.Treg.a <- expr.zscore[ids,]
expr.Treg.a <- na.omit(expr.Treg.a)
expr.Treg.a <- apply(expr.Treg.a, 2, mean)
expr.Treg.a <- expr.Treg.a[meta$sample]
ids <- match(genesets[["Suppressive-Tregs"]], rownames(expr.zscore))
expr.Treg.s <- expr.zscore[ids,]
expr.Treg.s <- na.omit(expr.Treg.s)
expr.Treg.s <- apply(expr.Treg.s, 2, mean)
expr.Treg.s <- expr.Treg.s[meta$sample]
ids <- match(genesets[["CD8-C5-ZNF683"]], rownames(expr.zscore))
expr.CD8.C5 <- expr.zscore[ids,]
expr.CD8.C5 <- na.omit(expr.CD8.C5)
expr.CD8.C5 <- apply(expr.CD8.C5, 2, mean)
expr.CD8.C5 <- expr.CD8.C5[meta$sample]
ids <- match(genesets[["CD8-C6-LAYN"]], rownames(expr.zscore))
expr.CD8.C6 <- expr.zscore[ids,]
expr.CD8.C6 <- na.omit(expr.CD8.C6)
expr.CD8.C6 <- apply(expr.CD8.C6, 2, mean)
expr.CD8.C6 <- expr.CD8.C6[meta$sample]
表达均值乘以CIBERSORT预测的CD8+T or CD4+T
cibersort <- as.data.frame(LUADcibersort)
cd8 <- cibersort$`T cells CD8`
cd4 <- cibersort$`T cells CD4 naive` +
cibersort$`T cells CD4 memory resting` +
cibersort$`T cells CD4 memory activated`
fractions <- structure(cd8, names=rownames(cibersort))
CD8fractions <- fractions[meta$sample]
fractions <- structure(cd4, names=rownames(cibersort))
CD4fractions <- fractions[meta$sample]
expr.Treg.a <- expr.Treg.a*CD4fractions
expr.Treg.s <- expr.Treg.s*CD4fractions
expr.CD8.C5 <- expr.CD8.C5*CD8fractions
expr.CD8.C6 <- expr.CD8.C6*CD8fractions
meta$Treg.a <- ifelse(expr.Treg.a > median(expr.Treg.a), "high", "low")
meta$Treg.s <- ifelse(expr.Treg.s > median(expr.Treg.s), "high", "low")
meta$CD8.C5 <- ifelse(expr.CD8.C5 > median(expr.CD8.C5), "high", "low")
meta$CD8.C6 <- ifelse(expr.CD8.C6 > median(expr.CD8.C6), "high", "low")
meta$C5_C6 <- paste0(meta$CD8.C5, "-", meta$CD8.C6)
meta$Group <- recode(meta$C5_C6,
"high-high" = "Group1",
"high-low" = "Group2",
"low-high" = "Group3",
"low-low" = "Group4")
save(genesets, meta, file = "survData.rda")
Cox多变量生存分析
cox.TNFRSF9 <- coxph(Surv(OS.time, OS) ~ Treg.a + age_at_initial_pathologic_diagnosis +
gender.demographic + tumor_stage.diagnoses, data=meta)
cox.TNFRSF9
sum <- summary(cox.TNFRSF9)
sfit1=survfit(Surv(OS.time/30, OS)~Treg.a , data=meta)
S1 <- ggsurvplot(sfit1,pval =T,data = meta,risk.table = TRUE,size = 1,palette = 'nejm')
library(stringr)
ggforest(cox.TNFRSF9, data =meta)
### CD8_C5-C6
cox.C5_C6 <- coxph(Surv(OS.time, OS) ~ Group + age_at_initial_pathologic_diagnosis +
gender.demographic + tumor_stage.diagnoses, data=meta)
summary(cox.C5_C6)
sfit1=survfit(Surv(OS.time/30, OS)~Group , data=meta)
S1 <- ggsurvplot(sfit1,pval =T,data = meta,risk.table = TRUE,size = 1,palette = 'nejm')
library(stringr)
ggforest(cox.C5_C6, data =meta)
save(cox.TNFRSF9, cox.C5_C6, file = "surv_cox.rda")
图片.png
图片.png
图片.png
网友评论