美文网首页
复现一篇文章

复现一篇文章

作者: 碌碌无为的杰少 | 来源:发表于2022-02-11 12:21 被阅读0次

    前言

    此复现过程全程由kinesin老师指导,过程有点复杂,如发现问题,请及时简书联系我,我及时更改,主要复现标准化免疫细胞组成的生存分析。


    图片.png
    图片.png

    下载数据

    #加载R包
    library(survival)
    library(survminer)
    library(tidyverse)
    library(data.table)
    library(IOBR)#IOBR包建议github下载,devtools::install_local("D:/BaiduNetdiskDownload/123/IOBR-master.zip"),但是更新的包count2tpmZ这个函数出现bug,后期作者应该会修改,我是用老的c2tpm包,如需要联系我
    library(UCSCXenaTools)#install.packages('UCSCXenaTools')
    
    # 查看TCGA队列
    grep("TCGA", unique(UCSCXenaTools::XenaData$XenaCohorts), value=T)
    # 查看LUAD队列的数据集
    XenaGenerate(subset = XenaCohorts =="GDC TCGA Lung Adenocarcinoma (LUAD)")@datasets
    # 提取表达数据
    mydata <- XenaGenerate(subset = XenaCohorts == "GDC TCGA Lung Adenocarcinoma (LUAD)") %>% 
      XenaFilter(filterDatasets = "TCGA-LUAD.htseq_counts.tsv") %>% XenaQuery() %>%
      XenaDownload() %>% XenaPrepare()
    
    ### count转tpm
    # 剔除Ensembl ID的版本号
    mydata$Ensembl_ID <- substring(mydata$Ensembl_ID, 1, 15)
    mydata <- column_to_rownames(mydata, var = "Ensembl_ID")
    # TCGA的数据经过了log2(x+1)转换,需要还原回去
    mydata <- (2^mydata)-1
    
    # 转换基因名称,并将count数据转为tpm数据
    # In this process, *biomaRt* R package is utilized to acquire the gene length of each Ensembl ID 
    # and calculate the TPM of each sample. If identical gene symbols exists, these genes would be 
    # ordered by the mean expression levels. The gene symbol with highest mean expression level is 
    # selected and remove others.
    #mydata <- count2tpm(countMat = mydata, idType = "Ensembl", org="hsa")
    dd <- c2tpm(countMat = mydata, idType = "Ensembl", org="hsa")#这一步由于新的代码bug,想要旧的c2tpm代码联系我
    saveRDS(dd, file = "TCGA.LUAD.TPM.rds")
    

    免疫分析

    #IOBR包方法
    ### cibersort分析
    dd <- readRDS("TCGA.LUAD.TPM.rds")
    cibersort <- deconvo_cibersort(dd, arrays = F)
    saveRDS(cibersort, "LAUDcibersort.rds")
    #这一步我开服务器都很慢,目前我已经跑了一个多小时了,已经受不了了,可以转下面的传统方法
    #传统方法
    source("CIBERSORT/CIBERSORT.R") 
    # 主要是 read.table 的参数可以随意修改,根据你自己的表达矩阵txt文件适应性调整即可
    ## sig_matrix <- read.table("CIBERSORT/LM22.txt",header=T,sep="\t",row.names=1,check.names=F)
    write.table(cbind(rownames(dd), dd),"exprMat.txt",quote = F, sep = "\t", row.names=FALSE)
    sig_matrix <- "CIBERSORT/LM22.txt"
    mixture_file = 'exprMat.txt'
    res_cibersort <- CIBERSORT(sig_matrix, mixture_file, perm=10, QN=TRUE)
    saveRDS(res_cibersort, "LUADcibersort.rds")
    

    整理原文基因

    genesets <- read_csv("Data/genesets.csv")
    genesets <- as.list(genesets)
    genesets <- lapply(genesets, FUN = na.omit)
    

    整理生存数据

    dd <- as.data.frame(dd)
    dd =dd[,str_sub(colnames(dd),14,16)=="01A"]
    
    surv <- fread("Data/TCGA-LUAD.survival.tsv")
    
    pheno <- fread("Data/TCGA-LUAD.GDC_phenotype.tsv.gz")
    
    clin1 = merge(surv, pheno, by.x = "sample", by.y = "submitter_id.samples")#去重复测序两次的标本
    class(clin1)
    clin1 <- as.data.frame(clin1)
    rownames(clin1) <- clin1$sample
    clin=clin1[,c("sample", "age_at_initial_pathologic_diagnosis", 
                  "age_at_diagnosis.diagnoses", "gender.demographic", "tumor_stage.diagnoses",'OS','OS.time')]
    
    clin =clin[str_sub(rownames(clin),14,15)=="01",]
    
    index <- intersect(rownames(clin),colnames(dd))
    
    clin10 <- clin[index,]
    
    #clin11<-subset(clin10,clin10$OS.time != "NA")
    dd <- dd[,index]
    meta <- na.omit(clin10)
    ids <- meta$tumor_stage.diagnoses != 'not reported'
    meta <- meta[ids,]
    #去除临床信息不完全的
    names <- rownames(meta)
    dd <- dd[,names]
    

    参照原文对数据标准化

    # Z-Score 标准化
    expr.log <- log2(dd + 1)
    expr.zscore <- apply(expr.log, 1, function(x){return((x - mean(x))/sd(x))})
    expr.zscore <- t(expr.zscore)
    saveRDS(expr.zscore, "expr.zscore.rds")
    
    # 按基因集求表达量均值
    ids <- match(genesets[["Activated-Tregs"]], rownames(expr.zscore))
    expr.Treg.a <- expr.zscore[ids,]
    expr.Treg.a <- na.omit(expr.Treg.a)
    expr.Treg.a <- apply(expr.Treg.a, 2, mean)
    expr.Treg.a <- expr.Treg.a[meta$sample]
    
    ids <- match(genesets[["Suppressive-Tregs"]], rownames(expr.zscore))
    expr.Treg.s <- expr.zscore[ids,]
    expr.Treg.s <- na.omit(expr.Treg.s)
    expr.Treg.s <- apply(expr.Treg.s, 2, mean)
    expr.Treg.s <- expr.Treg.s[meta$sample]
    
    ids <- match(genesets[["CD8-C5-ZNF683"]], rownames(expr.zscore))
    expr.CD8.C5 <- expr.zscore[ids,]
    expr.CD8.C5 <- na.omit(expr.CD8.C5)
    expr.CD8.C5 <- apply(expr.CD8.C5, 2, mean)
    expr.CD8.C5 <- expr.CD8.C5[meta$sample]
    
    ids <- match(genesets[["CD8-C6-LAYN"]], rownames(expr.zscore))
    expr.CD8.C6 <- expr.zscore[ids,]
    expr.CD8.C6 <- na.omit(expr.CD8.C6)
    expr.CD8.C6 <- apply(expr.CD8.C6, 2, mean)
    expr.CD8.C6 <- expr.CD8.C6[meta$sample]
    

    表达均值乘以CIBERSORT预测的CD8+T or CD4+T

    cibersort <- as.data.frame(LUADcibersort)
    cd8 <- cibersort$`T cells CD8`                   
    cd4 <- cibersort$`T cells CD4 naive` + 
                        cibersort$`T cells CD4 memory resting` +
                        cibersort$`T cells CD4 memory activated`
    
    fractions <- structure(cd8, names=rownames(cibersort))
    CD8fractions <- fractions[meta$sample]
    
    
    fractions <- structure(cd4, names=rownames(cibersort))
    CD4fractions <- fractions[meta$sample]
    
    
    expr.Treg.a <- expr.Treg.a*CD4fractions
    expr.Treg.s <- expr.Treg.s*CD4fractions
    expr.CD8.C5 <- expr.CD8.C5*CD8fractions
    expr.CD8.C6 <- expr.CD8.C6*CD8fractions
    
    meta$Treg.a <- ifelse(expr.Treg.a > median(expr.Treg.a), "high", "low")
    meta$Treg.s <- ifelse(expr.Treg.s > median(expr.Treg.s), "high", "low")
    meta$CD8.C5 <- ifelse(expr.CD8.C5 > median(expr.CD8.C5), "high", "low")
    meta$CD8.C6 <- ifelse(expr.CD8.C6 > median(expr.CD8.C6), "high", "low")
    
    meta$C5_C6 <- paste0(meta$CD8.C5, "-", meta$CD8.C6)
    meta$Group <- recode(meta$C5_C6,
                         "high-high" = "Group1",
                         "high-low" = "Group2",
                         "low-high" = "Group3",
                         "low-low" = "Group4")
    
    save(genesets, meta, file = "survData.rda")
    

    Cox多变量生存分析

    cox.TNFRSF9 <- coxph(Surv(OS.time, OS) ~ Treg.a + age_at_initial_pathologic_diagnosis + 
                                             gender.demographic + tumor_stage.diagnoses, data=meta)
    cox.TNFRSF9
    sum <- summary(cox.TNFRSF9)
    sfit1=survfit(Surv(OS.time/30, OS)~Treg.a , data=meta)
    S1 <- ggsurvplot(sfit1,pval =T,data = meta,risk.table = TRUE,size = 1,palette = 'nejm')
    library(stringr)
    ggforest(cox.TNFRSF9, data =meta)
    
    
    ### CD8_C5-C6
    cox.C5_C6 <- coxph(Surv(OS.time, OS) ~ Group + age_at_initial_pathologic_diagnosis + 
                                           gender.demographic + tumor_stage.diagnoses, data=meta)
    summary(cox.C5_C6)
    sfit1=survfit(Surv(OS.time/30, OS)~Group , data=meta)
    S1 <- ggsurvplot(sfit1,pval =T,data = meta,risk.table = TRUE,size = 1,palette = 'nejm')
    
    library(stringr)
    ggforest(cox.C5_C6, data =meta)
    save(cox.TNFRSF9, cox.C5_C6, file = "surv_cox.rda")
    
    图片.png
    图片.png
    图片.png

    相关文章

      网友评论

          本文标题:复现一篇文章

          本文链接:https://www.haomeiwen.com/subject/jdvwkrtx.html