美文网首页GEO
【生信技能树】2020-01-14作业:生存分析(1)

【生信技能树】2020-01-14作业:生存分析(1)

作者: 猫叽先森 | 来源:发表于2020-01-16 00:31 被阅读0次

    题目来源:https://mp.weixin.qq.com/s/jzDD05M5AuhCkiavkoLiHg

    1. 使用TCGA数据库的数据对PTP4A3基因做生存分析

    1.1 从UCSC Xena下载TCGA BRCA的表达矩阵HiSeqV2.gz,临床信息BRCA_clinicalMatrix,生存信息BRCA_survival.txt.gz,读入R。

    rm(list=ls())
    options(stringsAsFactors = F)
    options(warn = -1)
    
    library(data.table)
    ##读入TCGA_BRCA表达信息
    exprSet <- fread("HiSeqV2.gz",header = T,sep = '\t')
    exprSet <- as.data.frame(exprSet)
    rownames(exprSet) <- exprSet[,1]
    exprSet <- exprSet[,-1]
    ## 从exprSet中提取PTP4A3的表达情况
    dat <- exprSet["PTP4A3",]
    
    ##读入TCGA_BRCA生存信息
    surdata <- read.table("BRCA_survival.txt.gz",header = T,sep = '\t')
    rownames(surdata) <- surdata$sample
    surdata <- surdata[,-1]
    
    ##读入TCGA_BRCA临床信息
    pheno <- read.table("BRCA_clinicalMatrix",header = T,sep = '\t')
    rownames(pheno) <- pheno$sampleID
    pheno <- pheno[,-1]
    

    1.2 数据清洗
    1)病人数据去重

    table(duplicated(surdata$X_PATIENT))
    #FALSE  TRUE 
    # 1097   139 
    choose_samples <- rownames(surdata[!duplicated(surdata$X_PATIENT),])
    choose_samples[1:3]
    length(choose_samples)
    surdata <- surdata[choose_samples,]
    pheno <- pheno[choose_samples,]
    dat <- dat[,(colnames(dat) %in% choose_samples)]
    
    choose_samples <- colnames(dat)
    surdata <- surdata[choose_samples,]
    pheno <- pheno[choose_samples,]
    dat <- dat[,(colnames(dat) %in% choose_samples)]
    

    2)选择Primary Tumor样本

    pheno[1:3,1:3]
    table(pheno$sample_type)
    #         Metastatic       Primary Tumor Solid Tissue Normal 
    #                 7                1086                   2
    choose_samples <- rownames(pheno[pheno$sample_type=='Primary Tumor',])
    surdata <- surdata[choose_samples,]
    pheno <- pheno[choose_samples,]
    dat <- dat[,(colnames(dat) %in% choose_samples)]
    

    1.3 生存分析
    参考:【生信技能树】TCGA的28篇教程- 对TCGA数据库的任意癌症中任意基因做生存分析

    dat_bak <- dat
    dat <- t(dat)
    dat <- as.data.frame(dat)
    dat$sampleID <- rownames(dat)
    surdata$sampleID <- rownames(surdata)
    surdata <- merge(surdata,dat,by='sampleID')
    surdata$PTP4A3 <- as.numeric(surdata$PTP4A3)
    surdata$g <- ifelse(surdata$PTP4A3 > median(surdata$PTP4A3),'high','low')
    table(surdata$g)
    #high  low 
    # 543  543 
    
    library(survival)
    table(surdata$OS)
    #  0   1 
    #939 147
    table(surdata$OS.time>0)
    #FALSE  TRUE 
    #  13  1072 
    my.surv <- Surv(surdata$OS.time,surdata$OS==1)
    kmfit2 <- survfit(my.surv~surdata$g,data = surdata) 
    library("survminer")
    ggsurvplot(kmfit2,conf.int =F, pval = T,risk.table =F, ncensor.plot = F)
    
    BRCA_PTP4A3_survplot.png
    p=0.91,结果不显著。按照文献里写的用三阴性乳腺癌样本分析。

    参考:TCGA数据库中三阴性乳腺癌在亚洲人群中的差异表达

    colnames_num_tnbc <- grep('receptor_status',colnames(pheno))
    colnames(pheno)[colnames_num_tnbc]
    eph <- pheno[,colnames_num_tnbc[1:3]]
    eph$tnbc_row_num <- apply(eph, 1, function(x) sum(x =='Negative')) ## 均为阴性的为三阴性乳腺癌
    tnbc_samples <- rownames(pheno)[eph$tnbc_row_num == 3]
    length(tnbc_samples)
    #[1] 115 ##TCGA中tnbc病人只有115个
    
    tnbc_surdata <- surdata[surdata$sampleID %in% tnbc_samples,]
    tnbc.surv <- Surv(tnbc_surdata$OS.time,tnbc_surdata$OS==1)
    tnbc.kmfit <- survfit(tnbc.surv~tnbc_surdata$g,data=tnbc_surdata)
    ggsurvplot(tnbc.kmfit,conf.int = F,pval = T)
    
    tnbc_PTP4A3_survplot.png

    p=0.47。还是不显著,参考公众号【生信技能树】文章《生存分析就是一个任人打扮的小姑凉》继续折腾。

    surv.cut <- surv_cutpoint(
      surdata,
      time = "OS.time",
      event = "OS",
      variables = "PTP4A3"
    )
    summary(surv.cut)
    plot(surv.cut, "PTP4A3", palette = "npg")
    
    cutpoint_PTP4A3.png
    surv.cat <- surv_categorize(surv.cut)
    
    surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3,
                   data = surv.cat)
    ggsurvplot(
      surv.fit,                     # survfit object with calculated statistics.
      risk.table = TRUE,       # show risk table.
      pval = TRUE,             # show p-value of log-rank test.
      conf.int = TRUE,         # show confidence intervals for 
      # point estimaes of survival curves.
      #xlim = c(0,3000),        # present narrower X axis, but not affect
      # survival estimates.
      break.time.by = 1000,    # break X axis in time intervals by 500. 
      risk.table.y.text.col = T, # colour risk table text annotations.
      risk.table.y.text = FALSE # show bars instead of names in text annotations
      # in legend of risk table
    )
    
    surv_plot.png

    p=0.01!!!
    再来分析一遍三阴性乳腺癌样本。

    tnbc.surv.cut <- surv_cutpoint(
      tnbc_surdata,
      time = "OS.time",
      event = "OS",
      variables = "PTP4A3"
    )
    summary(tnbc.surv.cut)
    plot(tnbc.surv.cut, "PTP4A3", palette = "npg")
    
    cutpoint_tnbc_PTP4A3.png
    tnbc.surv.cat <- surv_categorize(tnbc.surv.cut)
    
    tnbc.surv.fit <- survfit(Surv(OS.time, OS) ~ PTP4A3,
                        data = tnbc.surv.cat)
    ggsurvplot(
      tnbc.surv.fit,                     # survfit object with calculated statistics.
      risk.table = TRUE,       # show risk table.
      pval = TRUE,             # show p-value of log-rank test.
      conf.int = TRUE,         # show confidence intervals for 
      # point estimaes of survival curves.
      # xlim = c(0,3000),        # present narrower X axis, but not affect
      # survival estimates.
      break.time.by = 1000,    # break X axis in time intervals by 500. 
      risk.table.y.text.col = T, # colour risk table text annotations.
      risk.table.y.text = FALSE # show bars instead of names in text annotations
      # in legend of risk table
    )
    
    tnbc_surv_plot.png
    p=0.078。大概数据量太少了吧(尬笑)

    1.4 网页工具分析TCGA BRCA中PTP4A3基因的生存分析
    1.4.1 oncoln(http://www.oncolnc.org)

    oncolnc.org.png
    1.4.2 kmplot(http://kmplot.com/analysis)
    kmplot.png
    1.4.3 GEPIA (http://gepia.cancer-pku.cn/detail.php?gene=PTP4A3)
    GEPIA.png
    1.4.4 UCSC Xena(https://xenabrowser.net/)
    UCSC_Xena.png

    相关文章

      网友评论

        本文标题:【生信技能树】2020-01-14作业:生存分析(1)

        本文链接:https://www.haomeiwen.com/subject/wmjzactx.html