生存分析

作者: 医只蜗牛 | 来源:发表于2021-07-23 20:53 被阅读0次

https://www.bbsmax.com/A/8Bz8GPXVdx/ 有关phenotype各列的含义

TCGA的28篇教程- 对TCGA数据库的任意癌症中任意基因做生存分析

CSDN :tcga压缩包提取合并_单基因生信分析流程（1）一文解决TCGA数据下载整理问题

以上两个最终得到的是一样的结果。后面的KM分析需要进一步看看。

【TCGAbiolinks】可以用

TCGAbiolinks下载的生存分析图片

从表格可以看出比较详细。

【UCSCXenaTools】下载的比较简单。

image.png

这个下载的是官网的处理好的表格，里面的内容比较简单。
是直接处理好的。跟网上下载的不一样。

对比以上两张图看出不同。

需要注意的是，两个包下载的东西不一样，不可混淆。一个是clinical，UCSCXenaTools是下载phenotype数据再分析，且，最好官网下载，UCSCXenaTools下载的读取可能出现错误。【Rdata直接加载结果与解压后加载的不一样】

library("UCSCXenaTools")
phenotype<-XenaGenerate(subset = XenaCohorts =="GDC TCGA Kidney Clear Cell Carcinoma (KIRC)")%>% 
  XenaFilter(filterDatasets    = "TCGA-KIRC.GDC_phenotype.tsv") %>% 
  XenaQuery() %>%
  XenaDownload() %>% 
  XenaPrepare()   #加载数据

head(phenotype)
save(phenotype,file = "TCGA-KIRC.GDC_phenotype_file.tsv")
save(phenotype,file = "TCGA-KIRC.GDC_phenotype_file.Rdata")

###前面第一行读取的是手动下载的。用包下载的读取会出现错误。所以直接跳过

kirc.phenotype <- read.csv("TCGA-KIRC.GDC_phenotype.tsv", header = T, sep = "\t")
# 提取肿瘤的表型信息
#kirc.phenotype = phenotype
tumor.sample <- sapply(as.character(kirc.phenotype$submitter_id.samples), function(x){
  number <- as.numeric(unlist(strsplit(unlist(strsplit(as.character(x), split="-"))[4], split = "[A-Z]"))[1])
  if(number<=9){
    id <- as.character(x)
    return(id)
  }
})
tumor.sample.id <- unlist(tumor.sample)
tumor.kirc.phenotype <- kirc.phenotype[match(tumor.sample.id, kirc.phenotype$submitter_id.samples), ]
rownames(tumor.kirc.phenotype) <- tumor.kirc.phenotype$submitter_id.samples
# 获取唯一的肿瘤样本以及表型矩阵
uniq.sample.id <- unique(tumor.kirc.phenotype$submitter_id)
uniq.tumor.kirc.phenotype <- tumor.kirc.phenotype[match(uniq.sample.id, tumor.kirc.phenotype$submitter_id), ]
rownames(uniq.tumor.kirc.phenotype) <- uniq.tumor.kirc.phenotype$submitter_id
# 获取OS time
os.time <- rep(0, nrow(uniq.tumor.kirc.phenotype))
dead.index <- which(uniq.tumor.kirc.phenotype$days_to_death > 0)
alive.index <- which(uniq.tumor.kirc.phenotype$days_to_last_follow_up.diagnoses > 0)
os.time[dead.index] <- uniq.tumor.kirc.phenotype$days_to_death[dead.index]
os.time[alive.index] <- uniq.tumor.kirc.phenotype$days_to_last_follow_up.diagnoses[alive.index]
# 获取终点事件，并用0，1表示
os.index <- which(os.time > 0)
os.time <- os.time[os.index]
status <- rep(0, nrow(uniq.tumor.kirc.phenotype))
dead.index <- which(uniq.tumor.kirc.phenotype$vital_status.demographic == "Dead")
alive.index <- which(uniq.tumor.kirc.phenotype$vital_status.demographic == "Alive")
status[dead.index] <- 0
status[alive.index] <- 1
status <- status[os.index]
uniq.tumor.kirc.phenotype <- uniq.tumor.kirc.phenotype[os.index, ]
# 提取感兴趣的表型信息
interesting.tumor.kirc.data <- data.frame(gender = uniq.tumor.kirc.phenotype$gender.demographic,
                                          age = uniq.tumor.kirc.phenotype$age_at_initial_pathologic_diagnosis,
                                          status = status,
                                          OS = os.time)
rownames(interesting.tumor.kirc.data) <- rownames(uniq.tumor.kirc.phenotype)

tcga压缩包提取合并_单基因生信分析流程（1）一文解决TCGA数据下载整理问题
这里面为什么有clinical.info = "patient"的原因，详见：https://www.jianshu.com/p/dc253f1713d5 引用部分

--------------------------分隔符-------------------------

几个生存分析可以一起看看，估计差不多的意思。
tcga压缩包提取合并_单基因生信分析流程（1）一文解决TCGA数据下载整理问题
 TCGA的28篇教程- 对TCGA数据库的任意癌症中任意基因做生存分析
 TCGA的28篇教程-整理GDC下载的xml格式的临床资料

-------------------------分隔符--------------------------

【clinical下载比较】

官网手动下载：加入cart，download xml文件。然后读取。

# Load the packages required to read XML files.
library("XML")
library("methods")
getwd()
dir='D:/R_code/follow_practice/xuetu_GEO_follow/week_practise/02_follow/02_02_KM_/gdc_download_20210723_163311.823442/'
  
all_fiels=list.files(path = dir ,pattern='*.xml$',recursive=T)
cl = lapply(all_fiels
            , function(x){
              #x=all_fiels[1]
              result <- xmlParse(file = file.path(dir,x)) 
              rootnode <- xmlRoot(result)  
              xmldataframe <- xmlToDataFrame( rootnode[2] ) 
              return(t(xmldataframe))
            })
cl_df <- t(do.call(cbind,cl))
save(cl_df,file = 'GDC_TCGA_KIRC_clinical_df.Rdata')

【TCGAbiolinks下载】

一顿操作后

##TCGAbiolinks下载：
library(TCGAbiolinks)
library(dplyr)
library(DT)
library(SummarizedExperiment)


#下面填入要下载的癌症种类
request_cancer=c("KIRC")
for (i in request_cancer) {
  cancer_type=paste("TCGA",i,sep="-")
  print(cancer_type)
  #下载临床数据
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  write.csv(clinical,file = paste(cancer_type,"clinical.csv",sep = "-"))
}

library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)
library(reshape2) 

clinical0 <- GDCquery_clinic(project = "TCGA-KIRC", type = "clinical")

## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
datatable(clinical, filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)
rm(list=ls())
query <- GDCquery(project = "TCGA-KIRC", 
                  data.category = "Clinical", 
                  file.type = "xml")
GDCdownload(query)
clinical000 <- GDCprepare_clinic(query, clinical.info = "patient")

最终结果：

image.png

比较，cl_df<clinical000＜clinical=clinical0；
且几个为包含关系。

【对比phenotype和clinical】

差不多，但是命名规则略有不同。

网友评论

生信

本文标题：生存分析

本文链接：https://www.haomeiwen.com/subject/xdvpmltx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！