step1
rm(list = ls())
library(dplyr)
library(tidyverse)
TCGA.LUAD.GDC_phenotype <- read.delim("TCGA-LUAD.GDC_phenotype.tsv")
#colnames(TCGA.LUAD.GDC_phenotype)
#head(TCGA.LUAD.GDC_phenotype)
LUAD_Pheno<- select(TCGA.LUAD.GDC_phenotype, "submitter_id.samples", "vital_status.diagnoses", "days_to_death.diagnoses", "days_to_last_follow_up.diagnoses", "pathologic_N", "pathologic_M", "days_to_new_tumor_event_after_initial_treatment")
LUAD_Pheno<- LUAD_Pheno[grep('M0',LUAD_Pheno$pathologic_M),] #####只筛选M0的
#table(LUAD_Pheno$pathologic_N)
LUAD_Pheno<- filter(LUAD_Pheno, pathologic_N == 'N1'| pathologic_N == 'N2')#####只筛选N1/2的
LUAD_Pheno[is.na(LUAD_Pheno)]<- 0
LUAD_Pheno$PFS_status<- ifelse((LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment == 0 & LUAD_Pheno$days_to_death.diagnoses == 0), 0,1)
LUAD_Pheno$OS<- ifelse(LUAD_Pheno$days_to_last_follow_up.diagnoses > LUAD_Pheno$days_to_death.diagnoses, LUAD_Pheno$days_to_last_follow_up.diagnoses,LUAD_Pheno$days_to_death.diagnoses)
LUAD_Pheno$PFS<- ifelse(LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment == 0, LUAD_Pheno$OS ,LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment)
LUAD_Pheno<- LUAD_Pheno[grep('01A',LUAD_Pheno$submitter_id.samples),] #####只筛选01A的 01A代表肿瘤
TCGA.LUAD.htseq_counts <- read.delim("TCGA-LUAD.htseq_counts.tsv")
Ensemble_ID<- TCGA.LUAD.htseq_counts[,1]
Ensemble_ID2<- as.character(Ensemble_ID)
ID<- strsplit(Ensemble_ID2, "[.]")
str(ID)
IDlast<- sapply(ID, "[", 1)
TCGA.LUAD.htseq_counts<- cbind(IDlast,TCGA.LUAD.htseq_counts[,-1])
save(TCGA.LUAD.GDC_phenotype,TCGA.LUAD.htseq_counts,LUAD_Pheno, file = 'TCGA.Rdata')
step 2
load(file = 'TCGA.Rdata')
#################
library(clusterProfiler)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
# 采用bitr 命令进行ID的转换
gene_ids <- bitr(TCGA.LUAD.htseq_counts$IDlast, fromType = "ENSEMBL", toType = c("SYMBOL", "GENENAME"), OrgDb="org.Hs.eg.db")
# 查看转换的结果
# gene_ids
######################### 合并结果 merge后基因从6万个变成了2万个
library(dplyr)
TCGA.LUAD.htseq_counts<- rename(TCGA.LUAD.htseq_counts, c(IDlast="ENSEMBL"))
TCGA.LUAD.htseq_counts<- merge(gene_ids, TCGA.LUAD.htseq_counts, by = "ENSEMBL")
TCGA.LUAD.htseq_counts[1:10,1:10]
######################RNA-seq研究基因用逗号分隔###########
# gene<- c("MPO IL6 CXCR2 SNAIL SH2D1A ADGRE5 CORO1A BST2 S100A8 FCGR3B ZEB1 KREMEN1 OAS3 VCAM1 PRDM1 PTGS2 ICOS TDO2 ITGAE CDKN3 CXCR4 MKI67 GZMB")
# gene<- strsplit(gene,'\ ')
# head(gene)
# str(gene)
# gene2<- sapply(gene, "[")
# gene2<- as.character(gene2)
# gene<- c('KREMEN1') ###这个基因没找到,其ensemble ID: ENSG00000183762.11
################TCGA.LUAD.htseq_counts[grep('ENSG00000183762', TCGA.LUAD.htseq_counts$Ensembl_ID),] #####
################grep的用法
gene<- c("MPO","IL6",'SNAI1', "CXCR2","SH2D1A","ADGRE5","CORO1A", "BST2", "S100A8" , "FCGR3B" ,"ZEB1" , "OAS3" , "VCAM1" , "PRDM1" ,"PTGS2", "ICOS" , "TDO2", "ITGAE" , "CDKN3" , "CXCR4", "MKI67" , "GZMB")
## RNA_seq_data<-filter(TCGA.LUAD.htseq_counts, TCGA.LUAD.htseq_counts$SYMBOL == 'OAS3') #############查找一下有没有这个基因
#############################生存曲线
#######select only gene to analysis
library(survminer)
library(survival)
library(ggplot2)
for (x in gene) {
RNA_seq_data<-filter(TCGA.LUAD.htseq_counts, TCGA.LUAD.htseq_counts$SYMBOL == x)
RNA_seq_data<- t(RNA_seq_data)
colnames(RNA_seq_data)
RNA_seq_data<- as.data.frame(RNA_seq_data)
# str(RNA_seq_data)
# colnames(LUAD_Pheno)
RNA_seq_data$submitter_id.samples<- row.names(RNA_seq_data)
colnames(RNA_seq_data)<- c("Expressionvalue","submitter_id.samples")
LUAD_Pheno$submitter_id.samples<- as.character(LUAD_Pheno$submitter_id.samples)
LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
finaldata<- inner_join(LUAD_Pheno,RNA_seq_data, by = "submitter_id.samples")
finaldata$PFS_status<- as.character(finaldata$PFS_status)
finaldata$PFS_status<- as.numeric(as.factor(finaldata$PFS_status))
finaldata$Expressionvalue<- as.numeric(as.character(finaldata$Expressionvalue))
finaldata$group<- ifelse(finaldata$Expressionvalue>median(finaldata$Expressionvalue),'high','low')
library(survminer)
library(survival)
fit <- survfit(Surv(finaldata$PFS,finaldata$PFS_status)~finaldata$group, data=finaldata)
summary(fit)
pp<- ggsurvplot(fit, data = finaldata, conf.int = F, pval = TRUE,
xlim = c(0,2000), # present narrower X axis, but not affect
# survival estimates.
xlab = "Time in days", # customize X axis label.
break.time.by = 200) # break X axis in time intervals by 500.
ggsave(filename = paste("plot_",x,".pdf",sep = ""))
print(x)
}
网友评论