step1

rm(list = ls()) 
library(dplyr)
library(tidyverse)

TCGA.LUAD.GDC_phenotype <- read.delim("TCGA-LUAD.GDC_phenotype.tsv")

#colnames(TCGA.LUAD.GDC_phenotype)
#head(TCGA.LUAD.GDC_phenotype)

LUAD_Pheno<- select(TCGA.LUAD.GDC_phenotype, "submitter_id.samples", "vital_status.diagnoses", "days_to_death.diagnoses", "days_to_last_follow_up.diagnoses", "pathologic_N", "pathologic_M", "days_to_new_tumor_event_after_initial_treatment")

LUAD_Pheno<- LUAD_Pheno[grep('M0',LUAD_Pheno$pathologic_M),]  #####只筛选M0的
#table(LUAD_Pheno$pathologic_N)
LUAD_Pheno<- filter(LUAD_Pheno, pathologic_N == 'N1'| pathologic_N == 'N2')#####只筛选N1/2的


LUAD_Pheno[is.na(LUAD_Pheno)]<- 0

LUAD_Pheno$PFS_status<- ifelse((LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment == 0 & LUAD_Pheno$days_to_death.diagnoses == 0), 0,1)
LUAD_Pheno$OS<- ifelse(LUAD_Pheno$days_to_last_follow_up.diagnoses > LUAD_Pheno$days_to_death.diagnoses, LUAD_Pheno$days_to_last_follow_up.diagnoses,LUAD_Pheno$days_to_death.diagnoses)
LUAD_Pheno$PFS<- ifelse(LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment == 0, LUAD_Pheno$OS ,LUAD_Pheno$days_to_new_tumor_event_after_initial_treatment)

LUAD_Pheno<- LUAD_Pheno[grep('01A',LUAD_Pheno$submitter_id.samples),]  #####只筛选01A的   01A代表肿瘤

TCGA.LUAD.htseq_counts <- read.delim("TCGA-LUAD.htseq_counts.tsv")
Ensemble_ID<- TCGA.LUAD.htseq_counts[,1]
Ensemble_ID2<- as.character(Ensemble_ID)
ID<- strsplit(Ensemble_ID2, "[.]")
str(ID)
IDlast<- sapply(ID, "[", 1)
TCGA.LUAD.htseq_counts<- cbind(IDlast,TCGA.LUAD.htseq_counts[,-1])
save(TCGA.LUAD.GDC_phenotype,TCGA.LUAD.htseq_counts,LUAD_Pheno, file = 'TCGA.Rdata')

step 2

load(file = 'TCGA.Rdata')

#################
library(clusterProfiler)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)

# 采用bitr 命令进行ID的转换

gene_ids <- bitr(TCGA.LUAD.htseq_counts$IDlast, fromType = "ENSEMBL", toType = c("SYMBOL", "GENENAME"), OrgDb="org.Hs.eg.db") 

# 查看转换的结果

# gene_ids
######################### 合并结果 merge后基因从6万个变成了2万个
library(dplyr)
TCGA.LUAD.htseq_counts<- rename(TCGA.LUAD.htseq_counts, c(IDlast="ENSEMBL"))
TCGA.LUAD.htseq_counts<- merge(gene_ids, TCGA.LUAD.htseq_counts, by = "ENSEMBL")
TCGA.LUAD.htseq_counts[1:10,1:10]

######################RNA-seq研究基因用逗号分隔###########
# gene<- c("MPO IL6 CXCR2 SNAIL SH2D1A ADGRE5 CORO1A BST2 S100A8 FCGR3B ZEB1 KREMEN1 OAS3 VCAM1 PRDM1 PTGS2 ICOS TDO2 ITGAE CDKN3 CXCR4 MKI67 GZMB")
# gene<- strsplit(gene,'\ ')
# head(gene)
# str(gene)
# gene2<- sapply(gene, "[")
# gene2<- as.character(gene2)

# gene<- c('KREMEN1')  ###这个基因没找到，其ensemble ID： ENSG00000183762.11
################TCGA.LUAD.htseq_counts[grep('ENSG00000183762', TCGA.LUAD.htseq_counts$Ensembl_ID),]  #####
################grep的用法
gene<- c("MPO","IL6",'SNAI1', "CXCR2","SH2D1A","ADGRE5","CORO1A", "BST2",  "S100A8" , "FCGR3B"  ,"ZEB1" , "OAS3" ,  "VCAM1"  , "PRDM1"  ,"PTGS2",   "ICOS" ,   "TDO2",    "ITGAE"  , "CDKN3" ,  "CXCR4",   "MKI67" ,  "GZMB")
## RNA_seq_data<-filter(TCGA.LUAD.htseq_counts, TCGA.LUAD.htseq_counts$SYMBOL == 'OAS3') #############查找一下有没有这个基因
#############################生存曲线
#######select only gene to analysis
library(survminer)
library(survival)
library(ggplot2)
for (x in gene) {
  RNA_seq_data<-filter(TCGA.LUAD.htseq_counts, TCGA.LUAD.htseq_counts$SYMBOL == x)
  RNA_seq_data<- t(RNA_seq_data)
  colnames(RNA_seq_data)
  RNA_seq_data<- as.data.frame(RNA_seq_data)
  # str(RNA_seq_data)
  # colnames(LUAD_Pheno)
  RNA_seq_data$submitter_id.samples<- row.names(RNA_seq_data)
  colnames(RNA_seq_data)<- c("Expressionvalue","submitter_id.samples")
  LUAD_Pheno$submitter_id.samples<- as.character(LUAD_Pheno$submitter_id.samples)
  LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
  LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
  LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
  LUAD_Pheno$submitter_id.samples<- sub('-', '.', LUAD_Pheno$submitter_id.samples)#############- replaced by .
  finaldata<- inner_join(LUAD_Pheno,RNA_seq_data, by = "submitter_id.samples")
  finaldata$PFS_status<- as.character(finaldata$PFS_status)
  finaldata$PFS_status<- as.numeric(as.factor(finaldata$PFS_status))
  finaldata$Expressionvalue<- as.numeric(as.character(finaldata$Expressionvalue))
  finaldata$group<- ifelse(finaldata$Expressionvalue>median(finaldata$Expressionvalue),'high','low')
  library(survminer)
  library(survival)
  fit <- survfit(Surv(finaldata$PFS,finaldata$PFS_status)~finaldata$group, data=finaldata) 
  summary(fit)
  pp<- ggsurvplot(fit, data = finaldata, conf.int = F, pval = TRUE,
                  xlim = c(0,2000), # present narrower X axis, but not affect
                  # survival estimates. 
                  xlab = "Time in days", # customize X axis label. 
                  break.time.by = 200) # break X axis in time intervals by 500. 
  ggsave(filename = paste("plot_",x,".pdf",sep = ""))
  print(x)
}