TCGA数据下载
suppressMessages(library(TCGAbiolinks))
suppressMessages(library(SummarizedExperiment))
suppressMessages(library(dplyr))
suppressMessages(library(DT))
suppressMessages(library(DESeq2))
rm(list=ls())
input_dir="~/ori_data/hg38/"
out_dir="~/DEGs/hg38/"
fpkm_dir<-paste0(input_dir,"TCGA-FPKM_07082019.csv")
counts_dir<-paste0(input_dir,"LIHC_count_matrix_07082019.csv")
counts<-read.csv(counts_dir,header = T,row.names = 1,check.names = F)
fpkm<-read.csv(fpkm_dir,header = T,row.names = 1,check.names = F)
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
tpm <- as.data.frame (apply(fpkm , 2, fpkmToTpm))
write.csv(tpm,paste0(input_dir,"TCGA-TPM_07082019.csv"))
sample_NT <- TCGAquery_SampleTypes(barcode = colnames(counts),typesample = c("NT"))
sample_TP <- TCGAquery_SampleTypes(barcode = colnames(counts),typesample = c("TP"))#提出了重复,以及二次测序的值
new_counts<-cbind(counts[,sample_NT],counts[,sample_TP])
cpm <- as.data.frame(t(t(new_counts)/colSums(new_counts) * 1000000))
row.names(cpm) <- row.names(new_counts)
#获取CPM>1半数以上样本,只保留表达量相对较高的样本
pass <- rowSums(cpm > 1)
pass <- pass[pass > (ncol(new_counts) / 2)]
write.csv(pass,paste0(input_dir,"pass_gene.scv"))
write.csv(subset(cpm, rownames(cpm) %in% names(pass)),paste0(input_dir,"TCGA-CPM_pass_07082019.csv"))
write.csv(cpm,paste0(input_dir,"TCGA-CPM_pass_07082019.csv"))
write.csv(subset(tpm, rownames(tpm) %in% names(pass)),paste0(input_dir,"TCGA-TPM_pass_07082019.csv"))
write.csv(tpm[,as.character(sample_TP)],paste0(input_dir,"TCGA-TPM_Tumor_07082019.csv"))
write.csv(cpm[,as.character(sample_TP)],paste0(input_dir,"TCGA-CPM_Tumor_07082019.csv"))
#输出包含group information的数据以便后期作图
new_tpm<-tpm[,c(sample_NT,sample_TP)]
new_tpm["Group",]<-c(rep("Normal",length(sample_NT)),rep("Tumor",length(sample_TP)))
new_cpm<-cpm[,c(sample_NT,sample_TP)]
new_cpm["Group",]<-c(rep("Normal",length(sample_NT)),rep("Tumor",length(sample_TP)))
write.csv(new_tpm,paste0(input_dir,"TCGA-TPM_Ordered_07082019.csv"))
write.csv(new_cpm,paste0(input_dir,"TCGA-CPM_Ordered_07082019.csv"))
#DEGs analysis
new_counts<-countdata <- subset(new_counts, rownames(new_counts) %in% names(pass))
group_list<-c(rep("Normal",length(sample_NT)),rep("Tumor",length(sample_TP)))
colData <- data.frame(row.names=colnames(new_counts), group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = new_counts,
colData = colData,
design = ~ group_list)
suppressMessages(dds2 <- DESeq(dds))
res <-results(dds2, contrast=c("group_list","Tumor","Normal"))
resOrdered <- res[order(res$padj),]
resOrdered=as.data.frame(resOrdered)
write.csv(resOrdered,paste0(out_dir,"LIHC_Tumor_vs_Normal_analysis_DEGs_0708201.csv"))
网友评论