setwd("C:\\Users\\Administrator.DESKTOP-4UQ3Q0K\\Desktop\\国奖")
library("TCGAbiolinks")
length(paired)
NT <- data.frame(NT1=substr(samplesNT,1,12),NT2=samplesNT)
TP <- data.frame(TP1 =substr(samplesTP,1,12),TP2=samplesTP)
TP_NT <- merge(TP,NT,by.x = "TP1",by.y = "NT1")
head(TP_NT,3)
#1.1获取配对正常组织的barcodes:
TP <- TP_NT$TP2
#1.2获取配对肿瘤组织的barcodes:
NT <- TP_NT$NT2
#2 参照前面几期进行数据下载和数据预处理:
queryDown <- GDCquery(project = "TCGA-PRAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = c(TP, NT))
GDCdownload(queryDown,method = "api", directory = "GDCdata",
files.per.chunk = 10)
#2.1 将数据准备成R语言可处理的形式
dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, save.filename =
"PRAD_case2.rda")
#2.2数据预处理:根据样本与样本之间的spearman相关系数去掉离群值
dataPrep2 <- TCGAanalyze_Preprocessing(object= dataPrep1,
cor.cut = 0.6,
datatype = "HTSeq - Counts")
#2.3选择肿瘤纯度大于90%的肿瘤样本
purityDATA <- TCGAtumor_purity(colnames(dataPrep1), 0, 0, 0, 0, 0.9)
# # filtered 为被过滤的数据, pure_barcodes是我们要的肿瘤数据
Purity.PRAD<-purityDATA$pure_barcodes
normal.PRAD<-purityDATA$filtered
View(normal.PRAD) #52个
View(Purity.PRAD) #34个
#获取肿瘤纯度大于90%的样本+正常组织样本,共计86个样本
puried_data <-dataPrep2[,c(Purity.PRAD,normal.PRAD)]
dim(puried_data)
#有34个肿瘤纯度大于90%的样本
#2.4基因注释
library("SummarizedExperiment")
rowData(dataPrep1)
# DataFrame with 56602 rows and 3 columns
# ensembl_gene_id external_gene_name original_ensembl_gene_id
# <character> <character> <character>
# ENSG00000000003 ENSG00000000003 TSPAN6 ENSG00000000003.13
# ENSG00000000005 ENSG00000000005 TNMD ENSG00000000005.5
rownames(puried_data)<-rowData(dataPrep1)$external_gene_name
write.csv(puried_data,file="puried.PRAD.csv",quote=FALSE)
#2.5使用EDAseq进行文库大小和GC丰度标准化
dataNorm <- TCGAanalyze_Normalization(tabDF = puried_data,
geneInfo = geneInfo,
method = "gcContent")
#2.6过滤低count的基因,并将结果输出
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
method = "quantile",
qnt.cut = 0.25)
write.csv(dataFilt,file = "paired_TCGA_PRAD_final.csv",quote = FALSE)
TCGA_PRAD_data <- read.csv(file = "TCGA_PRAD_final.csv",
header = T,
row.names = 1,
check.names = FALSE)
#samplesNT <- TCGAquery_SampleTypes(colnames(TCGA_PRAD_data), typesample = c("NT"))
#samplesTP <- TCGAquery_SampleTypes(colnames(TCGA_PRAD_data), typesample = c("TP"))
#paired <- intersect(substr(samplesNT,1,12),substr(samplesTP,1,12))
dataFilt <- read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#设置typesample=TP,获取肿瘤组织对应的barcodes
samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
TCGA_PRAD_data <- read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
mat1 <- TCGA_PRAD_data[,1:52]
mat2 <- TCGA_PRAD_data[,53:106]
DEG.PRAD.edgeR <- TCGAanalyze_DEA(mat1 = mat1,
mat2 = mat2,
pipeline = "edgeR",
batch.factors = c("TSS"), Cond1type = "tumor",
Cond2type = "normal",
voom = FALSE, ##设置了paired时,会出错(paired = TRUE),故此处未设置
method = "glmLRT",
contrast.formula = "Mycontrast =tumor -normal",
fdr.cut = 0.01, #设置过滤参数1,保留FDR<0.01的基因
logFC.cut = 1)
write.csv(DEG.PRAD.edgeR,file = "paired_DEG_by_edgeR.csv")
dataFilt.PRAD.final<-read.csv(file = "paired_TCGA_PRAD_final.csv",header = T,row.names = 1,check.names = FALSE)
dataDEGsFilt <- DEG.PRAD.edgeR[abs(DEG.PRAD.edgeR$logFC) >= 1,]
str(dataDEGsFilt)
# 'data.frame': 1336 obs. of 5 variables:
#4.1 TCGAquery_SampleTypes()用于获取特定组织对应的barcodes,如肿瘤组织(TP)、正常组织(NT)
#设置typesample =NT,获取正常组织对应的barcodes
samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt.PRAD.final), typesample = c("NT"))
#设置typesample=TP,获取肿瘤组织对应的barcodes
samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt.PRAD.final), typesample = c("TP"))
dataTP <- dataFilt.PRAD.final[,samplesTP]
dataTN <- dataFilt.PRAD.final[,samplesNT]
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(FC_FDR_table_mRNA = dataDEGsFilt,
typeCond1 = "Normal",
typeCond2 = "Tumor",
TableCond1 = dataTN,
TableCond2 = dataTP)
head(dataDEGsFiltLevel,2)
pca <- TCGAvisualize_PCA(dataFilt= dataFilt,
dataDEGsFiltLevel=dataDEGsFiltLevel,
ntopgenes = 100,
group1=samplesNT,
group2=samplesTP)
从这开始不出图了
#6.1 获取差异表达基因的表达水平
datDEGs <- dataFilt.PRAD.final[match(rownames(DEG.PRAD.edgeR),rownames(dataFilt.PRAD.final)),]
str(datDEGs)
#'data.frame': 1336 obs. of 106 variables:
#6.2 根据临床信息构造patient的metadata信息;或(和)基因的相关信息(此处不添加基因的注释信息)
#获取每一个患者barcode(barcode的前12位代表的patient信息,前16位代表的是sample信息)对应的临床信息,但是其barcodes与datDEGs_test顺序不一致
query <- GDCquery(project = "TCGA-PRAD",
data.category = "Clinical",
file.type = "xml",
barcode = substr(colnames(datDEGs),1,12))
GDCdownload(query)
clinical <- GDCprepare_clinic(query,"patient") #53个样本
##根据表达矩阵中的样本barcodes对样本临床信息匹配
datDEGs_test_barcodes <- as.data.frame(substr(colnames(datDEGs),1,12), ncol = 1 )
colnames(datDEGs_test_barcodes) <-"PRAD_patient_barcode"
m <- clinical[match(datDEGs_test_barcodes[,1], clinical[ , 1 ]),]
str(m)
#'data.frame': 106 obs. of 68 variables:
table(duplicated(m))
#FALSE TRUE
#52 54
#使用table(duplicated()查看m矩阵中是否有重复数据。
#这里的重复数据来源(肿瘤组合和癌旁正常组织来源于同一患者)
library(pheatmap)
#基本做图结果
pheatmap::pheatmap(datDEGs,scale = "row",show_rownames = F,show_colnames = F)
#增加metadata信息
col.mdat <- data.frame(Sex=m$gender,
status=m$vital_status,
group=c(rep("tumor",52),rep("normal",54)))
rownames(col.mdat) <- colnames(datDEGs)
#保证列注释信息的行名与样本名(对应列)一致
#设置图例的范围
bk <- c(seq(-1,6,by=0.01))
#绘制热图
pheatmap::pheatmap(datDEGs,scale = "row",show_rownames = F,show_colnames = F,
annotation_col = col.mdat,
border_color=NA,
main = "Heatmap by pheatmap(edgeR)",
filename = "Heatmap_by_pheatmap.pdf",
color =c(colorRampPalette(colors = c("blue","white"))(length(bk)/2),
colorRampPalette(colors = c("white","red"))(length(bk)/2)),
legend_breaks=seq(-1,6,2),
breaks=bk)
DEG.PRAD.filt<-DEG.PRAD.edgeR[which(abs(DEG.PRAD.edgeR$logFC) >= 4), ]
str(DEG.PRAD.filt)
#'data.frame': 24 obs. of 5 variables:
#共有24个基因满足|logFC|≥4
TCGAVisualize_volcano(DEG.PRAD.edgeR$logFC,
DEG.PRAD.edgeR$FDR,
filename = "TumorvsNormal_FC8.edgeR.pdf",
xlab = "logFC",
names = rownames(DEG.PRAD.edgeR),
show.names = "highlighted",
x.cut = 1,
y.cut = 0.01,
highlight = rownames(DEG.PRAD.edgeR),
highlight.color = "orange",
title = "volcano plot by edgeR")
title = "volcano plot by limma"
plot(1:5,1:5)
dev.off()
dev.new()
TCGAVisualize_volcano(x = DEG.PRAD.edgeR$logFC,
y = DEG.PRAD.edgeR$FDR,
filename = "volcanoexp.png",
x.cut = 1,
y.cut = 0.01,
names = rownames(DEG.PRAD.edgeR),
color = c("gray80","#99CC00","#FF99CC"),
names.size = 2,
xlab = " Gene expression fold change (Log2)",
legend = "State",
title = "Volcano plot",
width = 10)
volcanoexp.png
网友评论