今天的文章是关于胶质母细胞瘤,标题是MINING TCGA DATABASE FOR GENES OF PROGNOSTIC VALUE IN GLIOBLASTOMA MICROENVIRONMENT,衔接我们上次的R包“ESTIMATE”。作者用分析出的”STROMALSCORE“和”IMMUNESCORE“对TCGA数据进行了重新分组,并进行了一系列后续的下游分析。
P值0.05界限为什么不好(纯R代码复现一篇数据挖掘文章)(这个是R包“ESTIMATE”的教程)
首先还是下载表达数据、临床信息以及突变矩阵,然后对数据进行简单的过滤。
if(!file.exists('./data/TCGA-GBM.Rdata')) {
gzfile <-"./raw_data/TCGA-GBM.AffyU133a_log2.tsv.gz"
download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/HT_HG-U133A.gz",
destfile = gzfile)
library(R.utils)
gunzip(gzfile, remove =F)
library(data.table)
raw_data <- fread("./raw_data/TCGA-GBM.AffyU133a_log2.tsv",
sep =' ', header =T)
raw_data <- as.data.frame( raw_data )
raw_data[1:5,1:6]
rownames( raw_data ) <- raw_data[,1]
raw_data <- raw_data[, -1]
raw_data[1:5,1:6]
raw_data <-2^raw_data -1
raw_data <- ceiling( raw_data )
raw_data[1:5,1:6]
pick_row <- apply( raw_data,1,function(x){
sum(x ==0) <10
})
raw_data <- raw_data[pick_row, ]
dim(raw_data )
save( raw_data, file ='./data/TCGA-GBM.Rdata')
}else{
load('./data/TCGA-GBM.Rdata')
}
# Step2 Grouping by special clinical information --------------------------
if(!file.exists('./data/TCGA-GBM_phenotype.Rdata')) {
gzfile <-"./raw_data/TCGA-GBM_phenotype.tsv.gz"
## download.file("https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.GDC_phenotype.tsv.gz",
## destfile = gzfile)
download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/GBM_clinicalMatrix.gz",
destfile = gzfile)
phenoData <- read.table( gzfile,
header =T,
sep =' ',
quote ='')
name <- phenoData[ ,1]
name <- gsub(pattern ="-", replacement =".", name)
phenoData <- phenoData[ , -1]
rownames( phenoData ) <- name
phenoData[1:5,1:3]
save( phenoData, file ='./data/TCGA-GBM_phenotype.Rdata')
}else{
load('./data/TCGA-GBM_phenotype.Rdata')
}
## Category 1.2: GBM: GeneExp_Subtype
## Pick columns that contains "GeneExp_Subtype"
typePheno <- phenoData[,'GeneExp_Subtype']
save(typePheno, file ='./data/gbm_geneExp_sample.Rdata')
## Category 3: mutation
if(!file.exists('./data/TCGA-GBM.snv.Rdata')) {
gzfile <-"./raw_data/TCGA-GBM.snv.tsv.gz"
## download.file("https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.mutect2_snv.tsv.gz",
## destfile = gzfile)
download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/mutation_broad.gz",
destfile = gzfile)
library(R.utils)
gunzip(gzfile, remove =F)
mutype_file <- read.table("./raw_data/TCGA-GBM.snv.tsv",
header =T,
sep =' ',
quote ='')
save( mutype_file, file ='./data/TCGA-GBM.snv.Rdata')
}else{
load('./data/TCGA-GBM.snv.Rdata')
}
这里根据临床信息中的,'GeneExp_Subtype'对数据进行了分组。用”estimate“R包得到的结果,根据其中的"StromalScore" ,"ImmuneScore"可以得到两个分类。并且文章还根据突变IDH进行了分类。一共有四个分类。后续我们只用到了StromalScore" ,"ImmuneScore"的分类。
# Step1 Install packages --------------------------------------------------
library(utils)
rforge <-"http://r-forge.r-project.org"
install.packages("estimate", repos = rforge, dependencies =TRUE)
# Step2 calculate tumor purity --------------------------------------------
library(estimate)
write.table(as.data.frame(raw_data),'./data/varianCancerExpr.txt',
quote =FALSE, sep =" ")
filterCommonGenes(input.f ="./data/varianCancerExpr.txt",
output.f ="./data/genes.gct", id ="GeneSymbol")
estimateScore("./data/genes.gct","./data/estimate_score.gct",
platform ="affymetrix")
## read result
estimate_score <- read.table("./data/estimate_score.gct",
sep =" ", skip =2,
header =T)
rownames(estimate_score) <- estimate_score[,1]
estimate_score <- as.data.frame(t(estimate_score[, -c(1,2)]))
estimate_score[1:5,1:4]
## type1-gene_Exp
estimate_score <- estimate_score[rownames(estimate_score) %in% names(typePheno),]
typePheno <- typePheno[names(typePheno) %in% rownames(estimate_score)]
estimate_score <- estimate_score[names(typePheno), ]
estimate_score$type <- as.character(typePheno)
estimate_score <- estimate_score[estimate_score[,5] !="", ]
## type2-WHO-IDH1
name <- mutype_file[,1]
name <- gsub(pattern ="-", replacement =".", name)
mutype_file[,1] <- name
mutation <- mutype_file[mutype_file[,7] =='IDH1',1]
typeIDH <- ifelse(rownames(estimate_score) %in% mutype_file[,1],
ifelse(rownames(estimate_score) %in% mutation,"IDH_mutat","IDH-wildtype"),
"NOS")
estimate_score$type <- as.character(typeIDH)
estimate_score <- estimate_score[estimate_score[,5] !="NOS", ]
## type3-ImmuneScore
estimate_score <- estimate_score[order(estimate_score[,2]),]
typeImmune <- c(rep('Low', floor(length(rownames(estimate_score))/2)),
rep('High', ceiling(length(rownames(estimate_score))/2)))
names(typeImmune) <- rownames(estimate_score)
## type4-StromalScore
estimate_score <- estimate_score[order(estimate_score[,1]),]
typeStromal <- c(rep('Low', floor(length(rownames(estimate_score))/2)),
rep('High', ceiling(length(rownames(estimate_score))/2)))
names(typeStromal) <- rownames(estimate_score)
ggboxplot(estimate_score, x ="type", y ="ImmuneScore",
color ="type", palette = c("#00AFBB","#E7B800","#FC4E07","#c00000"),
add ="jitter", shape ="type") + stat_compare_means(label.y =3500)
ggboxplot(estimate_score, x ="type", y ="StromalScore",
color ="type", palette = c("#00AFBB","#E7B800","#FC4E07","#c00000"),
add ="jitter", shape ="type") + stat_compare_means(label.y = -2000)
## OS
library(survival)
os.model <- phenoData[, c("OS.time","OS")]
os.model <- os.model[rownames(os.model) %in% names(typeStromal),]
os.model <- os.model[names(typeStromal),]
os.model$type <- typeStromal
fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)
library(survminer)
ggsurvplot(fit,
pval =TRUE, conf.int =TRUE,
linetype ="type",
surv.median.line ="hv",
ggtheme = theme_bw(),
palette = c("#E7B800","#2E9FDF")
)
os.model <- os.model[rownames(os.model) %in% names(typeImmune),]
os.model <- os.model[names(typeImmune),]
os.model$type <- typeImmune
fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)
library(survminer)
ggsurvplot(fit,
pval =TRUE, conf.int =TRUE,
linetype ="type",
surv.median.line ="hv",
ggtheme = theme_bw(),
palette = c("#E7B800","#2E9FDF")
)
下面是StromalScore分类得到的生存分析结果
下面是ImmuneScore分类得到的生存分析结果
draw_heatmap <-function(nrDEG, type){
library("pheatmap")
nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]
nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]
choose_gene = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:50] )
choose_matrix = AssayData[ choose_gene, ]
choose_matrix = t( scale( t( choose_matrix ) ) )
choose_matrix[choose_matrix >2] =2
choose_matrix[choose_matrix < -2] = -2
annotation_col = data.frame( CellType = factor( group_list ) )
rownames( annotation_col ) = colnames( AssayData )
filename <- paste('./fig/', type,'_heatmap_top100_logFC.png',
sep ="", collapse =NULL)
pheatmap( fontsize =6, choose_matrix, annotation_col = annotation_col,
show_rownames =T, show_colnames =F,
annotation_legend =T, cluster_cols =F,
filename = filename)
}
draw_volcano <-function(nrDEG, type){
library("ggplot2")
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) +2* sd( abs( logFC ) ) )
nrDEG$change = as.factor( ifelse(
nrDEG$P.Value <0.01& abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff,'UP','DOWN'),'NOT') )
nrDEGfile <- paste('./data/', type,'_nrDEG_by_logFC.Rdata',
sep ="", collapse =NULL)
save( nrDEG, file = nrDEGfile )
this_tile <- paste0(
'Cutoff for logFC is ', round( logFC_cutoff,3),
'
The number of up gene is '
, nrow(nrDEG[ nrDEG$change =='UP', ] ),
'
The number of down gene is '
, nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )
volcano = ggplot(data = nrDEG,
aes( x = logFC, y = -log10(P.Value), color = change)) +
geom_point( alpha =0.4, size =1.75) +
theme_set( theme_set( theme_bw( base_size =15) ) ) +
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) +
theme( plot.title = element_text( size =15, hjust =0.5)) +
scale_colour_manual( values = c('blue','black','red') )
print( volcano )
filename <- paste('./fig/', type,'_volcano_logFC.png',
sep ="", collapse =NULL)
ggsave( volcano, filename = filename )
dev.off()
}
AssayData <- raw_data
names(typeImmune) <- gsub(pattern ="\.", replacement ="-", names(typeImmune))
AssayData <- AssayData[, colnames(AssayData) %in% names(typeImmune)]
AssayData <- AssayData[, names(typeImmune)]
group_list <- typeImmune
AssayData <- raw_data
names(typeStromal) <- gsub(pattern ="\.", replacement ="-", names(typeStromal))
AssayData <- AssayData[, colnames(AssayData) %in% names(typeStromal)]
AssayData <- AssayData[, names(typeStromal)]
group_list <- typeStromal
# Step3 Lastly run voom from limma ----------------------------------------
library(limma)
## A list-based S4 class for storing read counts and associated information
## from digital gene expression or sequencing technologies.
DGElist <- DGEList( counts = AssayData, group = factor(group_list) )
## Counts per Million or Reads per Kilobase per Million
keep_gene <- rowSums( cpm(DGElist) >1) >=2
table(keep_gene)
DGElist <- DGElist[ keep_gene, , keep.lib.sizes =FALSE]
## Calculate Normalization Factors to Align Columns of a Count Matrix
DGElist <- calcNormFactors( DGElist )
DGElist$samples
design <- model.matrix( ~0+ factor(group_list) )
rownames(design) <- colnames(DGElist)
colnames(design) <- levels(factor(group_list))
## Transform RNA-Seq Data Ready for Linear Modelling
v <- voom(DGElist, design, plot =TRUE, normalize ="quantile")
## Fit linear model for each gene given a series of arrays
fit <- lmFit(v, design)
## Construct the contrast matrix corresponding to specified contrasts of a set
## of parameters.
cont.matrix <- makeContrasts(contrasts = c('Low-High'), levels = design)
## Given a linear model fit to microarray data, compute estimated coefficients
## and standard errors for a given set of contrasts.
fit2 <- contrasts.fit(fit, cont.matrix)
## Empirical Bayes Statistics for Differential Expression
fit2 <- eBayes(fit2)
nrDEG_limma_voom = topTable(fit2, coef ='Low-High', n =Inf)
nrDEG_limma_voom = na.omit(nrDEG_limma_voom)
head(nrDEG_limma_voom)
draw_heatmap(nrDEG = nrDEG_limma_voom, type ='limma_voom')
draw_volcano(nrDEG = nrDEG_limma_voom, type ='limma_voom')
# Step4 Compare ---------------------------------------------
library("VennDiagram")
nrDEG_Z = nrDEG_limma_voom_immune[ order( nrDEG_limma_voom_immune$logFC ), ]
nrDEG_F = nrDEG_limma_voom_immune[ order( -nrDEG_limma_voom_immune$logFC ), ]
choose_gene_A = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )
nrDEG_Z = nrDEG_limma_voom_stromal[ order( nrDEG_limma_voom_stromal$logFC ), ]
nrDEG_F = nrDEG_limma_voom_stromal[ order( -nrDEG_limma_voom_stromal$logFC ), ]
choose_gene_B = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )
## Venn Diagram
venn.plot <- venn.diagram(x = list(A = choose_gene_A, B = choose_gene_B),
filename ="DIFF.png", height =450, width =450,
resolution =300, imagetype ="png", col ="transparent",
fill = c("cornflowerblue","darkorchid1"),
alpha =0.50, cex =0.45, cat.cex =0.45)
先是stromal_Score的分组绘制的热图和火山图
之后是immnue_Score的分组绘制的热图和火山图
比较一下共有的基因情况
## function of KEGG pathway
kegg_plot <-function(type) {
kk.up <- enrichKEGG( gene = gene_up ,
organism ='hsa',
universe = gene_all ,
pvalueCutoff =0.8,
qvalueCutoff =0.8)
kk.down <- enrichKEGG( gene = gene_down ,
organism ='hsa',
universe = gene_all ,
pvalueCutoff =0.8,
qvalueCutoff =0.8)
library("ggplot2")
kegg_down_dt <- as.data.frame( kk.down )
kegg_up_dt <- as.data.frame( kk.up )
down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue <0.05, ]
down_kegg$group <- -1
up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue <0.05, ]
up_kegg$group <-1
dat = rbind( up_kegg, down_kegg )
dat$pvalue = -log10( dat$pvalue )
dat$pvalue = dat$pvalue * dat$group
dat = dat[ order( dat$pvalue, decreasing =F), ]
g_kegg <- ggplot( dat,
aes(x = reorder( Description, order( pvalue, decreasing =F)),
y = pvalue, fill = group)) +
geom_bar( stat ="identity") +
scale_fill_gradient( low ="blue", high ="red", guide =FALSE) +
scale_x_discrete( name ="Pathway names") +
scale_y_continuous( name ="log10P-value") +
coord_flip() + theme_bw() +
theme( plot.title = element_text( hjust =0.5),
axis.text.x = element_text(size =10),
axis.text.y = element_text(size =7)) +
ggtitle("Pathway Enrichment")
print( g_kegg )
filename <- paste('./fig/kegg_up_down_', type,'.png', sep ="", collapse =NULL)
ggsave( g_kegg, filename = filename )
}
## function of GO pathway
go_plot <-function(type) {
go_enrich_results <- lapply( g_list,function(gene) {
lapply( c('BP','MF','CC') ,function(ont) {
cat( paste('Now process', ont ) )
ego <- enrichGO( gene = gene,
universe = gene_all,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod ="BH",
pvalueCutoff =0.99,
qvalueCutoff =0.99,
readable =TRUE)
print( head( ego ) )
return( ego )
})
})
gofilename <- paste('./data/go_enrich_result', type,'.Rdata',
sep ="", collapse =NULL)
save( go_enrich_results, file = gofilename )
n1 = c('gene_up','gene_down','gene_diff')
n2 = c('BP','MF','CC')
for(iin1:3) {
for(jin1:3) {
fn = paste0('./fig/dotplot_', n1[i],'_', n2[j],'_', type,'.png')
cat( paste0( fn,'
'
) )
png( fn, res =150, width =1080)
print( dotplot( go_enrich_results[[i]][[j]] ) )
dev.off()
}
}
}
# Step1 annotation --------------------------------------------------------
library("clusterProfiler")
library("org.Hs.eg.db")
keytypes(org.Hs.eg.db)
library("stringr")
nrDEG <- nrDEG_limma_voom_immune
nrDEG <- nrDEG_limma_voom_stromal
logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) +2* sd( abs( logFC ) ) )
nrDEG$change = as.factor( ifelse(
nrDEG$P.Value <0.01& abs(nrDEG$logFC) > logFC_cutoff,
ifelse( nrDEG$logFC > logFC_cutoff,'UP','DOWN'),'NOT') )
nrDEG_limma_voom_immune <- nrDEG
nrDEG_limma_voom_stromal <- nrDEG
## tans1: ENSEMBL2ENTREZID
table( nrDEG$change )
rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start =1, end =15)
nrDEG$SYMBOL <- rownames( nrDEG )
df <- bitr( rownames( nrDEG ), fromType ="SYMBOL", toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db )
head( df )
nrDEG = merge( nrDEG, df, by ='SYMBOL')
head( nrDEG )
gene_up = nrDEG[ nrDEG$change =='UP','ENTREZID']
gene_down = nrDEG[ nrDEG$change =='DOWN','ENTREZID']
gene_diff = c( gene_up, gene_down )
gene_all = as.character(nrDEG[ ,'ENTREZID'] )
g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)
# Step2 pathway analysis ------------------------------------------
kegg_plot("limma_voom")
go_plot("limma_voom")
下面是”StromalScore“分类后差异分析的通路结果
下面是”immnue_Score“分类后差异分析的通路结果
原创: 生信技能树 生信菜鸟团
网友评论