5分的生信数据挖掘文章全部图表复现

作者: 7aabdaa41cae | 来源:发表于2019-04-10 17:15 被阅读14次

5分的生信数据挖掘文章全部图表复现
R实战 | TCGA数据挖掘复现--BRCA篇
一篇数据挖掘文章的图表复现-1
一篇单细胞数据挖掘文章的图表复现
你确定不留意一下这个接收纯生信的非OA期刊吗？
Nature作图也出错：单细胞UMAP/TSNE图的ggplot
SEER数据挖掘4.8分SCI发文思路
请自信表达，并秀出你的态度
只要肯用心，再简单的纯生信论文都有机会发表
GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）

今天的文章是关于胶质母细胞瘤，标题是MINING TCGA DATABASE FOR GENES OF PROGNOSTIC VALUE IN GLIOBLASTOMA MICROENVIRONMENT，衔接我们上次的R包“ESTIMATE”。作者用分析出的”STROMALSCORE“和”IMMUNESCORE“对TCGA数据进行了重新分组，并进行了一系列后续的下游分析。

P值0.05界限为什么不好(纯R代码复现一篇数据挖掘文章)（这个是R包“ESTIMATE”的教程）

首先还是下载表达数据、临床信息以及突变矩阵，然后对数据进行简单的过滤。

if(!file.exists('./data/TCGA-GBM.Rdata')) {

gzfile <-"./raw_data/TCGA-GBM.AffyU133a_log2.tsv.gz"

download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/HT_HG-U133A.gz",

destfile = gzfile)

library(R.utils)

gunzip(gzfile, remove =F)

library(data.table)

raw_data <- fread("./raw_data/TCGA-GBM.AffyU133a_log2.tsv",

sep =' ', header =T)

raw_data <- as.data.frame( raw_data )

raw_data[1:5,1:6]

rownames( raw_data ) <- raw_data[,1]

raw_data <- raw_data[, -1]

raw_data[1:5,1:6]

raw_data <-2^raw_data -1

raw_data <- ceiling( raw_data )

raw_data[1:5,1:6]

pick_row <- apply( raw_data,1,function(x){

sum(x ==0) <10

})

raw_data <- raw_data[pick_row, ]

dim(raw_data )

save( raw_data, file ='./data/TCGA-GBM.Rdata')

}else{

load('./data/TCGA-GBM.Rdata')

}

# Step2 Grouping by special clinical information --------------------------

if(!file.exists('./data/TCGA-GBM_phenotype.Rdata')) {

gzfile <-"./raw_data/TCGA-GBM_phenotype.tsv.gz"

## download.file("https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.GDC_phenotype.tsv.gz",

## destfile = gzfile)

download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/GBM_clinicalMatrix.gz",

destfile = gzfile)

phenoData <- read.table( gzfile,

header =T,

sep =' ',

quote ='')

name <- phenoData[ ,1]

name <- gsub(pattern ="-", replacement =".", name)

phenoData <- phenoData[ , -1]

rownames( phenoData ) <- name

phenoData[1:5,1:3]

save( phenoData, file ='./data/TCGA-GBM_phenotype.Rdata')

}else{

load('./data/TCGA-GBM_phenotype.Rdata')

}

## Category 1.2: GBM: GeneExp_Subtype

## Pick columns that contains "GeneExp_Subtype"

typePheno <- phenoData[,'GeneExp_Subtype']

save(typePheno, file ='./data/gbm_geneExp_sample.Rdata')

## Category 3: mutation

if(!file.exists('./data/TCGA-GBM.snv.Rdata')) {

gzfile <-"./raw_data/TCGA-GBM.snv.tsv.gz"

## download.file("https://gdc.xenahubs.net/download/TCGA-BRCA/Xena_Matrices/TCGA-BRCA.mutect2_snv.tsv.gz",

## destfile = gzfile)

download.file("https://tcga.xenahubs.net/download/TCGA.GBM.sampleMap/mutation_broad.gz",

destfile = gzfile)

library(R.utils)

gunzip(gzfile, remove =F)

mutype_file <- read.table("./raw_data/TCGA-GBM.snv.tsv",

header =T,

sep =' ',

quote ='')

save( mutype_file, file ='./data/TCGA-GBM.snv.Rdata')

}else{

load('./data/TCGA-GBM.snv.Rdata')

}

这里根据临床信息中的，'GeneExp_Subtype'对数据进行了分组。用”estimate“R包得到的结果，根据其中的"StromalScore" ，"ImmuneScore"可以得到两个分类。并且文章还根据突变IDH进行了分类。一共有四个分类。后续我们只用到了StromalScore" ，"ImmuneScore"的分类。

# Step1 Install packages --------------------------------------------------

library(utils)

rforge <-"http://r-forge.r-project.org"

install.packages("estimate", repos = rforge, dependencies =TRUE)

# Step2 calculate tumor purity --------------------------------------------

library(estimate)

write.table(as.data.frame(raw_data),'./data/varianCancerExpr.txt',

quote =FALSE, sep =" ")

filterCommonGenes(input.f ="./data/varianCancerExpr.txt",

output.f ="./data/genes.gct", id ="GeneSymbol")

estimateScore("./data/genes.gct","./data/estimate_score.gct",

platform ="affymetrix")

## read result

estimate_score <- read.table("./data/estimate_score.gct",

sep =" ", skip =2,

header =T)

rownames(estimate_score) <- estimate_score[,1]

estimate_score <- as.data.frame(t(estimate_score[, -c(1,2)]))

estimate_score[1:5,1:4]

## type1-gene_Exp

estimate_score <- estimate_score[rownames(estimate_score) %in% names(typePheno),]

typePheno <- typePheno[names(typePheno) %in% rownames(estimate_score)]

estimate_score <- estimate_score[names(typePheno), ]

estimate_score$type <- as.character(typePheno)

estimate_score <- estimate_score[estimate_score[,5] !="", ]

## type2-WHO-IDH1

name <- mutype_file[,1]

name <- gsub(pattern ="-", replacement =".", name)

mutype_file[,1] <- name

mutation <- mutype_file[mutype_file[,7] =='IDH1',1]

typeIDH <- ifelse(rownames(estimate_score) %in% mutype_file[,1],

ifelse(rownames(estimate_score) %in% mutation,"IDH_mutat","IDH-wildtype"),

"NOS")

estimate_score$type <- as.character(typeIDH)

estimate_score <- estimate_score[estimate_score[,5] !="NOS", ]

## type3-ImmuneScore

estimate_score <- estimate_score[order(estimate_score[,2]),]

typeImmune <- c(rep('Low', floor(length(rownames(estimate_score))/2)),

rep('High', ceiling(length(rownames(estimate_score))/2)))

names(typeImmune) <- rownames(estimate_score)

## type4-StromalScore

estimate_score <- estimate_score[order(estimate_score[,1]),]

typeStromal <- c(rep('Low', floor(length(rownames(estimate_score))/2)),

rep('High', ceiling(length(rownames(estimate_score))/2)))

names(typeStromal) <- rownames(estimate_score)

ggboxplot(estimate_score, x ="type", y ="ImmuneScore",

color ="type", palette = c("#00AFBB","#E7B800","#FC4E07","#c00000"),

add ="jitter", shape ="type") + stat_compare_means(label.y =3500)

ggboxplot(estimate_score, x ="type", y ="StromalScore",

color ="type", palette = c("#00AFBB","#E7B800","#FC4E07","#c00000"),

add ="jitter", shape ="type") + stat_compare_means(label.y = -2000)

## OS

library(survival)

os.model <- phenoData[, c("OS.time","OS")]

os.model <- os.model[rownames(os.model) %in% names(typeStromal),]

os.model <- os.model[names(typeStromal),]

os.model$type <- typeStromal

fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)

library(survminer)

ggsurvplot(fit,

pval =TRUE, conf.int =TRUE,

linetype ="type",

surv.median.line ="hv",

ggtheme = theme_bw(),

palette = c("#E7B800","#2E9FDF")

)

os.model <- os.model[rownames(os.model) %in% names(typeImmune),]

os.model <- os.model[names(typeImmune),]

os.model$type <- typeImmune

fit <- survfit(Surv(OS.time, OS) ~ type, data = os.model)

library(survminer)

ggsurvplot(fit,

pval =TRUE, conf.int =TRUE,

linetype ="type",

surv.median.line ="hv",

ggtheme = theme_bw(),

palette = c("#E7B800","#2E9FDF")

)

下面是StromalScore分类得到的生存分析结果

下面是ImmuneScore分类得到的生存分析结果

draw_heatmap <-function(nrDEG, type){

library("pheatmap")

nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]

nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]

choose_gene = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:50] )

choose_matrix = AssayData[ choose_gene, ]

choose_matrix = t( scale( t( choose_matrix ) ) )

choose_matrix[choose_matrix >2] =2

choose_matrix[choose_matrix < -2] = -2

annotation_col = data.frame( CellType = factor( group_list ) )

rownames( annotation_col ) = colnames( AssayData )

filename <- paste('./fig/', type,'_heatmap_top100_logFC.png',

sep ="", collapse =NULL)

pheatmap( fontsize =6, choose_matrix, annotation_col = annotation_col,

show_rownames =T, show_colnames =F,

annotation_legend =T, cluster_cols =F,

filename = filename)

}

draw_volcano <-function(nrDEG, type){

library("ggplot2")

logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) +2* sd( abs( logFC ) ) )

nrDEG$change = as.factor( ifelse(

nrDEG$P.Value <0.01& abs(nrDEG$logFC) > logFC_cutoff,

ifelse( nrDEG$logFC > logFC_cutoff,'UP','DOWN'),'NOT') )

nrDEGfile <- paste('./data/', type,'_nrDEG_by_logFC.Rdata',

sep ="", collapse =NULL)

save( nrDEG, file = nrDEGfile )

this_tile <- paste0(

'Cutoff for logFC is ', round( logFC_cutoff,3),

The number of up gene is '

, nrow(nrDEG[ nrDEG$change =='UP', ] ),

The number of down gene is '

, nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )

volcano = ggplot(data = nrDEG,

aes( x = logFC, y = -log10(P.Value), color = change)) +

geom_point( alpha =0.4, size =1.75) +

theme_set( theme_set( theme_bw( base_size =15) ) ) +

xlab("log2 fold change") + ylab("-log10 p-value") +

ggtitle( this_tile ) +

theme( plot.title = element_text( size =15, hjust =0.5)) +

scale_colour_manual( values = c('blue','black','red') )

print( volcano )

filename <- paste('./fig/', type,'_volcano_logFC.png',

sep ="", collapse =NULL)

ggsave( volcano, filename = filename )

dev.off()

}

AssayData <- raw_data

names(typeImmune) <- gsub(pattern ="\.", replacement ="-", names(typeImmune))

AssayData <- AssayData[, colnames(AssayData) %in% names(typeImmune)]

AssayData <- AssayData[, names(typeImmune)]

group_list <- typeImmune

AssayData <- raw_data

names(typeStromal) <- gsub(pattern ="\.", replacement ="-", names(typeStromal))

AssayData <- AssayData[, colnames(AssayData) %in% names(typeStromal)]

AssayData <- AssayData[, names(typeStromal)]

group_list <- typeStromal

# Step3 Lastly run voom from limma ----------------------------------------

library(limma)

## A list-based S4 class for storing read counts and associated information

## from digital gene expression or sequencing technologies.

DGElist <- DGEList( counts = AssayData, group = factor(group_list) )

## Counts per Million or Reads per Kilobase per Million

keep_gene <- rowSums( cpm(DGElist) >1) >=2

table(keep_gene)

DGElist <- DGElist[ keep_gene, , keep.lib.sizes =FALSE]

## Calculate Normalization Factors to Align Columns of a Count Matrix

DGElist <- calcNormFactors( DGElist )

DGElist$samples

design <- model.matrix( ~0+ factor(group_list) )

rownames(design) <- colnames(DGElist)

colnames(design) <- levels(factor(group_list))

## Transform RNA-Seq Data Ready for Linear Modelling

v <- voom(DGElist, design, plot =TRUE, normalize ="quantile")

## Fit linear model for each gene given a series of arrays

fit <- lmFit(v, design)

## Construct the contrast matrix corresponding to specified contrasts of a set

## of parameters.

cont.matrix <- makeContrasts(contrasts = c('Low-High'), levels = design)

## Given a linear model fit to microarray data, compute estimated coefficients

## and standard errors for a given set of contrasts.

fit2 <- contrasts.fit(fit, cont.matrix)

## Empirical Bayes Statistics for Differential Expression

fit2 <- eBayes(fit2)

nrDEG_limma_voom = topTable(fit2, coef ='Low-High', n =Inf)

nrDEG_limma_voom = na.omit(nrDEG_limma_voom)

head(nrDEG_limma_voom)

draw_heatmap(nrDEG = nrDEG_limma_voom, type ='limma_voom')

draw_volcano(nrDEG = nrDEG_limma_voom, type ='limma_voom')

# Step4 Compare ---------------------------------------------

library("VennDiagram")

nrDEG_Z = nrDEG_limma_voom_immune[ order( nrDEG_limma_voom_immune$logFC ), ]

nrDEG_F = nrDEG_limma_voom_immune[ order( -nrDEG_limma_voom_immune$logFC ), ]

choose_gene_A = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )

nrDEG_Z = nrDEG_limma_voom_stromal[ order( nrDEG_limma_voom_stromal$logFC ), ]

nrDEG_F = nrDEG_limma_voom_stromal[ order( -nrDEG_limma_voom_stromal$logFC ), ]

choose_gene_B = c( rownames( nrDEG_Z )[1:50], rownames( nrDEG_F )[1:350] )

## Venn Diagram

venn.plot <- venn.diagram(x = list(A = choose_gene_A, B = choose_gene_B),

filename ="DIFF.png", height =450, width =450,

resolution =300, imagetype ="png", col ="transparent",

fill = c("cornflowerblue","darkorchid1"),

alpha =0.50, cex =0.45, cat.cex =0.45)

先是stromal_Score的分组绘制的热图和火山图

之后是immnue_Score的分组绘制的热图和火山图

比较一下共有的基因情况

## function of KEGG pathway

kegg_plot <-function(type) {

kk.up <- enrichKEGG( gene = gene_up ,

organism ='hsa',

universe = gene_all ,

pvalueCutoff =0.8,

qvalueCutoff =0.8)

kk.down <- enrichKEGG( gene = gene_down ,

organism ='hsa',

universe = gene_all ,

pvalueCutoff =0.8,

qvalueCutoff =0.8)

library("ggplot2")

kegg_down_dt <- as.data.frame( kk.down )

kegg_up_dt <- as.data.frame( kk.up )

down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue <0.05, ]

down_kegg$group <- -1

up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue <0.05, ]

up_kegg$group <-1

dat = rbind( up_kegg, down_kegg )

dat$pvalue = -log10( dat$pvalue )

dat$pvalue = dat$pvalue * dat$group

dat = dat[ order( dat$pvalue, decreasing =F), ]

g_kegg <- ggplot( dat,

aes(x = reorder( Description, order( pvalue, decreasing =F)),

y = pvalue, fill = group)) +

geom_bar( stat ="identity") +

scale_fill_gradient( low ="blue", high ="red", guide =FALSE) +

scale_x_discrete( name ="Pathway names") +

scale_y_continuous( name ="log10P-value") +

coord_flip() + theme_bw() +

theme( plot.title = element_text( hjust =0.5),

axis.text.x = element_text(size =10),

axis.text.y = element_text(size =7)) +

ggtitle("Pathway Enrichment")

print( g_kegg )

filename <- paste('./fig/kegg_up_down_', type,'.png', sep ="", collapse =NULL)

ggsave( g_kegg, filename = filename )

}

## function of GO pathway

go_plot <-function(type) {

go_enrich_results <- lapply( g_list,function(gene) {

lapply( c('BP','MF','CC') ,function(ont) {

cat( paste('Now process', ont ) )

ego <- enrichGO( gene = gene,

universe = gene_all,

OrgDb = org.Hs.eg.db,

ont = ont ,

pAdjustMethod ="BH",

pvalueCutoff =0.99,

qvalueCutoff =0.99,

readable =TRUE)

print( head( ego ) )

return( ego )

})

gofilename <- paste('./data/go_enrich_result', type,'.Rdata',

sep ="", collapse =NULL)

save( go_enrich_results, file = gofilename )

n1 = c('gene_up','gene_down','gene_diff')

n2 = c('BP','MF','CC')

for(iin1:3) {

for(jin1:3) {

fn = paste0('./fig/dotplot_', n1[i],'_', n2[j],'_', type,'.png')

cat( paste0( fn,'

) )

png( fn, res =150, width =1080)

print( dotplot( go_enrich_results[[i]][[j]] ) )

dev.off()

}

# Step1 annotation --------------------------------------------------------

library("clusterProfiler")

library("org.Hs.eg.db")

keytypes(org.Hs.eg.db)

library("stringr")

nrDEG <- nrDEG_limma_voom_immune

nrDEG <- nrDEG_limma_voom_stromal

logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) +2* sd( abs( logFC ) ) )

nrDEG$change = as.factor( ifelse(

nrDEG$P.Value <0.01& abs(nrDEG$logFC) > logFC_cutoff,

ifelse( nrDEG$logFC > logFC_cutoff,'UP','DOWN'),'NOT') )

nrDEG_limma_voom_immune <- nrDEG

nrDEG_limma_voom_stromal <- nrDEG

## tans1: ENSEMBL2ENTREZID

table( nrDEG$change )

rownames( nrDEG ) <- str_sub(rownames( nrDEG ), start =1, end =15)

nrDEG$SYMBOL <- rownames( nrDEG )

df <- bitr( rownames( nrDEG ), fromType ="SYMBOL", toType = c("ENTREZID"),

OrgDb = org.Hs.eg.db )

head( df )

nrDEG = merge( nrDEG, df, by ='SYMBOL')

head( nrDEG )

gene_up = nrDEG[ nrDEG$change =='UP','ENTREZID']

gene_down = nrDEG[ nrDEG$change =='DOWN','ENTREZID']

gene_diff = c( gene_up, gene_down )

gene_all = as.character(nrDEG[ ,'ENTREZID'] )

g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)

# Step2 pathway analysis ------------------------------------------

kegg_plot("limma_voom")

go_plot("limma_voom")

下面是”StromalScore“分类后差异分析的通路结果

下面是”immnue_Score“分类后差异分析的通路结果

原创：生信技能树生信菜鸟团

5分的生信数据挖掘文章全部图表复现
今天的文章是关于胶质母细胞瘤，标题是MINING TCGA DATABASE FOR GENES OF PROGN...
R实战 | TCGA数据挖掘复现--BRCA篇
本文写于观看生信技能树公众号（vx: biotrainee）的七步走纯R代码通过数据挖掘复现一篇实验文章（第1到6...
一篇数据挖掘文章的图表复现-1
0.文章文章标题：Characterization of an endoplasmic reticulum st...
一篇单细胞数据挖掘文章的图表复现
前言看到一篇单细胞数据挖掘的文章，题为：Establishment of a Prognostic Model ...
你确定不留意一下这个接收纯生信的非OA期刊吗？
在很多人眼里只有OA期刊会接纯生信数据挖掘的文章，其中并不是这样的。有很多非OA期刊也是会接收纯生信数据挖掘的文章...
Nature作图也出错：单细胞UMAP/TSNE图的ggplot
复现Nature文章的图表：我们这里主要展示UMAP图的ggplot做法，原文图如下：复现效果如下：可以发现，...
SEER数据挖掘4.8分SCI发文思路
生信做腻了吗？或者不做生信数据挖掘的，可以尝试一下别的发文思路，例如挖掘SEER数据库的临床数据进行发表SCI。本...
请自信表达，并秀出你的态度
不能怂用数据和图表说话多总结，多讨论线上可验证数据可追溯出现问题可复现（尽量在本地或测试环境复现，在本...
只要肯用心，再简单的纯生信论文都有机会发表
这次分享一下简单的生信数据挖掘发文思路，文章发表在Biomed Research International上，该...
GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）
文章来源：生信技能树原文：GEO数据挖掘-第三期-口腔鳞状细胞癌（OSCC）文章标题 A pplication o...