美文网首页试读收入即学习
TCGA芯片数据下载及差异分析

TCGA芯片数据下载及差异分析

作者: SolomonTeng | 来源:发表于2018-11-29 22:59 被阅读156次

    好久没更新了,最近有点颓,但是该做的事情还是要继续。外包实验的事情最近搞得头大,要用一丢丢钱,做很多事情,这个真的是难,尤其是和公司谈判,心累。

    今天我来做一下TCGA芯片数据下载和差异分析的一个笔记,健明老师布置的TNBC的作业还未完成,但是先慢慢来吧,最近感觉自己的R语言基础太差了,还需要恶补啊,心里其实很清楚该怎么处理,可是写不出具体的代码,这个是最伤的。不扯了,开始正题。


    1、数据下载

    数据我是从https://xenabrowser.net/datapages/网站上下载的,上面有各种数据库的芯片数据可供下载。我选择的是乳腺癌的数据:https://tcga.xenahubs.net/download/TCGA.BRCA.sampleMap/HiSeqV2_PANCAN.gz 下载下来就好了。

    2、数据准备

    rm(list = ls())
    options(stringsAsFactors = F)
    if(F){
      array_brca=read.table('BRCA.medianexp.txt.gz',header = T,sep='    ',fill=T,quote = '')
      array_brca[1:4,1:4]
      array_brca=array_brca[-1,]
      rownames(array_brca)=array_brca[,1]
      array_brca=array_brca[,-1]
      
      exprSet=array_brca
      exprSet[1:4,1:4]
      group_list=ifelse(as.numeric(substr(colnames(array_brca),14,15)) < 10,'tumor','normal')
      #根据TCGA样本的命名可以区分正常组织和肿瘤样本的测序结果,详情阅读最后的原文。
      exprSet=as.data.frame(lapply(exprSet,as.numeric))
      rownames(exprSet)=rownames(array_brca)
      exprSet=na.omit(exprSet)
      exprSet[1:4,1:4]
      dim(exprSet)
      save(exprSet,group_list,file = "tcga_brca_array_input.Rdata")
    }
    load(file = "tcga_brca_array_input.Rdata")
    

    3、差异分析

    library( "limma" )
    {
      design <- model.matrix( ~0 + factor( group_list ) )
      colnames( design ) = levels( factor( group_list ) )
      rownames( design ) = colnames( exprSet )
    }
    design
    
    contrast.matrix <- makeContrasts( "tumor-normal", levels = design )
    contrast.matrix
    
    {
      fit <- lmFit( exprSet, design )
      fit2 <- contrasts.fit( fit, contrast.matrix ) 
      fit2 <- eBayes( fit2 )
      nrDEG = topTable( fit2, coef = 1, n = Inf )
      write.table( nrDEG, file = "nrDEG_BRCA_medianexp.out")
    }
    head(nrDEG)
    

    4、绘制热图

    library( "pheatmap" )
    {
      tmp = nrDEG[nrDEG$P.Value < 0.05,]
      差异结果需要先根据p值挑选
      nrDEG_Z = tmp[ order( tmp$logFC ), ]
      nrDEG_F = tmp[ order( -tmp$logFC ), ]
      choose_gene = c( rownames( nrDEG_Z )[1:100], rownames( nrDEG_F )[1:100] )
      choose_matrix = exprSet[ choose_gene, ]
      choose_matrix = t( scale( t( choose_matrix ) ) )
      
      choose_matrix[choose_matrix > 2] = 2
      choose_matrix[choose_matrix < -2] = -2
      
      annotation_col = data.frame( CellType = factor( group_list ) )
      rownames( annotation_col ) = colnames( exprSet )
      pheatmap( fontsize = 2, choose_matrix, annotation_col = annotation_col, show_rownames = F, annotation_legend = F, filename = "heatmap_BRCA_medianexp.png")
    }
    
    image

    5、绘制火山图

    library( "ggplot2" )
    logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) )
    logFC_cutoff
    logFC_cutoff = 1.2
    {
      nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05 & abs(nrDEG$logFC) > logFC_cutoff,
                                        ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
      
      save( nrDEG, file = "nrDEG_array_medianexp.Rdata" )
      
      this_tile <- paste0( 'Cutoff for logFC is ', round( logFC_cutoff, 3 ),
                           ' The number of up gene is ', nrow(nrDEG[ nrDEG$change =='UP', ] ),
                           ' The number of down gene is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )
      
      volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(P.Value), color = change)) +
        geom_point( alpha = 0.4, size = 1.75) +
        theme_set( theme_set( theme_bw( base_size = 15 ) ) ) +
        xlab( "log2 fold change" ) + ylab( "-log10 p-value" ) +
        ggtitle( this_tile ) + theme( plot.title = element_text( size = 15, hjust = 0.5)) +
        scale_colour_manual( values = c('blue','black','red') )
      print( volcano )
      ggsave( volcano, filename = 'volcano_BRCA_medianexp.png' )
      dev.off()
    }
    

    6、KEGG注释

    library( "clusterProfiler" )
    library( "org.Hs.eg.db" )
    df <- bitr( rownames( nrDEG ), fromType = "SYMBOL", toType = c( "ENTREZID" ), OrgDb = org.Hs.eg.db )
    head( df )
    {
      nrDEG$SYMBOL = rownames( nrDEG )
      nrDEG = merge( nrDEG, df, by='SYMBOL' )
    }
    head( nrDEG )
    
    {
      gene_up = nrDEG[ nrDEG$change == 'UP', 'ENTREZID' ] 
      gene_down = nrDEG[ nrDEG$change == 'DOWN', 'ENTREZID' ]
      gene_diff = c( gene_up, gene_down )
      gene_all = as.character(nrDEG[ ,'ENTREZID'] )
    }
    
    {
      geneList = nrDEG$logFC
      names( geneList ) = nrDEG$ENTREZID
      geneList = sort( geneList, decreasing = T )
    }
    
    library( "ggplot2" )
    # kegg  enrich 
    {
      
      {
        ## KEGG pathway analysis
        kk.up <- enrichKEGG(   gene          =  gene_up    ,
                               organism      =  'hsa'      ,
                               universe      =  gene_all   ,
                               pvalueCutoff  =  0.99       ,
                               qvalueCutoff  =  0.99        )
        kk.down <- enrichKEGG( gene          =  gene_down  ,
                               organism      =  'hsa'      ,
                               universe      =  gene_all   ,
                               pvalueCutoff  =  0.99       ,
                               qvalueCutoff  =  0.99        )
      }
      
      head( kk.up )[ ,1:6 ]
      head( kk.down )[ ,1:6 ]
      kegg_down_dt <- as.data.frame( kk.down )
      kegg_up_dt <- as.data.frame( kk.up )
      down_kegg <- kegg_down_dt[ kegg_down_dt$pvalue < 0.05, ]
      down_kegg$group = -1
      up_kegg <- kegg_up_dt[ kegg_up_dt$pvalue < 0.05, ]
      up_kegg$group = 1
      
      dat = rbind( up_kegg, down_kegg )
      dat$pvalue = -log10( dat$pvalue )
      dat$pvalue = dat$pvalue * dat$group
      
      dat = dat[ order( dat$pvalue, decreasing = F ), ]
      
      g_kegg <- ggplot( dat, 
        aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) + 
        geom_bar( stat = "identity" ) + 
        scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) + 
        scale_x_discrete( name = "Pathway names" ) +
        scale_y_continuous( name = "log10P-value" ) +
        coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +
        ggtitle( "Pathway Enrichment" ) 
      print( g_kegg )
      ggsave( g_kegg, filename = 'kegg_up_down.png' )
    }
    

    7、GSEA注释

    {
      ###  GSEA 
      kk_gse <- gseKEGG(geneList     = geneList,
                        organism     = 'hsa',
                        nPerm        = 1000,
                        minGSSize    = 30,
                        pvalueCutoff = 0.9,
                        verbose      = FALSE)
      head(kk_gse)[,1:6]
      gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
     down_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
      up_kegg<-kk_gse[kk_gse$pvalue<0.01 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
      
      dat = rbind( up_kegg, down_kegg )
      dat$pvalue = -log10( dat$pvalue )
      dat$pvalue = dat$pvalue * dat$group
      
      dat = dat[ order( dat$pvalue, decreasing = F ), ]
      
      g_kegg <- ggplot( dat, 
                        aes(x = reorder( Description, order( pvalue, decreasing=F ) ), y = pvalue, fill = group)) + 
        geom_bar( stat = "identity" ) + 
        scale_fill_gradient( low = "blue", high = "red", guide = FALSE ) + 
        scale_x_discrete( name = "Pathway names" ) +
        scale_y_continuous( name = "log10P-value" ) +
        coord_flip() + theme_bw() + theme( plot.title = element_text( hjust = 0.5 ) ) +
        ggtitle( "Pathway Enrichment" ) 
      print( g_kegg )
      ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
    }
    

    8、GO注释

    g_list = list( gene_up = gene_up, gene_down = gene_down, gene_diff = gene_diff)
    
    go_enrich_results <- lapply( g_list, function( gene ) {
      lapply( c( 'BP', 'MF', 'CC' ) , function( ont ) {
        cat( paste( 'Now process', ont ) )
        ego <- enrichGO( gene          =  gene,
                         universe      =  gene_all,
                         OrgDb         =  org.Hs.eg.db,
                         ont           =  ont ,
                         pAdjustMethod =  "BH",
                         pvalueCutoff  =  0.99,
                         qvalueCutoff  =  0.99,
                         readable      =  TRUE)
        print( head( ego ) )
        return( ego )
      })
    })
    save( go_enrich_results, file = 'go_enrich_results.Rdata' )
    
    n1 = c( 'gene_up', 'gene_down', 'gene_diff' )
    n2 = c( 'BP', 'MF', 'CC' ) 
    for ( i in 1:3 ){
      for ( j in 1:3 ){
        fn = paste0( 'dotplot_', n1[i], '_', n2[j], '.png' )
        cat( paste0( fn, ' ' ) )
        png( fn, res = 150, width = 1080 )
        print( dotplot( go_enrich_results[[i]][[j]] ) )
        dev.off()
      }
    }
    

    代码基本可以无脑复制,直接出图,完美!

    相关文章

      网友评论

        本文标题:TCGA芯片数据下载及差异分析

        本文链接:https://www.haomeiwen.com/subject/asbqcqtx.html