生信人的GEO-1

作者: 泥人吴 | 来源:发表于2019-02-04 20:33 被阅读37次

    这是我学习jimmyB站的GEO数据挖掘-记下的笔记



    了解GEO数据库


    下载表达矩阵的方法:

    第一种:下载表达矩阵,利用R提取GSE42872
    a=read.table('GSE42872_series_matrix.txt.gz',sep='\t',quote = "",fill = T,
                 comment.char = "!",header = T) # 提取表达矩阵
    rownames(a)=a[,1] # 将第一列的ID设置为rownames
    a=a[,-1] # 去除第一个列
    
    第二种:getGEO下载表达矩阵GSE42872
    if(F){
      downGSE <- function(studyID = "GSE1009", destdir = ".") {
        
        library(GEOquery)
        eSet <- getGEO(studyID, destdir = destdir, getGPL = F)
        
        exprSet = exprs(eSet[[1]])
        pdata = pData(eSet[[1]])
        
        write.csv(exprSet, paste0(studyID, "_exprSet.csv"))
        write.csv(pdata, paste0(studyID, "_metadata.csv"))
        return(eSet)
        
      }
      downGSE('GSE42872')
    }
    # 这是写的一个function,后续只需要把GSE42872改为你的目标GSE即可
    

    ID转换

    还是以上面的GSE42872为例进行学习
    • 对应的GPL6244找到其注释包为:67 GPL6244——Homo sapiens hugene10sttranscriptcluster
    load(file='GSE42872_raw_exprSet.Rdata') 
    exprSet=raw_exprSet
    
    # 加载上面找到的对应注释包
    library(hugene10sttranscriptcluster.db)
    ids=toTable(hugene10sttranscriptclusterSYMBOL)
    length(unique(ids$symbol)) #查看基因多少
    tail(sort(table(ids$symbol))) # 每个基因对应的探针个数进行
    table(sort(table(ids$symbol))) #table以下上面的,查看大致情况
    plot(table(sort(table(ids$symbol)))) #画个图看看,基因对应探针的情况。
    
    > tail(sort(table(ids$symbol)))
    
      RPL41  UBTFL1  CDK11B  UBE2D3    IGKC LRRFIP1 
          6       6       8       8      10      10 
    > table(sort(table(ids$symbol)))
    
        1     2     3     4     5     6     8    10 
    18074   600   132    16     5     6     2     2 
    > plot(table(sort(table(ids$symbol))))
    
    plot5t
    表达矩阵对应注释包中对应情况
    dim(exprSet)
    table(rownames(exprSet) %in% ids$probe_id) # 查看对应情况
    dim(exprSet)
    exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,] # 过滤其中能够对应的Gene-ID
    dim(exprSet)
    
    ids=ids[match(rownames(exprSet),ids$probe_id),]Gene-ID
    
    > dim(exprSet)
    [1] 33297     6
    > table(rownames(exprSet) %in% ids$probe_id)
    
    FALSE  TRUE 
    13466 19831 
    > dim(exprSet)
    [1] 33297     6
    > exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
    > dim(exprSet)
    [1] 19831     6
    
    查看两者的情况
    head(ids)
    exprSet[1:5,1:5]
    
    > head(ids)
      probe_id    symbol
    1  7896759 LINC01128
    2  7896761    SAMD11
    3  7896779    KLHL17
    4  7896798   PLEKHN1
    5  7896817     ISG15
    6  7896822      AGRN
    > exprSet[1:5,1:5]
            GSM1052615 GSM1052616 GSM1052617 GSM1052618 GSM1052619
    7896759    8.75126    8.61650    8.81149    8.32067    8.41445
    7896761    8.39069    8.52617    8.43338    9.17284    9.10216
    7896779    8.20228    8.30886    8.18518    8.13322    8.06453
    7896798    8.41004    8.37679    8.27521    8.34524    8.35557
    7896817    7.72204    7.74572    7.78022    7.72308    7.53797
    
    将exprset中的ID与symbol一一对应起来
    jimmy <- function(exprSet,ids){
      tmp = by(exprSet,
               ids$symbol,
               function(x) rownames(x)[which.max(rowMeans(x))] )
    #根据id的symbol对表达矩阵进行分类,如果一个symbol对应多个探针,那么将会得到一个新的探针列表
      probes = as.character(tmp) #将上面的新的探针列表,得到一个probe
      print(dim(exprSet))
      exprSet=exprSet[rownames(exprSet) %in% probes ,] #过滤有多个探针的基因
      print(dim(exprSet))
      rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]
      return(exprSet)
    }
    new_exprSet <- jimmy(exprSet,ids)
    
     exprSet=exprSet[rownames(exprSet) %in% probes ,] #过滤有多个探针的基因
    [1] 19831     6
    [1] 18837     6
    
    矩阵
    处理一个symbol对应多个探针的情况。
    • 上面我们已经展示:
    > tail(sort(table(ids$symbol)))
    
      RPL41  UBTFL1  CDK11B  UBE2D3    IGKC LRRFIP1 
          6       6       8       8      10      10
    
    • 去ids中查找IGKC得到如下:


      IGKC
    • 那么后面用R语言找出上面的10个IGKC对应的gene-id
    ### IGKC,童谣需要对exprSet及ids进行过滤
    load(file='GSE42872_raw_exprSet.Rdata') 
    exprSet=raw_exprSet
    library(hugene10sttranscriptcluster.db)
    ids=toTable(hugene10sttranscriptclusterSYMBOL)
    length(unique(ids$symbol))
    tail(sort(table(ids$symbol)))
    table(sort(table(ids$symbol)))
    plot(table(sort(table(ids$symbol))))
    
    dim(exprSet)
    table(rownames(exprSet) %in% ids$probe_id)
    dim(exprSet)
    exprSet=exprSet[rownames(exprSet) %in% ids$probe_id,]
    dim(exprSet)
    
    ids=ids[match(rownames(exprSet),ids$probe_id),]
    head(ids)
    exprSet[1:5,1:5]
    # 查找IGKC
    table(ids[,2]=='IGKC')  #ids[,2]=='IGKC' 返回的为逻辑值TRUE、FLASE
    exprSet[ids[,2]=='IGKC',] # 返回其中exprSet中对应gene-id为TRUE的值
    
    x表达矩阵
    • 上面这个即为x=exprSet[ids[,2]=='IGKC',],即IGKC对应的10个探针,在每个sample中的情况,需要筛选出其中6个sample中表达平均自最大的探针rowMeans(x)
    x=exprSet[ids[,2]=='IGKC',] 
    rownames(x) 
    rowMeans(x) 
    which.max(rowMeans(x))
    rownames(x)
    
    jimmy <- function(exprSet,ids){
      tmp = by(exprSet,
               ids$symbol,
               function(x) rownames(x)[which.max(rowMeans(x))] ) #根据id的symbol对表达矩阵进行分类,如果一个symbol对应多个探针,那么将会得到一个新的探针列表
      probes = as.character(tmp) #将上面的新的探针列表,得到一个probe
      print(dim(exprSet))
      exprSet=exprSet[rownames(exprSet) %in% probes ,]
      
      print(dim(exprSet))
      rownames(exprSet)=ids[match(rownames(exprSet),ids$probe_id),2]
      return(exprSet)
    }
    
    new_exprSet <- jimmy(exprSet,ids) 
    # jimmy这个函数要求较严格,需要表达矩阵为matrix,ids与其一一对应,过滤
    
    某些GPL无法扎到对应的注释包
    getwd()
    library(GEOquery)
    gpl=getGEO('GPL6480',destdir = getwd())
    colnames(Table(gpl)) ##you need to check this,which column do you need 
    head(Table(gpl)[,c(1,6,7)])
    write.csv(Table(gpl)[,c(1,6,7)],"GPL6400.csv")
    
    • 上面代码运行如下:
    > getwd()
    [1] "F:/R studying/GEO-master/GEO-master/GSE42872"
    > library(GEOquery)
    Setting options('download.file.method.GEOquery'='auto')
    Setting options('GEOquery.inmemory.gpl'=FALSE)
    > gpl=getGEO('GPL6480',destdir = getwd())
    File stored at: 
    F:/R studying/GEO-master/GEO-master/GSE42872/GPL6480.soft
    Warning: 94 parsing failures.
      row          col           expected actual         file
    41001 CONTROL_TYPE 1/0/T/F/TRUE/FALSE    pos literal data
    41002 CONTROL_TYPE 1/0/T/F/TRUE/FALSE    pos literal data
    41003 CONTROL_TYPE 1/0/T/F/TRUE/FALSE    pos literal data
    41004 CONTROL_TYPE 1/0/T/F/TRUE/FALSE    pos literal data
    41005 CONTROL_TYPE 1/0/T/F/TRUE/FALSE    pos literal data
    ..... ............ .................. ...... ............
    See problems(...) for more details.
    
    > colnames(Table(gpl))
     [1] "ID"                   "SPOT_ID"              "CONTROL_TYPE"         "REFSEQ"              
     [5] "GB_ACC"               "GENE"                 "GENE_SYMBOL"          "GENE_NAME"           
     [9] "UNIGENE_ID"           "ENSEMBL_ID"           "TIGR_ID"              "ACCESSION_STRING"    
    [13] "CHROMOSOMAL_LOCATION" "CYTOBAND"             "DESCRIPTION"          "GO_ID"               
    [17] "SEQUENCE"            
    > head(Table(gpl)[,c(1,6,7)])
                ID   GENE GENE_SYMBOL
    1 A_23_P100001 400451     FAM174B
    2 A_23_P100011  10239       AP3S2
    3 A_23_P100022   9899        SV2B
    4 A_23_P100056 348093      RBPMS2
    5 A_23_P100074  57099        AVEN
    6 A_23_P100092 146050     ZSCAN29
    

    了解你的表达矩阵

    load(file='GSE42872_new_exprSet.Rdata')
    exprSet=new_exprSet
    # if(T){
    # 数据转换
      library(reshape2)
      exprSet_L=melt(exprSet)
      colnames(exprSet_L)=c('probe','sample','value')
      exprSet_L$group=rep(group_list,each=nrow(exprSet))
      head(exprSet_L)
    
    查看每个数据的基本情况
      ### ggplot2
      library(ggplot2)
      p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
      print(p) # boxplot,每个样本基本要在一条线上次才能进行比较
    
    boxplot
    • 后面这些代码画出大致意义同上
      p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_violin()
      print(p)
      p=ggplot(exprSet_L,aes(value,fill=group))+geom_histogram(bins = 200)+facet_wrap(~sample, nrow = 4)
      print(p)
      p=ggplot(exprSet_L,aes(value,col=group))+geom_density()+facet_wrap(~sample, nrow = 4)
      print(p)
      p=ggplot(exprSet_L,aes(value,col=group))+geom_density()
      print(p)
      p=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot()
      p=p+stat_summary(fun.y="mean",geom="point",shape=23,size=3,fill="red")
      p=p+theme_set(theme_set(theme_bw(base_size=20)))
      p=p+theme(text=element_text(face='bold'),axis.text.x=element_text(angle=30,hjust=1),axis.title=element_blank())
      print(p)
    
    image.png
    hclust
      ## hclust
      colnames(exprSet)=paste(group_list,1:ncol(exprSet),sep='_')
      # Define nodePar
      nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
                      cex = 0.7, col = "blue")
      hc=hclust(dist(t(exprSet)))
      plot(hc) # 查看每组之间样本是否存在outline的
      par(mar=c(5,5,5,10))
      png('hclust.png',res=120)
      dev.off()
      plot(as.dendrogram(hc), nodePar = nodePar, horiz = TRUE)
      dev.off()
    
    huclust
    PCA
      ## PCA
      library(ggfortify)
      df=as.data.frame(t(exprSet))
      df$group=group_list
      png('pca.png',res=120)
      dev.off()
      autoplot(prcomp( df[,1:(ncol(df)-1)] ), data=df,colour = 'group')+theme_bw()
      dev.off()
    # }
    
    PCA

    差异分析

    用limma对芯片数据做差异分析

    limma包的使用

    使用这个包需要三个数据:

    • 表达矩阵
    • 分组矩阵
    • 差异比较矩阵

    而且总共也只有三个步骤:

    • lmFit
    • eBayes
    • topTable

    三个数据

    1+2拿到分组矩阵design(表达矩阵已经存在)
    load(file='GSE42872_new_exprSet.Rdata')
    exprSet=new_exprSet
    dim(exprSet)
    group_list
    
    library(limma)
    # tmp=data.frame(case=c(0,0,0,1,1,1),control=c(1,1,1,0,0,0))
    #design.factor = factor(group_list, levels=c("control","case"))
    #design.matrix = model.matrix(~0+design.factor)
    #colnames(design.matrix) = levels(design.factor)
    #tmp=design.matrix
    design <- model.matrix(~0+factor(group_list))
    colnames(design)=levels(factor(group_list))
    rownames(design)=colnames(exprSet)
    design # 分组矩阵
    
    • 分组矩阵需要对应好表达矩阵


      分组矩阵
    3.比较矩阵contrast.matrix
    contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),
                                   levels = design)
    contrast.matrix<-makeContrasts("case-control",
                                   levels = design)
    
    contrast.matrix #这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
    
    contrast.matrix
    • 需要确保case1,control -1.

    三个步骤

    lmFit+eBayes+topTable
    deg = function(exprSet,design,contrast.matrix){
      ##step1
      fit <- lmFit(exprSet,design)
      ##step2
      fit2 <- contrasts.fit(fit, contrast.matrix) 
      ##这一步很重要,大家可以自行看看效果
      
      fit2 <- eBayes(fit2)  ## default no trend !!!
      ##eBayes() with trend=TRUE
      ##step3
      tempOutput = topTable(fit2, coef=1, n=Inf)
      nrDEG = na.omit(tempOutput) 
      #write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
      head(nrDEG)
      return(nrDEG)
    }
    
    re = deg(exprSet,design,contrast.matrix)
    

    作图

    热图-pheatmap

    nrDEG=re
    ## heatmap
    library(pheatmap)
    choose_gene=head(rownames(nrDEG),25) ## 50 maybe better
    choose_matrix=exprSet[choose_gene,]
    choose_matrix=t(scale(t(choose_matrix)))
    pheatmap(choose_matrix,filename = 'DEG_top100_heatmap.png')
    
    pheatmao

    火山图

    library(ggplot2)
    ## volcano plot
    colnames(nrDEG)
    plot(nrDEG$logFC,-log10(nrDEG$P.Value))
    DEG=nrDEG
    logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) )
    # logFC_cutoff=1
    
    DEG$change = as.factor(ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
                                  ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
    )
    this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                        '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                        '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
    )
    
    g = ggplot(data=DEG, 
               aes(x=logFC, y=-log10(P.Value), 
                   color=change)) +
      geom_point(alpha=0.4, size=1.75) +
      theme_set(theme_set(theme_bw(base_size=20)))+
      xlab("log2 fold change") + ylab("-log10 p-value") +
      ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
      scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
    print(g)
    #ggsave(g,filename = 'volcano.png')
     
    #save(new_exprSet,group_list,nrDEG,DEG, file='GSE42872_DEG.Rdata')
    

    火山图

    下游分析

    学习包clusteprofile的过程,你不必一一运行

    rm(list=ls())
    load(file='GSE42872_DEG.Rdata')
    gene=head(rownames(nrDEG),1000) #此处我们应该取到其上调的486个基因
    gene
    
    ## KEGG pathway analysis,来自Y叔的clustrprofile
    library(clusterProfiler)
    ## 包接受的gene为entrezid,那么需要将gene的symbol改为ENTREZID
    ### get the universal genes and sDEG 
    
    gene.df <- bitr(gene, fromType = "SYMBOL",
                    toType = c("ENSEMBL", "ENTREZID"),
                    OrgDb = org.Hs.eg.db)
    head(gene.df)
    # 此处包中接受的gene为ENTREZID,那我们将gene.df中的ENTREZID赋值给包中的gene
    kk <- enrichKEGG(gene         = gene.df$ENTREZID,
                     organism     = 'hsa',
                     pvalueCutoff = 0.05)
    head(kk)[,1:6]
    
    • 运行如下:
    > load(file='GSE42872_DEG.Rdata')
    > gene=head(rownames(nrDEG),1000) #此处我们应该取到其上调的486个基因
    > gene [1] "CD36"         "DUSP6"        "DCT"          "SPRY2"        "MOXD1"       
    [6] "ETV4"         "ETV5"         "ST6GALNAC2"   "DTL"          "NUPR1"  
    [11] "LDLR"         "SPRY4"        "FST"          "TXNIP"        "CCND1"  
    [16] "IER3"         "SERPINF1"     "FAM111B"      "CD24"         "RNF150"    
    ...
    
    [986] "SASS6"        "NUP58"        "TSPAN13"      "MDGA2"        "FJX1"        
     [991] "MMD"          "CKS1B"        "PSAP"         "RMI1"         "TRIM51"      
     [996] "EMG1"         "CENPP"        "LINC00346"    "MAK16"        "TMEM232"     
    > ## KEGG pathway analysis,来自Y叔的clustrprofile
    > library(clusterProfiler)
    > gene.df <- bitr(gene, fromType = "SYMBOL",
    +                 toType = c("ENSEMBL", "ENTREZID"),
    +                 OrgDb = org.Hs.eg.db)
    'select()' returned 1:many mapping between keys and columns
    Warning message:
    In bitr(gene, fromType = "SYMBOL", toType = c("ENSEMBL", "ENTREZID"),  :
      1.3% of input gene IDs are fail to map...
    > head(gene.df)
      SYMBOL         ENSEMBL ENTREZID
    1   CD36 ENSG00000135218      948
    2  DUSP6 ENSG00000139318     1848
    3    DCT ENSG00000080166     1638
    4  SPRY2 ENSG00000136158    10253
    5  MOXD1 ENSG00000079931    26002
    6   ETV4 ENSG00000175832     2118
    > # 此处包中接受的gene为ENTREZID,那我们将gene.df中的ENTREZID赋值给包中的gene
    > kk <- enrichKEGG(gene         = gene.df$ENTREZID,
    +                  organism     = 'hsa',
    +                  pvalueCutoff = 0.05)
    > head(kk)[,1:6]
                   ID                  Description GeneRatio  BgRatio       pvalue
    hsa03030 hsa03030              DNA replication    18/427  36/7493 9.927795e-14
    hsa04110 hsa04110                   Cell cycle    30/427 124/7493 6.292081e-12
    hsa05322 hsa05322 Systemic lupus erythematosus    27/427 133/7493 4.890465e-09
    hsa05034 hsa05034                   Alcoholism    28/427 180/7493 9.561579e-07
    hsa05203 hsa05203         Viral carcinogenesis    29/427 201/7493 2.958811e-06
    hsa03460 hsa03460       Fanconi anemia pathway    13/427  54/7493 7.211884e-06
                 p.adjust
    hsa03030 2.779783e-11
    hsa04110 8.808914e-10
    hsa05322 4.564434e-07
    hsa05034 6.693106e-05
    hsa05203 1.656934e-04
    hsa03460 3.365546e-04
    

    数据处理

    rm(list=ls())
    ### ---------------
    ###
    ### Create: Jianming Zeng
    ### Date: 2018-07-09 20:11:07
    ### Email: jmzeng1314@163.com
    ### Blog: http://www.bio-info-trainee.com/
    ### Forum:  http://www.biotrainee.com/thread-1376-1-1.html
    ### CAFS/SUSTC/Eli Lilly/University of Macau
    ### Update Log: 2018-07-09  First version
    ###
    ### ---------------
    
    load(file='GSE42872_DEG.Rdata')
    source('functions.R')
    library(ggplot2)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    df <- bitr(rownames(DEG), fromType = "SYMBOL",
               toType = c( "ENTREZID"),
               OrgDb = org.Hs.eg.db) #SYMBOL转为ENTREZID
    head(df)
    head(DEG)
    DEG$SYMBOL = rownames(DEG)
    DEG=merge(DEG,df,by='SYMBOL')
    head(DEG)
    
    gene_up= DEG[DEG$change == 'UP','ENTREZID'] 
    gene_down=DEG[DEG$change == 'DOWN','ENTREZID'] 
    gene_diff=c(gene_up,gene_down)
    gene_all=as.character(DEG[ ,'ENTREZID'] )
    
    data(geneList, package="DOSE") #加载genelist这个包
    head(geneList)
    boxplot(geneList) # 是一个0往上下走的数值
    boxplot(DEG$logFC) #那我此处DEG$logFC同genelist,就是我们所需要的数值
    
    geneList=DEG$logFC #所以将DEG$logFC赋值给包中的genelist
    names(geneList)=DEG$ENTREZID #命名?
    geneList=sort(geneList,decreasing = T)
    

    KEGG 分析

    ## KEGG pathway analysis
    if(T){
      ###   over-representation test,
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa',
                          universe     = gene_all,
                          pvalueCutoff = 0.9,
                          qvalueCutoff =0.9) #当中的pvalueCutoff、qvalueCutoff值是否需要更改?
      head(kk.up)[,1:6]
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa',
                            universe     = gene_all,
                            pvalueCutoff = 0.05)
      head(kk.down)[,1:6]
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa',
                            pvalueCutoff = 0.05)
      head(kk.diff)[,1:6]
      
      kegg_diff_dt <- as.data.frame(kk.diff)
      kegg_down_dt <- as.data.frame(kk.down)
      kegg_up_dt <- as.data.frame(kk.up)
      down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,];down_kegg$group=-1
      up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,];up_kegg$group=1
    
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
      #ggsave(g_kegg,filename = 'kegg_up_down.png')
    
    KEGG

    GSEA

    ###  GSEA 
    kk_gse <- gseKEGG(geneList     = geneList,
                      organism     = 'hsa',
                      nPerm        = 1000,
                      minGSSize    = 120,
                      pvalueCutoff = 0.9, #后期使用根据自己的实际情况修改这个值
                      verbose      = FALSE)
    head(kk_gse)[,1:6]
    dev.off()
    gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
    
    down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
    up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
    
    g_kegg=kegg_plot(up_kegg,down_kegg)
    print(g_kegg)
    #ggsave(g_kegg,filename = 'kegg_up_down_gsea.png')
    }
    
    gseaplot

    GO

    ### GO database analysis 
    g_list=list(gene_up=gene_up,
                gene_down=gene_down,
                gene_diff=gene_diff)
    
    if(F){
      go_enrich_results <- lapply( g_list , function(gene) {
        lapply( c('BP','MF','CC') , function(ont) {
          cat(paste('Now process ',ont ))
          ego <- enrichGO(gene          = gene,
                          universe      = gene_all,
                          OrgDb         = org.Hs.eg.db,
                          ont           = ont ,
                          pAdjustMethod = "BH",
                          pvalueCutoff  = 0.99,
                          qvalueCutoff  = 0.99,
                          readable      = TRUE)
          print( head(ego) )
          return(ego)
        })
      })
      #save(go_enrich_results,file = 'go_enrich_results.Rdata')
    }
    load(file = 'go_enrich_results.Rdata')
    
    n1= c('gene_up','gene_down','gene_diff')
    n2= c('BP','MF','CC') 
    for (i in 1:3){
      for (j in 1:3){
        fn=paste0('dotplot_',n1[i],'_',n2[j],'.png')
        cat(paste0(fn,'\n'))
        png(fn,res=150,width = 1080)
        print( dotplot(go_enrich_results[[i]][[j]] ))
        dev.off()
      }
    }
    # TODO:
    # ~~~~~~
    
    • 最后会出现如下这些图:
    dotplot_gene_up_BP.png
    dotplot_gene_up_MF.png
    dotplot_gene_up_CC.png
    dotplot_gene_down_BP.png
    dotplot_gene_down_MF.png
    dotplot_gene_down_CC.png
    dotplot_gene_diff_BP.png
    dotplot_gene_diff_MF.png
    dotplot_gene_diff_CC.png
    

    生存分析

    我就直接放代码了,自己运行一遍吧。
    rm(list=ls())
    ### ---------------
    ###
    ### Create: Jianming Zeng
    ### Date: 2018-07-09 20:11:07
    ### Email: jmzeng1314@163.com
    ### Blog: http://www.bio-info-trainee.com/
    ### Forum:  http://www.biotrainee.com/thread-1376-1-1.html
    ### CAFS/SUSTC/Eli Lilly/University of Macau
    ### Update Log: 2018-07-09  First version
    ###
    ### ---------------
    
    # R里面实现生存分析非常简单!
    
    # 用my.surv <- surv(OS_MONTHS,OS_STATUS=='DECEASED')构建生存曲线。
    # 用kmfit2 <- survfit(my.surv~TUMOR_STAGE_2009)来做某一个因子的KM生存曲线。
    # 用 survdiff(my.surv~type, data=dat)来看看这个因子的不同水平是否有显著差异,其中默认用是的logrank test 方法。
    # 用coxph(Surv(time, status) ~ ph.ecog + tt(age), data=lung) 来检测自己感兴趣的因子是否受其它因子(age,gender等等)的影响。
    
    load(file='GSE11121_new_exprSet.Rdata')
    exprSet=new_exprSet
    dim(exprSet)
    colnames(phe)
    colnames(phe)=c('event','grade','node','size','time')
    colnames(phe)
    phe = as.data.frame(apply(phe,2,as.numeric)) #2是随便写的一个
    dev.off()
    boxplot(phe$size) #用boxplot看看大size的大小
    phe$size=ifelse(phe$size>median(phe$size),'big','small')#根据median(size)
    
    library(survival)
    library(survminer)
    # 利用ggsurvplot快速绘制漂亮的生存曲线图
    sfit <- survfit(Surv(time, event)~size, data=phe) #time、event与size的关系
    sfit
    summary(sfit)
    ggsurvplot(sfit, conf.int=F, pval=TRUE)
    ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
               risk.table =TRUE,pval =TRUE,
               conf.int =TRUE,xlab ="Time in months", 
               ggtheme =theme_light(), 
               ncensor.plot = TRUE)
    ## 多个 ggsurvplots作图生存曲线代码合并代码公布
    sfit1=survfit(Surv(time, event)~size, data=phe)
    sfit2=survfit(Surv(time, event)~grade, data=phe)
    splots <- list()
    splots[[1]] <- ggsurvplot(sfit1,pval =TRUE, data = phe, risk.table = TRUE)
    splots[[2]] <- ggsurvplot(sfit2,pval =TRUE, data = phe, risk.table = TRUE)
    # Arrange multiple ggsurvplots and print the output
    arrange_ggsurvplots(splots, print = TRUE,  ncol = 2, nrow = 1, risk.table.height = 0.4)
    ## 挑选感兴趣的基因做差异分析
    phe$CBX4=ifelse(exprSet['CBX4',]>median(exprSet['CBX4',]),'high','low')
    table(phe$CBX4)
    ggsurvplot(survfit(Surv(time, event)~CBX4, data=phe), conf.int=F, pval=TRUE)
    phe$CBX6=ifelse(exprSet['CBX6',]>median(exprSet['CBX6',]),'high','low')
    table(phe$CBX6)
    ggsurvplot(survfit(Surv(time, event)~CBX6, data=phe), conf.int=F, pval=TRUE)
    ## 批量生存分析 使用  logrank test 方法
    mySurv=with(phe,Surv(time, event))
    log_rank_p <- apply(exprSet , 1 , function(gene){
      #gene=exprSet[1,]
      phe$group=ifelse(gene>median(gene),'high','low')  
      data.survdiff=survdiff(mySurv~group,data=phe)
      p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
      return(p.val)
    })
    log_rank_p=sort(log_rank_p)
    head(log_rank_p)
    boxplot(log_rank_p) 
    phe$H6PD=ifelse(exprSet['H6PD',]>median(exprSet['H6PD',]),'high','low')
    table(phe$H6PD)
    ggsurvplot(survfit(Surv(time, event)~H6PD, data=phe), conf.int=F, pval=TRUE)
    
    ## 批量生存分析 使用 coxph 回归方法
    colnames(phe)
    mySurv=with(phe,Surv(time, event))
    cox_results <-apply(exprSet , 1 , function(gene){
      group=ifelse(gene>median(gene),'high','low')
      survival_dat <- data.frame(group=group,grade=phe$grade,size=phe$size,stringsAsFactors = F)
      m=coxph(mySurv ~ grade + size + group, data =  survival_dat)
      
      beta <- coef(m)
      se <- sqrt(diag(vcov(m)))
      HR <- exp(beta)
      HRse <- HR * se
      
      #summary(m)
      tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
                         HR = HR, HRse = HRse,
                         HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
                         HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                         HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
      return(tmp['grouplow',])
      
    })
    cox_results=t(cox_results)
    table(cox_results[,4]<0.05)
    cox_results[cox_results[,4]<0.05,]
    
    length(setdiff(rownames(cox_results[cox_results[,4]<0.05,]),
                   names(log_rank_p[log_rank_p<0.05])
    ))
    length(setdiff( names(log_rank_p[log_rank_p<0.05]),
                    rownames(cox_results[cox_results[,4]<0.05,])
    ))
    length(unique( names(log_rank_p[log_rank_p<0.05]),
                    rownames(cox_results[cox_results[,4]<0.05,])
    ))
    save(log_rank_p,cox_results,exprSet,phe,file = 'surviva.Rdata')
    

    写在最后吧:

    • 上面所有代码都来自jimmy大大的;
    • 我这个只是跟着视频把GEO数据在现过程,这个教程只是帮组你参考;
    • 不需要跟着我的教程走,跟着jimmy大大的视频走一遍就更完美了。

    相关文章

      网友评论

        本文标题:生信人的GEO-1

        本文链接:https://www.haomeiwen.com/subject/ckkssqtx.html