美文网首页GEO
【生信技能树】2020-01-14作业:生存分析(2)

【生信技能树】2020-01-14作业:生存分析(2)

作者: 猫叽先森 | 来源:发表于2020-01-16 00:31 被阅读0次

    题目来源:https://mp.weixin.qq.com/s/jzDD05M5AuhCkiavkoLiHg

    1. 对GSE20685的所有基因做生存分析(表达量中位值分组),获取统计学显著差异的基因列表。

    2.1 批量生存分析,获取统计学显著差异的基因列表。

    rm(list=ls())
    options(stringsAsFactors = F)
    options(warn = -1)
    
    library(AnnoProbe)
    gset<-geoChina("GSE20685")
    eSet <- exprs(gset[[1]])
    pheno <- pData(gset[[1]])
    dim(eSet)
    #[1] 54627   327
    dim(pheno)
    #[1] 327  63
    eSet[1:4,1:4]
    #          GSM519117 GSM519118 GSM519119 GSM519120
    #1007_s_at 11.995510 12.042242 11.903921 11.905713
    #1053_at    9.440330  9.329414  8.893978  9.577525
    #117_at     7.907728  7.991689  8.163261  9.160542
    #121_at     9.007045  8.976325  8.728571  8.665711
    ##表达矩阵行名是探针名,需要转换为基因名。
    
    checkGPL(gset[[1]]@annotation)
    #[1] TRUE
    e2g <- idmap(gset[[1]]@annotation)
    eSet <- filterEM(eSet,e2g)
    eSet[1:4,1:4]
    #       GSM519117 GSM519118 GSM519119 GSM519120
    #ZZZ3   10.562788  10.68748 10.209871  10.48307
    #ZZEF1   9.856933  10.46540  9.873843  10.07197
    #ZYX    10.672518  10.48420 11.005133  10.59319
    #ZYG11B 11.059366  11.29188 11.039351  11.49405
    

    查看临床信息

    image.png
    characteristics_ch1.3 是存活情况
    characteristics_ch1.4 是存活时间(存疑)
    dat <- pheno[,c("characteristics_ch1.3","characteristics_ch1.4")]
    library(stringr)
    colnames(dat) <- c("status","OS.time")
    dat$status <- as.numeric(unlist(lapply(str_split(dat$status,":"),function(x) {return(x[2])})))
    dat$OS.time <- as.numeric(unlist(lapply(str_split(dat$OS.time,":"),function(x) {return(x[2])})))
    table(dat$status)
    #  0   1 
    #244  83 
    boxplot(dat$OS.time~dat$status)
    
    boxplot.png

    死亡病例对应的持续回访时间更低。

    library(survival)
    
    cg <- array(dim = nrow(eSet))
    
    survdata <- dat
    my.surv <- Surv(survdata$OS.time,survdata$status==0)
    #library("survminer")
    dim(eSet)
    
    for (i in 1:nrow(eSet)) {
    #  i <- 1
    #  print(i)
      gene_exprs <- eSet[i,]
      gene_exprs <- as.data.frame(t(gene_exprs))
      gene_exprs[,1] <- as.numeric(gene_exprs[,1])
      gene_exprs$g <- 
      ifelse(gene_exprs[,1]>=median(gene_exprs[,1]),'high','low')
    #  print(table(gene_exprs$g))
      gene_exprs$sample_name <- rownames(gene_exprs)
      gene_surv <- survdata
      gene_surv$sample_name <- rownames(gene_surv)
      gene_surv <- merge(gene_surv,gene_exprs,by='sample_name')
    #  kmfit <- survfit(my.surv~gene_surv$g,data = gene_surv)
      kmdif <- survdiff(my.surv~gene_surv$g,data = gene_surv)
    #  ggsurvplot(kmfit,conf.int = F,pval = T)
      p.value <- 1 - pchisq(kmdif$chisq, length(kmdif$n) -1)
      cg[i] <- (p.value < 0.05)
    }
    table(cg)
    #cg
    #FALSE  TRUE 
    #18386  1802
    ##有1802个基因统计学差异显著。
    surv_diff_genes <- rownames(eSet)[cg]
    save(surv_diff_genes,file = "surv_diff_genes.rdata")
    

    2.2 对生存分析显著的基因列表做富集分析
    1)获取列表基因的ENTREZID

    rm(list=ls())
    load("surv_diff_genes.rdata")
    surv.diff.genes <- as.data.frame(surv_diff_genes)
    colnames(surv.diff.genes) <- c("SYMBOL")
    library(clusterProfiler)
    library(org.Hs.eg.db)
    df <- bitr(surv.diff.genes$SYMBOL,
               fromType = "SYMBOL",
               toType = c("ENTREZID"),
               OrgDb = org.Hs.eg.db)
    genelist <- df$ENTREZID
    length(genelist)
    #[1] 1762
    

    1)GO分析

    go_enrich_results <- lapply( c('BP','MF','CC') , function(ont) {
        cat(paste('Now process ',ont ))
        ego <- enrichGO(gene          = genelist,
                        OrgDb         = org.Hs.eg.db,
                        ont           = ont ,
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.99,
                        qvalueCutoff  = 0.99,
                        readable      = TRUE)
        
        print( head(ego) )
        dotplot(ego,title=paste0('dotplot_',ont)) %>% print()
        return(ego)
    })
    save(go_enrich_results,file = 'go_enrich_results.Rdata')
    
    dotplot_BP.png
    dotplot_MF.png
    dotplot_CC.png

    2)KEGG分析

    library(clusterProfiler)
    kk <- enrichKEGG(gene = genelist,
                 organism = 'hsa',
             pvalueCutoff = 0.9,
             qvalueCutoff = 0.9)
    head(kk)[,1:6]
    #               ID                           Description GeneRatio  BgRatio       pvalue  p.adjust
    #hsa04145 hsa04145                             Phagosome    25/658 152/7978 0.0006218799 0.1940265
    #hsa04974 hsa04974      Protein digestion and absorption    16/658  95/7978 0.0044579625 0.6954421
    #hsa04657 hsa04657               IL-17 signaling pathway    15/658  94/7978 0.0095703403 0.8056927
    #hsa00650 hsa00650                  Butanoate metabolism     6/658  28/7978 0.0241405319 0.8056927
    #hsa05130 hsa05130 Pathogenic Escherichia coli infection    25/658 202/7978 0.0259336441 0.8056927
    #hsa03010 hsa03010                              Ribosome    20/658 153/7978 0.0260288000 0.8056927
    enrichKK=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
    

    可视化

    dotplot(enrichKK,color = "pvalue")
    barplot(enrichKK,showCategory=20,color = "pvalue")
    cnetplot(enrichKK, categorySize="pvalue", colorEdge = TRUE)
    emapplot(enrichKK,color = "pvalue")
    heatplot(enrichKK,showCategory = 20)
    
    KEGG_dotplot.png
    KEGG_barplot.png
    KEGG_cnetplot.png
    KEGG_emapplot.png
    KEGG_heatplot.png

    相关文章

      网友评论

        本文标题:【生信技能树】2020-01-14作业:生存分析(2)

        本文链接:https://www.haomeiwen.com/subject/vaixzctx.html