找基因所在的通路

作者: 天涯清水 | 来源:发表于2020-02-04 18:09 被阅读0次

    复盘一下,找基因所在的通路下

    新的任务,找到这 276 genes encompassed nine major DDR pathways:
    base excision repair (BER),
    nucleotide excision repair (NER),
    mismatch repair (MMR),
    the Fanconi anemia (FA) pathway,
    homology-dependent recombination (HR),
    non-homologous DNA end joining (NHEJ),
    direct damage reversal repair (DR),
    translesion DNA synthesis (TLS),
    nucleotide pool maintenance (NP)
    然后搞清楚每个基因出现在多少个通路,跟上次的任务比较像,来自于Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlashttps://www.cell.com/cell-reports/pdf/S2211-1247(18)30437-6.pdf

    文章在线网址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5961503/

    根据Jimmy老师提示KEGGREST包含KEGG通路里的所有基因,那么反过来理论上也是可行的。所以查找KEGGREST包的说明,了解这个包的使用。

    了解一下数据


    image.png

    读入数据

    rm (list=ls())
    Sys.setenv(LANGUAGE = "en") #显示英文报错信息
    options(stringsAsFactors = FALSE) #禁止chr转成factor
    
    { #install.packages
      options(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")
      options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
      if(!require("xlsx")) install.packages("xlsx",update = F,ask = F)
      if(!require("ggplot2")) install.packages("ggplot2",update = F,ask = F)
      if(!require("tidyverse")) install.packages("tidyverse",update = F,ask = F)
      if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
      if(!require("KEGGREST")) BiocManager::install("KEGGREST",update = F,ask = F)
      if(!require("clusterProfiler")) BiocManager::install("clusterProfiler",update = F,ask = F)
      if(!require("UpSetR")) BiocManager::install("UpSetR",update = F,ask = F)
    }
    
    # 载入数据,补充材料表1
    library("xlsx")
    res<- read.xlsx("1-s2.0-S2211124718304376-mmc2.xlsx", 1, header=F, colClasses=NA)[-(1:2),1:2]
    
    head(res)
    # colnames(res) <- res[1,]
    # res <- res[-1,]
    
    colnames(res) <- c("Entrez_Gene_ID","Gene_Symbol")
    head(res)
    

    这里可以查看差异基因在哪些KEGG通路

    library(KEGGREST)
    listDatabases()
    
     [1] "pathway"  "brite"    "module"   "ko"       "genome"   "vg"       "ag"       "compound"
     [9] "glycan"   "reaction" "rclass"   "enzyme"   "disease"  "drug"     "dgroup"   "environ" 
    [17] "genes"    "ligand"   "kegg"    
    
    #
    res1 <- paste("hsa:",res[-1, 1]) #根据包说明书对变量重命名
    res2 <- gsub(" ", "", res1) #正则表达式去掉空格
    res3 <- keggLink("pathway", res2)#查找通路
    res3
    

    分析流程:

    res1 <- paste("hsa:",res[-1, 1]) #根据包说明书对变量重命名
    
    res2 <- gsub(" ", "", res1) #正则表达式去掉空格
    
    res3 <- keggLink("pathway", res2)#查找通路
    res3
    
    dim(as.data.frame(res3))
    
    
    
    #开始进行分析
    DDR.list <- list("hsa03410", "hsa03420", "hsa03430", "hsa03440", "hsa03450", "hsa03460")
    
    # 批量提取几个pathway的基因
    DDRgene <- lapply(DDR.list, function(i){
      #i="hsa03410"
      d1 <- KEGGREST::keggGet(i)[[1]]$GENE
      gene <- sapply(seq(2, length(d1), 2), function(x){
        d2 <-  unlist(strsplit(d1[x], ";"))[1]
      })
    })
    #有三个通路在KEGG数据库中找不到,在其他数据库
    
    #https://reactome.org/PathwayBrowser/找到的,但是与原文有些不一致
    
    #该数据库的使用方法参考陈同老师的科学网博客科学网—没钱买KEGG怎么办?REACTOME开源通路更强大 - 陈同的博文 http://blog.sciencenet.cn/blog-118204-1142737.html
    
    
    #direct damage reversal/repair (DR) 
    #PS:下边虽然找到了该通路,由于该数据库通路名字的原因可能并没有找全
    
    #molecule选项里面的
    '
    Proteins    P16455  UniProt:P16455 MGMT 
    Proteins    Q6NS38  UniProt:Q6NS38 ALKBH2   
    Proteins    Q6P6C2  UniProt:Q6P6C2 ALKBH5   
    Proteins    Q9C0B1  UniProt:Q9C0B1 FTO  
    Proteins    Q9H1I8  UniProt:Q9H1I8 ASCC2    
    Proteins    Q8N3C0  UniProt:Q8N3C0 ASCC3    
    Proteins    Q8N9N2  UniProt:Q8N9N2 ASCC1    
    Proteins    Q96Q83  UniProt:Q96Q83 ALKBH3'
    # genenames <- 'Gene.Name'
    # ddr <- c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
    
    Gene.Name <-  c("MGMT","ALKBH2","ALKBH5","FTO","ASCC2","ASCC3","ASCC1","ALKBH3")
    DDR.list2 <- list(Gene.Name=Gene.Name)
    
    #同样的方法分别构建translesion DNA synthesis (TLS) 和 and nucleotide pool maintenance (NP)的通路基因
    
    #translesion DNA synthesis (TLS) 
    
    #由于TLS这个通路里面有32个基因,所以可以从文本里面导入。
    #与参考博文不一致;参考博文是85个,他应该是把该条通路下面的所有基因汇总了。
    #我觉得没有必要,上边的都没有汇总
    
    TLS <- read.xlsx("TLS.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
    
    TLS <- data.frame(TLS)
    colnames(TLS) <- c("Gene.Name")
    
    library(stringr)
    
    TLS <- str_split(as.character(TLS$Gene.Name)," ",simplify = T)[,2]
    
    TLS <- list(Gene.Name =TLS)
    DDR.list3 <- TLS
    
    
    #nucleotide pool maintenance (NP)
    #这个没有找到非常匹配的
    #按照提示,选取包括基因最多的通路DNA Repair
    NP <- read.xlsx("NP.xlsx", 1, header=F, colClasses=NA)[,-(1:2)]
    
    NP <- data.frame(NP)
    colnames(NP) <- c("Gene.Name")
    
    NP <- str_split(as.character(NP$Gene.Name)," ",simplify = T)[,2]
    
    NP <- list(Gene.Name =NP)
    
    
    DDR.list4 <- NP
    
    class(NP)
    
    merged.list <- c( DDRgene , DDR.list2, DDR.list3, DDR.list4)
    
    
    #names(merged.list) <- c(1:9)
    
    # 文章中276个基因和5条通路的交集
    # overlap <- lapply(1:5, function(x){
    #   res <- intersect(gene$Gene_Symbol, DDRgene[[x]])
    # })
    
    intersect(res[-1, 2], DDRgene[[1]])
    
    overlap <- lapply(1:9, function(x){
      res <- intersect(res[-1, 2], merged.list[[x]])
    })
    
    Reduce(intersect,overlap) # 也没有overlap
    
    
    library(ggplot2)
    library(tidyverse)
    count <- unlist(overlap) %>% table()
    count <- as.data.frame(count) %>% base::subset(Freq > 1)
    names(count)[1] <- "gene"
    
    ggplot(count, aes(gene, Freq),size =1)+
      geom_bar(stat="identity")+
      coord_flip()+
      theme_bw()
      
      
    
    ggplot(count, aes(gene, Freq, colour=Freq, size =Freq))+
      geom_point(stat="identity")+
      coord_flip()+
      theme_bw()
    
    
    
    

    韦恩图

    #没有生成图片
    if(F){
    library(UpSetR)
    listinput <- list(dfgene = res[-1, 2],
                         BER = merged.list[[1]],
                         NER = merged.list[[2]],
                         MMR = merged.list[[3]],
                          HR = merged.list[[4]],
                        NHEJ = merged.list[[5]],
                          FA = merged.list[[6]],
                          NP = merged.list[[7]],
                         TLS = merged.list[[8]],
                          DR = merged.list[[9]])
    
    pdf(file='upset.pdf',height = 8,width = 8)
    p <- upset(fromList(listinput),nsets = 9, order.by = "freq")
    dev.off()
    }
    
    

    单个基因/单个通路

    keggLink("pathway", "hsa:7157" )#单个基因,比如TP53
    png <- keggGet("path:hsa01522", "image")# 看下通路 
    t <- tempfile()
    library(png)
    writePNG(png, t)
    if (interactive()) browseURL(t)
    

    注意:这个任务中最关键的是如何确定九条信号通路中有哪些基因,但是各个数据库并不一致,这是造成与作者原图有出路的主要原因,作者在补充材料表格3中有276个基因的分类,可以参考。

    参考文献
    Genomic and Molecular Landscape of DNA Damage Repair Deficiency across The Cancer Genome Atlas
    KEGG学习笔记
    简介Bioconductor中的几个注释信息数据库

    相关文章

      网友评论

        本文标题:找基因所在的通路

        本文链接:https://www.haomeiwen.com/subject/tlvxxhtx.html