美文网首页rr公共数据挖掘task参考
R语言获取 GO/KEGG 中某一通路的基因集

R语言获取 GO/KEGG 中某一通路的基因集

作者: 生信摆渡 | 来源:发表于2021-06-09 13:34 被阅读0次

    有时候我们想看某一特定通路的富集情况,这时就需要获取该通路的基因集。

    一、从 GO 中获取

    下载脚本 getGoTerm.R,获取所有的 GO geneSet,然后保存为RData,因为 get_GO_data 这一步非常耗时。

    source("getGoTerm.R")
    GO_DATA <- get_GO_data("org.Hs.eg.db", "ALL", "SYMBOL")
    save(GO_DATA, file = "GO_DATA.RData")
    

    然后,我写了两个小函数 findGOgetGO 分别用于寻找 GO ID 和获取 GO geneSet。

    findGO <- function(pattern, method = "key"){
    
        if(!exists("GO_DATA"))
            load("GO_DATA.RData")
        if(method == "key"){
            pathways = cbind(GO_DATA$PATHID2NAME[grep(pattern, GO_DATA$PATHID2NAME)])
        } else if(method == "gene"){
            pathways = cbind(GO_DATA$PATHID2NAME[GO_DATA$EXTID2PATHID[[pattern]]])
        }
    
        colnames(pathways) = "pathway"
    
        if(length(pathways) == 0){
            cat("No results!\n")
        } else{
            return(pathways)
        }
    }
    
    getGO <- function(ID){
    
        if(!exists("GO_DATA"))
            load("GO_DATA.RData")
        allNAME = names(GO_DATA$PATHID2EXTID)
        if(ID %in% allNAME){
            geneSet = GO_DATA$PATHID2EXTID[ID]
            names(geneSet) = GO_DATA$PATHID2NAME[ID]
            return(geneSet)     
        } else{
            cat("No results!\n")
        }
    }
    

    用法示例

    load("GO_DATA.RData") # 载入数据 GO_DATA
    findGO("insulin") # 寻找 含有指定关键字的 pathway name 的 pathway
    findGO("INS", method = "gene") # 寻找含有指定基因名的 pathway
    
    getGO("GO:1901142") # 获取指定 GO ID 的 gene set
    $`insulin metabolic process`
    [1] "CEACAM1" "CPE"     "ERN1"    "IDE"     "PCSK2"   "ERO1B"
    

    二、从KEGG 中获取

    1. 获取目标通路的ID号

    以人类糖尿病相关通路为例,访问 https://www.kegg.jp/kegg/pathway.html ,在 Select prefix 中输入人类的缩写 hsaEnter keywords 中输入关键词 diabetes,检索到了8个相关通路,点进第一个 Type II diabetes mellitus,可以查看详细信息,ID号为 hsa04930

    2. 开始获取
    # 方法一
    if(!requireNamespace("BiocManager", quietly = TRUE))      
       install.packages("BiocManager") 
    BiocManager::install("KEGGREST", version = "3.10")
    
    library("KEGGREST")
    gsInfo = keggGet('hsa04930')[[1]]
    names(gsInfo)
    
    geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])
    geneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])
    names(geneSet) = gsInfo$NAME
    > geneSet
    $`Type II diabetes mellitus - Homo sapiens (human)`
     [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
     [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
    [15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
    [22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
    [29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
    [36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
    [43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
    
    # 方法二
    library("rjson")
    download.file("http://togows.dbcls.jp/entry/pathway/hsa04930/genes.json", "hsa04930.json")
    json = fromJSON(file ="hsa04930.json")
    geneSet = list(as.character(sapply(json[[1]], function(x) sapply(strsplit(x[1], ";"), function(x) x[1]))))
    > geneSet
    [[1]]
     [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
     [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
    [15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
    [22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
    [29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
    [36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
    [43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
    

    不过第二种方法不能获取geneSet的 NAME,因此更推荐第一种方法。

    同样将以上步骤包装为函数:

    getKEGG <- function(ID){
    
        library("KEGGREST")
    
        gsList = list()
        for(xID in ID){
    
            gsInfo = keggGet(xID)[[1]]
            if(!is.null(gsInfo$GENE)){
                geneSetRaw = sapply(strsplit(gsInfo$GENE, ";"), function(x) x[1])   
                xgeneSet = list(geneSetRaw[seq(2, length(geneSetRaw), 2)])          
                NAME = sapply(strsplit(gsInfo$NAME, " - "), function(x) x[1])
                names(xgeneSet) = NAME
                gsList[NAME] = xgeneSet 
            } else{
                cat(" ", xID, "No corresponding gene set in specific database.\n")
            }
        }
        return(gsList)
    }
    
    > getKEGG('hsa04930')
    $`Type II diabetes mellitus - Homo sapiens (human)`
     [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
     [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
    [15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
    [22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
    [29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
    [36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
    [43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
    
    > getKEGG(c("hsa04930", "hsa04940", "hsa04950"))
    $`Type II diabetes mellitus`
     [1] "INS"     "INSR"    "IRS1"    "IRS2"    "IRS4"    "PIK3R1"  "PIK3R2"
     [8] "PIK3R3"  "PIK3CA"  "PIK3CD"  "PIK3CB"  "SLC2A4"  "ADIPOQ"  "MAPK1"
    [15] "MAPK3"   "MTOR"    "PRKCZ"   "SOCS1"   "SOCS2"   "SOCS3"   "SOCS4"
    [22] "IKBKB"   "MAPK8"   "MAPK10"  "MAPK9"   "TNF"     "PRKCD"   "PRKCE"
    [29] "PDX1"    "MAFA"    "SLC2A2"  "HK3"     "HK1"     "HK2"     "HKDC1"
    [36] "GCK"     "PKM"     "PKLR"    "KCNJ11"  "ABCC8"   "CACNA1C" "CACNA1D"
    [43] "CACNA1A" "CACNA1B" "CACNA1E" "CACNA1G"
    
    $`Type I diabetes mellitus`
     [1] "INS"      "GAD1"     "GAD2"     "PTPRN"    "PTPRN2"   "CPE"
     [7] "HSPD1"    "ICA1"     "HLA-DMA"  "HLA-DMB"  "HLA-DOA"  "HLA-DOB"
    [13] "HLA-DPA1" "HLA-DPB1" "HLA-DQA1" "HLA-DQA2" "HLA-DQB1" "HLA-DRA"
    [19] "HLA-DRB1" "HLA-DRB3" "HLA-DRB4" "HLA-DRB5" "CD80"     "CD86"
    [25] "CD28"     "IL12A"    "IL12B"    "IL2"      "IFNG"     "HLA-A"
    [31] "HLA-B"    "HLA-C"    "HLA-F"    "HLA-G"    "HLA-E"    "FASLG"
    [37] "FAS"      "PRF1"     "GZMB"     "LTA"      "TNF"      "IL1A"
    [43] "IL1B"
    
    $`Maturity onset diabetes of the young`
     [1] "HHEX"    "MNX1"    "ONECUT1" "PDX1"    "NR5A2"   "NEUROG3" "NKX2-2"
     [8] "NKX6-1"  "PAX6"    "PAX4"    "NEUROD1" "RFX6"    "HES1"    "HNF1B"
    [15] "FOXA2"   "MAFA"    "HNF4A"   "HNF1A"   "HNF4G"   "FOXA3"   "PKLR"
    [22] "SLC2A2"  "INS"     "IAPP"    "GCK"     "BHLHA15"
    
    

    Ref

    用clusterProfiler做GSEA(一)

    从KEGG的pathway中提取gene symbol

    R-下载某一条通路的所有基因名字(KEGG)

    相关文章

      网友评论

        本文标题:R语言获取 GO/KEGG 中某一通路的基因集

        本文链接:https://www.haomeiwen.com/subject/rcjjeltx.html