美文网首页
KEGG数据库数据下载

KEGG数据库数据下载

作者: wo_monic | 来源:发表于2023-11-23 10:54 被阅读0次

    kegg数据库提供API供用户批量访问https://www.kegg.jp/kegg/rest/keggapi.html
    使用R语言KEGGREST包可以帮助我们下载数据

    BiocManager::install("KEGGREST")
    BiocManager::install("fmcsR")
    #BiocManager::install("RbioRXN")
    #devtools::install_git("https://github.com/cran/RbioRXN.git")
    ##包加载
    library(KEGGREST)
    #library(RbioRXN)
    ##查看KEGG数据库包含的数据
    listDatabases()
    
    detach(package:KEGGREST, unload = T)
    library(devtools)
    devtools::install_github("https://github.com/kozo2/KEGGREST/tree/patch-1")
    library(httr)    
    set_config(use_proxy(url="127.0.0.1",port=10809))#注意此处的url是你本机的地址,端口是梯子的本地端口。如果你不需要梯子可以直接访问kegg数据库,则不需要设置。
    #显示kegg所有可用的list
    listDatabases()
    ##获取某个类型所有数据集中的数据,
    pathway<- keggList("pathway")
    compound <- keggList("compound")
    reaction <- keggList("reaction")
    
    library(plyr)
    #这个是RbioRXN包的函数,因为安装不上,所以我直接把它的函数源码拿过来用
    get.kegg.byId <-
      function(keggId) {
        kegg = data.frame()
        i = 1
        while(i <= length(keggId)) {
          
          cat('processing', keggId[i], '\n')
          query <- keggGet(keggId[i:(i+9)])
          
          for(l in 1:length(query)) {
            
            keggRow = query[[l]]
            
            for(j in names(keggRow)) {
              if(j == 'DBLINKS') {
                for(k in 1:length(keggRow$DBLINKS)) {
                  db = unlist(strsplit(keggRow$DBLINKS[k], ': '))[1]
                  id = unlist(strsplit(keggRow$DBLINKS[k], ': '))[2]
                  keggRow[[db]] = id
                }
              } else if (j == 'PATHWAY') {
                for(k in 1:length(keggRow$PATHWAY)) {
                  keggRow$PATHWAY[k] = paste(names(keggRow$PATHWAY[k]), keggRow$PATHWAY[k], sep=': ')
                }
                keggRow$PATHWAY = paste(keggRow$PATHWAY, collapse='///')
              } else if (j == 'REFERENCE') {
                keggRow$REFERENCE = paste(keggRow$REFERENCE[[1]]$REFERENCE, collapse='///')
              } else {
                if(length(keggRow[[j]]) > 1) {
                  keggRow[[j]] = paste(keggRow[[j]], collapse='///')
                }
              }
            }
            keggRow[['DBLINKS']] = NULL
            keggRow = as.data.frame(keggRow, stringsAsFactors=FALSE)
            kegg = rbind.fill(kegg, keggRow)
            kegg[is.na(kegg)] = ''
          }
          i = i + 10 
        }
        return(kegg)
      }
    
    

    批量获取所有的kegg的反应和化合物数据
    这是RbioRXN包的get.kegg.all 函数源码

        cmp <- keggList("compound")
        reactionEntry = keggList("reaction")
        
        cmpId = names(cmp)
        cmpId = sub('cpd:', '', cmpId)
        reactionEntry = names(reactionEntry)
        reactionEntry = sub('rn:', '', reactionEntry)
        keggReaction = get.kegg.byId(reactionEntry)
        keggReaction[is.na(keggReaction)] = ""
        
        keggCompound = get.kegg.byId(cmpId)
        keggCompound[is.na(keggCompound)] = ""
        
        # reference
        referIndex = grep('.+', keggReaction$REFERENCE)
        referId = keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY']
        referIdUnique = unique(keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY'])
        
        redundantIndex = c()
        for(i in referIdUnique) {
          index = grep(i, referId)
          index = referIndex[index[-1]]
          redundantIndex = c(redundantIndex, index)
        }
        
        if(length(redundantIndex) > 0) {
          keggReaction_unique = keggReaction[-redundantIndex,]
        } else {
          keggReaction_unique = keggReaction
        }
        
        result = list()
        result[['reaction']] = keggReaction_unique
        result[['compound']] = keggCompound
        cat('# of reactions:', nrow(keggReaction_unique), '\n')
        cat('# of compounds:', nrow(keggCompound), '\n')
        keggAll = result
    ##获取所有的代谢反应和化合物数据
    save(keggAll,file="keggAll.Rdata")
    ###提取数据
    reaction=keggAll$reaction
    write.csv(reaction," reaction.csv")
     
    compound=keggAll$compound
    write.csv(compound," compound.csv")
    

    如果下载的时候遇到403错误,那就是你的网络问题,更换ip地址再访问试试。
    本文修改自https://cloud.tencent.com/developer/article/1800280的源码,解决了包安装不上和网络错误的问题。

    相关文章

      网友评论

          本文标题:KEGG数据库数据下载

          本文链接:https://www.haomeiwen.com/subject/rbzwwdtx.html