kegg数据库提供API供用户批量访问https://www.kegg.jp/kegg/rest/keggapi.html
使用R语言KEGGREST包可以帮助我们下载数据
BiocManager::install("KEGGREST")
BiocManager::install("fmcsR")
#BiocManager::install("RbioRXN")
#devtools::install_git("https://github.com/cran/RbioRXN.git")
##包加载
library(KEGGREST)
#library(RbioRXN)
##查看KEGG数据库包含的数据
listDatabases()
detach(package:KEGGREST, unload = T)
library(devtools)
devtools::install_github("https://github.com/kozo2/KEGGREST/tree/patch-1")
library(httr)
set_config(use_proxy(url="127.0.0.1",port=10809))#注意此处的url是你本机的地址,端口是梯子的本地端口。如果你不需要梯子可以直接访问kegg数据库,则不需要设置。
#显示kegg所有可用的list
listDatabases()
##获取某个类型所有数据集中的数据,
pathway<- keggList("pathway")
compound <- keggList("compound")
reaction <- keggList("reaction")
library(plyr)
#这个是RbioRXN包的函数,因为安装不上,所以我直接把它的函数源码拿过来用
get.kegg.byId <-
function(keggId) {
kegg = data.frame()
i = 1
while(i <= length(keggId)) {
cat('processing', keggId[i], '\n')
query <- keggGet(keggId[i:(i+9)])
for(l in 1:length(query)) {
keggRow = query[[l]]
for(j in names(keggRow)) {
if(j == 'DBLINKS') {
for(k in 1:length(keggRow$DBLINKS)) {
db = unlist(strsplit(keggRow$DBLINKS[k], ': '))[1]
id = unlist(strsplit(keggRow$DBLINKS[k], ': '))[2]
keggRow[[db]] = id
}
} else if (j == 'PATHWAY') {
for(k in 1:length(keggRow$PATHWAY)) {
keggRow$PATHWAY[k] = paste(names(keggRow$PATHWAY[k]), keggRow$PATHWAY[k], sep=': ')
}
keggRow$PATHWAY = paste(keggRow$PATHWAY, collapse='///')
} else if (j == 'REFERENCE') {
keggRow$REFERENCE = paste(keggRow$REFERENCE[[1]]$REFERENCE, collapse='///')
} else {
if(length(keggRow[[j]]) > 1) {
keggRow[[j]] = paste(keggRow[[j]], collapse='///')
}
}
}
keggRow[['DBLINKS']] = NULL
keggRow = as.data.frame(keggRow, stringsAsFactors=FALSE)
kegg = rbind.fill(kegg, keggRow)
kegg[is.na(kegg)] = ''
}
i = i + 10
}
return(kegg)
}
批量获取所有的kegg的反应和化合物数据
这是RbioRXN包的get.kegg.all 函数源码
cmp <- keggList("compound")
reactionEntry = keggList("reaction")
cmpId = names(cmp)
cmpId = sub('cpd:', '', cmpId)
reactionEntry = names(reactionEntry)
reactionEntry = sub('rn:', '', reactionEntry)
keggReaction = get.kegg.byId(reactionEntry)
keggReaction[is.na(keggReaction)] = ""
keggCompound = get.kegg.byId(cmpId)
keggCompound[is.na(keggCompound)] = ""
# reference
referIndex = grep('.+', keggReaction$REFERENCE)
referId = keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY']
referIdUnique = unique(keggReaction[grep('.+', keggReaction$REFERENCE), 'ENTRY'])
redundantIndex = c()
for(i in referIdUnique) {
index = grep(i, referId)
index = referIndex[index[-1]]
redundantIndex = c(redundantIndex, index)
}
if(length(redundantIndex) > 0) {
keggReaction_unique = keggReaction[-redundantIndex,]
} else {
keggReaction_unique = keggReaction
}
result = list()
result[['reaction']] = keggReaction_unique
result[['compound']] = keggCompound
cat('# of reactions:', nrow(keggReaction_unique), '\n')
cat('# of compounds:', nrow(keggCompound), '\n')
keggAll = result
##获取所有的代谢反应和化合物数据
save(keggAll,file="keggAll.Rdata")
###提取数据
reaction=keggAll$reaction
write.csv(reaction," reaction.csv")
compound=keggAll$compound
write.csv(compound," compound.csv")
如果下载的时候遇到403错误,那就是你的网络问题,更换ip地址再访问试试。
本文修改自https://cloud.tencent.com/developer/article/1800280的源码,解决了包安装不上和网络错误的问题。
网友评论