egg_f <- "annuum.eggnog.annotations"
egg <- read.csv(egg_f, sep = "\t")
egg[egg==""]<-NA #这个代码来自花花的指导(将空行变成NA,方便下面的去除)
gene_info <- egg %>%
dplyr::select(GID = query_name, GENENAME = `eggNOG.free.text.desc.`) %>% na.omit()
gterms <- egg %>%
dplyr::select(query_name, GOs) %>% na.omit()
gene2go <- data.frame(GID = character(),
GO = character(),
EVIDENCE = character())
# 然后向其中填充:注意到有的query_name对应多个GO,因此我们以GO号为标准,每一行只能有一个GO号,但query_name和Evidence可以重复
for (row in 1:nrow(gterms)) {
gene_id <- gterms[row, "query_name"][[1]]
gene_terms <- str_split(gterms[row,"GOs"], ",", simplify = FALSE)[[1]]
df_temp <- data_frame(GID = rep(gene_id, length(gene_terms)),
GO = gene_terms,
EVIDENCE = rep("IEA", length(gene_terms)))
gene2go <- rbind(gene2go, df_temp)
}
gene2go = gene2go[!duplicated(gene2go$GID), ]
gene2ko <- egg %>%
dplyr::select(GID = query_name, Ko = KEGG_ko) %>%
na.omit()
options(stringsAsFactors = F)
if(F){
# 需要下载 json文件(这是是经常更新的)
# https://www.genome.jp/kegg-bin/get_htext?ko00001
# 代码来自:http://www.genek.tv/course/225/task/4861/show
library(jsonlite)
library(purrr)
library(RCurl)
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
}
}
}
save(pathway2name, ko2pathway, file = "kegg_info.RData")
}
update_kegg(json = "ko00001.json")
}
load(file = "kegg_info.RData")
gene2pathway <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>%
dplyr::select(GID, Pathway) %>%
na.omit()
tax_id = "4072"
genus = "Capsicum"
species = "annuum"
author= "yuantao"
makeOrgPackage(gene_info=gene_info,
go=gene2go,
ko=gene2ko,
pathway=gene2pathway,
# gene2pathway=gene2pathway,
version="0.0.2",
maintainer=author,
author=author,
outputDir = ".",
tax_id=tax_id,
genus=genus,
species=species,
goTable="go")
网友评论