构建自己的注释包

作者: 多啦A梦的时光机_648d | 来源:发表于2020-11-06 17:01 被阅读0次
    egg_f <- "annuum.eggnog.annotations"
    egg <- read.csv(egg_f, sep = "\t")
    egg[egg==""]<-NA #这个代码来自花花的指导(将空行变成NA,方便下面的去除)
    gene_info <- egg %>%
      dplyr::select(GID = query_name, GENENAME = `eggNOG.free.text.desc.`) %>% na.omit()
    
    gterms <- egg %>%
      dplyr::select(query_name, GOs) %>% na.omit()
    
    gene2go <- data.frame(GID = character(),
                          GO = character(),
                          EVIDENCE = character())
    
    # 然后向其中填充:注意到有的query_name对应多个GO,因此我们以GO号为标准,每一行只能有一个GO号,但query_name和Evidence可以重复
    
    for (row in 1:nrow(gterms)) {
      gene_id <- gterms[row, "query_name"][[1]]
      gene_terms <- str_split(gterms[row,"GOs"], ",", simplify = FALSE)[[1]]
      df_temp <- data_frame(GID = rep(gene_id, length(gene_terms)),
                            GO = gene_terms,
                            EVIDENCE = rep("IEA", length(gene_terms)))
      gene2go <- rbind(gene2go, df_temp)
    }
    
    gene2go = gene2go[!duplicated(gene2go$GID), ]
    
    gene2ko <- egg %>%
      dplyr::select(GID = query_name, Ko = KEGG_ko) %>%
      na.omit()
    
    options(stringsAsFactors = F)
    if(F){
      # 需要下载 json文件(这是是经常更新的)
      # https://www.genome.jp/kegg-bin/get_htext?ko00001
      # 代码来自:http://www.genek.tv/course/225/task/4861/show
      library(jsonlite)
      library(purrr)
      library(RCurl)
      
      update_kegg <- function(json = "ko00001.json") {
        pathway2name <- tibble(Pathway = character(), Name = character())
        ko2pathway <- tibble(Ko = character(), Pathway = character())
        
        kegg <- fromJSON(json)
        
        for (a in seq_along(kegg[["children"]][["children"]])) {
          A <- kegg[["children"]][["name"]][[a]]
          
          for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
            B <- kegg[["children"]][["children"]][[a]][["name"]][[b]] 
            
            for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
              pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
              
              pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
              pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
              pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
              
              kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
              
              kos <- str_match(kos_info, "K[0-9]*")[,1]
              
              ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))
            }
          }
        }
        
        save(pathway2name, ko2pathway, file = "kegg_info.RData")
      }
      
      update_kegg(json = "ko00001.json")
      
    }
    
    load(file = "kegg_info.RData")
    gene2pathway <- gene2ko %>% left_join(ko2pathway, by = "Ko") %>% 
      dplyr::select(GID, Pathway) %>%
      na.omit()
    
    
    tax_id = "4072"
    genus = "Capsicum" 
    species = "annuum"
    author= "yuantao"
    makeOrgPackage(gene_info=gene_info,
                   go=gene2go,
                   ko=gene2ko,
                   pathway=gene2pathway,
                   # gene2pathway=gene2pathway,
                   version="0.0.2",
                   maintainer=author,
                   author=author,
                   outputDir = ".",
                   tax_id=tax_id,
                   genus=genus,
                   species=species,
                   goTable="go")
    
    

    相关文章

      网友评论

        本文标题:构建自己的注释包

        本文链接:https://www.haomeiwen.com/subject/bcotbktx.html