美文网首页rice related analysis生信相关分析方法
「Bioconductor」让我们愉快的为自己做一个物种包吧

「Bioconductor」让我们愉快的为自己做一个物种包吧

作者: xuzhougeng | 来源:发表于2019-01-09 18:51 被阅读63次

    做植物是一件比较艰苦的事情,不但资源少,而且有限的资源未必还能用的好,就拿Bioconductor上的注释包来说吧,我在「Bioconductor」不要轻易相信AnnotationHub的物种注释包, 里面就提到拟南芥的物种包用的注释其实一直都没有更新。究其原因,是因为拟南芥的物种包里的注释一直是从TAIR的FTP下载,而我另一篇文章TAIR周期性更新的注释原来不在FTP服务器上也说了,最新的拟南芥注释信息是要在另外的地方进行下载。所以,我写了「Bioconductor」再次提醒,研究植物的不要轻易相信你用的注释包, 让大家尝试用enricher解决问题。

    但是生活不能苟且,我好歹在生信圈搬了几年砖,遇到困难不能退缩,于是我决定自己构建一个拟南芥的物种包。代码如下:

    library(RSQLite)
    library(AnnotationForge)
    options(stringsAsFactors = F)
    
    # GENE-GO注释的数据框
    go_df <- read.table("F:/Project/org.At.tair.db/ATH_GO_TERM.txt",
                          sep="\t", header = FALSE,
                          as.is = TRUE)
    go_df$V3 <- ifelse(go_df$V3 == "C", "CC",
                         ifelse(go_df$V3 == "P", "BP",
                                ifelse(go_df$V3 == "F", "MF", "")))
    colnames(go_df) <- c("GID","GO","ONTOLOGY","EVIDENCE")
    
    
    # GENE-PUB的数据框
    pub_df <- read.table("F:/Project/org.At.tair.db/Locus_Published_20171231.txt",
                         sep="\t",
                         header = TRUE)
    
    ## 只选择AT开头的基因
    pub_df <- pub_df[grepl(pattern = "^AT\\d", pub_df$name),]
    pub_df <- cbind(GID=do.call(rbind,strsplit(pub_df$name, split = "\\."))[,1],
                    pub_df)
    ## pubmed_id 不能为空
    pub_df <- pub_df[!is.na(pub_df$PMID),]
    
    colnames(pub_df) <- c("GID","GENEID","REFID",
                          "PMID","PUBYEAR")
    
    # GENE-SYMBOL的注释数据库
    symbol_df <- read.table("F:/Project/org.At.tair.db/gene_aliases_20171231.txt",
                            sep = "\t",
                            header = TRUE)
    symbol_df <- symbol_df[grepl(pattern = "^AT\\d", symbol_df$name),]
    colnames(symbol_df) <- c("GID","SYMBOL","FULL_NAME")
    
    
    # GENE-FUNCTION
    func_df <- read.table("F:/Project/org.At.tair.db/Araport11_functional_descriptions_20171231.txt",
                          sep = "\t",
                          header=TRUE)
    func_df <- func_df[grepl(pattern = "^AT\\d", func_df$name),]
    func_df <- cbind(GID=do.call(rbind,strsplit(func_df$name, split = "\\."))[,1],
                      func_df)
    colnames(func_df) <- c("GID","TXID","GENE_MODEL_TYPE",
                           "SHORT_DESCRIPTION",
                           "CURATOR_SUMMARY",
                           "COMPUTATIONAL_DESCRIPTION")
    ## 去重复行
    go_df <- go_df[!duplicated(go_df),]
    go_df <- go_df[,c(1,2,4)]
    pub_df <- pub_df[!duplicated(pub_df),]
    symbol_df <- symbol_df[!duplicated(symbol_df),]
    func_df <- func_df[!duplicated(func_df),]
    
    makeOrgPackage(go=go_df,
                   pub_info = pub_df,
                   symbol_info = symbol_df,
                   function_info = func_df,
                   version = "0.1",
                   maintainer = "xuzhougeng <xuzhougeng@163.com>",
                   author="xuzhogueng <xuzhougeng@163.com>",
                   outputDir = "F:/Project/org.At.tair.db",
                   tax_id = "3702",
                   genus = "At",
                   species = "tair10",
                   goTable = "go"
      
    )
    

    最后会在指定目录下生成"org.Atair10.eg.db", 然后就可以用

    install.packages("./org.Atair10.eg.db", repos = NULL,
                     type = "source")
    

    而且我测试了,能和Y叔的clusterProfiler完美结合

    library(org.Atair10.eg.db)
    org <- org.Atair10.eg.db
    ego_down <-enrichGO(gene = DEG_GENES, 
             OrgDb = org,
             keyType = "GID",
             ont = "BP"
             ) 
    

    目前我是自己用为主,如果你们有需要,可以按照如下代码进行安装

    # 解决依赖包的问题
    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("org.At.tair.db", version = "3.8")
    # 安装我的注释包
    install.packages("https://raw.githubusercontent.com/xuzhougeng/org.At.tair.db/master/org.Atair10.eg.db.tgz", repos=NULL, type="source")
    

    出现问题,欢迎在我的GitHubhttps://github.com/xuzhougeng/org.At.tair.db上提出issue

    参考资料

    相关文章

      网友评论

        本文标题:「Bioconductor」让我们愉快的为自己做一个物种包吧

        本文链接:https://www.haomeiwen.com/subject/ntogrqtx.html