美文网首页R生信NGS
用SeqinR包在NCBI获取基因组序列并分析

用SeqinR包在NCBI获取基因组序列并分析

作者: Y大宽 | 来源:发表于2019-02-12 11:07 被阅读64次

    这里是网页版获取DNA序列,下载保存后可以用read.fasta打开
    ##########################
    用SeqinR包获取序列并进行统计
    ##########################
    比如,在NCBI获取NC_001477登革病毒的基因组序列,
    安装加载seqinr

    install.packages("seqinr")
    library(seqinr)
    choosebank()#list all the sub-database in ACNUC
    
    > choosebank()
     [1] "genbank"         "embl"            "emblwgs"         "swissprot"      
     [5] "ensembl"         "hogenom7"        "hogenom"         "hogenomdna"     
     [9] "hovergendna"     "hovergen"        "hogenom5"        "hogenom5dna"    
    [13] "hogenom4"        "hogenom4dna"     "homolens"        "homolensdna"    
    [17] "hobacnucl"       "hobacprot"       "phever2"         "phever2dna"     
    [21] "refseq"          "refseq16s"       "greviews"        "bacterial"      
    [25] "archaeal"        "protozoan"       "ensprotists"     "ensfungi"       
    [29] "ensmetazoa"      "ensplants"       "ensemblbacteria" "mito"           
    [33] "polymorphix"     "emglib"          "refseqViruses"   "ribodb"         
    [37] "taxodb"
    

    "genebank"包含DNA和RNA序列数据库
    “refseq”包含"Refseq”中DNA和RNA
    "refseqViruses”包含Refseq中病毒的DNA,RNA和蛋白序列
    更详细的见http://doua.prabi.fr/databases/acnuc

    比如要获取DEN-1登革病毒基因组序列,accesion number NC_001477

    1 构造一个函数,由Accession number直接下载所需要的序列

    getncbiseq <- function(accession)
    {
      require("seqinr") # this function requires the SeqinR R package
      # first find which ACNUC database the accession is stored in:
      dbs <- c("genbank","refseq","refseqViruses","bacterial")
      numdbs <- length(dbs)
      for (i in 1:numdbs)
      {
        db <- dbs[i]
        choosebank(db)
        # check if the sequence is in ACNUC database 'db':
        resquery <- try(query(".tmpquery", paste("AC=", accession)), silent = TRUE)
        if (!(inherits(resquery, "try-error")))
        {
          queryname <- "query2"
          thequery <- paste("AC=",accession,sep="")
          query2 <- query(queryname, thequery)
          # see if a sequence was retrieved:
          seq <- getSequence(query2$req[[1]])
          closebank()
          return(seq)
        }
        closebank()
      }
      print(paste("ERROR: accession",accession,"was not found"))
    }
    

    2.根据accession number下载序列

    dengueseq <- getncbiseq("NC_001477")
    
    > dengueseq
       [1] "a" "g" "t" "t" "g" "t" "t" "a" "g" "t" "c" "t" "a" "c" "g" "t" "g" "g" "a"
      [20] "c" "c" "g" "a" "c" "a" "a" "g" "a" "a" "c" "a" "g" "t" "t" "t" "c" "g" "a"
      [39] "a" "t" "c" "g" "g" "a" "a" "g" "c" "t" "t" "g" "c" "t" "t" "a" "a" "c" "g"
      [58] "t" "a" "g" "t" "t" "c" "t" "a" "a" "c" "a" "g" "t" "t" "t" "t" "t" "t" "a"
      [77] "t" "t" "a" "g" "a" "g" "a" "g" "c" "a" "g" "a" "t" "c" "t" "c" "t" "g" "a"
      [96] "t" "g" "a" "a" "c" "a" "a" "c" "c" "a" "a" "c" "g" "g" "a" "a" "a" "a" "a"
    

    3 输出fasta格式文件

    write.fasta(names="DEN-1", sequences=dengueseq, file.out="den1.fasta")
    

    4读入,如果通过网页直接下载序列,可以直接从这一步开始进行导入

    dengue <- read.fasta(file = "den1.fasta")
    dengueseq <- dengue[[1]]
    

    5.1DNA length

    > length(dengueseq)
    [1] 10735
    

    5.2Base composition

    > table(dengueseq)
    dengueseq
       a    c    g    t 
    3426 2240 2770 2299
    

    等同于

    count(dengueseq,1)
    
       a    c    g    t 
    3426 2240 2770 2299
    
    

    5.3GC Content

    > GC(dengueseq)
    [1] 0.4666977
    

    5.4 DNA words

    > count(dengueseq,2)
    
      aa   ac   ag   at   ca   cc   cg   ct   ga   gc   gg   gt   ta   tc   tg   tt 
    1108  720  890  708  901  523  261  555  976  500  787  507  440  497  832  529 
    

    5.5写一个函数计算AT含量

    AT <- function(inputseq)
    {
      mytable <- count(inputseq,1)#make a table with the count of As,Cs,Gs,Ts
      mylength <- length(inputseq)#find the length of the whole sequence
      myAs <- mytable[[1]]#number of As in the sequence
      myTs <- mytable[[4]]#nmmber of Ts in the sequence
      myAT <- (myAs+myTs)/mylength
      return(myAT)
    }
    

    计算denggueseq的AT含量

    > AT(dengueseq)
    [1] 0.5333023
    

    5.6统计三联体密码一共多少个

    > count(dengueseq,3) %>% sum
    [1] 10733
    

    参考文章:A Little Book of R For Bioinformatics, Release 0.1

    相关文章

      网友评论

        本文标题:用SeqinR包在NCBI获取基因组序列并分析

        本文链接:https://www.haomeiwen.com/subject/qmqheqtx.html