美文网首页
IRscope代码拆解一

IRscope代码拆解一

作者: 小明的数据分析笔记本 | 来源:发表于2020-05-30 16:05 被阅读0次

    IRScope 是用用来可视化 叶绿体基因组边界扩张收缩的一个shiny应用。正好自己最近在学习R语言里shiny相关的知识。就准备看看他的代码是怎么写的。目前看懂了一小部分,记录在这里。

    他开头定义了很多函数

    1、读入genbank函数的文件

    read.gb<- function(file){
      return(readLines(file))
    }
    

    readLines()函数读入文本文件,结果好像是一个向量,文件中的每行是向量中的一个元素。

    2、提取读入的genbank文件的fasta序列

    这个代码稍微有点长,,逻辑还有点没看懂

    FasExtract<- function(gb){
      fasta<-gb[(grep("ORIGIN", gb)+1):length(gb)]
      while(fasta[length(fasta)]=="") {
        fasta<- fasta[1:length(fasta)-1]
      }
      while(fasta[length(fasta)]=="//") {
        fasta<- fasta[1:length(fasta)-1]
      }
      fas<-""
      for (i in 1:length(fasta)){
        sort.let<- sort(unique(c(grep("c", strsplit(fasta[i], " ")[[1]]),grep("a", strsplit(fasta[i], " ")[[1]]), grep("t", strsplit(fasta[i], " ")[[1]]), grep("g", strsplit(fasta[i], " ")[[1]]))))
        try(if(length(sort.let)==!6) stop("Check the gb file; the columns of ORIGIN should be 6"))
        fasta[i]<- paste(strsplit(fasta[i], " ")[[1]][sort.let[1]],strsplit(fasta[i], " ")[[1]][sort.let[2]],strsplit(fasta[i], " ")[[1]][sort.let[3]],strsplit(fasta[i], " ")[[1]][sort.let[4]],strsplit(fasta[i], " ")[[1]][sort.let[5]],strsplit(fasta[i], " ")[[1]][sort.let[6]], sep="")
        fas<-paste(fas, fasta[i], sep="")
      }
      #fasta[length(fasta)]<- gsub("NA", "", fasta[length(fasta)])
      fas<-gsub("NA", "", fas)
      strsplit(fas, "")[[1]]
    }
    

    3、计算fasta序列GC含量

    GC.content<- function(fas){
      return((sum(fas=="g")+sum(fas=="c"))/length(fas))
    }
    

    4、提取genbank文件中的物种拉丁名

    sp.name<- function(gb){
      paste(strsplit(gb[2], " ")[[1]][3], strsplit(gb[2], " ")[[1]][4], sep=" ")
    }
    
    这四个函数连在一起用就是
    > gbfile<-read.gb("sequence.gb")
    > fas<-FasExtract(gbfile)
    > fas
      [1] "a" "t" "g" "a" "t" "t" "g" "a" "a" "g" "t" "t" "t" "t" "t" "c" "t" "a"
     [19] "t" "t" "t" "g" "g" "a" "a" "t" "t" "g" "t" "c" "t" "t" "a" "g" "g" "t"
     [37] "c" "t" "a" "a" "t" "t" "c" "c" "t" "a" "t" "t" "a" "c" "t" "t" "t" "a"
     [55] "g" "c" "t" "g" "g" "a" "t" "t" "a" "t" "t" "t" "g" "t" "a" "a" "c" "t"
     [73] "g" "c" "a" "t" "a" "t" "t" "t" "a" "c" "a" "a" "t" "a" "c" "a" "g" "a"
     [91] "c" "g" "t" "g" "g" "t" "g" "a" "t" "c" "a" "g" "t" "t" "g" "g" "a" "c"
    [109] "t" "t" "t" "t" "g" "a"
    > GC.content(fas)
    [1] 0.3157895
    > sp.name(gbfile)
    [1] "Punica granatum"
    

    5、替换fasta序列中的非ATCG的字符

    rdnFixer<- function(gb){
      seq<- FasExtract(gb)
      seq[which(seq=="u")]<-sample(c("t"), length(which(seq=="u")), TRUE)
      seq[which(seq=="r")]<-sample(c("a", "g"), length(which(seq=="r")), TRUE)
      seq[which(seq=="y")]<-sample(c("c", "t"), length(which(seq=="y")), TRUE)
      seq[which(seq=="s")]<-sample(c("c", "g"), length(which(seq=="s")), TRUE)
      seq[which(seq=="w")]<-sample(c("a", "t"), length(which(seq=="w")), TRUE)
      seq[which(seq=="k")]<-sample(c("g", "t"), length(which(seq=="k")), TRUE)
      seq[which(seq=="m")]<-sample(c("c", "a"), length(which(seq=="m")), TRUE)
      seq[which(seq=="b")]<-sample(c("c", "g", "t"), length(which(seq=="b")), TRUE)
      seq[which(seq=="d")]<-sample(c("a", "g", "t"), length(which(seq=="d")), TRUE)
      seq[which(seq=="h")]<-sample(c("c", "a", "t"), length(which(seq=="h")), TRUE)
      seq[which(seq=="v")]<-sample(c("c", "a", "g"), length(which(seq=="v")), TRUE)
      seq[which(seq=="n")]<-sample(c("c", "g", "t", "a"), length(which(seq=="n")), TRUE)
      seq[which(seq=="-")]<-sample(c("c", "g", "t", "a"), length(which(seq=="-")), TRUE)
      return(seq)
    }
    

    5、鉴定叶绿基因组的四个区域边界信息
    这个代码就更长了,代码看起来就更吃力了。如何鉴定四个区域的边界位置逻辑有点看不懂呀!

    IRinfo<- function(genome, parallel=TRUE){
      
      #' IR information
      #' 
      #' Detecting the starts and the length of the IR regions on the chloroplast genome
      #' 
      #' @param genome The plastid genome of a species as a simple vector of nucleotides 
      #' @return a vector of four elements as the start of the first and second IR region, their length and the total lenght of the genome, respectively
      #' @export 
      
      ###Preliminary functions 
      
      cirtick<<- function(tick, vector){#circular rotative function
        if(tick > length(vector)-1 || tick < 1){
          return(vector)
        }
        else {
          return(c(vector[(tick+1):length(vector)], vector[1:tick]))
        }
      }
      
      genome.comp.rev<<- function(genome){#reverse complement function
        gcr<-genome[length(genome):1]
        gcr<-gsub("a", "T", gcr)
        gcr<-gsub("t", "A", gcr)
        gcr<-gsub("g", "C", gcr)
        gcr<-gsub("c", "G", gcr)
        return(tolower(gcr))
      }
      
      #Checked
      phase.detector<- function(genome){#detecting the phase difference of the two inverted genomes
        shifter=84000 #this shifter is faster mode, to be sure set the value to 80000,
        gcr<- genome.comp.rev(genome)
        genome<-cirtick(shifter, genome) 
        l<-length(genome)
        track<- numeric(l+1)
        track[l+1]<- round(l/4)
        for (i in 1:l){
          track[i]<- sum(cirtick(i, genome)==gcr)
          if ((track[i] - track[l+1])/l > 0.1) {###stable version with 0.07 but changing it for the Guizotia abyssinica, for the parallel p.d fucntion as well
            break
          }
        }
        a<<- which(track==max(track))+shifter
        if ( a > l){
          return(a - l)
        }
        else {
          return(a)
        }
      }
      #the parallel version of the phase.detector function. Set with default 4
      p.d<<- function(genome, nCPU=2){#tested in the GlobEnv, it might fail when embedded in the bigger function
        genome<<-genome
        fun<<- function(shifter){#the function to be passed to the slaves for the parallel computing, the out put is with max 10 second either NA or the phase.detector
          gcr<- genome.comp.rev(genome)
          cir.genome<<-cirtick(shifter, genome)
          l<-length(genome)
          track<- numeric(l+1)
          track[l+1]<- round(l/4)
          s.time<- Sys.time()
          no.value<- FALSE
          for (i in 1:l){
            track[i]<- sum(cirtick(i, cir.genome)==gcr)
            i.time<- Sys.time()
            if ((track[i] - track[l+1])/l > 0.1) {
              break
            }
            if (i.time - s.time > 11){
              no.value<- TRUE
              break
            }
          }
          if (no.value) {
            a<<- NA
            return(a)
          }
          else {
            a<<- which(track==max(track))+shifter
            if ( a > l){
              return(a - l)
            }
            else {
              return(a)
            } 
          }
        }
        ini.forw<<- 84000
        ini.back<<- ini.forw
        mm<<- rep(NA, nCPU)
        sfStop()
        sfInit(parallel=TRUE, cpus=nCPU)
        while(sum(is.na(mm))==length(mm)){
          sfExport("genome", "cirtick", "genome.comp.rev", "fun", "mm", "ini.forw", "ini.back")
          mm<-unlist(sfLapply(c(seq(ini.forw, ini.forw+nCPU/2*1000-1, 1000), seq(ini.back, ini.back- nCPU/2*1000, -1000)[-1]),  fun))
          ini.forw<<- ini.forw+nCPU/2*1000
          ini.back<<-ini.back-nCPU/2*1000
        }
        sfStop()
        return(unique(mm)[which((is.na(unique(mm))==FALSE))])
      }
      
      #Checked
      True.sequence.finder<- function(genome, phase.difference){#finding the cordinate of the IR region
        #phase.difference<-phase.detector(genome)
        true.search<-cirtick(phase.difference, genome)==genome.comp.rev(genome)
        true.arm<- round(length(genome)/100)
        for (i in 1:length(genome)){
          if (sum(true.search[i:(true.arm+i-1)])==true.arm) {
            return(i); break
          }
        }
      }
      
      #Checked
      IR1start<-function(phase.difference,True.sequence.finder){
        return(phase.difference+True.sequence.finder)
      }
      
      #Checked
      IR.length<- function(IR1start, True.sequence.finder, genome){
        s<-IR1start
        t<-True.sequence.finder
        r<- genome.comp.rev(genome)
        T<-cirtick(s, genome)==cirtick(t, r)
        Tl<- list()
        for (i in 1:50){
          Tl[[i]]<- cirtick((s + i), genome)==cirtick(t, r)
        }
        for (i in 1:50){
          Tl[[i+50]]<- cirtick(s , genome)==cirtick((t+i), r)
        }
        count<-1
        while(T[count]){
          count<- count+1
        }
        ###jump from INDEL
        for (i in 1:50){
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
          if(sum(Tl[[i]][(count+1):(count+10)])==10){
            while (Tl[[i]][(count+1)]){
              count<- ((count+1) + i)
            }
          }
          if(sum(Tl[[i]][(count+2):(count+11)])==10){
            while (Tl[[i]][(count+2)]){
              count<- ((count+2) + i)
            }
          }
          if(sum(Tl[[i]][(count+3):(count+12)])==10){
            while (Tl[[i]][(count+3)]){
              count<- ((count+3) + i)
            }
          }
          if(sum(Tl[[i]][(count+4):(count+13)])==10){
            while (Tl[[i]][(count+4)]){
              count<- ((count+4) + i)
            }
          }
          if(sum(Tl[[i]][(count+5):(count+14)])==10){
            while (Tl[[i]][(count+5)]){
              count<- ((count+5) + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
          if(sum(Tl[[i+50]][(count+1):(count+10)])==10){
            while (Tl[[i+50]][(count+1)]){
              count<- ((count+1) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+2):(count+11)])==10){
            while (Tl[[i+50]][(count+2)]){
              count<- ((count+2) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+3):(count+12)])==10){
            while (Tl[[i+50]][(count+3)]){
              count<- ((count+3) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+4):(count+13)])==10){
            while (Tl[[i]][(count+4)]){
              count<- ((count+4) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+5):(count+14)])==10){
            while (Tl[[i]][(count+5)]){
              count<- ((count+5) + i)
            }
          }
        }
        ##end of jump indel
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        ###jump from INDEL
        for (i in 1:50){
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
          if(sum(Tl[[i]][(count+1):(count+10)])==10){
            while (Tl[[i]][(count+1)]){
              count<- ((count+1) + i)
            }
          }
          if(sum(Tl[[i]][(count+2):(count+11)])==10){
            while (Tl[[i]][(count+2)]){
              count<- ((count+2) + i)
            }
          }
          if(sum(Tl[[i]][(count+3):(count+12)])==10){
            while (Tl[[i]][(count+3)]){
              count<- ((count+3) + i)
            }
          }
          if(sum(Tl[[i]][(count+4):(count+13)])==10){
            while (Tl[[i]][(count+4)]){
              count<- ((count+4) + i)
            }
          }
          if(sum(Tl[[i]][(count+5):(count+14)])==10){
            while (Tl[[i]][(count+5)]){
              count<- ((count+5) + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
          if(sum(Tl[[i+50]][(count+1):(count+10)])==10){
            while (Tl[[i+50]][(count+1)]){
              count<- ((count+1) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+2):(count+11)])==10){
            while (Tl[[i+50]][(count+2)]){
              count<- ((count+2) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+3):(count+12)])==10){
            while (Tl[[i+50]][(count+3)]){
              count<- ((count+3) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+4):(count+13)])==10){
            while (Tl[[i]][(count+4)]){
              count<- ((count+4) + i)
            }
          }
          if(sum(Tl[[i+50]][(count+5):(count+14)])==10){
            while (Tl[[i]][(count+5)]){
              count<- ((count+5) + i)
            }
          }
        }
        ##end of jump indel
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+25)]==rep(TRUE, 16))==16){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+75)]==rep(TRUE, 66))==66){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        for (i in 1:50){###jump from INDEL
          if(sum(Tl[[i]][count:(count+9)])==10){
            while (Tl[[i]][count]){
              count<- (count + i)
            }
          }
        }
        for (i in 1:50){
          if(sum(Tl[[i+50]][count:(count+9)])==10){
            while (Tl[[i+50]][count]){
              count<- (count + i)
            }
          }
        }##end of jump indel
        if (sum(T[(count+10):(count+175)]==rep(TRUE, 166))==66){
          count<- count+10
          while(T[count]){
            count<- count+1
          } 
        }
        count
      }
      
      IR2start<- function(True.sequence.finder, IR.length, genome){
        return(length(genome)-(True.sequence.finder+IR.length-2))
      }
      
      #declaration
      if (parallel){
        phase.difference<- p.d(genome)
      }
      else{
        phase.difference<- phase.detector(genome)
      }
      Trsf<- True.sequence.finder(genome, phase.difference)
      IR1s<- IR1start(phase.difference, Trsf)
      IR.l<- IR.length(IR1s, Trsf, genome)
      IR2s<- IR2start(Trsf, IR.l, genome)
      
      #calculation
      return(c(IR1s, IR2s, IR.l, length(genome)))#returning the start of IR one and two follow by their lenght and the lenght of genome
      #gives a vector with four elemens as the start of the IRb and IRa and their length and the lenght of the genome sequence
    }
    
    

    如果要单独用这个函数需要加载 sonwfall()这个包,这里用到了并行计算

    输入是 fasta
    输出是 第一个反向重复区的位置,小单拷贝区的长度,总长

    > gbfile<-read.gb("Taishanhong_CP_genome.gb")
    > fas<-FasExtract(gbfile)
    > IRinfo(fas)
    snowfall 1.84-6.1 initialized (using snow 0.4-3): parallel execution on 2 CPUs.
    
    
    Stopping cluster
    
    [1]  89022 133173  25466 158638
    

    6、获取genbank文件中所有的基因名

    gene.name<-function(gb, type){
      if(type=="gene"){
        t <- gb[grep("  gene  ", gb)+1]
        for (i in 1:length(t)){
          crude<-strsplit(t[i], " ")[[1]]
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  gene  ", gb)+2][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  gene  ", gb)+3][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  gene  ", gb)+4][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          crude.name<-crude[which(crude!=rep("", length(crude)))]
          t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
        }
        na<-which(is.na(t))
        if(length(na) > 0){
          for (i in 1:length(na)){
            warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
          }
        }
      }
      else if(type=="tRNA"){
        t <- gb[grep(" tRNA  ", gb)+1]
        for (i in 1:length(t)){
          crude<-strsplit(t[i], " ")[[1]]
          if(length(grep("=", crude))==0){
            t[i] <- gb[grep("tRNA  ", gb)+2][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  tRNA  ", gb)+3][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  tRNA  ", gb)+4][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          crude.name<-crude[which(crude!=rep("", length(crude)))]
          t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
        }
        na<-which(is.na(t))
        if(length(na) > 0){
          for (i in 1:length(na)){
            warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("tRNA ", gb)+1]), ""), ""), "\n")
          }
        }       
      }
      else if(type=="rRNA"){
        t <- gb[grep("rRNA  ", gb)+1]
        for (i in 1:length(t)){
          crude<-strsplit(t[i], " ")[[1]]
          if(length(grep("=", crude))==0){
            t[i] <- gb[grep("rRNA  ", gb)+2][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  rRNA  ", gb)+3][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  rRNA  ", gb)+4][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          crude.name<-crude[which(crude!=rep("", length(crude)))]
          t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
        }
        na<-which(is.na(t))
        if(length(na) > 0){
          for (i in 1:length(na)){
            warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
          }
        }
      }
      else if(type=="mRNA"){
        t <- gb[grep("mRNA  ", gb)+1]
        for (i in 1:length(t)){
          crude<-strsplit(t[i], " ")[[1]]
          if(length(grep("=", crude))==0){
            t[i] <- gb[grep("mRNA  ", gb)+2][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  mRNA  ", gb)+3][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          if(length(grep("=", crude))==0){#if it cannot find the gene name in the following line go to the next
            t[i] <- gb[grep("  mRNA  ", gb)+4][i]
            crude<- strsplit(t[i], " ")[[1]]
          }
          crude.name<-crude[which(crude!=rep("", length(crude)))]
          t[i]<-gsub("\"", "", strsplit(crude.name, "=")[[1]][2])
        }
        na<-which(is.na(t))
        if(length(na) > 0){
          for (i in 1:length(na)){
            warning(paste(paste(paste("Gene No.", na[i], " "), paste("is not properly named and is deleted from the list."), ""), paste("Check the gb file on line", which(gb==gb[grep("gene ", gb)+1][na]), ""), ""))
          }
        }
      }
      else {
        stop("The type should be defined as either gene or tRNA")
      }
      t<-t[!is.na(t)]
      return(t)
      #intermediate gene name function to substract the gene names of either gene or tRNA, mRNA or rRNA from their second line information(or third), the input is the gb file.
    }
    

    可以获得所有的基因名称,tRNA或者rRNA的名称,但是不能够获得蛋白编码基因的名称,但是应该可以改,把代码里的mRNA统一换成CDS应该就可以了

    > gbfile<-read.gb("Taishanhong_CP_genome.gb")
    > gene.name(gbfile,'rRNA')
    [1] "rrn16"  "rrn23"  "rrn4.5" "rrn5"   "rrn5"   "rrn4.5" "rrn23"  "rrn16" 
    

    还有好多,今天就到这里了。
    重点是画图函数,但是他的画图函数好长好长啊!

    欢迎大家关注我的公众号
    小明的数据分析笔记本

    公众号二维码.jpg

    相关文章

      网友评论

          本文标题:IRscope代码拆解一

          本文链接:https://www.haomeiwen.com/subject/cgxhzhtx.html