筛选最长转录本问题

作者: SnorkelingFan凡潜 | 来源:发表于2021-06-15 23:10 被阅读0次
    • 在此介绍一个可直接筛选的脚本select_longest_transcripts.pl
    1 #!/usr/bin/perl -w
          2 use strict;
          3 die "perl $0 <genomic.gff|genomic.gff.gz> <all.cds|all.cds.gz>\n" unless @ARGV == 2;
          4
          5 ##### example inputs: GCA_002891405.2_Csec_1.0_genomic.gff.gz, GCA_002891405.2_Csec_1.0_cds_from_genomic.fna.gz #####
          6
          7 my $in = shift;
          8 my $pool = shift;
          9
         10 ##### get the longest transcript of each mRNA #####
         11
         12 if( $in =~ /.*\.gz$/ ){
         13         open IN, "gzip -dc $in | " || die "$!\n";
         14 }else{
         15         open IN, $in || die "$!\n";
         16 }
         17
         18 my ($ref, $gene, $trans);
         19 while( <IN> ){
         20         chomp;
         21         next if /^$/;
         22         next if /#/;
         23         my @tmp = split(/\t/, $_);
         24         if( $tmp[2] eq "gene" ){
         25                 $gene = $1 if $tmp[8] =~ /ID=(.*);Name.*/;
         26         }
         27         if( $tmp[2] eq "mRNA" ){
         28                 $trans = $1 if $tmp[8] =~ /ID=(.*);Parent.*/;
         29         }
         30         if( $tmp[2] eq "CDS" ){
         31                 my ($cds, $parent) = ($1, $2) if $tmp[8] =~ /ID=cds\.evm\.model\.(.*);Parent=(.*)/;
         32                 $ref->{$gene}{$cds} += $tmp[4] - $tmp[3] if $parent eq $trans;
         33         }
         34 }
         35 close IN;
         36
         37 open O1, ">$in.all.mRNA.len.info.xls" || die "$!\n";
         38 open O2, ">$in.all.longst_mRNA.info.xls" || die "$!\n";
         39 for my $key ( sort keys %$ref ){
         40         my $gene = $ref->{$key};
         41         print O1 $key;
         42         print O2 $key;
         43         for my $mrna ( sort { $gene->{$a} <=> $gene->{$b} } keys %$gene ){
         44                 print O1 "\t$mrna\t$gene->{$mrna}\n";
         45         }
         46         my @sort = sort { $gene->{$a} <=> $gene->{$b} } keys %$gene;
         47         print O2 "\t$sort[-1]\t$gene->{$sort[-1]}\n";
         48 }
         49 close O1;
         50 close O2;
         51
         52 ##### get longest mRNAs from the all mRNAs pool #####
         53
         54 my (%Seq, $id);
         55 if( $pool =~ /.*\.*gz$/ ){
         56         open CDS, "gzip -dc $pool | " || die "$!\n";
         57 }else{
         58         open CDS, $pool || die "$!\n";
         59 }
         60
         61 while( <CDS> ){
         62         chomp;
         63         #if( />.*_cds_(.*)_\d+\s+\[\S+.*/ ){
         64         if( />evm\.model\.(.*)\s+.*/ ){
         65                 $id = $1;
         66         }else{
         67                 $Seq{$id} .= $_;
         68         }
         69 }
         70 close CDS;
         71
         72 open IN, "$in.all.longst_mRNA.info.xls" || die "$!\n";
         73 open OUT, ">$in.longest_transcript.cds.fa" || die "$!\n";
         74 while( <IN> ){
         75         chomp;
         76         my @tmp = split(/\s+/, $_);
         77         print OUT ">$tmp[1]\n";
         78         for(my $i = 0; $i <= length($Seq{$tmp[1]}); $i += 80){
         79                 print OUT substr($Seq{$tmp[1]}, $i, 80), "\n";
         80         }
         81 }
         82 close IN;
         83 close OUT;
    

    重点改写第25, 28, 31, 64正则匹配.gff文件和.cds文件里的信息

    • 筛选之前先检查所下载的gff文件中是否存在多转录本的情况
    awk '$3=="gene"' *gff | wc -l 
    awk '$3=="mRNA"' *gff | wc -l
    wc -l *xls
    

    • 问题报错排查
    Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 24, <IN> line 200027.
    Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 27, <IN> line 200027.
    Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 30, <IN> line 200027.
    
    image.png
    • 添加print参数进行排查
    ##### get the longest transcript of each mRNA #####
    
    if( $in =~ /.*\.gz$/ ){
            open IN, "gzip -dc $in | " || die "$!\n";
    }else{
            open IN, $in || die "$!\n";
    }
    
    my ($ref, $gene, $trans);
    while( <IN> ){
            chomp;
            next if /^$/;
            next if /#/;
            my @tmp = split(/\t/, $_);
            if( $tmp[2] eq "gene" ){
                    $gene = $1 if $tmp[8] =~ /ID=(.*);.*/;
            print "gene\t$gene\n";
            }
            if( $tmp[2] eq "mRNA" ){
                    $trans = $1 if $tmp[8] =~ /ID=(.*);Parent.*/;
            print "trans\t$trans\n";
            }
            if( $tmp[2] eq "CDS" ){
                    my ($cds, $parent) = ($1, $2) if $tmp[8] =~ /ID=cds\.evm\.model\.(.*);Parent=(.*)/;
            print "cds\t$cds\n"; print "parent\t$parent\n";
                    $ref->{$gene}{$cds} += $tmp[4] - $tmp[3] if $parent eq $trans;
            }
    }
    close IN;
    
    open O1, ">$in.all.mRNA.len.info.xls" || die "$!\n";
    open O2, ">$in.all.longst_mRNA.info.xls" || die "$!\n";
    
    image.png

    这个案例中的报错排查的结果是下载的gff文件中有空格行的原因,在脚本中添加了next if /^$/;后得到解决


    可供选择的第二种方法,用OrthoFinder自带的脚本进行挑选

    The files from Ensembl will contain many transcripts per gene. If we ran OrthoFinder on these raw files it would take 10x longer than necessary and could lower the accuracy. We’ll use a script provided with OrthoFinder to extract just the longest transcript variant per gene and run OrthoFinder on these files:

    for f in *fa ; do python ~/orthofinder_tutorial/OrthoFinder/tools/primary_transcript.py $f ; done

    import os
    import sys
    from collections import Counter, defaultdict
    
    # Use the 'all' version rather than ab initio
    
    def ScanTags(fn):
        """
        For ensembl genomes, look for tag:id and count repeated ids
        :param fn:
        :return:
        """
        tags = set()
        tokens = []
        with open(fn, 'r') as infile:
            for line in infile:
                if not line.startswith(">"): continue
                tokens.append([t.split(":", 1) for t in line.rstrip().split() if ":" in t])
                tags.update([t[0] for t in tokens[-1]])
        for this_tag in tags:
            print(this_tag)
            # print(tokens[-1])
            c = Counter([idd for acc in tokens for t, idd in acc if t == this_tag])
            print(c.most_common(5))
            print("")
    
    def ScanTags_NCBI(fn):
        genes = []
        with open(fn, 'r') as infile:
            for line in infile:
                if not line.startswith(">"): continue
                genes.append(line[1:].split(".", 1)[0])
        print("%d sequences, %d genes" % (len(genes), len(set(genes))))
    
    def ScanTags_with_fn(fn, gene_name_fn):
        genes = []
        with open(fn, 'r') as infile:
            for line in infile:
                if not line.startswith(">"): continue
                genes.append(gene_name_fn(line))
        print("%d sequences, %d genes" % (len(genes), len(set(genes))))
        # print(genes[0])
        # print(sorted(genes)[:10])
    
    def GetGeneName(acc_line):
        tokens = [(t.split("=") if "=" in t else t.split(":"))[1] for t in acc_line.rstrip().split() if ("gene:" in t or "gene=" in t)]
        if len(tokens) != 1: return None
        return tokens[0]
    
    def CreatePrimaryTranscriptsFile(fn, dout, gene_name_fn=GetGeneName):
        # Get genes and lengths
        max_gene_lens = defaultdict(int)
        with open(fn, 'r') as infile:
            lines = [l.rstrip() for l in infile]
        N = len(lines) - 1
        nAcc = 0
        nGeneUnidentified = 0
        acc_to_use = defaultdict(str)
        iLine = -1
        while iLine < N:
            iLine += 1
            line = lines[iLine]
            if not line.startswith(">"): continue
            nAcc += 1
            iLineAcc = iLine
            gene = gene_name_fn(line)
            if gene == None:
                nGeneUnidentified += 1
                continue
            # get length
            l = 0
            while iLine < N:
                iLine += 1
                line = lines[iLine]
                if line.startswith(">"):
                    iLine -= 1
                    break
                l += len(line.rstrip())
            if l > max_gene_lens[gene]:
                max_gene_lens[gene] = l
                acc_to_use[gene] = iLineAcc
        print("Found %d accessions, %d genes, %d unidentified transcripts" % (nAcc, len(max_gene_lens), nGeneUnidentified))
        # print(gene)
        # print(sorted(max_gene_lens.keys())[:10])
     # print(len(set(max_gene_lens.keys())))
    
        # Get longest version for each gene
        # Parse file second time and only write out sequences that are longest variant
        nGenesWriten = 0
        outfn = dout + os.path.basename(fn)
        with open(outfn, 'w') as outfile:
            iLine = -1
            while iLine < N:
                iLine += 1
                line = lines[iLine]
                if not line.startswith(">"): continue
                gene = gene_name_fn(line)
                # transcripts not identifying the gene should be written
                if gene != None and iLine != acc_to_use[gene]: continue
                acc_line_out = line + "\n" if gene == None else ">%s\n" % gene
                nGenesWriten += 1
                outfile.write(acc_line_out)
                while iLine < N:
                    iLine += 1
                    line = lines[iLine]
                    if line.startswith(">"):
                        iLine -= 1
                        break
                    outfile.write(line + "\n")
        print("Wrote %d genes" % nGenesWriten)
        if nGenesWriten != len(max_gene_lens) + nGeneUnidentified:
            print("ERROR")
            raise Exception
        print(outfn)
    
    
    def last_dot(text):
        return text[1:].rstrip().rsplit(".", 1)[0]
    
    
    def space(text):
        return text[1:].rstrip().split(None, 1)[0]
    
    
    function_dict = {"last_dot":last_dot, "space":space}
    def main(args=None):
        if args is None:
            args = sys.argv[1:]
        fn = args[0]
        dout = os.path.dirname(os.path.abspath(fn)) + "/primary_transcripts/"
        if not os.path.exists(dout):
            os.mkdir(dout)
        if len(sys.argv) == 3:
            gene_name_function_name = function_dict[sys.argv[2]]
            ScanTags_with_fn(fn, gene_name_function_name)
            CreatePrimaryTranscriptsFile(fn, dout, gene_name_function_name)
        else:
            # ScanTags(fn)
            # ScanTags_NCBI(fn)
            # ScanTags_second_dot(fn)
    
            CreatePrimaryTranscriptsFile(fn, dout)
    
    if __name__ == "__main__":
        args = sys.argv[1:]
        main(args)
    

    可供选择的第三种方法,用BITACORA自带的脚本Scripts/Tools/exclude_isoforms_fromfasta.pl 进行挑选

    #!/usr/bin/perl
    use strict;
    use warnings;
    
    # Script to cluster genes (longest gene as representative) in overlapping positions (being putative isoforms or bad-annotations)
    
    # usage: perl exclude_isoforms_fromfasta.pl file.gff3 file.fasta
    die "Script to cluster genes (keeping the longest sequence as representative) in overlapping positions, i.e. clustering isoforms \n\nperl exclude_isoforms_fromfasta.pl file.gff3 file.fasta \n\n" unless @ARGV == 2;
    
    
    my ($line, $name, $nameout);
    my %overlap; my %mrna; my %fasta;
    
    my $outname="";
    if ($ARGV[1] =~ /(\S+)\.fa/){
        $outname = $1;
    } else {
        $outname = $ARGV[1];
    }
    
    open (Fasta , "<", $ARGV[1]); 
    while (<Fasta>) {
        chomp;
        my $line = $_;
        next if ($line !~ /\S+/);
        if ($line =~ />(\S+)/){
            $name = $1;
        } else {
            $fasta{$name} .= $line;
        }
    
    }
    close Fasta;
    
    
    open (Results, ">", "$ARGV[0]\_overlapping_genes.txt");
    
    open (GFFfile , "<", $ARGV[0]); 
    while (<GFFfile>) {
        chomp;
        $line = $_;
        next if ($line !~ /\S+/);
        next if ($line =~ /^#/);
        my @subline = split (/\t/, $line);
    
        if ($subline[2] =~ /CDS/){
    
    ####### No needed in this script
    next;
    #######     
            my $genename = "";
            if ($subline[8] =~ /Parent=([^;]+)/){
            #if ($subline[8] =~ /transcript_id \"(\S+)\"/){
                $genename = $1;
            }
            else {die "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting Parent ID in $line\n";}
    
    
            if ($genename =~ /(\S+)\_\d+dom/){
                $genename = $1;
            } 
    
    
        }
        elsif ($subline[2] =~ /mRNA/){
    
            next if ($line =~ /^END/);
    
            my $genename = "";
            if ($subline[8] =~ /ID=([^;]+)/){
            #if ($subline[8] =~ /transcript_id \"(\S+)\"/){
                $genename = $1;
            }
            else {print "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting ID in $line\n";}
    
    #       if ($genename =~ /(\S+)\_\d+dom/){
    #           $genename = $1;
    #       } 
    
    
            #Overlap checking
            my $start = "";
            my $end = "";
            my $frame = "$subline[6]";
            $frame =~ s/\+/\\\+/g;
            $frame =~ s/\-/\\\-/g;
            if ($subline[4] > $subline[3]){
                $start = $subline[3];
                $end = $subline[4];
            } else {
                $start = $subline[4];
                $end = $subline[3];         
            }
    
            foreach my $key (keys %{$mrna{$subline[0]}}){           
                my @subl2 = split (/\s/, $mrna{$subline[0]}{$key});
                next if ($subl2[2] !~ /$frame/); # First check chain
                my $start2 = $subl2[0];
                my $end2 = $subl2[1];
                my $over = 0;
                if ($start <= $start2 && $end >= $end2){
                    $over++
                } elsif ($start2 <= $start && $end2 >= $end){
                    $over++
                } elsif ($start <= $start2 && $end >= $start2){
                    $over++
                } elsif ($start2 <= $start && $end2 >= $start){
                    $over++
                } 
    
                if ($over > 0){
                    #if (exists $overlap{$genename}){
                    #   $overlap{$genename} .= "$key ";
                    #} elsif (exists $overlap{$key}){
                    #   $overlap{$key} .= "$genename ";
                    #} else {
                        $overlap{$subline[0]}{$genename} .= "$key ";
                        $overlap{$subline[0]}{$key} .= "$genename ";
                    #}
                }
    
            }
    
            $mrna{$subline[0]}{$genename} = "$start $end $frame";
    
    
        }
    
        elsif ($subline[2] =~ /gene/){
    
    ####### No needed in this script
    next;
    #######
            next if ($line =~ /^END/);
    
            my $genename = "";
            if ($subline[8] =~ /ID=([^;]+)/){
            #if ($subline[8] =~ /transcript_id \"(\S+)\"/){
                $genename = $1;
            }
            else {print "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting ID in $line\n";}
    
            if ($genename =~ /(\S+)\_\d+dom/){
                $genename = $1;
            } 
    
    
        }
    }
    close GFFfile;
    
    my %overlapgood;
    foreach my $scaf (sort keys %overlap){
        my $exclude;
        foreach my $key (sort keys %{$overlap{$scaf}}){
    
            #print "$key $overlap{$scaf}{$key}\n"; #### Debug
    
            my @subl = split (/\s/, $overlap{$scaf}{$key});
    
            if (exists $overlapgood{$scaf}{$key}){
                foreach my $sgen (@subl){
                    my $list = $overlapgood{$scaf}{$key};
                    #$list =~ s/\./\\\./g;
                    $list =~ s/\|/\\\|/g;
                    if ($list =~ /$sgen /){
                        next;
                    } else {
                        $overlapgood{$scaf}{$key} .= "$sgen ";
                    }
                }
            } else {
                my $cont = 0;
                foreach my $sgen (@subl){
                    if (exists $overlapgood{$scaf}{$sgen}){
                        $cont++;
                        my $list = "$sgen $overlapgood{$scaf}{$sgen}";
                        #$list =~ s/\./\\\./g;
                        $list =~ s/\|/\\\|/g;
    
                        if ($list !~ /$key /){
                            $overlapgood{$scaf}{$sgen} .= "$key ";
                        }
    
                        foreach my $ssgen (@subl){
    
                            if ($list =~ /$ssgen /){
                                next;
                            } else {
                                $overlapgood{$scaf}{$sgen} .= "$ssgen ";
                            }
                        }                   
    
                    }
    
                }
    
                if ($cont == 0){
                    $overlapgood{$scaf}{$key} .= "$overlap{$scaf}{$key}";
                } elsif ($cont > 1){
                    #print "Warning, two different fields have been saved for the same gene in $key $overlap{$scaf}{$key}\n";
                }
            }
    
        }
    }
    
    my %overlapgood2;
    foreach my $scaf (sort keys %overlapgood){
        my $exclude;
        foreach my $key (sort keys %{$overlapgood{$scaf}}){
    
            #print "$key $overlapgood{$scaf}{$key}\n"; #### Debug
    
            my @subl = split (/\s/, $overlapgood{$scaf}{$key});
    
            if (exists $overlapgood2{$key}){
                foreach my $sgen (@subl){
                    my $list = $overlapgood2{$key};
                    #$list =~ s/\./\\\./g;
                    $list =~ s/\|/\\\|/g;
                    if ($list =~ /$sgen /){
                        next;
                    } else {
                        $overlapgood2{$key} .= "$sgen ";
                    }
                }
            } else {
                my $cont = 0;
                foreach my $sgen (@subl){
                    if (exists $overlapgood2{$sgen}){
                        $cont++;
                        my $list = "$sgen $overlapgood2{$sgen}";
                        #$list =~ s/\./\\\./g;
                        $list =~ s/\|/\\\|/g;
    
                        if ($list !~ /$key /){
                            $overlapgood2{$sgen} .= "$key ";
                        }
    
                        foreach my $ssgen (@subl){
    
                            if ($list =~ /$ssgen /){
                                next;
                            } else {
                                $overlapgood2{$sgen} .= "$ssgen ";
                            }
                        }                   
    
                    }
    
                }
    
                if ($cont == 0){
                    $overlapgood2{$key} .= "$overlapgood{$scaf}{$key}";
                } elsif ($cont > 1){
                    #print "Warning, two different fields have been saved for the same gene in $key $overlapgood{$scaf}{$key}\n";
                }
            }
    
        }
    }
    
    my $excluded = "0";
    open (Resultsfa, ">", "$outname\_noiso.fasta");
    open (Resultsiso, ">", "$outname\_isoforms_excluded.txt");
    open (Resultsisotab, ">", "$outname\_isoforms_table.txt");
    print Resultsisotab "Representative sequence\tIsoforms\n";
    
    foreach my $key (sort keys %overlapgood2){
        print Results "$key $overlapgood2{$key}\n";
    
        $overlapgood2{$key} .= "$key ";
    
        my $bseq = "firstseq";
        my $blen = "0";
        my $excludedseqs = "";
        my @split = split(/\s/, $overlapgood2{$key});
        foreach my $seq (@split){
            unless (exists ($fasta{$seq})) {
                next;
                die "Can't find $seq in fasta file\n";
            }
            my $length = length($fasta{$seq});
    
            if ($length > $blen){
                $blen = $length;
                unless ($bseq =~ /firstseq/){
                    print Resultsiso "$bseq\n";
                    $excludedseqs .= "$bseq ";
                    delete $fasta{$bseq};
                    $excluded++;
                }
                $bseq = $seq;
            } else {
                print Resultsiso "$seq\n";
                $excludedseqs .= "$seq ";
                delete $fasta{$seq};
                $excluded++;
            }
        }
        next if ($excludedseqs !~ /\S+/);
        print Resultsisotab "$bseq\t$excludedseqs\n";
    }
    
    foreach my $key (sort keys %fasta) {
        print Resultsfa ">$key\n$fasta{$key}\n";
    }
    
    close Results;
    close Resultsfa;
    close Resultsiso;
    
    print "\nDone, $excluded sequences have been excluded. Output files have been saved in $outname\_noiso.fasta\n\n";
    
    
    

    相关文章

      网友评论

        本文标题:筛选最长转录本问题

        本文链接:https://www.haomeiwen.com/subject/pmvtyltx.html