美文网首页生物信息学习
一个 fasta 序列格式化小脚本

一个 fasta 序列格式化小脚本

作者: 正踪大米饭儿 | 来源:发表于2017-12-04 16:10 被阅读17次

    一个 fasta 小工具,主要用来格式化fasta序列的,也可以实现 fasta 核酸序列转为 pep 蛋白序列。

    #!/usr/bin/perl -w
    use strict;
    use Getopt::Long;
    
    my ($infile, $outfile);
    my ($alignLeng, $num, $cds2pep, $remove, $caseType);
    my ($version, $help);
    
    GetOptions(
        "infile|i=s"  => \$infile,
        "outfile|o:s" => \$outfile,
        "caseType|c:s"=> \$caseType,
        "alignLeng" => \$alignLeng,
        "num|n"     => \$num,
        "cds2pep"   => \$cds2pep,
        "version"   => \$version,
        "help|?|h"  => \$help
    );
    
    &usage if (!defined $infile);
    $num  ||= 60;
    $caseType ||= "";
    
    open FA, "<$infile" || die "Erro! Can't open file $infile\n";
    $/ = ">";<FA>;
    while(<FA>){
        chomp;
        my ($info, $seq) = split (/\n/, $_, 2);
        my $id = &keepPart($info,1,' ');
        my $out;
    
        if (defined $alignLeng){
           $seq =~ s/\n//g;
           $out = &out_fasta($seq, $num);
           $out .= "\n";
        }else{
            $out = $seq;
        }
        
        if ($caseType =~ /Uper/i){
            $out = uc($out);
            print ">$id\n$out";
        }elsif( $caseType =~ /low/i){
            $out = lc($out);
            print ">$id\n$out";
        }else{
            print ">$id\n$out";
        }
    }
    close FA;
    
    
    sub keepPart{
        my ($info, $col, $spp) = @_;
        my @tmp = split /$spp/, $info;
        my $out = $tmp[$col-1];
        return $out;
    }
    
    
    sub out_fasta{
        my ($seq,$num) = @_;
        my $len = length $seq;
        $seq =~ s/([A-Za-z]{$num})/$1\n/g;
        chop($seq) unless $len % $num;
        return $seq;
    }
    
    sub cds2pep{
        my $cds_fasta = shift @_;
        my $p = &code();
        my $pep_fasta;
    
        for(my $i = 0; $i < length $cds_fasta; $i += 3){
            my $codon = uc(substr($cds_fasta,$i,3));
            last if (length $codon < 3);
            $pep_fasta .= exists $p->{$codon} ? $p->{$codon} : "X";
        }
        return $pep_fasta;
    }
    
    sub code{
        my $codon = {
            'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A',                               # Alanine
            'TGC' => 'C', 'TGT' => 'C',                                                           # Cysteine
            'GAC' => 'D', 'GAT' => 'D',                                                           # Aspartic Aci
            'GAA' => 'E', 'GAG' => 'E',                                                           # Glutamic Aci
            'TTC' => 'F', 'TTT' => 'F',                                                           # Phenylalanin
            'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G',                               # Glycine
            'CAC' => 'H', 'CAT' => 'H',                                                           # Histidine
            'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I',                                             # Isoleucine
            'AAA' => 'K', 'AAG' => 'K',                                                           # Lysine
            'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L',   # Leucine
            'ATG' => 'M',                                                                         # Methionine
            'AAC' => 'N', 'AAT' => 'N',                                                           # Asparagine
            'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P',                               # Proline
            'CAA' => 'Q', 'CAG' => 'Q',                                                           # Glutamine
            'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R',   # Arginine
            'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S',   # Serine
            'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T',                               # Threonine
            'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V',                               # Valine
            'TGG' => 'W',                                                                         # Tryptophan
            'TAC' => 'Y', 'TAT' => 'Y',                                                           # Tyrosine
            'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U'                                              # Stop
        };
        return $codon;
    }
    
    sub usage{
        print STDERR<<EOF;
    ===========================================================
    Usage:
        perl $0 [OPTIONS]
    Options:
        --infile,    -i   input fasta
        --outfile,   -o   output [defualt]
        --alignLeng, -l   out sequence length
        --caseType,  -c   caseType (Uper, lower, NULL)
        --cds2pep         convert fasta sequence to pep
        --version         print version
        --help, -h, ?     print this help information
    ===========================================================
    EOF
        exit 1;
    }
    
    sub info{
        print STDOUT<<EOF;
    Author  :  Liu P
    Data    :  2017-12-04
    Version :  0.1
    Email   :  sxliulian2009@126.com
    EOF
        exit 0;  
    }
    __END__
    

    还可以根据需要加点小功能。

    相关文章

      网友评论

        本文标题:一个 fasta 序列格式化小脚本

        本文链接:https://www.haomeiwen.com/subject/jgexixtx.html