一个 fasta 小工具,主要用来格式化fasta序列的,也可以实现 fasta 核酸序列转为 pep 蛋白序列。
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
my ($infile, $outfile);
my ($alignLeng, $num, $cds2pep, $remove, $caseType);
my ($version, $help);
GetOptions(
"infile|i=s" => \$infile,
"outfile|o:s" => \$outfile,
"caseType|c:s"=> \$caseType,
"alignLeng" => \$alignLeng,
"num|n" => \$num,
"cds2pep" => \$cds2pep,
"version" => \$version,
"help|?|h" => \$help
);
&usage if (!defined $infile);
$num ||= 60;
$caseType ||= "";
open FA, "<$infile" || die "Erro! Can't open file $infile\n";
$/ = ">";<FA>;
while(<FA>){
chomp;
my ($info, $seq) = split (/\n/, $_, 2);
my $id = &keepPart($info,1,' ');
my $out;
if (defined $alignLeng){
$seq =~ s/\n//g;
$out = &out_fasta($seq, $num);
$out .= "\n";
}else{
$out = $seq;
}
if ($caseType =~ /Uper/i){
$out = uc($out);
print ">$id\n$out";
}elsif( $caseType =~ /low/i){
$out = lc($out);
print ">$id\n$out";
}else{
print ">$id\n$out";
}
}
close FA;
sub keepPart{
my ($info, $col, $spp) = @_;
my @tmp = split /$spp/, $info;
my $out = $tmp[$col-1];
return $out;
}
sub out_fasta{
my ($seq,$num) = @_;
my $len = length $seq;
$seq =~ s/([A-Za-z]{$num})/$1\n/g;
chop($seq) unless $len % $num;
return $seq;
}
sub cds2pep{
my $cds_fasta = shift @_;
my $p = &code();
my $pep_fasta;
for(my $i = 0; $i < length $cds_fasta; $i += 3){
my $codon = uc(substr($cds_fasta,$i,3));
last if (length $codon < 3);
$pep_fasta .= exists $p->{$codon} ? $p->{$codon} : "X";
}
return $pep_fasta;
}
sub code{
my $codon = {
'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', # Alanine
'TGC' => 'C', 'TGT' => 'C', # Cysteine
'GAC' => 'D', 'GAT' => 'D', # Aspartic Aci
'GAA' => 'E', 'GAG' => 'E', # Glutamic Aci
'TTC' => 'F', 'TTT' => 'F', # Phenylalanin
'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', # Glycine
'CAC' => 'H', 'CAT' => 'H', # Histidine
'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', # Isoleucine
'AAA' => 'K', 'AAG' => 'K', # Lysine
'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L', # Leucine
'ATG' => 'M', # Methionine
'AAC' => 'N', 'AAT' => 'N', # Asparagine
'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', # Proline
'CAA' => 'Q', 'CAG' => 'Q', # Glutamine
'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R', # Arginine
'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S', # Serine
'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', # Threonine
'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', # Valine
'TGG' => 'W', # Tryptophan
'TAC' => 'Y', 'TAT' => 'Y', # Tyrosine
'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U' # Stop
};
return $codon;
}
sub usage{
print STDERR<<EOF;
===========================================================
Usage:
perl $0 [OPTIONS]
Options:
--infile, -i input fasta
--outfile, -o output [defualt]
--alignLeng, -l out sequence length
--caseType, -c caseType (Uper, lower, NULL)
--cds2pep convert fasta sequence to pep
--version print version
--help, -h, ? print this help information
===========================================================
EOF
exit 1;
}
sub info{
print STDOUT<<EOF;
Author : Liu P
Data : 2017-12-04
Version : 0.1
Email : sxliulian2009@126.com
EOF
exit 0;
}
__END__
还可以根据需要加点小功能。
网友评论