美文网首页生物信息学习SNP
生物信息必备技能 -- 查找 SNP 突变后的氨基酸

生物信息必备技能 -- 查找 SNP 突变后的氨基酸

作者: 正踪大米饭儿 | 来源:发表于2017-02-28 11:55 被阅读361次

    生信虐我系列之--几个必备小程序:
    这个程序可以根据提供的 SNP位点信息,以及参考注释信息对 SNP 位点突变前后的 氨基酸 进行比较。

    ** 修改了部分代码,重新设计了 SNP 突变位点落在反向转录本上的情况**
    ** 重新设计了结果的输出格式**
    ** 欢迎积极提出设计上的不足**

    输入文件格式:
    1. 参考注释的 GTF 文件;
    2. 参考注释的基因组 fasta 文件;
    3. 输入的 SNP 列表文件:
      1       49527   A       G       Exon:Zm00001d027230;
      1       51053   A       T       Exon:Zm00001d027231;
      1       53013   A       G       Exon:Zm00001d027231;
      1       53109   C       T       Exon:Zm00001d027231;
    第一列: 染色体
    第二列:突变位置
    第三列:SNP 参考的碱基
    第四列:SNP 位点突变后的碱基
    第五列:[可选],SNP 的信息
    
    

    使用方法:

    perl script.pl zma.gtf zma.fa in.txt >result.txt
    

    程序源码如下:(遵循 GPL v3 Licenses

    #!/usr/bin/perl -w
    use strict;
    die "Usage: perl $0 <zma.GTF> <zma.FA> <in.TXT> >Result.txt \n" if @ARGV < 3;
    
    &timing("start")
    my (%tr, %cds, %seq);
    
    open IN, $ARGV[0] or die $!;
    while (<IN>){
        chomp;
        next if /^#/;
        my @a = split /\t/,$_;
        if ($a[2] eq "transcript"){
    #        (my $gene) = $a[8] =~ /gene_id\s"(\S+)";/;
            (my $id) = $a[8] =~ /transcript_id\s"(\S+)";/;
    
    #        $tr{$a[0]}{$gene}{$id} = [@a[3,4,6]];
            $tr{$a[0]}{$id} = [@a[3,4,6]];
        }
        if ($a[2] eq "CDS"){
    #        (my $gene) = $a[8] =~ /gene_id\s"(\S+)";/;
            (my $id) = $a[8] =~ /transcript_id\s"(\S+)";/;
            next unless exists $tr{$a[0]}{$id};
            push @{$cds{$a[0]}{$id}},[@a[3,4,7]];
        }
    }
    close IN;
    
    # make fasta hash
    
    open FA,$ARGV[1] or die $!;
    $/ = ">"; <FA>;
    while(<FA>){
        chomp;
        my ($info,$fa) = split /\n/,$_,2;
        my ($chr,$tmp) = split /\s/,$info,2;
        $fa =~ s/\n+//g;
        $seq{$chr} = $fa;
    }
    $/ = "\n";
    close FA;
    
    # deal with input 
    open IN,$ARGV[2] or die $!;
    print "# Chrom:Loci:Ref->Alt\tTranscript\tStrand\tCDS(Ref->Alt)\tRef_Codon\tAlt_Codon\tRef_aa\tAlt_aa\tAlt_Pos\tCDS_Pos\n";
    while (<IN>){
        chomp;
        my ($chr,$loci,$ref,$alt,$tmp) = split /\t/,$_;
        my ($strand,$j,$tr_id);
        my @arr;
        my $flag = 0;
    
        if ( exists $tr{$chr} ){
            foreach my $k (sort keys $tr{$chr}){
                if ($loci >= $tr{$chr}{$k}->[0] && $loci <= $tr{$chr}{$k}->[1]){
                    $flag = 1;
                    $strand = $tr{$chr}{$k}->[2];
                    @arr = sort{$a->[0] <=> $b->[0] || $a->[1] <=> $b->[1]} @{$cds{$chr}{$k}};
                    for (my $i=0;$i<@arr;$i++){
                        if ($loci >= $arr[$i]->[0] && $loci <= $arr[$i]->[1]){
                            $flag = 2;
                            $j = $i;
                            $tr_id = $k;
                            last;
                        }
                    }
                }
            }
        }
    if ($flag == 2 ){
            my ($pos, $seq, $site, $ref_codon, $alt_codon, $ref_aa, $alt_aa);
            my ($ref_nucl, $alt_nucl, $alt_pos);
            my $c = &code();
    
            if ($strand eq "+"){
                foreach (0..($j-1)){
                    $pos += $arr[$_]->[1] - $arr[$_]->[0] + 1;
                }
                $pos += $loci - $arr[$j]->[0] + 1;
                foreach my $v (@arr){
                    my ($x,$y) = @$v[0,1];
                    $seq .= uc(substr($seq{$chr},$x-1,$y-$x+1));
                }
                $ref_nucl = substr($seq,$pos-1,1);
                $alt_nucl = $alt;
    
                $site = $pos % 3 ;
                if($site == 1){
                    $ref_codon = uc(substr($seq,$pos - 1,3));
                    $alt_codon = &ref2alt($ref_codon,1,$alt_nucl);
                    $alt_pos = 1;
                }elsif($site == 2){
                    $ref_codon = uc(substr($seq,$pos - 2,3));
                    $alt_codon = &ref2alt($ref_codon,2,$alt_nucl);
                    $alt_pos = 2;
                }elsif($site == 0){
                    $ref_codon = uc(substr($seq,$pos - 3,3));
                    $alt_codon = &ref2alt($ref_codon,3,$alt_nucl);
                    $alt_pos = 3;
                }
            }
            if ($strand eq "-"){
                my $pos1;
                foreach (0..($j-1)){
                    $pos1 += $arr[$_]->[1] - $arr[$_]->[0] + 1;
                }
                $pos1 += $loci - $arr[$j]->[0] + 1;
                foreach my $v (@arr){
                    my ($x,$y) = @$v[0,1];
                    $seq .= uc(substr($seq{$chr},$x-1,$y-$x+1));
                }
                my $len = length $seq;
                $pos = $len - $pos1 + 1;
                $seq = reverse $seq;
                $seq =~ tr/AaGgCcTt/TtCcGgAa/;
     
                $ref_nucl = substr($seq,$pos-1,1);
                $alt_nucl = $alt;
                $alt_nucl =~ tr/AaGgCcTt/TtCcGgAa/;
                
                $site = $pos % 3;
                if($site == 1){
                    $ref_codon = uc(substr($seq,$pos - 1,3));
                    $alt_codon = &ref2alt($ref_codon,1,$alt_nucl);
                    $alt_pos = 1;
                }elsif($site == 2){
                    $ref_codon = uc(substr($seq,$pos - 2,3));
                    $alt_codon = &ref2alt($ref_codon,2,$alt_nucl);
                    $alt_pos = 2;
                }elsif($site == 0){
                    $ref_codon = uc(substr($seq,$pos - 3,3));
                    $alt_codon = &ref2alt($ref_codon,3,$alt_nucl);
                    $alt_pos = 3;
                }
            }
            $ref_aa = exists $c->{"standard"}{$ref_codon} ? $c->{"standard"}{$ref_codon} : "-";
            $alt_aa = exists $c->{"standard"}{$alt_codon} ? $c->{"standard"}{$alt_codon} : "-";
            print "$chr:$loci:$ref->$alt\t$tr_id\t$strand\t$ref_nucl->$alt_nucl\t$ref_codon\t$alt_codon\t$ref_aa\t$alt_aa\t$alt_pos\tcds_$pos\n"; 
        }
    }
    close IN;
    
    &timing("end");
    
    # time
    sub timing{
        my $state=shift;
        my $time=strftime("%Y-%m-%d.%H:%M:%S",localtime);
        print STDOUT "## ==== >> Start at $time\n" if $state eq "start";
        print STDOUT "## ==== >> End at $time\n" if $state eq "end";
    }
    # ref2alt
    sub ref2alt{
        my ($codon, $pos, $alt) = @_;
        my ($c1, $c2, $c3) = split //,$codon;
        my $out;
        
        if ($pos == 1){
            $out = join ("",$alt,$c2,$c3);
        }elsif ($pos == 2){
            $out = join ("",$c1,$alt,$c3);
        }else{
            $out = join ("",$c1,$c2,$alt);
        }
        return $out;
    }
    
    # code
    sub code{
        my $p = {
            "standard" =>
            {       
                'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A',                               # Alanine
                'TGC' => 'C', 'TGT' => 'C',                                                           # Cysteine
                'GAC' => 'D', 'GAT' => 'D',                                                           # Aspartic Aci
                'GAA' => 'E', 'GAG' => 'E',                                                           # Glutamic Aci
                'TTC' => 'F', 'TTT' => 'F',                                                           # Phenylalanin
                'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G',                               # Glycine
                'CAC' => 'H', 'CAT' => 'H',                                                           # Histidine
                'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I',                                             # Isoleucine
                'AAA' => 'K', 'AAG' => 'K',                                                           # Lysine
                'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L',   # Leucine
                'ATG' => 'M',                                                                         # Methionine
                'AAC' => 'N', 'AAT' => 'N',                                                           # Asparagine
                'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P',                               # Proline
                'CAA' => 'Q', 'CAG' => 'Q',                                                           # Glutamine
                'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R',   # Arginine
                'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S',   # Serine
                'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T',                               # Threonine
                'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V',                               # Valine
                'TGG' => 'W',                                                                         # Tryptophan
                'TAC' => 'Y', 'TAT' => 'Y',                                                           # Tyrosine
                'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U'                                              # Stop
            }
        }
    }
    
    
    __END__
    
    1       49527   A       G       Exon:Zm00001d027230;
    1       51053   A       T       Exon:Zm00001d027231;
    1       53013   A       G       Exon:Zm00001d027231;
    1       53109   C       T       Exon:Zm00001d027231;
    1       53826   T       C       Exon:Zm00001d027231;
    1       53885   T       C       Exon:Zm00001d027231;
    
    

    本文原创,转载请注明出处!
    欢迎留言讨论!

    相关文章

      网友评论

        本文标题:生物信息必备技能 -- 查找 SNP 突变后的氨基酸

        本文链接:https://www.haomeiwen.com/subject/pxkxgttx.html