美文网首页
提取selective sweep区间以及基因信息

提取selective sweep区间以及基因信息

作者: pumpkinC | 来源:发表于2022-04-30 09:07 被阅读0次

介绍:使用脚本select_candidate_genes.pl提取selective sweep 区间,区间中的基因,以及对应拟南芥基因组中的共线性基因和注释。

输入文件(Turniprape_Oilseed.fst.txt.zscore):

Chr Pos zFst    Fst Pop
A01 100000  -0.806170053084979  0.047348880271623   TurniprapevsOilseed
A01 105000  -0.778921829159115  0.0488180565924054  TurniprapevsOilseed
A01 110000  -0.75681745703367   0.0500098854690626  TurniprapevsOilseed

输出结果(Genes.in.sweep.regions.Lst.xls)

SweepRegion CFid    Chr:start-end   Atid    Atinfo
A01:5070000-5395000 BraA01g009860.3.1C  A01:5076463-5078857 AT4G18470   negative regulator of systemic acquired resistance (SNI1)
A01:5070000-5395000 BraA01g009870.3.1C  A01:5079478-5081637 NA  NA
A01:5070000-5395000 BraA01g009880.3.1C  A01:5087486-5088665 AT4G18460   D-Tyr-tRNA(Tyr) deacylase family protein

select_candidate_genes.pl 如下:

#!/usr/bin/perl -w
use strict;

my $in0 = $ARGV[0]; ##- Turniprape_Oilseed.fst.txt.zscore
my $in1 = $ARGV[1]; ##- Chiifu.chrs.sizes
my $in2 = $ARGV[2]; ##- Chiifu.coords
my $in3 = $ARGV[3]; ##- Chiifu_TAIR10.SynOrths.gz
my $in4 = $ARGV[4]; ##- TAIR10_functional_descriptions.txt.sim

my $winSizes = 200000;

my %chrid2Len = ();
   &readChrLen($in1, \%chrid2Len);

my $mergedBed = "selection_sweep.bed.xls";
   &readOurliers($in0, \%chrid2Len, $mergedBed);

my $out = "Genes.in.sweep.regions.Lst.xls";
   &readCFgene($mergedBed, $in2, $in3, $in4, $out);
 

sub readCFgene {

    my ($bedFile, $geneCoords, $SynOrth, $Atinfo, $output) = @_;


    my %coord2Gene = ();
       &readCoords($geneCoords, \%coord2Gene);

    my %cfid2At = ();
       &CF2At($in3, \%cfid2At);
   
    my %atindex = ();
       &readAtinfo($Atinfo, \%atindex);

    open IN7, $bedFile;
    open OUTX, ">$output";  
    print OUTX join("\t", "SweepRegion", "CFid", "Chr:start-end", "Atid", "Atinfo"), "\n";
    while(<IN7>) {
      chomp;
      my @temp = split(/\t/, $_);
      for my $gS(sort {$a<=>$b}  keys %{$coord2Gene{$temp[0]}}){
          for my $gE(sort {$a<=>$b} keys %{$coord2Gene{$temp[0]}{$gS}}){
              if($gS >= $temp[1] && $gE <= $temp[2]){
                 my $cfid = $coord2Gene{$temp[0]}{$gS}{$gE};
                 my $Atid = 'NA';
                 my $Atinfo = 'NA';
                    $Atid = $cfid2At{$cfid}, if(exists $cfid2At{$cfid});
                    $Atinfo = $atindex{$Atid}, if(exists $atindex{$Atid});
                    print OUTX join("\t", "$temp[0]:$temp[1]-$temp[2]", $cfid, "$temp[0]:$gS-$gE", $Atid, $Atinfo), "\n";
              }  
          }
      }
    }
    close OUTX;
    close IN7;

    my %cfgeneIndex = ();
    my %atgeneIndex = ();

    open IN8, $output;
    <IN8>;
    while(<IN8>){
      chomp; 
      my @temp = split(/\t/, $_);
      if($temp[1] ne "NA"){
         $cfgeneIndex{$temp[1]} = "Y";
      }
      if($temp[3] ne "NA"){
         $atgeneIndex{$temp[3]} = "Y";
      }
    }
    close IN8;
    
    print "Number of genes (Chiifu): ", scalar(keys %cfgeneIndex), "\n"; 
    print "Number of orthologs in A. thaliana: ",  scalar(keys %atgeneIndex), "\n";


}

sub readAtinfo {

    my ($tair, $atindex) = @_;

    open IN6, $tair;
    while(<IN6>){
      chomp;
      my @temp = split(/\t/, $_);
      $atindex ->{$temp[0]} = $temp[1];
    }
    close IN6;

}

sub CF2At {

    my ($synOrth, $cfid2at) = @_;

    open IN5, "gzip -dc $synOrth | ";
    while(<IN5>){
      chomp;
      my @temp = split(/\t/, $_);
      $cfid2at ->{$temp[0]} = $temp[4] ; 
    }
    close IN5;

}


sub readCoords {

    my ($geneCoords, $coordindex) = @_;

    open IN4, $geneCoords;
    while(<IN4>){
      chomp;
      my @temp = split(/\t/, $_);
      $coordindex ->{$temp[1]} ->{$temp[2]} ->{$temp[3]} = $temp[0];
    }
    close IN4;

}

sub readOurliers {

    my ($Fst, $chr2Len, $mergedBed) = @_;

    open IN0, $Fst;
    open OUT0, ">$mergedBed.tmp";
    <IN0>;
    while(<IN0>){
      chomp;
      my @temp = split(/\t/, $_);
      if($temp[2] >= 2.33){
         my $start = $temp[1] - $winSizes/2;
         my $end   = $temp[1] + $winSizes/2;
           
         if($start < 0){
            $start = 0;
         }
         if($end > ($chr2Len ->{$temp[0]})){
            $end = $chr2Len ->{$temp[0]};
         }
         print OUT0 join("\t", $temp[0], $start, $end), "\n";
      }

    }
    close IN0;
    close OUT0;

    system("sort -k1,1 -k2,2n -k3,3n $mergedBed.tmp > $mergedBed.tmp.sorted");
    system("bedtools merge -i  $mergedBed.tmp.sorted  > $mergedBed"); 
    system("rm $mergedBed.tmp  $mergedBed.tmp.sorted");

    ##- stat
    my $totalLen = 0;
    open IN2, $mergedBed;
    while(<IN2>){
      chomp;
      my @temp = split(/\t/, $_);
      $totalLen += ($temp[2]-$temp[1]+1);
    }
    close IN2;  
    print "The total length of selective sweep regions: ", $totalLen, " bp", "\n"; 

}

sub readChrLen {

    my ($chrfile, $chrLen) = @_;

    open INX, $chrfile;
    while(<INX>){
      chomp;
      my @temp = split(/\t/, $_);
      $chrLen ->{$temp[0]} = $temp[1];
    }
    close INX;

}

相关文章

网友评论

      本文标题:提取selective sweep区间以及基因信息

      本文链接:https://www.haomeiwen.com/subject/czpyyrtx.html