美文网首页
提取selective sweep区间以及基因信息

提取selective sweep区间以及基因信息

作者: pumpkinC | 来源:发表于2022-04-30 09:07 被阅读0次

    介绍:使用脚本select_candidate_genes.pl提取selective sweep 区间,区间中的基因,以及对应拟南芥基因组中的共线性基因和注释。

    输入文件(Turniprape_Oilseed.fst.txt.zscore):

    Chr Pos zFst    Fst Pop
    A01 100000  -0.806170053084979  0.047348880271623   TurniprapevsOilseed
    A01 105000  -0.778921829159115  0.0488180565924054  TurniprapevsOilseed
    A01 110000  -0.75681745703367   0.0500098854690626  TurniprapevsOilseed
    

    输出结果(Genes.in.sweep.regions.Lst.xls)

    SweepRegion CFid    Chr:start-end   Atid    Atinfo
    A01:5070000-5395000 BraA01g009860.3.1C  A01:5076463-5078857 AT4G18470   negative regulator of systemic acquired resistance (SNI1)
    A01:5070000-5395000 BraA01g009870.3.1C  A01:5079478-5081637 NA  NA
    A01:5070000-5395000 BraA01g009880.3.1C  A01:5087486-5088665 AT4G18460   D-Tyr-tRNA(Tyr) deacylase family protein
    

    select_candidate_genes.pl 如下:

    #!/usr/bin/perl -w
    use strict;
    
    my $in0 = $ARGV[0]; ##- Turniprape_Oilseed.fst.txt.zscore
    my $in1 = $ARGV[1]; ##- Chiifu.chrs.sizes
    my $in2 = $ARGV[2]; ##- Chiifu.coords
    my $in3 = $ARGV[3]; ##- Chiifu_TAIR10.SynOrths.gz
    my $in4 = $ARGV[4]; ##- TAIR10_functional_descriptions.txt.sim
    
    my $winSizes = 200000;
    
    my %chrid2Len = ();
       &readChrLen($in1, \%chrid2Len);
    
    my $mergedBed = "selection_sweep.bed.xls";
       &readOurliers($in0, \%chrid2Len, $mergedBed);
    
    my $out = "Genes.in.sweep.regions.Lst.xls";
       &readCFgene($mergedBed, $in2, $in3, $in4, $out);
     
    
    sub readCFgene {
    
        my ($bedFile, $geneCoords, $SynOrth, $Atinfo, $output) = @_;
    
    
        my %coord2Gene = ();
           &readCoords($geneCoords, \%coord2Gene);
    
        my %cfid2At = ();
           &CF2At($in3, \%cfid2At);
       
        my %atindex = ();
           &readAtinfo($Atinfo, \%atindex);
    
        open IN7, $bedFile;
        open OUTX, ">$output";  
        print OUTX join("\t", "SweepRegion", "CFid", "Chr:start-end", "Atid", "Atinfo"), "\n";
        while(<IN7>) {
          chomp;
          my @temp = split(/\t/, $_);
          for my $gS(sort {$a<=>$b}  keys %{$coord2Gene{$temp[0]}}){
              for my $gE(sort {$a<=>$b} keys %{$coord2Gene{$temp[0]}{$gS}}){
                  if($gS >= $temp[1] && $gE <= $temp[2]){
                     my $cfid = $coord2Gene{$temp[0]}{$gS}{$gE};
                     my $Atid = 'NA';
                     my $Atinfo = 'NA';
                        $Atid = $cfid2At{$cfid}, if(exists $cfid2At{$cfid});
                        $Atinfo = $atindex{$Atid}, if(exists $atindex{$Atid});
                        print OUTX join("\t", "$temp[0]:$temp[1]-$temp[2]", $cfid, "$temp[0]:$gS-$gE", $Atid, $Atinfo), "\n";
                  }  
              }
          }
        }
        close OUTX;
        close IN7;
    
        my %cfgeneIndex = ();
        my %atgeneIndex = ();
    
        open IN8, $output;
        <IN8>;
        while(<IN8>){
          chomp; 
          my @temp = split(/\t/, $_);
          if($temp[1] ne "NA"){
             $cfgeneIndex{$temp[1]} = "Y";
          }
          if($temp[3] ne "NA"){
             $atgeneIndex{$temp[3]} = "Y";
          }
        }
        close IN8;
        
        print "Number of genes (Chiifu): ", scalar(keys %cfgeneIndex), "\n"; 
        print "Number of orthologs in A. thaliana: ",  scalar(keys %atgeneIndex), "\n";
    
    
    }
    
    sub readAtinfo {
    
        my ($tair, $atindex) = @_;
    
        open IN6, $tair;
        while(<IN6>){
          chomp;
          my @temp = split(/\t/, $_);
          $atindex ->{$temp[0]} = $temp[1];
        }
        close IN6;
    
    }
    
    sub CF2At {
    
        my ($synOrth, $cfid2at) = @_;
    
        open IN5, "gzip -dc $synOrth | ";
        while(<IN5>){
          chomp;
          my @temp = split(/\t/, $_);
          $cfid2at ->{$temp[0]} = $temp[4] ; 
        }
        close IN5;
    
    }
    
    
    sub readCoords {
    
        my ($geneCoords, $coordindex) = @_;
    
        open IN4, $geneCoords;
        while(<IN4>){
          chomp;
          my @temp = split(/\t/, $_);
          $coordindex ->{$temp[1]} ->{$temp[2]} ->{$temp[3]} = $temp[0];
        }
        close IN4;
    
    }
    
    sub readOurliers {
    
        my ($Fst, $chr2Len, $mergedBed) = @_;
    
        open IN0, $Fst;
        open OUT0, ">$mergedBed.tmp";
        <IN0>;
        while(<IN0>){
          chomp;
          my @temp = split(/\t/, $_);
          if($temp[2] >= 2.33){
             my $start = $temp[1] - $winSizes/2;
             my $end   = $temp[1] + $winSizes/2;
               
             if($start < 0){
                $start = 0;
             }
             if($end > ($chr2Len ->{$temp[0]})){
                $end = $chr2Len ->{$temp[0]};
             }
             print OUT0 join("\t", $temp[0], $start, $end), "\n";
          }
    
        }
        close IN0;
        close OUT0;
    
        system("sort -k1,1 -k2,2n -k3,3n $mergedBed.tmp > $mergedBed.tmp.sorted");
        system("bedtools merge -i  $mergedBed.tmp.sorted  > $mergedBed"); 
        system("rm $mergedBed.tmp  $mergedBed.tmp.sorted");
    
        ##- stat
        my $totalLen = 0;
        open IN2, $mergedBed;
        while(<IN2>){
          chomp;
          my @temp = split(/\t/, $_);
          $totalLen += ($temp[2]-$temp[1]+1);
        }
        close IN2;  
        print "The total length of selective sweep regions: ", $totalLen, " bp", "\n"; 
    
    }
    
    sub readChrLen {
    
        my ($chrfile, $chrLen) = @_;
    
        open INX, $chrfile;
        while(<INX>){
          chomp;
          my @temp = split(/\t/, $_);
          $chrLen ->{$temp[0]} = $temp[1];
        }
        close INX;
    
    }
    

    相关文章

      网友评论

          本文标题:提取selective sweep区间以及基因信息

          本文链接:https://www.haomeiwen.com/subject/czpyyrtx.html