介绍:使用脚本select_candidate_genes.pl提取selective sweep 区间,区间中的基因,以及对应拟南芥基因组中的共线性基因和注释。
输入文件(Turniprape_Oilseed.fst.txt.zscore):
Chr Pos zFst Fst Pop
A01 100000 -0.806170053084979 0.047348880271623 TurniprapevsOilseed
A01 105000 -0.778921829159115 0.0488180565924054 TurniprapevsOilseed
A01 110000 -0.75681745703367 0.0500098854690626 TurniprapevsOilseed
输出结果(Genes.in.sweep.regions.Lst.xls)
SweepRegion CFid Chr:start-end Atid Atinfo
A01:5070000-5395000 BraA01g009860.3.1C A01:5076463-5078857 AT4G18470 negative regulator of systemic acquired resistance (SNI1)
A01:5070000-5395000 BraA01g009870.3.1C A01:5079478-5081637 NA NA
A01:5070000-5395000 BraA01g009880.3.1C A01:5087486-5088665 AT4G18460 D-Tyr-tRNA(Tyr) deacylase family protein
select_candidate_genes.pl 如下:
#!/usr/bin/perl -w
use strict;
my $in0 = $ARGV[0]; ##- Turniprape_Oilseed.fst.txt.zscore
my $in1 = $ARGV[1]; ##- Chiifu.chrs.sizes
my $in2 = $ARGV[2]; ##- Chiifu.coords
my $in3 = $ARGV[3]; ##- Chiifu_TAIR10.SynOrths.gz
my $in4 = $ARGV[4]; ##- TAIR10_functional_descriptions.txt.sim
my $winSizes = 200000;
my %chrid2Len = ();
&readChrLen($in1, \%chrid2Len);
my $mergedBed = "selection_sweep.bed.xls";
&readOurliers($in0, \%chrid2Len, $mergedBed);
my $out = "Genes.in.sweep.regions.Lst.xls";
&readCFgene($mergedBed, $in2, $in3, $in4, $out);
sub readCFgene {
my ($bedFile, $geneCoords, $SynOrth, $Atinfo, $output) = @_;
my %coord2Gene = ();
&readCoords($geneCoords, \%coord2Gene);
my %cfid2At = ();
&CF2At($in3, \%cfid2At);
my %atindex = ();
&readAtinfo($Atinfo, \%atindex);
open IN7, $bedFile;
open OUTX, ">$output";
print OUTX join("\t", "SweepRegion", "CFid", "Chr:start-end", "Atid", "Atinfo"), "\n";
while(<IN7>) {
chomp;
my @temp = split(/\t/, $_);
for my $gS(sort {$a<=>$b} keys %{$coord2Gene{$temp[0]}}){
for my $gE(sort {$a<=>$b} keys %{$coord2Gene{$temp[0]}{$gS}}){
if($gS >= $temp[1] && $gE <= $temp[2]){
my $cfid = $coord2Gene{$temp[0]}{$gS}{$gE};
my $Atid = 'NA';
my $Atinfo = 'NA';
$Atid = $cfid2At{$cfid}, if(exists $cfid2At{$cfid});
$Atinfo = $atindex{$Atid}, if(exists $atindex{$Atid});
print OUTX join("\t", "$temp[0]:$temp[1]-$temp[2]", $cfid, "$temp[0]:$gS-$gE", $Atid, $Atinfo), "\n";
}
}
}
}
close OUTX;
close IN7;
my %cfgeneIndex = ();
my %atgeneIndex = ();
open IN8, $output;
<IN8>;
while(<IN8>){
chomp;
my @temp = split(/\t/, $_);
if($temp[1] ne "NA"){
$cfgeneIndex{$temp[1]} = "Y";
}
if($temp[3] ne "NA"){
$atgeneIndex{$temp[3]} = "Y";
}
}
close IN8;
print "Number of genes (Chiifu): ", scalar(keys %cfgeneIndex), "\n";
print "Number of orthologs in A. thaliana: ", scalar(keys %atgeneIndex), "\n";
}
sub readAtinfo {
my ($tair, $atindex) = @_;
open IN6, $tair;
while(<IN6>){
chomp;
my @temp = split(/\t/, $_);
$atindex ->{$temp[0]} = $temp[1];
}
close IN6;
}
sub CF2At {
my ($synOrth, $cfid2at) = @_;
open IN5, "gzip -dc $synOrth | ";
while(<IN5>){
chomp;
my @temp = split(/\t/, $_);
$cfid2at ->{$temp[0]} = $temp[4] ;
}
close IN5;
}
sub readCoords {
my ($geneCoords, $coordindex) = @_;
open IN4, $geneCoords;
while(<IN4>){
chomp;
my @temp = split(/\t/, $_);
$coordindex ->{$temp[1]} ->{$temp[2]} ->{$temp[3]} = $temp[0];
}
close IN4;
}
sub readOurliers {
my ($Fst, $chr2Len, $mergedBed) = @_;
open IN0, $Fst;
open OUT0, ">$mergedBed.tmp";
<IN0>;
while(<IN0>){
chomp;
my @temp = split(/\t/, $_);
if($temp[2] >= 2.33){
my $start = $temp[1] - $winSizes/2;
my $end = $temp[1] + $winSizes/2;
if($start < 0){
$start = 0;
}
if($end > ($chr2Len ->{$temp[0]})){
$end = $chr2Len ->{$temp[0]};
}
print OUT0 join("\t", $temp[0], $start, $end), "\n";
}
}
close IN0;
close OUT0;
system("sort -k1,1 -k2,2n -k3,3n $mergedBed.tmp > $mergedBed.tmp.sorted");
system("bedtools merge -i $mergedBed.tmp.sorted > $mergedBed");
system("rm $mergedBed.tmp $mergedBed.tmp.sorted");
##- stat
my $totalLen = 0;
open IN2, $mergedBed;
while(<IN2>){
chomp;
my @temp = split(/\t/, $_);
$totalLen += ($temp[2]-$temp[1]+1);
}
close IN2;
print "The total length of selective sweep regions: ", $totalLen, " bp", "\n";
}
sub readChrLen {
my ($chrfile, $chrLen) = @_;
open INX, $chrfile;
while(<INX>){
chomp;
my @temp = split(/\t/, $_);
$chrLen ->{$temp[0]} = $temp[1];
}
close INX;
}
网友评论