目的:从测序数据(clean reads) 到样本的GVCF文件。利用bwa,samtools,和GATK 获得每个样本的GVCF文件。
#!/usr/bin/perl -w
use strict;
my $in0 = $ARGV[0]; ##- sam id
my $in1 = $ARGV[1]; ##- reference fasta
my $bwa = "/10t/caix/bin/bwa";
my $samtools = "/120t/caix/src/samtools-1.9/samtools"; ##- samtools 1.9
my $sambamba = "/120t/caix/src/bin/sambamba";
my $gatk = "/120t/caix/src/gatk-4.1.3.0/gatk";
my $threads = 10;
my $readDir = "/120t/caix/work/Bra_resequencing/Mapping_to_TUE_genes/reads"; ##- it depends__
##-make reference index
my $cmdString = "NA";
$cmdString = "$bwa index $in1";
if(not -e $in1.".bwt"){
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
}
##-map short-reads onto reference
my $left = "$readDir/$in0"."_1.fq.ft.gz";
my $right = "$readDir/$in0"."_2.fq.ft.gz";
die "Error: cannot find: $left", if(not -e $left);
die "Error: cannot find: $right", if(not -e $right);
$cmdString = "$bwa mem -M -t $threads -R '\@RG\\tID:$in0\\tLB:$in0\\tPL:ILLUMINA\\tSM:$in0' $in1 $left $right | $samtools view -@ $threads -Sb -| $samtools sort -@ $threads -o $in0.sorted.bam";
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
##-remove duplication
$cmdString = "$sambamba markdup -r -t 10 --overflow-list-size 600000 --tmpdir ./$in0.tmp $in0.sorted.bam $in0.sorted.rmdup.bam";
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
##-bam index
$cmdString = "$samtools index $in0.sorted.rmdup.bam";
system("$cmdString") == 0 or die "failed to execute: $cmdString\n";
##-make gatk4 index
if(not -e $in1.".dict"){
$cmdString = "$gatk CreateSequenceDictionary -R $in1";
system("$cmdString");
}
if(not -e $in1.".fai"){
$cmdString = "$samtools faidx $in1";
system("$cmdString");
}
##-run gatk call GVCF
$cmdString = "$gatk HaplotypeCaller --emit-ref-confidence GVCF -R $in1 -I $in0.sorted.rmdup.bam -O $in0.gvcf.gz 2>/dev/null";
#system("$cmdString");
##-clean
system("rm -rf $in0.sorted.bam ./$in0.tmp ");
网友评论