给小鼠的GATK4流程
下载必备文件
一般来说,可以选择最新版小鼠参考基因组(mm10)了,如果你实在有其它需求,也可以自行选择其它版本。
遗憾的是GATK官网不提供小鼠相关资源:ftp://ftp.broadinstitute.org/bundle
所以只能去illumina官网下载咯: https://support.illumina.com/sequencing/sequencing_software/igenome.html
mkdir -p ~/biosoft/GATK/resources/bundle/mm10
cd ~/biosoft/GATK/resources/bundle/mm10
wget ftp://igenome:G3nom3s4u@ussd-ftp.illumina.com/Mus_musculus/UCSC/mm10/Mus_musculus_UCSC_mm10.tar.gz
nohup tar zxvf Mus_musculus_UCSC_mm10.tar.gz &
下载全部成功后,应该如下:
如果有要做 RealignerTargetCreator 需要下载dbsnp文件,参考: https://www.biostars.org/p/182917/
质控
source activate wes
bin_trim_galore="trim_galore"
mkdir -p clean_fastq
cat $config_file |while read id
do
arr=($id)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
file=$(basename $fq1 )
fq1_base=${file%%.*}
file=$(basename $fq2 )
fq2_base=${file%%.*}
if((i%$number1==$number2))
then
ls $analysis_dir/clean_fastq/${fq2_base}_val_2.fq.gz
if [ ! -f $analysis_dir/clean_fastq/${fq2_base}_val_2.fq.gz ]; then
echo "start trim_galore for $sample" `date`
$bin_trim_galore -q 25 --phred33 --length 50 -e 0.1 --stringency 3 --paired -o $analysis_dir/clean_fastq $fq1 $fq2
echo "end trim_galore for $sample" `date`
fi
fi
i=$((i+1))
done
source deactivate
比对
module load java/1.8.0_91
GENOME=/home/jianmingzeng/biosoft/GATK/resources/bundle/mm10/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa
INDEX=/home/jianmingzeng/biosoft/GATK/resources/bundle/mm10/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa
GATK=/home/jianmingzeng/biosoft/GATK/gatk-4.0.2.1/gatk
#####################################################
################ Step 1 : Alignment #################
#####################################################
start=$(date +%s.%N)
echo bwa `date`
bwa mem -t 5 -M -R "@RG\tID:$sample\tSM:$sample\tLB:WES\tPL:Illumina" $INDEX $fq1 $fq2 1>$sample.sam
echo bwa `date`
dur=$(echo "$(date +%s.%N) - $start" | bc)
printf "Execution time for BWA : %.6f seconds" $dur
echo
#####################################################
################ Step 2: Sort and Index #############
#####################################################
start=$(date +%s.%N)
echo SortSam `date`
$GATK --java-options "-Xmx25G -Djava.io.tmpdir=./" SortSam \
-SO coordinate -I $sample.sam -O $sample.bam #1>log.sort 2>&1
samtools index $sample.bam
echo SortSam `date`
dur=$(echo "$(date +%s.%N) - $start" | bc)
printf "Execution time for SortSam : %.6f seconds" $dur
echo
rm $sample.sam
#####################################################
################ Step 3: Basic Statistics ###########
#####################################################
start=$(date +%s.%N)
echo stats `date`
samtools flagstat $sample.bam > ${sample}.alignment.flagstat
samtools stats $sample.bam > ${sample}.alignment.stat
echo plot-bamstats -p ${sample}_QC ${sample}.alignment.stat
echo stats `date`
dur=$(echo "$(date +%s.%N) - $start" | bc)
printf "Execution time for Basic Statistics : %.6f seconds" $dur
echo
#####################################################
####### Step 4: multiple filtering for bam files ####
#####################################################
###MarkDuplicates###
start=$(date +%s.%N)
echo MarkDuplicates `date`
$GATK --java-options "-Xmx25G -Djava.io.tmpdir=./" MarkDuplicates \
-I $sample.bam -O ${sample}_marked.bam -M $sample.metrics --REMOVE_DUPLICATES true
echo MarkDuplicates `date`
dur=$(echo "$(date +%s.%N) - $start" | bc)
printf "Execution time for MarkDuplicates : %.6f seconds" $dur
echo
rm $sample.bam
###FixMateInfo###
start=$(date +%s.%N)
echo FixMateInfo `date`
$GATK --java-options "-Xmx25G -Djava.io.tmpdir=./" FixMateInformation \
-I ${sample}_marked.bam -O ${sample}_marked_fixed.bam -SO coordinate
samtools index ${sample}_marked_fixed.bam
echo FixMateInfo `date`
dur=$(echo "$(date +%s.%N) - $start" | bc)
printf "Execution time for FixMateInfo : %.6f seconds" $dur
echo
rm ${sample}_marked.bam
网友评论