查看下后台运行的程序情况:
(base) root@1100150:~# ll ~/project/2.gatk/|grep '_bqsr.bam'|wc
231 2079 12613
(base) root@1100150:~# ll ~/project/0.bwa/|grep 'ok'|wc
250 2250 17146
已经成功转换的bam文件有250个,其中231个已经完成碱基质量值矫正,
下一步则是HaplotypeCaller,参考之前的练习https://www.jianshu.com/p/6ae5af0794a7
还是先写脚本calling.sh
:
## start the gatk analysis
GATK=~/biosoft/gatk-4.1.7.0/gatk
#references
ref=~/reference/genome/Homo_sapiens_assembly38.fasta
cat config | while read id
do
if [ ! -f ~/project/2.gatk/gvcf/${id}.HC.g.vcf.gz]
then
time $samtools index ~/project/2.gatk/${id}_bqsr.bam
echo "** ${id}.sorted.marked.fixed.bqsr.bam index done **"
time $GATK --java-options "-Xmx20G -Djava.io.tmpdir=./tmp" HaplotypeCaller \
--emit-ref-confidence GVCF \
-R $ref \
-I ~/project/2.gatk/${id}_bqsr.bam \
-O ~/project/2.gatk/gvcf/${id}.HC.g.vcf.gz \
1>~/project/2.gatk/${id}_log.HC 2>&1
echo "** GVCF ${id}.HC.g.vcf.gz done **"
fi
done
同样nohup bash calling.sh &
提交到后台。
下一步则是分步call SNP,Indel:脚本预计内容如下
# VQSR
# first SNP mode 分别评估SNP和INDEL突变位点的质量
# SNP mode
samtools=samtools
GATK=~/biosoft/gatk-4.1.7.0/gatk
#references
ref=~/reference/genome/Homo_sapiens_assembly38.fasta
gatk_ref=~/reference/genome/Homo_sapiens_assembly38.fasta
gatk_bundle=~/annotation/variation/GATK
dbsnp=$gatk_bundle/dbsnp_146.hg38.vcf.gz
indel=$gatk_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
G1000=$gatk_bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz
hapmap=$gatk_bundle/hapmap_3.3.hg38.vcf.gz
omini=$gatk_bundle/1000G_omni2.5.hg38.vcf.gz
mills=$gatk_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
cat config | while read id
do
if [ ! -f ~/project/2.gatk/gvcf/${id}.HC.snps.recal]
then
time $GATK VariantRecalibrator \
-R $ref \
-V ~/project/2.gatk/gvcf/${id}.HC.g.vcf.gz \
--max-gaussians 4 \
-resource:hapmap,known=false,training=true,truth=true,prior=15.0 $hapmap \
-resource:omini,known=false,training=true,truth=false,prior=12.0 $omini \
-resource:1000G,known=false,training=true,truth=false,prior=10.0 $G1000 \
-resource:snp,known=true,training=false,truth=false,prior=10.0 $dbsnp \
-an DP -an QD -an SOR -an ReadPosRankSum -an MQRankSum \
-mode SNP \
--rscript-file ~/project/2.gatk/gvcf/${id}.HC.snps.plots.R \
--tranches-file ~/project/2.gatk/gvcf/${id}.HC.snps.tranches \
-O ~/project/2.gatk/gvcf/${id}.HC.snps.recal
time $GATK ApplyVQSR \
-R $ref \
-V ~/project/2.gatk/gvcf/${id}.HC.g.vcf.gz \
--truth-sensitivity-filter-level 99.0 \
--tranches-file ~/project/2.gatk/gvcf/${id}.HC.snps.tranches \
--recal-file ~/project/2.gatk/gvcf/${id}.HC.snps.recal \
-mode SNP \
-O ~/project/2.gatk/gvcf/${id}.HC.snps.VQSR.vcf.gz && echo "** SNPs VQSR done **"
## Indel mode
time $GATK VariantRecalibrator \
-R $ref \
-V ~/project/2.gatk/gvcf/${id}.HC.g.vcf.gz \
--max-gaussians 6 \
-resource:mills,known=false,training=true,truth=true,prior=15.0 $mills \
-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
-mode INDEL \
--rscript-file ~/project/2.gatk/gvcf/${id}.HC.snps.indels.plots.R \
--tranches-file ~/project/2.gatk/gvcf/${id}.HC.snps.indels.tranches \
-O ~/project/2.gatk/gvcf/${id}.HC.snps.indels.recal
time $GATK ApplyVQSR \
-R $ref \
-V ~/project/2.gatk/gvcf/${id}.HC.snps.VQSR.vcf.gz \
--truth-sensitivity-filter-level 99.0 \
--tranches-file ~/project/2.gatk/gvcf/${id}.HC.snps.indels.tranches \
--recal-file ~/project/2.gatk/gvcf/${id}.HC.snps.indels.recal \
-mode INDEL \
-O ~/project/2.gatk/gvcf/${id}.HC.snps.indels.VQSR.vcf.gz && echo "** SNPs and Indels VQSR $sample done **"
fi
done
不知道需要运行多久,先不提交,先看前一步的结果如何。
先记录下,这个时间2020-07-17 21:19分时的结果:
(base) root@1100150:~# ll ~/project/0.bwa/|grep 'ok'|wc
252 2268 17284
(base) root@1100150:~# ll ~/project/2.gatk/|grep '_bqsr.bam'|wc
231 2079 12613
(base) root@1100150:~# ll ~/project/2.gatk/gvcf/
total 44
drwxr-xr-x 2 root root 2 Jul 10 00:43 ./
drwxr-xr-x 3 root root 1725 Jul 17 12:22 ../
2020-07-19更新查看样本情况
(wes) root@1100150:~/project# l ~/project/2.gatk/gvcf/
(wes) root@1100150:~/project# ll ~/project/2.gatk/|grep '_bqsr.bam'|wc
255 2295 13931
(wes) root@1100150:~/project# ll ~/project/0.bwa/|grep 'ok'|wc
461 4149 31681
(wes) root@1100150:~/project# date
Sun Jul 19 11:23:23 UTC 2020
网友评论