美文网首页
gatk 分析流程 2020-06-02

gatk 分析流程 2020-06-02

作者: 爬山小虎 | 来源:发表于2020-06-08 10:38 被阅读0次
    #!/bin/sh
    
    #PBS -N DL_FQ
    #PBS -l nodes=1:ppn=16
    #PBS -o /home/omics2017/hx_du/test1.log
    #PBS -e /home/omics2017/hx_du/test1.err
    #PBS -q batch
    #PBS -l walltime=120:00:00
    
    wkdir=/bios-store2/hx_du/fq_td/HBB300K_200501/200501/PE
    
    src=/home/omics2017/hx_du/program
    ref=/bios-store2/hx_du/NIPT/reference
    bedfile=/bios-store2/hx_du/HBB300K_200327/test20200331/all_target_segments_covered_by_probes_HBB_v1_hg19_191108105530.bed
    
    cd ${wkdir}
    
    
    samples=$(for i in $(ls *_R1*.fastq.gz);do echo "${i%%_*}";done)
    
    for ID in $samples
    do
        echo $ID
    
    
        if [ -f ${ID}*R2*.fastq.gz ];then
            echo ${ID} "R2 exist"
            java -jar $src/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 16 $wkdir/${ID}*R1*.fastq.gz $wkdir/${ID}*R2*.fastq.gz ${ID}_1_paired.fq.gz ${ID}_1_unpaired.fq.gz ${ID}_2_paired.fq.gz ${ID}_2_unpaired.fq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
    
    
            # PE ===========================
            $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_1_paired.fq.gz ${ID}_2_paired.fq.gz > ${ID}_pe.sam
            $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_1_unpaired.fq.gz > ${ID}_s1.sam
            $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}_2_unpaired.fq.gz > ${ID}_s2.sam
    
            picard MergeSamFiles \
                TMP_DIR=${wkdir}/tmp \
                INPUT=${ID}_pe.sam \
                INPUT=${ID}_s1.sam \
                INPUT=${ID}_s2.sam \
                OUTPUT=${ID}.bam
            rm ${ID}*.sam
            # #===========================
    
        else
            echo ${ID} "R2 noexist"
    
            java -jar $src/Trimmomatic-0.38/trimmomatic-0.38.jar SE -threads 16 $wkdir/${ID}*R1*.fastq.gz ${ID}.trimmed.fq.gz LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
            $src/bwa-0.7.17/bwa mem -t 16 -M $ref/genome.fa ${ID}.trimmed.fq.gz > ${ID}_s1.sam
            picard MergeSamFiles \
                INPUT=${ID}_s1.sam \
                OUTPUT=${ID}.bam
            rm ${ID}*.sam
        fi
        
        picard AddOrReplaceReadGroups \
            TMP_DIR=${wkdir}/tmp \
            INPUT=${ID}.bam \
            OUTPUT=${ID}.add.bam \
            RGID=group1 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=${ID}
    
        picard SortSam \
            TMP_DIR=${wkdir}/tmp \
            INPUT=${ID}.add.bam \
            OUTPUT=${ID}.add.sort.bam \
            SORT_ORDER=coordinate
    
        picard MarkDuplicates \
            INPUT=${ID}.add.sort.bam \
            OUTPUT=${ID}.add.dedup.bam \
            METRICS_FILE=metrics.txt \
            REMOVE_DUPLICATES=true
    
        picard BuildBamIndex \
            TMP_DIR=${wkdir}/tmp \
            INPUT=${ID}.add.dedup.bam
    
        # # Not essential ===========================
        gatk BaseRecalibrator \
            -R $ref/genome.fa \
            -I ${ID}.add.dedup.bam \
            --known-sites $ref/hg19/dbsnp_138.hg19.vcf \
            --known-sites $ref/hg19/1000G_phase1.snps.high_confidence.hg19.sites.vcf \
            -O ${ID}.recal.grp \
            -L $bedfile
    
        gatk ApplyBQSR \
            -R $ref/genome.fa \
            -I ${ID}.add.dedup.bam \
            -bqsr ${ID}.recal.grp \
            -O ${ID}.recal.bam \
            -L $bedfile
        # # ===========================
    
        # # Singel Sample ===========================
        gatk HaplotypeCaller \
            -R $ref/genome.fa \
            -I ${ID}.recal.bam \
            -O ${ID}.raw.g.vcf \
            -ERC GVCF \
            -L $bedfile
        # # =========================================   
    
        # # Multiple Sample ===========================
        gatk GenotypeGVCFs \
            -R $ref/genome.fa \
            -V ${ID}.raw.g.vcf \
            --include-non-variant-sites true \
            -O ${ID}_genotype.vcf \
            -L $bedfile
    
    
    
        gatk SelectVariants \
            -R $ref/genome.fa \
            -V ${ID}_genotype.vcf \
            --select-type-to-include SNP \
            --select-type-to-include NO_VARIATION \
            -O ${ID}_raw_snps.vcf \
            -L $bedfile
        #===========================
    
        gatk VariantFiltration \
            -V ${ID}_raw_snps.vcf \
            --filter-expression "QD < 2.0 || FS > 200.0 || SOR > 10.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0 || DP < 10" \
            --filter-name "Filter" \
            -O ${ID}.filter.vcf
    
        cat ${ID}.filter.vcf | grep -v '\./\.' > ${ID}.filter.gt.vcf
    
    
    done
    
    

    相关文章

      网友评论

          本文标题:gatk 分析流程 2020-06-02

          本文链接:https://www.haomeiwen.com/subject/gowyzhtx.html