WES 分析(从下载数据到IGV)

作者: dandanwu90 | 来源:发表于2019-01-27 20:57 被阅读62次

    1. 数据下载

    SRP077971 选取其中三个wes下载

    cat >list.txt
    SRR5943131
    SRR5943132
    SRR5943133
    

    下载sra数据,每个耗时~25min

    cat >download.sh
    cat list.txt |while read id;do (prefetch ${id});done
    nohup bash download.sh &
    
    figure1 sra

    sra转为fastq.gz(每个耗时~30min)

    #--gzip,输出文件为gzip的压缩格式
    #--split-3,将一个样本初始的3文件以配对的形式拆分为_1.fastq,_2.fastq;若只有一个fastq,则忽略拆分,保留一个fastq
    #-O 输出文件
    #1>${id}.sra.log 日志信息保存
    #2>&1 报错信息保存在日志1中
    mkdir sra && cd sra
    mv /home/ncbi/project/sra/SRR* ./
    ls *|while read id; do (fastq-dump --gzip --split-3 -O ${id}) 1>${id}.sra.log 2>&1;done
    
    figure2 fastq

    3. 质控

    3.1 序列质量
    # xargs 将ls的内容用管道符传递给fastqc,
    #-t 为线程数,运行速度更快
    cd ../
    mkdir qc && cd qc
    ls ../sra/*gz|xargs fastqc -t 3 -o ./
    multiqc ./
    
    figure3 qc.png figure4 multiqc
    figure4.1序列中有N figure4.2有overlap,有adaptor
    3.2 Quality control (qc) 每个样本耗时~30min

    出错:输出路径-o ../clean 以及../clean/ ,log日志 1>${sample}.log 2>&1的路径

    #ls *_1.fastq.gz >1将fq1样本名称放在一个文本里
    #ls *_2.fastq.gz >2 将fq2样本名称放在一个文本里
    #paste将同一个样本的fq1,fq2 放在同一行,进行处理
    #-q 25 去除序列中的接头及低质量的序列,默认值为
    #--phred33 质控去除接头时(Sanger/Illumina 1.9+ encoding)以Phred 为ASCII+33质量值进行
    #  --stringency 3 3‘端adaptor重叠为3bp时,对序列进行过滤             
    mkdir raw && cd raw
    mv ../*fastq.gz ./
    ls *_1.fastq.gz >1
    ls *_2.fastq.gz >2
    paste 1 2 >config
    cat >qc.sh
    vim qc.sh
    cat config|while read sample
    do
     arr=(${sample})
     fq1=${arr[0]}
     fq2=${arr[1]}
    trim_galore -q 25 --phred33 --length 36 -e 0.1 --stringency 3 --paired -o ../clean $fq1 $fq2
    done
    nohup nash qc.sh &
    
    figure4 clean

    4. mapping (每个样品1h10min)

    bwa从sam到排序好的bam:将打断测序的reads重新mapping到参考基因组上得到sam文件,然后用samtools对文件进行排序

    #文件路径home/vip11/project/wes/test/clean/SRR5943131_1_trimmed.fq.gz
    #不用传参数或是文件,cut -d"/"指定分割符为"/"; -f 8 1 取第8列的第一个元素
    #cut -d"_" -f 1 指定分隔符为"_" 取第一列,
    #得到只含样本名称,超级超级好用!!!
    
    ls *1.fq.gz >1
    ls *2.fq.gz >2
    cut -d"/" -f 8 1 |cut -d"_" -f 1  > 0
    paste 0 1 2 > config 
    cat >mapping.sh
    INDEX=“/home/qmcui/database/reference/index/bwa/hg38”
    cat config |while read sample
    do
     arr=($sample)
     fq1=${arr[1]}
     fq2=${arr[2]}
     sample=${arr[0]}
    echo $sample $fq1 $fq2 
    bwa mem -t 2 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2  | samtools sort -@ 2 -o $sample.bam -
    done   
    #bwa mem 算法,bwa mem [options] <idxbase> <in1.fq> [in2.fq]
    #-t 2 两个线程
    #—R 读区时显示的头文件信息;如:'@RG\tID:foo\tSM:bar' [null]
    #"@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" 出现在bam文件前文件信息:包括样本ID,样本名称,样本测序类型WGS以及测序平台信息。
    
    # bam统计
    ls *.bam | while read id ;do (samtools flagstat $id > $(basename $id ".bam").stat);done
    
    figure5 stat

    5. GATK4 流程

    5.1 markduplicate (去除建库过程中起始和终止完全相同的序列)
    #gatk MarkDuplicates 必须参数-I:inputfile, -O:file, -M:file to write duplication metrics
    mkdir gatk_markdup && cd gatk_markdup
    cut -d"/" -f 8 1|cut -d"_" -f 1 >sample.txt
    cat sample.txt|while read sample;
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" MarkDuplicates -I ../${sample}.sort.bam -O ${sample}_marked.bam -M ${sample}.metrics 1>${sample}_log.mark 2>&1
    done
    
    #5.2 fixmateinformation
    mkdir fixbam && cd fixbam
    cp ../gatk_markdup/sample.txt ./
    cat >fixbam.sh
    vim fixbam.sh
    cat sample.txt|while read sample
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" FixMateInformation
    -I ../gatk_markdup/${sample}_markdu.bam
    -O ${sample}_mkfixed.bam -SO coordinate 1>${sample}_log.fix 2>&1
    done
    
    #samtools index  子命令建索引
    cat >idex.sh
    vim idex.sh
    cat sample.txt|while read sample
    do
    samtools index ${sample}_mkfixed.bam
    done
    
    #5.3 每30min一个样本
    cat >recal.sh
    vim recal.sh
    ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
    indel="/home/qmcui/tmp/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
    snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
    cat sample.txt|while read sample
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" BaseRecalibrator -R $ref -I ${sample}_mkfixed.bam --known-sites $snp --known-sites $indel -O ${sample}_recal.table
    done
    
    #5.4 每10min一个样本
    cat >bqsr.sh
    vim bqsr.sh
    ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
    cat sample.txt|while read sample
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" ApplyBQSR -R $ref -I ${sample}_mkfixed.bam -bqsr ${sample}_recal.table -O ${sample}_bqsr.bam 1>${sample}_log.ApplyBQSR 2>&1
    done
    
    #5.5 每1h一个样本
    cat >rawvcf.sh
    vim rawvcf.sh
    ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
    snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
    cat sample.txt|while read sample
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" HaplotypeCaller -R $ref -I ${sample}_bqsr.bam --dbsnp $snp -O ${sample}_raw.vcf 1>${sample}_log.HC 2>&1
    done
    

    6 按照区域call variation

    6.1
    #每个样本2h40min
    cat >gvcf.sh
    vim gvcf.sh
    ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
    snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
    cat sample.txt|while read sample
    do
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" HaplotypeCaller -ERC GVCF -R $ref -I ${sample}.bqsr.bam --dbsnp $snp -O ${sample}_raw.gvcf 1>${sample}.gvcf.log 2>&1
    done
    
    6.2 按区域找变异(依据不同染色体,3min/chr)
    seq 22|while read id;do echo chr*{id};done >bed.txt
    cat >> bed.txt
    chrX
    chrY
    chrM
    
    cat >final_vcf.sh
    vim final_vcf.sh
    ref="/home/qmcui/database/reference/index/hisat/hg38/hg38.fa"
    snp="/home/qmcui/dbsnp_146.hg38.vcf.gz"
    cat bed.txt|while read bed;do gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" GenomicsDBImport -L $bed -R $ref -V SRR5943131_raw.gvcf -V SRR5943132_raw.gvcf -V SRR5943133_raw.gvcf --genomicsdb-workspace-path gvcfs_${bed}.db
    gatk --java-options "-Xmx8G -Djava.io.tmpdir=./" GenotypeGVCFs -R $ref -V gendb://gvcfs_${bed}.db --dbsnp $snp -O final_${bed}.vcf
    done
    nohup bash gvcf.sh & 
    
    6.3 将区域变异合并(1-2min)
    cat bed.txt|while read bed;do (gatk MergeVcfs -I final_${bed}.vcf -O raw.combine.vcf.gz);done
    
    image.png

    7 raw vcf filter,按照snp和indel分别过滤

    **容易错**raw vcf 过滤 可以新建文件夹,运动代码时,输入文件夹上绝对路径,或者要将输入文件移动到新的文件夹时,一定要移动所有相关的文件!!!

    mkdir vcf_filter && cd vcf_filter
    mv ../rgvcf/raw.combine.vcf.gz ./
    mv ../rgvcf/raw.combine.vcf.gz.tbi ./
    
    7.1 snp filter
    gatk SelectVariants -select-type SNP -V raw.combine.vcf.gz -O raw.snp.vcf.gz
    gatk VariantFiltration -V raw.snp.vcf.gz --filter-expression "QUAL < 30.0 || QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "Filter" -O filter.snp.vcf.gz
    gatk SelectVariants --exclude-filtered true -V filter.snp.vcf.gz -O filtered.snp.vcf.gz
    
    7.2 indel filter
    gatk SelectVariants -select-type INDEL -V raw.combine.vcf.gz -O raw.indel.vcf.gz
    gatk VariantFiltration -V raw.indel.vcf.gz --filter-expression "QUAL < 30.0 | QD < 2.0 || FS > 200.0 || SOR > 10.0 || Inbreeding Coeff < -0.8 || ReadPosRankSum < -20.0" --filter-name "Filter" -O filter.indel.vcf.gz
    gatk SelectVariants --exclude-filtered true -V filter.indel.vcf.gz -O filtered.indel.vcf.gz
    

    相关文章

      网友评论

        本文标题:WES 分析(从下载数据到IGV)

        本文链接:https://www.haomeiwen.com/subject/ezrnjqtx.html