美文网首页生信分析流程
Day 9 下载ESCC wes数据进行分析

Day 9 下载ESCC wes数据进行分析

作者: 陈宇乔 | 来源:发表于2018-12-17 19:31 被阅读23次

    参考文章: Genomic comparison of esophageal squamous cell carcinoma and its precursor lesions by multi-region whole-exome sequencing

    step 0 配置aspera必不可少
    step 1 下载数据
    cat SRR_list | while read id; do (prefetch ${id});done
    

    下载默认的文件目录~/ncbi/public/sra

    step 2 sra格式转化成fastq
    ##用循环语句
    ls ./1.SRA/*sra | while read id; do (fastq-dump --gzip --split-3 -O ./project ./${id}); done
    挂后台
    ls ./*.sra | while read id; do (nohup fastq-dump --gzip --split-3 -O ./ ${id} &); done
    
    step 3 批量跑fastqc,然后multiqc
    批量跑fastqc,
    nohup \
    fastqc -t 3 ~/ncbi/public/sra/*.fastq.gz \
    -o ~/ncbi/public/sra/fastqc \
    &
    ###
    multiqc ./
    
    step 4 批量 trim_galore

    首先: 配置config文件

    ls ./*1.fastq.gz > ./fq1
    ls ./*2.fastq.gz > ./fq2
    ls *fastq.gz|awk -F '_' '{print $1}'| paste - -|cut -f 1 > sample_name
    paste fq1 fq2 sample_name> config
    

    然后 编辑trim_galore.sh 脚本,bash trim_galore.sh config

    vim trim_galore.sh 
    ####要有一个shell脚本
    ####脚本内容如下
    cat $1 |while read id
    do
            arr=(${id})
            fq1=${arr[0]}####这里的arr别写成id
            fq2=${arr[1]}
            echo $fq1 $fq2
    trim_galore --phred33 -q 25 -e 0.1 --length 36 --stringency 3 --paired  -o ./  $fq1 $fq2
    echo 'OK'
    done
    ######
    nohup bash trim_galore.sh config & ####后台运行
    #######
    bash语句中最后的config 就可以传给循环语句中的$1
    
    step 5 Bwa比对

    首先配置三列的config包括(fq1 fq2 sample_name),同step4,如果step4已经配置可以省略。

    ls ./*1.fastq.gz > ./fq1
    ls ./*2.fastq.gz > ./fq2
    ls *fastq.gz|awk -F '_' '{print $1}'| paste - -|cut -f 1 > sample_name
    paste fq1 fq2 sample_name> config
    

    然后构建索引,编辑bwa.sh 脚本,最后bash bwa.sh config

    bwa index ./hg38.fa 构建索引文件,需要2-3个小时,要提前准备
    
    vim bwa.sh ####要有一个shell脚本,脚本内容如下:
    
    ref='/public/reference/index/bwa/hg38'   
    #######比对的参考基因组索引文件hg38既不是路径也不是文件参见https://www.jianshu.com/p/589e8894459d
           cat $1 |while read id
    do
            arr=(${id})
            fq1=${arr[0]}
            fq2=${arr[1]}
           sample=${arr[2]}
            echo $fq1 $fq2
            echo $sample
            bwa mem -t 10 -R "@RG\tID:$sample\tSM:${sample}\tLB:WGS\tPL:Illumina" \
    $ref $fq1 $fq2 2>${sample}.log \
    |samtools sort -@ 5 -o $sample.sort.bam - 1>${sample}_bam.log
            echo "OKOK"
    done
    ######
    nohup bash bwa.sh config &
    #######
    
    step 6 MarkDuplicates+GATK Call(一个脚本就够)vs(我还是一步一步来吧)
    vim gatk_call.sh#################下面都可以一步完成,但我还是一步一步慢慢来吧,所以此处略去
    
    ###########################gatk_call.sh 内容
    ###########################赋值变量
    source activate wes
    ###GATK=/home/jmzeng/biosoft/gatk4/gatk-4.0.6.0/gatk
    ref='/public/biosoft/GATK/resources/bundle/hg38/Homo_sapiens_assembly38.fasta'
    snp="/public/biosoft/GATK/resources/bundle/hg38/dbsnp_146.hg38.vcf.gz"
    indel="/public/biosoft/GATK/resources/bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
    ########################################################
    cat ./sample_name |while read id
    do
    gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" MarkDuplicates     \
    -I ./${id}.sort.bam    \
    -O ${id}_marked.bam     \
    -M ${id}.metrics   \
    1>${id}_log.mark 2>&1
    done
    ####################################
    cat ./sample_name |while read id
    do
    gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" FixMateInformation \
    -I ./${id}_marked.bam     \
    -O ${id}_marked_fixed.bam     \
    -SO coordinate   \
    1>${id}_log.fix 2>&1 
    done
    ###################################
    cat ./sample_name |while read id
    do
    gatk --java-options "-Xmx20G -Djava.io.tmpdir=./"  BaseRecalibrator    \
    -R $ref      \
    -I  ./${id}_marked_fixed.bam 
    --known-sites $snp     \
    --known-sites $indel    \
    -O ${id}_recal.table 1>${id}_log.fix 2>&1
    done
    #################################
    cat ./sample_name |while read id
    do
    gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
    -R $ref \
    -I ./${id}_marked_fixed.bam -bqsr ./${id}_recal.table \
    -O ./${id}_bqsr.bam \
    1>${id}_log.ApplyBQSR 2>&1
    done
    #################################bqsr.bam按照区域 call variation
    ####################ERC 指定GVCF
    cat ./sample_name |while read id
    do
    gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller \
    -ERC GVCF\
    -R $ref \
    -I ${id}.bqsr.bam \
    --dbsnp $snp \
    -O ${id}_raw.all.gvcf \
    1>${id}.gvcf.log 2>&1
    done
    

    #####################step 8 gatk call all

    ##########################################################
    samtools index ${id}.bqsr.bam # 核查bam是否建索引,必不可少
    ##################################################################
    raw.all.gvcf=$(ls *fastq.gz| awk '{print "-V"" "$0}'|tr '\n' ' ')#######这个赋值很重要
    ##################
    for bed in chr{1..22} chrX chrY chrM
    do
    gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" GenomicsDBImport \
    -L $bed \
    -R $ref \
    $raw.all.gvcf
    --genomicsdb-workspace-path gvcfs_${bed}.db
    gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" GenotypeGVCFs \
    -R $ref -V gendb://gvcfs_${bed}.db \
    --dbsnp $snp \
    -O final_${bed}.vcf
    done
    #############################合并vcf
    Merge_chr=$(ls *final*.vcf| awk '{print "-I"" "$0}'|tr '\n' ' ')#######这个赋值很重要
    ############################
    gatk MergeVcfs 
    $Merge_chr \
    -O raw.conbine.vcf.gz
    

    step 9 filter

    ############################################################
    raw_vcf="./raw.conbine.vcf.gz"
    gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" SelectVariants \
    -select-type SNP \
    -V $raw_vcf \
    -O raw.snp.vcf.gz
    ########
    gatk --java-options "-Xmx15G -Djava.io.tmpdir=./"  VariantFiltration \
    -V raw.snp.vcf.gz \
    --filter-expression "QUAL < 30.0 || QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \
    --filter-name "Filter" \
    -O filter.snp.vcf.gz
    ##########################################################
    gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" SelectVariants \
    -select-type INDEL \
    -V $raw_vcf \
    -O raw.indel.vcf.gz
    ################
    gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" VariantFiltration \
    -V raw.indel.vcf.gz \
    --filter-expression "QUAL < 30.0 | QD < 2.0 || FS > 200.0 || SOR > 10.0 || InbreedingCoeff < -0.8 || ReadPosRankSum < -20.0" \
    --filter-name "Filter" \
    -O filter.indel.vcf.gz
    ########################合并,也可不合并,看最后用途
    gatk MergeVcfs -I filter.snp.vcf.gz -I filter.snp.vcf.gz \
    -O conbine.filter.vcf.gz
    
    step 10 注释#############软件要自己装
    cat sample_name| while read id 
    do
    ~qmcui/software/annovar/annovar/convert2annovar.pl \
    -format vcf4old \ # -filter # -allsample
    ${id}_filtered.vcf > ${id}.annovar
    ##############################################hg38是annovar自带的
    ~qmcui/software/annovar/annovar/annotate_variation.pl \
    -buildver hg38 \
    --outfile ${id}.anno \
    ${id}.annovar \
    /public/biosoft/ANNOVAR/annovar/humandb
    done
    

    相关文章

      网友评论

        本文标题:Day 9 下载ESCC wes数据进行分析

        本文链接:https://www.haomeiwen.com/subject/xpvckqtx.html