参考文章: Genomic comparison of esophageal squamous cell carcinoma and its precursor lesions by multi-region whole-exome sequencing
step 0 配置aspera必不可少
step 1 下载数据
cat SRR_list | while read id; do (prefetch ${id});done
下载默认的文件目录~/ncbi/public/sra
step 2 sra格式转化成fastq
##用循环语句
ls ./1.SRA/*sra | while read id; do (fastq-dump --gzip --split-3 -O ./project ./${id}); done
挂后台
ls ./*.sra | while read id; do (nohup fastq-dump --gzip --split-3 -O ./ ${id} &); done
step 3 批量跑fastqc,然后multiqc
批量跑fastqc,
nohup \
fastqc -t 3 ~/ncbi/public/sra/*.fastq.gz \
-o ~/ncbi/public/sra/fastqc \
&
###
multiqc ./
step 4 批量 trim_galore
首先: 配置config文件
ls ./*1.fastq.gz > ./fq1
ls ./*2.fastq.gz > ./fq2
ls *fastq.gz|awk -F '_' '{print $1}'| paste - -|cut -f 1 > sample_name
paste fq1 fq2 sample_name> config
然后 编辑trim_galore.sh 脚本,bash trim_galore.sh config
vim trim_galore.sh
####要有一个shell脚本
####脚本内容如下
cat $1 |while read id
do
arr=(${id})
fq1=${arr[0]}####这里的arr别写成id
fq2=${arr[1]}
echo $fq1 $fq2
trim_galore --phred33 -q 25 -e 0.1 --length 36 --stringency 3 --paired -o ./ $fq1 $fq2
echo 'OK'
done
######
nohup bash trim_galore.sh config & ####后台运行
#######
bash语句中最后的config 就可以传给循环语句中的$1
step 5 Bwa比对
首先配置三列的config包括(fq1 fq2 sample_name),同step4,如果step4已经配置可以省略。
ls ./*1.fastq.gz > ./fq1
ls ./*2.fastq.gz > ./fq2
ls *fastq.gz|awk -F '_' '{print $1}'| paste - -|cut -f 1 > sample_name
paste fq1 fq2 sample_name> config
然后构建索引,编辑bwa.sh 脚本,最后bash bwa.sh config
bwa index ./hg38.fa 构建索引文件,需要2-3个小时,要提前准备
vim bwa.sh ####要有一个shell脚本,脚本内容如下:
ref='/public/reference/index/bwa/hg38'
#######比对的参考基因组索引文件hg38既不是路径也不是文件参见https://www.jianshu.com/p/589e8894459d
cat $1 |while read id
do
arr=(${id})
fq1=${arr[0]}
fq2=${arr[1]}
sample=${arr[2]}
echo $fq1 $fq2
echo $sample
bwa mem -t 10 -R "@RG\tID:$sample\tSM:${sample}\tLB:WGS\tPL:Illumina" \
$ref $fq1 $fq2 2>${sample}.log \
|samtools sort -@ 5 -o $sample.sort.bam - 1>${sample}_bam.log
echo "OKOK"
done
######
nohup bash bwa.sh config &
#######
step 6 MarkDuplicates+GATK Call(一个脚本就够)vs(我还是一步一步来吧)
vim gatk_call.sh#################下面都可以一步完成,但我还是一步一步慢慢来吧,所以此处略去
###########################gatk_call.sh 内容
###########################赋值变量
source activate wes
###GATK=/home/jmzeng/biosoft/gatk4/gatk-4.0.6.0/gatk
ref='/public/biosoft/GATK/resources/bundle/hg38/Homo_sapiens_assembly38.fasta'
snp="/public/biosoft/GATK/resources/bundle/hg38/dbsnp_146.hg38.vcf.gz"
indel="/public/biosoft/GATK/resources/bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
########################################################
cat ./sample_name |while read id
do
gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" MarkDuplicates \
-I ./${id}.sort.bam \
-O ${id}_marked.bam \
-M ${id}.metrics \
1>${id}_log.mark 2>&1
done
####################################
cat ./sample_name |while read id
do
gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" FixMateInformation \
-I ./${id}_marked.bam \
-O ${id}_marked_fixed.bam \
-SO coordinate \
1>${id}_log.fix 2>&1
done
###################################
cat ./sample_name |while read id
do
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R $ref \
-I ./${id}_marked_fixed.bam
--known-sites $snp \
--known-sites $indel \
-O ${id}_recal.table 1>${id}_log.fix 2>&1
done
#################################
cat ./sample_name |while read id
do
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I ./${id}_marked_fixed.bam -bqsr ./${id}_recal.table \
-O ./${id}_bqsr.bam \
1>${id}_log.ApplyBQSR 2>&1
done
#################################bqsr.bam按照区域 call variation
####################ERC 指定GVCF
cat ./sample_name |while read id
do
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller \
-ERC GVCF\
-R $ref \
-I ${id}.bqsr.bam \
--dbsnp $snp \
-O ${id}_raw.all.gvcf \
1>${id}.gvcf.log 2>&1
done
#####################step 8 gatk call all
##########################################################
samtools index ${id}.bqsr.bam # 核查bam是否建索引,必不可少
##################################################################
raw.all.gvcf=$(ls *fastq.gz| awk '{print "-V"" "$0}'|tr '\n' ' ')#######这个赋值很重要
##################
for bed in chr{1..22} chrX chrY chrM
do
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" GenomicsDBImport \
-L $bed \
-R $ref \
$raw.all.gvcf
--genomicsdb-workspace-path gvcfs_${bed}.db
gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" GenotypeGVCFs \
-R $ref -V gendb://gvcfs_${bed}.db \
--dbsnp $snp \
-O final_${bed}.vcf
done
#############################合并vcf
Merge_chr=$(ls *final*.vcf| awk '{print "-I"" "$0}'|tr '\n' ' ')#######这个赋值很重要
############################
gatk MergeVcfs
$Merge_chr \
-O raw.conbine.vcf.gz
step 9 filter
############################################################
raw_vcf="./raw.conbine.vcf.gz"
gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" SelectVariants \
-select-type SNP \
-V $raw_vcf \
-O raw.snp.vcf.gz
########
gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" VariantFiltration \
-V raw.snp.vcf.gz \
--filter-expression "QUAL < 30.0 || QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \
--filter-name "Filter" \
-O filter.snp.vcf.gz
##########################################################
gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" SelectVariants \
-select-type INDEL \
-V $raw_vcf \
-O raw.indel.vcf.gz
################
gatk --java-options "-Xmx15G -Djava.io.tmpdir=./" VariantFiltration \
-V raw.indel.vcf.gz \
--filter-expression "QUAL < 30.0 | QD < 2.0 || FS > 200.0 || SOR > 10.0 || InbreedingCoeff < -0.8 || ReadPosRankSum < -20.0" \
--filter-name "Filter" \
-O filter.indel.vcf.gz
########################合并,也可不合并,看最后用途
gatk MergeVcfs -I filter.snp.vcf.gz -I filter.snp.vcf.gz \
-O conbine.filter.vcf.gz
step 10 注释#############软件要自己装
cat sample_name| while read id
do
~qmcui/software/annovar/annovar/convert2annovar.pl \
-format vcf4old \ # -filter # -allsample
${id}_filtered.vcf > ${id}.annovar
##############################################hg38是annovar自带的
~qmcui/software/annovar/annovar/annotate_variation.pl \
-buildver hg38 \
--outfile ${id}.anno \
${id}.annovar \
/public/biosoft/ANNOVAR/annovar/humandb
done
网友评论