重要知识点一:理解循环
vim trim_galore.sh ####要有一个shell脚本
####脚本内容如下
cat $1 |while read id
do
arr=(${id})
fq1=${arr[0]}####这里的arr别写成id
fq2=${arr[1]}
echo $fq1 $fq2
trim_galore --phred33 -q 25 -e 0.1 --length 36 --stringency 3 --paired -o ./ $fq1 $fq2
echo 'OK'
done
######
nohup bash trim_galore.sh config & ####后台运行
#######
bash语句中最后的config 就可以传给循环语句中的$1
重要知识点二:学会构建每个项目的config
####方法一:
ls /home/qmcui/test_boy/1.fastq_qc/*.fastq.gz| paste - - |> config
####方法二:
ls /home/qmcui/test_boy/1.fastq_qc/*1.fastq.gz > ./fq1
ls /home/qmcui/test_boy/1.fastq_qc/*2.fastq.gz > ./fq2
paste ./fq1 ./fq2| > ./config
####方法三(获取sample name,bwa时):
ls /home/qmcui/test_boy/1.fastq_qc/*1.fastq.gz > ./fq1
ls /home/qmcui/test_boy/1.fastq_qc/*2.fastq.gz > ./fq2
ls /home/qmcui/test_boy/1.fastq_qc/*.fastq.gz|while read id;do basename $id;done| >./sample_name
paste ./fq1 ./fq2 ./sample_name| > ./config
####方法四(自己的笨办法,但自己也最能理解)
ls /home/qmcui/test_boy/1.fastq_qc/*1.fastq.gz| > ./fq1
ls /home/qmcui/test_boy/1.fastq_qc/*2.fastq.gz| > ./fq2
ls /home/qmcui/test_boy/1.fastq_qc/*.fastq.gz| awk -F '/' '{print$6}'| awk -F '.' '{print$1}'|paste - -|cut -f 1 > sample###输出后面不要加管道符,重复的使用paste - -|cut -f 1去重,这个语句不会改变顺序此时最好不要使用sort -u,怕改变顺序。
step0: 安装软件
要注意conda install 装的samtools是有问题的,此时需要使用我自己装在biosoft里的samtools,(卸载conda remove samtools后系统就自动使用我自己装的samtools了)
conda activate wes
conda install -y fastqc trim-galore star bwa hisat2 bowtie2 subread tophat htseq bedtools deeptools salmon cutadapt multiqc
step 1 跑fastqc
fastqc -t 3 /home/qmcui/test_boy/1.fastq_qc/*.fastq.gz -o ./project/wes/
multiqc ./ ####
####在路径下对所有的fastq.gz文件进行fastqc
step2 filter data需要选择质量较差的进行过滤(此处可以写循环 bash trim_galore.sh config)
trim_galore --phred33 -q 25 -e 0.1 --length 36 --stringency 3 --paired -o ./ \
/home/qmcui/test_boy/1.fastq_qc/7E5241.L1_1.fastq.gz \
/home/qmcui/test_boy/1.fastq_qc/7E5241.L1_2.fastq.gz
step 3 构建索引
构建完索引后会多5个文件每一款比对软件的索引都不一样:bwa软件比对时hg38既不是路径也不是文件
image.png hg38既不是文件也不是路径
step 4 使用循环进行比对
####要输入三列的config包括(sample fq1 fq2)
ls /home/qmcui/test_boy/1.fastq_qc/*.fastq.gz| awk -F '/' '{print$6}'| awk -F '.' '{print$1}'|paste - -|cut -f 1 > sample ###>前一般不要加入管道符
ls /home/qmcui/test_boy/1.fastq_qc/*1.fastq.gz > ./fq1
ls /home/qmcui/test_boy/1.fastq_qc/*2.fastq.gz > ./fq2
paste ./sample ./fq1 ./fq2 | > config
vim bwa.sh ####要有一个shell脚本,脚本内容如下
cat $1 |while read id
do
arr=(${id})
sample=${arr[0]}
fq1=${arr[1]}
fq2=${arr[2]}
echo $fq1 $fq2
echo $sample
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" /public/reference/index/bwa/hg38 $fq1 $fq2 2>$sample.log|samtools sort -@ 5 -o $sample.sort.bam - 1>$sample.log
echo "OKOK"
done
######
nohup bash bwa.sh ~project/wes/config &
#######
此处要特别注意:如果循环报错,可以echo$变量来检查错误
step 5 MarkDuplicates
#####使用basename创建sample_name的文件用于循环
ls ./*.sort.bam|while read id;do basename $id >> sample ;done
cat sample| awk -F '.' '{print$1}'> sample_name
nohup cat /home/vip25/project/wes/2.mapping/sample_name |while read sample; do gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" MarkDuplicates -I /home/vip25/project/wes/2.mapping/$sample.sort.bam -O ${sample}_marked.bam -M ${sample}.metrics 1>${sample}_log.mark 2>&1 ; echo 'OK'; done &
image.pngflag 数字大小1187的意思:有duplicate
image.png
step 6 GATK CALL :1.FixMateInformation
nohup cat /home/vip25/project/wes/2.mapping/sample_name |while read sample; do gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" FixMateInformation -I /home/vip25/project/wes/${sample}_marked.bam -O ${sample}_marked_fixed.bam -SO coordinate 1>${sample}_log.fix 2>&1 ; echo 'OK'; done &
######${sample},需要用大括号,避免系统识别错误
step 6 GATK CALL :2.BaseRecalibrator
ref='/public/biosoft/GATK/resources/bundle/hg38/Homo_sapiens_assembly38.fasta'
snp="/public/biosoft/GATK/resources/bundle/hg38/dbsnp_146.hg38.vcf.gz"
indel="/public/biosoft/GATK/resources/bundle/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
这一步赋值很重要,后面的Call 很多部分都需要用到
nohup
cat /home/vip25/project/wes/2.mapping/sample_name |while read sample; do gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator -R $ref -I ./${sample}_marked_fixed.bam --known-sites $snp --known-sites $indel -O ${sample}_recal.table 1>${sample}_log.fix 2>&1 ; done
&
image.png########GATK call()
特别注意此时GATK报错,重新下载gatk4 虽然报错但是能继续运行:错误:Feature file "/public/biosoft/GATK/resources/bundle/hg38/dbsnp_146.hg38.vcf.gz" appears to contain no sequence dictionary. Attempting to retrieve a sequence dictionary from the associated index file运行完之后文件很小:148K 每一个case二十分钟
step 6 GATK CALL :3. 子命令ApplyBQSR
ref='/public/biosoft/GATK/resources/bundle/hg38/Homo_sapiens_assembly38.fasta'
nohup
cat /home/vip25/project/wes/2.mapping/sample_name |while read sample; do gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR -R $ref -I ./${sample}_marked_fixed.bam -bqsr ./${sample}_recal.table -O ./${sample}_bqsr.bam 1>${sample}_log.ApplyBQSR 2>&1; done
&
step 6 GATK CALL :4. HaplotypeCaller ### calling variation得到vcf
mkdir gatk_single_vcf
cd gatk_single_vcf
nohup
cat /home/vip25/project/wes/2.mapping/sample_name |while read sample; do gatk --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller -R $ref -I ./${sample}_bqsr.bam --dbsnp $snp -O ./${sample}_raw.vcf 1>${sample}_log.HC 2>&1; done
&
image.png
step 7注释
annovar 说明书
http://annovar.openbioinformatics.org/en/latest/user-guide/gene/
需要安装annovar软件;
/home/qmcui/software/annovar/annovar/convert2annovar.pl \
-format vcf4old \
7E5240_L1_A001_raw.vcf > 7E5240.annovar
/home/qmcui/software/annovar/annovar/annotate_variation.pl \
-buildver hg38 \
--outfile 7E5240.anno \
7E5240.annovar \
/public/biosoft/ANNOVAR/annovar/humandb
less -S 7E5240_L1_A001_raw.vcf|grep -v '##' |cut -f 1|sort|uniq -c
less -S 7E5240.annovar |cut -f 1|sort|uniq -c
####查看文件信息
less -S 7E5240_last_annovar.exonic_variant_function |grep -v -E -w 'synonymous SNV|unknown'|awk '$4=="chr1"&&$5>0 &&$6< 122212222{print $0}'
###查看突变类型信息,grep -E通配符的时候要用;-w 是精确匹配的意思这样不会过滤nonsynonymous SNV
网友评论