1. md5 检测数据是否下载完整
#!/bin/bash
cat /dev/null > md5_out # 把 med5_out 文件夹清空
for file in `ls /tempdisk/001/output/*gz`
do
echo $file
md5sum $file >> md5_out
done
# 硬板携带md5.txt
nohup md5sum /tempdisk/001/output/md5.txt >out.log 2>&1 &
2. fastqc 质控
# 所有的fastqc 文件放在output 文件夹下
#!/usr/bin/env bash
fastqc -o ./fastqc/ -t 16 /tempdisk/001/output/*.fastq.gz
# fastq_screen 质控
fastq_screen --aligner BOWTIE2 --outdir fastq_screen_list --threads 64 \
/tempdisk/001/output/*fastq.gz
3. Trimmomatic 过滤低质量序列
# 我的数据拿到公司已经帮忙去接头了。
java -jar <path to trimmomatic.jar> PE [-threads <threads] [-phred33 | -phred64] [-trimlog
<logFile>] >] [-basein <inputBase> | <input 1> <input 2>] [-baseout <outputBase> |
<paired output 1> <unpaired output 1> <paired output 2> <unpaired output 2> <step 1> <step 2> ...
4. 构建参考基因组index
- 下面三个三选一
# hg38下载地址:
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
nohup time bowtie2-build ../hg38.fa.gz index &
#bwa 构建
nohup bwa index ../hg38.fa.gz &
# hisat2
ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/hg38.tar.gz
tar zvxf hg38.tar.gz
5. 把fastq 序列和参考基因组比对
5.1 hisat2 比对,生成sam 文件
nohup hisat2 -t -x /share/public/reference/Homo_sapiens/hg38/hg38_hisat2_index/hg38/genome -1 /tempdisk/001/output/CHB000013-CHB000013D-1303MR00003WJ1-TCCGGAGA-AGGCTATA_S2_R1_001.fastq.gz -2 /tempdisk/001/output/CHB000013-CHB000013D-1303MR00003WJ1-TCCGGAGA-AGGCTATA_S2_R2_001.fastq.gz -S /home/chengkaizhu/zhuchengkai/nmr/hisat2/samfiles/CHB000013.sam &
- 用samtools 把sam 文件转为bam 文件
list=(12 13 14 15 16 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 44 45 46)
for i in ${list[@]};
do
nohup samtools view -S CHB0000${i}.sam -b > CHB0000${i}.bam &
nohup samtools sort CHB0000${i}.bam -o CHB0000${i}_sorted.bam &
nohup samtools index CHB0000${i}_sorted.bam &
done
5.2 bwa 比对
bwa mem -t 16 /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/hg38.fa.gz \
/tempdisk/001/output/CHB000012-CHB000012D-1303MR00002WJ1-ATTACTCG- \
AGGCTATA_S1_R1_001.fastq.gz /tempdisk/001/output/CHB000012-CHB000012D- \
1303MR00002WJ1-ATTACTCG-AGGCTATA_S1_R2_001.fastq.gz | samtools view -bhS -o \
/home/chengkaizhu/zhuchengkai/nmr/bwa/bamfiles/CHB000012.bam
# sort & index
nohup samtools sort CHB0000${i}.bam -o CHB0000${i}_sorted.bam &
nohup samtools index CHB0000${i}_sorted.bam &
6.RSeQC 比对后数据质控
# 下载参考基因组bed 文件
wget https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg38_RefSeq.bed.gz
# 质控
python2.7 /root/anaconda3/pkgs/rseqc-2.6.4-py27_0/lib/python2.7/site-packages/RSeQC-2.6.4-py2.7.egg-info/scripts/inner_distance.py -r /root/anaconda3/pkgs/rseqc-2.6.4-py27_0/hg38_RefSeq.bed.gz -i /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted.bam -o /home/chengkaizhu/zhuchengkai/nmr/rseqc/
7. picard 标记重复
picard MarkDuplicates I=CHB000012_sorted.bam O=CHB000012_sorted.markdup.bam M=CHB000012_sorted.markdup_metrics.txt
# samtools 创建索引文件
nohup samtools index CHB0000${i}_sorted.bam &
# gatk 集成了picard,也可以跑
time /opt/gatk-4.0.6.0/gatk MarkDuplicates -I /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted.bam -O /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted_markdup.bam -M /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012.sorted.markdup_metrics.txt && echo "** markdup done **"
# 创建比对文件索引
time samtools index /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted_markdup.bam && echo "** index done **"
8 GATK 寻找变异
8.1 RealignerTargetCreator & IndelRealigner(可选)
- RealignerTargetCreator目的是定位出所有需要进行序列重比对的目标区域
- IndelRealigner对所有在第一步中找到的目标区域运用算法进行序列重比对,最后得到捋顺了的新结果。
- 这部我没做,因为后面使用GATK的HaplotypeCaller模块,仅当这个时候才可以减少这个Indel局部重比对的步骤。原因是GATK的HaplotypeCaller中,会对潜在的变异区域进行相同的局部重比对!但是其它的变异检测工具或者GATK的其它模块就没有这么干了!所以切记!
java -jar /path/to/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R /path/to/human.fasta \
-I sample_name.sorted.markdup.bam \
-known /path/to/gatk/bundle/1000G_phase1.indels.b37.vcf \
-known /path/to/gatk/bundle/Mills_and_1000G_gold_standard.indels.b37.vcf \
-o sample_name.IndelRealigner.intervals
java -jar /path/to/GenomeAnalysisTK.jar \
-T IndelRealigner \
-R /path/to/human.fasta \
-I sample_name.sorted.markdup.bam \
-known /path/to/gatk/bundle/1000G_phase1.indels.b37.vcf \
-known /path/to/gatk/bundle/Mills_and_1000G_gold_standard.indels.b37.vcf \
-o sample_name.sorted.markdup.realign.bam \
--targetIntervals sample_name.IndelRealigner.intervals
8.2 重新校正碱基质量值(BQSR)
8.3.1 BaseRecalibrator
- 这里计算出了所有需要进行重校正的read和特征值,然后把这些信息输出为一份校准表文件(sample_name.recal_data.table)
8.3.2 ApplyBQSR
- 这一步利用第一步得到的校准表文件(sample_name.recal_data.table)重新调整原来BAM文件中的碱基质量值,并使用这个新的质量值重新输出一份新的BAM文件。
/opt/gatk-4.0.6.0/gatk BaseRecalibrator \
-I /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted_rg.markdup.bam \
-R /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/hg38.fa \
--known-sites /share/public/reference/Homo_sapiens/hg38/reference/1000G_phase1.snps.high_confidence.hg38.vcf.gz \
--known-sites /share/public/reference/Homo_sapiens/hg38/reference/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
--known-sites /share/public/reference/Homo_sapiens/hg38/reference/1000G_omni2.5.hg38.vcf.gz \
--known-sites /share/public/reference/Homo_sapiens/hg38/reference/hapmap_3.3.hg38.vcf.gz \
--known-sites /share/public/reference/Homo_sapiens/hg38/reference/dbsnp_146.hg38.vcf.gz \
-O CHB000012.recal_data.table
/opt/gatk-4.0.6.0/gatk ApplyBQSR \
-R /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/hg38.fa \
-I /share/public/reference/Homo_sapiens/hg38/hg38_bwa_index/CHB000012_sorted.markdup.bam \
--bqsr-recal-file CHB000012.recal_data.table \
-O CHB000012_sorted_rg.markdup.BQSR.bam
缺少fai 文件报错
samtool faidx hg38.fa
缺少dict 文件报错
/opt/gatk-4.0.6.0/gatk CreateSequenceDictionary -R hg38.fa -O hg38.dict && echo "** dict done **"
read group =0 报错
picard AddOrReplaceReadGroups \
I=CHB000012_sorted.markdup.bam \
O=CHB000012_sorted_rg.markdup.bam \
RGID=CHB000012 \
RGLB=dna \
RGPL=illumina \
RGPU=hiseq \
RGSM=CHB000012
8.4 变异检测
网友评论