第一次做出来的结果不好,所以打算重新做一次,加上在听一个讲座时看到的ppt,更打了鸡血重新做了
image.png
目录准备
如何一连串创建目录:
mkdir的-p选项允许一次性创建多层次的目录,而不是一次只创建单独的目录。例如,我们要在当前目录创建目录Projects/a/src,使用命令
mkdir -p Project/a/src
此外,如果我们想创建多层次、多维度的目录树,mkcd也显得比较苍白了。例如我们想要建立目录Project,其中含有4个文件夹a, b, c, d,且这4个文件都含有一个src文件夹。或许,我们可以逐个创建,但我更倾向于使用单个命令来搞定,而mkdir 的-p选项和shell的参数扩展允许我这么做,例如下面的一个命令就可以完成任务。
mkdir -p Project/{a,b,c,d}/src
数据下载
wget -b https://zenodo.org/record/3243160/files/father_R1.fq.gz
wget -b https://zenodo.org/record/3243160/files/father_R2.fq.gz
wget -b https://zenodo.org/record/3243160/files/mother_R1.fq.gz
wget -b https://zenodo.org/record/3243160/files/mother_R2.fq.gz
wget -b https://zenodo.org/record/3243160/files/proband_R1.fq.gz
wget -b https://zenodo.org/record/3243160/files/proband_R2.fq.gz
1.后台下载
使用wget -b +链接进行下载
后台任务启动后,会返回两段话,第一段返回一个pid,代表这个后台任务的进程,并且我们可以kill掉这个id来终止此次下载,第二段返回了一句话,意思是会将输出(持续)写入到wget-log这个文件。
2:查看wget后台进度
tail -f wget-log
数据质控
# 激活conda环境
conda activate wes
# 使用FastQC软件对单个fastq文件进行质量评估,结果输出到qc/文件夹下
qcdir=~/project/boy/data/rawdata/qc
fqdir=~/project/boy/data/rawdata/fastq
fastqc -t 3 -o $qcdir $fqdir/*.fastq.gz
# 多个数据质控
fastqc -t 2 -o $qcdir $fqdir/*.fq.gz
###### -o 为设置输出目录,此文件夹一定要存在,否则无法生成结果。若不设置此参数,默认将结果输出到输入文件所在的文件夹中
# 使用MultiQc整合FastQC结果
multiqc *.zip
数据比对
-
建立参考基因组序列的索引文件(index)
time bwa index -a bwtsw -p gatk_hg38 ~/project/boy/data/Homo_sapiens_assembly38.fasta
- 比对
INDEX=~/project/boy/data/gatk_hg38
cat config |while read id
do
arr=($id)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2 | samtools sort -@ 5 -o $sample.bam -
done
INDEX=/public/biosoft/GATK/resources/bundle/hg38/bwa_index/gatk_hg38
cat config |while read id
do
arr=($id)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
echo $sample $fq1 $fq2
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2 | samtools sort -@ 5 -o $sample.bam -
done
## bwa.sh
INDEX=~/project/boy/data/gatk_hg38
cat ~/project/boy/data/rawdata/qc/sampleId.txt| while read id
do
echo "start bwa for ${id}" `date`
fq1=./2.clean_fq/${id}_1_val_1.fq.gz
fq2=./2.clean_fq/${id}_2_val_2.fq.gz
bwa mem -M -t 16 -R "@RG\tID:${id}\tSM:${id}\tLB:WXS\tPL:Illumina" ${INDEX} ${fq1} ${fq2} | samtools sort -@ 10 -m 1G -o ./4.align/${id}.bam -
echo "end bwa for ${id}" `date`
done
多样本时需要走循环
ls ~/project/boy/data/rawdata/fastq/*1.fq.gz>1
ls ~/project/boy/data/rawdata/fastq/*2.fq.gz>2
cut -d"/" -f 7 1 |cut -d"_" -f 1 > 0
paste 0 1 2 > config
source activate wes
INDEX=~/project/boy/data/gatk_hg38
cat config |while read id
do
arr=($id)
fq1=${arr[1]}
fq2=${arr[2]}
sample=${arr[0]}
echo $sample $fq1 $fq2
bwa mem -t 5 -R "@RG\tID:$sample\tSM:$sample\tLB:WGS\tPL:Illumina" $INDEX $fq1 $fq2 | samtools sort -@ 5 -o $sample.bam -
done
找变异
gatk
conda activate ws
GATK=~/biosoft/gatk4/gatk-4.1.6.0/gatk
ref=~/project/boy/data/gatk/gatk-bundle/Homo_sapiens_assembly38.fasta
snp=~/project/boy/data/gatk/gatk-bundle/dbsnp_146.hg38.vcf.gz
indel=~/project/boy/data/gatk/gatk-bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
for sample in {proband_R1.fq.gz,mother_R1.fq.gz,father_ R1.fq.gz}
do
echo $sample
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" MarkDuplicates \
-I $sample.bam \
-O ${sample}_marked.bam \
-M $sample.metrics \
1>${sample}_log.mark 2>&1
for sample in {proband_R1.fq.gz,mother_R1.fq.gz,father_ R1.fq.gz}
do
echo $sample
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" FixMateInformation \
-I ${sample}_marked.bam \
-O ${sample}_marked_fixed.bam \
-SO coordinate \
1>${sample}_log.fix 2>&1
samtools index ${sample}_marked_fixed.bam
for sample in {proband_R1.fq.gz,mother_R1.fq.gz,father_ R1.fq.gz}
do
echo $sample
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" BaseRecalibrator \
-R $ref \
-I ${sample}_marked_fixed.bam \
--known-sites $snp \
--known-sites $indel \
-O ${sample}_recal.table \
1>${sample}_log.recal 2>&1
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I ${sample}_marked_fixed.bam \
-bqsr ${sample}_recal.table \
-O ${sample}_bqsr.bam \
1>${sample}_log.ApplyBQSR 2>&1 &
$GATK --java-options "-Xmx20G -Djava.io.tmpdir=./" HaplotypeCaller \
-R $ref \
-I ${sample}_bqsr.bam \
--dbsnp $snp \
-O ${sample}_raw.vcf \
1>${sample}_log.HC 2>&1 &
比对结果质控
source activate wes
wkd=~/project/boy
cd $wkd/clean
ls *.gz |xargs fastqc -t 10 -o ./
cd $wkd/align
rm *_marked*.bam
ls *.bam |xargs -i samtools index {}
ls *.bam | while read id ;do (samtools flagstat $id > $(basename $id ".bam").stat);done
conda install bedtools
cat /home/data/refdir/refdir/annotation/CCDS/human/CCDS.20160908.txt |perl -alne '{/\[(.*?)\]/;next unless $1;$gene=$F[2];$exons=$1;$exons=~s/\s//g;$exons=~s/-/\t/g;print "$F[0]\t$_\t$gene" foreach split/,/,$exons;}'|sort -u |bedtools sort -i |awk '{print "chr"$0"\t0\t+"}' > $wkd/align/hg38.exon.bed
exon_bed=hg38.exon.bed
ls *_bqsr.bam | while read id;
do
sample=${id%%.*}
echo $sample
qualimap bamqc --java-mem-size=20G -gff $exon_bed -bam $id &
done
### featureCounts
gtf=/public/reference/gtf/gencode/gencode.v25.annotation.gtf.gz
featureCounts -T 5 -p -t exon -g gene_id \
-a $gtf *_bqsr.bam -o all.id.txt 1>counts.id.log 2>&1 &
网友评论