#!/bin/bash
# 正式开始之前,先准备一系列所需文件
# create_Drop-seq_reference_metadata.sh
# 该脚本生成DropSeq Alignment所需文件
# fasta 参考基因组序列
# dict picard软件所需文件
# gtf 基因组注释文件
# refFlat picard所钟爱的类似gtf的文件格式
# gene.intervals 用于方便查看bam文件中比对至基因的读段
# exon.intervals 用于方便查看bam文件中比对至外显子的读段
# rRNA.intervals 用于方便查看bam文件中比对至rRNA读段的比例
# reduced.gtf gtf文件的子集 可读性更好
# create_Drop-seq_reference_metadata.sh \
# -n hg19 \
# -r hg19.fa.gz \
# -g gencode.v34lift37.annotation.gtf.gz \
# -s Human \
# -d /root/Drop-seq_tools-2.3.0/ \
# ls *.gz | awk 'BEGIN {FS = "."}{print $1}' | uniq | while read samplename; do
for samplename in CHP2N_2 CHP2T_2; do
mkdir $samplename
cd $samplename
mkdir tmp
echo "============FastqToSam============"
# https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups
# More information about the definition of "read group".
# Generally, reads sequenced from a single lane are considered as a read group.
# We can use command:
# ls *.gz | awk 'BEGIN {FS = "_"}{print $1"_"$2}'
# to get sample infomation,
# and:
# ls *.gz | awk 'BEGIN {FS = "_"}{print $3}'
# to get lane infomation.
picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp FastqToSam \
F1=../$samplename\.raw_1.fq.gz \
F2=../$samplename\.raw_2.fq.gz \
O=unaligned_data.bam \
SM=$samplename \
RG=$samplename
echo "============TagBamWithReadSequenceExtended============"
# 这一步发现部分reads的第3位碱基为N,碱基质量Phred33值为#,对应0.63的错误率,会导致cell barcode和molecular barcode的移位
# BASE_QUALITY=10这一参数会过滤掉这些reads
# 1.Tag cell barcodes
TagBamWithReadSequenceExtended \
INPUT=unaligned_data.bam \
OUTPUT=unaligned_tagged_Cell.bam \
SUMMARY=unaligned_tagged_Cellular.bam_summary.txt \
BASE_RANGE=1-12 \
BASE_QUALITY=10 \
BARCODED_READ=1 \
DISCARD_READ=False \
TAG_NAME=XC \
NUM_BASES_BELOW_QUALITY=1
echo "rm -f unaligned_data.bam"
rm -f unaligned_data.bam
# 2. Tag molecular barcodes
TagBamWithReadSequenceExtended \
INPUT=unaligned_tagged_Cell.bam \
OUTPUT=unaligned_tagged_CellMolecular.bam \
SUMMARY=unaligned_tagged_Molecular.bam_summary.txt \
BASE_RANGE=13-20 \
BASE_QUALITY=10 \
BARCODED_READ=1 \
DISCARD_READ=True \
TAG_NAME=XM \
NUM_BASES_BELOW_QUALITY=1
echo "rm -f unaligned_tagged_Cell.bam"
rm -f unaligned_tagged_Cell.bam
echo "============FilterBam============"
FilterBam \
TAG_REJECT=XQ \
INPUT=unaligned_tagged_CellMolecular.bam \
OUTPUT=unaligned_tagged_filtered.bam
echo "rm -f unaligned_tagged_CellMolecular.bam"
rm -f unaligned_tagged_CellMolecular.bam
# 查看保留的reads数
# samtools view unaligned_tagged_filtered.bam | wc -l
echo "============TrimStartingSequence============"
# 3. Trim 5’ primer sequence
# https://support.illumina.com/bulletins/2016/04/adapter-trimming-why-are-adapter-sequences-trimmed-from-only-the--ends-of-reads.html
# 5' adaptor sequence won't be found in reads using illumina seqtech
# TrimStartingSequence \
# INPUT=unaligned_tagged_filtered.bam \
# OUTPUT=unaligned_tagged_trimmed_smart.bam \
# OUTPUT_SUMMARY=adapter_trimming_report.txt \
# SEQUENCE=AAGCAGTGGTATCAACGCAGAGTGAATGGG \
# MISMATCHES=0 \
# NUM_BASES=5
echo "============PolyATrimmer============"
# 4. Trim 3’ polyA sequence
PolyATrimmer \
INPUT=unaligned_tagged_filtered.bam \
OUTPUT=unaligned_mc_tagged_polyA_filtered.bam \
OUTPUT_SUMMARY=polyA_trimming_report.txt \
# MISMATCHES=0 \
# NUM_BASES=6 \
# USE_NEW_TRIMMER=true
echo "rm -f unaligned_tagged_filtered.bam"
rm -f unaligned_tagged_filtered.bam
# unaligned_mc_tagged_polyA_filtered.bam在retrieve tag infomation时仍需用到
echo "============SamToFastq============"
# 5. SamToFastq
picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp SamToFastq \
INPUT=unaligned_mc_tagged_polyA_filtered.bam \
FASTQ=unaligned_mc_tagged_polyA_filtered.fastq
echo "============STAR★alignment============"
# 6. STAR★alignment
STAR \
--genomeDir /data/wangzw/dropseqMetadata/STAR \
--readFilesIn unaligned_mc_tagged_polyA_filtered.fastq \
--outFileNamePrefix star
echo "rm -f unaligned_mc_tagged_polyA_filtered.bam"
rm -f unaligned_mc_tagged_polyA_filtered.fastq
echo "============SortSam============"
# 7.Sort STAR alignment in queryname order
picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp SortSam \
I=starAligned.out.sam \
O=aligned.sorted.bam \
SO=queryname
echo "rm -f starAligned.out.sam"
rm -f starAligned.out.sam
echo "============MergeBamAlignment============"
# 8. Merge STAR alignment tagged SAM to recover cell/molecular barcodes
picard -Xmx16g -Djava.io.tmpdir=`pwd`/tmp MergeBamAlignment \
REFERENCE_SEQUENCE=/data/wangzw/dropseqMetadata/hg19.fasta.gz \
UNMAPPED_BAM=unaligned_mc_tagged_polyA_filtered.bam \
ALIGNED_BAM=aligned.sorted.bam \
OUTPUT=merged.bam \
INCLUDE_SECONDARY_ALIGNMENTS=false \
PAIRED_RUN=false
echo "rm -f unaligned_mc_tagged_polyA_filtered.bam aligned.sorted.bam"
rm -f unaligned_mc_tagged_polyA_filtered.bam aligned.sorted.bam
echo "============TagReadWithGeneExon============"
# 9.Add gene/exon and other annotation tags
TagReadWithGeneFunction \
I=merged.bam \
O=star_gene_exon_tagged.bam \
ANNOTATIONS_FILE=/data/wangzw/dropseqMetadata/hg19.refFlat
echo "rm -f merged.bam"
rm -f merged.bam
# echo "============DetectBeadSubstitutionErrors============"
# 10.Barcode Repair
# this step is based on multiple experimental observations, while the underlying theory is unclear.
## 10.1 Repair substitution errors (DetectBeadSubstitutionErrors)
# DetectBeadSubstitutionErrors \
# I=my.bam \
# O=my_clean_subtitution.bam \
# OUTPUT_REPORT=my_clean.substitution_report.txt
# echo "============DetectBeadSynthesisErrors============"
# 10.Barcode Repair
## 10.2 Repair indel errors (DetectBeadSynthesisErrors)
### SYNTHESIS_MISSING_BASE
### SINGLE_UMI_ERROR
### PRIMER_MATCH
### OTHER
# DetectBeadSynthesisErrors \
# I=star_gene_exon_tagged.bam \
# O=clean.bam \
# REPORT=clean.indel_report.txt \
# OUTPUT_STATS=synthesis_stats.txt \
# SUMMARY=synthesis_stats.summary.txt
# DigitalExpression函数的Filter Option只支持单个
DigitalExpression \
I=star_gene_exon_tagged.bam \
O=$samplename\_out_gene_exon_tagged.dge.txt.gz \
SUMMARY=$samplename\_out_gene_exon_tagged.dge.summary.txt \
MIN_NUM_TRANSCRIPTS_PER_CELL=500
cd ..
# linux命令替换
currentTime=`date`
echo "$samplename has finished. $currentTime."
done
参考内容:http://mccarrolllab.org/dropseq/
网友评论