使用IsoSeq3利用pacbio测序数据进行全长转录本序列分析,为了得到更准确的基因注释作准备,但价格昂贵,且无法进行表达量分析
mkdir -p /home/train/08.RNA-seq_analysis_by_trinity
cd /home/train/08.RNA-seq_analysis_by_trinity
# 对PacBio转录组测序数据进行全长转录本序列分析
# https://isoseq.how/clustering/schematic-workflow.html
mkdir -p /home/train/08.RNA-seq_analysis_by_trinity/IsoSeq
cd /home/train/08.RNA-seq_analysis_by_trinity/IsoSeq
#将subreads转换成ccs数据
PATH=/opt/biosoft/miniconda3_for_pbbioconda/bin/:$PATH
samtools view -h ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/m54086_170204_081430.subreads.bam | head -n 10000 | samtools view -b > m54086_170204_081430.subreads.bam
pbindex m54086_170204_081430.subreads.bam #对pacbio测序结果的bam建立索引
# ccs的正常运行需要提供chemistry.xml测序试剂信息;在运行ccs数据之前配置文件chemistry.xml
# wget https://raw.githubusercontent.com/PacificBiosciences/pbcore/develop/pbcore/chemistry/resources/mapping.xml -O chemistry.xml
cp ~/00.incipient_data/data_for_genome_assembling/chemistry.xml .
perl -p -i -e 's/<SoftwareVersion>5.0/<SoftwareVersion>4.0/' chemistry.xml
export SMRT_CHEMISTRY_BUNDLE_DIR=$PWD#将当前目录写到环境变量
# 运行ccs命令将Subreads转换为CCS数据。默认参数--min-passes为3表示至少有3条全长的subreads;--min-snr为2.5表示信噪比,即SSubreads的准确率至少为2.5/3.5=71.4%。
ccs --min-rq 0.9 m54086_170204_081430.subreads.bam m54086_170204_081430.ccs.bam
#使用samtools对结果进行查看
[train@MiWiFi-R3P-srv IsoSeq]$ samtools view m54086_170204_081430.ccs.bam | les
# 去除引物和barcode碱基信息。
echo '>NEB_5p
GCAATGAAGTCGCAGGGTTGGG
>Clontech_5p
AAGCAGTGGTATCAACGCAGAGTACATGGGG
>NEB_Clontech_3p
GTACTCTGCGTTGATACCACTGCTT' > barcoded_primers.fasta
lima m54086_170204_081430.ccs.bam barcoded_primers.fasta m54086_170204_081430.lima.bam --isoseq --peek-guess
统计筛选后的条数
[train@MiWiFi-R3P-srv IsoSeq]$ samtools view m54086_170204_081430.lima.Clontech_5p--NEB_Clontech_3p.bam | wc -l
656
# 去除PolyA和合体序列,得到FLNC序列
isoseq refine m54086_170204_081430*lima*.bam barcoded_primers.fasta m54086_170204_081430.flnc.bam --require-polya
# 对FLNC reads进行聚类,得到全长转录本序列信息: hq.fasta.gz with predicted accuracy ≥ 0.99; lq.fasta.gz with predicted accuracy < 0.99。
isoseq cluster m54086_170204_081430.flnc.bam clustered.bam --verbose --use-qvs
[train@MiWiFi-R3P-srv IsoSeq]$ samtools view m54086_170204_081430.flnc.bam | wc -l
646
[train@MiWiFi-R3P-srv IsoSeq]$ samtools view clustered.bam | wc -l
41
[train@MiWiFi-R3P-srv IsoSeq]$ ll -t
total 37784
-rw-r--r-- 1 train train 8588 Jul 21 14:36 clustered.cluster
-rw-r--r-- 1 train train 15327 Jul 21 14:36 clustered.hq.fasta.gz
-rw-r--r-- 1 train train 2752 Jul 21 14:36 clustered.lq.fasta.gz
-rw-r--r-- 1 train train 404 Jul 21 14:36 clustered.bam.pbi
-rw-r--r-- 1 train train 21560 Jul 21 14:36 clustered.bam
-rw-r--r-- 1 train train 381 Jul 21 14:36 clustered.hq.bam.pbi
-rw-r--r-- 1 train train 18707 Jul 21 14:36 clustered.hq.bam
-rw-r--r-- 1 train train 109 Jul 21 14:36 clustered.lq.bam.pbi
-rw-r--r-- 1 train train 3352 Jul 21 14:36 clustered.lq.bam
-rw-r--r-- 1 train train 1568 Jul 21 14:36 clustered.transcriptset.xml
-rw-r--r-- 1 train train 6505 Jul 21 14:36 clustered.cluster_report.csv
-rw-r--r-- 1 train train 8195 Jul 21 14:33 m54086_170204_081430.flnc.bam.pbi
-rw-r--r-- 1 train train 1509378 Jul 21 14:33 m54086_170204_081430.flnc.bam
-rw-r--r-- 1 train train 1611 Jul 21 14:33 m54086_170204_081430.flnc.consensusreadset.xml
-rw-r--r-- 1 train train 821 Jul 21 14:33 m54086_170204_081430.flnc.filter_summary.report.json
-rw-r--r-- 1 train train 50339 Jul 21 14:33 m54086_170204_081430.flnc.report.csv
-rw-r--r-- 1 train train 1990 Jul 21 14:29 m54086_170204_081430.lima.Clontech_5p--NEB_Clontech_3p.consensusreadset.xml
-rw-r--r-- 1 train train 3826 Jul 21 14:29 m54086_170204_081430.lima.consensusreadset.xml
-rw-r--r-- 1 train train 639 Jul 21 14:29 m54086_170204_081430.lima.json
-rw-r--r-- 1 train train 108 Jul 21 14:29 m54086_170204_081430.lima.lima.counts
-rw-r--r-- 1 train train 845 Jul 21 14:29 m54086_170204_081430.lima.lima.summary
-rw-r--r-- 1 train train 8342 Jul 21 14:29 m54086_170204_081430.lima.Clontech_5p--NEB_Clontech_3p.bam.pbi
-rw-r--r-- 1 train train 1564566 Jul 21 14:29 m54086_170204_081430.lima.Clontech_5p--NEB_Clontech_3p.bam
-rw-r--r-- 1 train train 101927 Jul 21 14:29 m54086_170204_081430.lima.lima.clips
-rw-r--r-- 1 train train 183452 Jul 21 14:29 m54086_170204_081430.lima.lima.report
-rw-r--r-- 1 train train 118 Jul 21 14:29 m54086_170204_081430.lima.lima.guess
-rw-r--r-- 1 train train 119 Jul 21 14:29 barcoded_primers.fasta
-rw-r--r-- 1 train train 862 Jul 21 14:17 m54086_170204_081430.ccs.ccs_report.txt
-rw-r--r-- 1 train train 61814 Jul 21 14:17 m54086_170204_081430.ccs.zmw_metrics.json.gz
-rw-r--r-- 1 train train 8030 Jul 21 14:17 m54086_170204_081430.ccs.bam.pbi
-rw-r--r-- 1 train train 1694358 Jul 21 14:17 m54086_170204_081430.ccs.bam
-rw-r--r-- 1 train train 8366 Jul 21 14:15 chemistry.xml
-rw-r--r-- 1 train train 89364 Jul 21 14:10 m54086_170204_081430.subreads.bam.pbi
-rw-r--r-- 1 train train 33228366 Jul 21 14:09 m54086_170204_081430.subreads.bam
网友评论