Homolog (近缘种同源蛋白比对)
Transcript (转录本预测)
Ab initio / HMM Training (从头预测)
软件安装
#同源基因预测软件安装
# Installing genewise (https://www.ebi.ac.uk/Tools/psa/genewise/ | https://www.ebi.ac.uk/~birney/wise2/)
#wget https://www.ebi.ac.uk/~birney/wise2/wise2.4.1.tar.gz -P ~/software/
tar zxf ~/software/wise2.4.1.tar.gz -C /opt/biosoft/
cd /opt/biosoft/wise2.4.1/src/
find ./ -name makefile | xargs sed -i 's/glib-config/pkg-config --libs glib-2.0/'
perl -p -i -e 's/getline/get_line/g' ./HMMer2/sqio.c
perl -p -i -e 's/isnumber/isdigit/' models/phasemodel.c
perl -p -i -e 's/csh welcome.csh/sh welcome.csh/' makefile
make all -j 4
export WISECONFIGDIR=/opt/biosoft/wise2.4.1/wisecfg/
make test
echo 'PATH=$PATH:/opt/biosoft/wise2.4.1/src/bin/' >> ~/.bashrc
echo 'export WISECONFIGDIR=/opt/biosoft/wise2.4.1/wisecfg/' >> ~/.bashrc
source ~/.bashrc
# 直接使用genewise无法进行全基因组水平的基因预测,需要自行编写程序来调用genewise进行基因预测。https://github.com/chenlianfu/geta
tar zxf ~/software/geta-2.6.1.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/geta-2.6.1/bin/' >> ~/.bashrc
source ~/.bashrc
# genewise的预测基因模型结果需要进行过滤处理,此处使用PFAM数据库进行过滤。以下安装Pfam数据库和Hmmer软件
# Installing hmmer (http://www.hmmer.org/)
#wget http://eddylab.org/software/hmmer/hmmer-3.3.2.tar.gz -P ~/software/
tar zxf ~/software/hmmer-3.3.2.tar.gz
cd hmmer-3.3.2/
./configure --prefix=/opt/biosoft/hmmer-3.3.2 && make -j 4 && make install
cd .. && rm -rf hmmer-3.3.2
echo 'PATH=/opt/biosoft/hmmer-3.3.2/bin/:$PATH' >> ~/.bashrc
source ~/.bashrc
cd /opt/biosoft/hmmer-3.3.2
# Installing Pfam v27 (http://pfam.xfam.org/ | ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/)
#wegt ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/Pfam-A.hmm.gz -O ~/software/Pfam-A_V27.hmm.gz
#wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam27.0/Pfam-B.hmm.gz -O ~/software/Pfam-B_V27.hmm.gz
mkdir -p /opt/biosoft/bioinfomatics_databases/Pfam
gzip -dc ~/software/Pfam-A_V27.hmm.gz > /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
gzip -dc ~/software/Pfam-B_V27.hmm.gz >> /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
hmmpress /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
rm /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm
# Installing GenomeThreader (http://genomethreader.org/)
#wget http://genomethreader.org/distributions/gth-1.7.3-Linux_x86_64-64bit.tar.gz -P ~/software/
tar zxf ~/software/gth-1.7.3-Linux_x86_64-64bit.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/gth-1.7.3-Linux_x86_64-64bit/bin/' >> ~/.bashrc
source ~/.bashrc
# install ProtHint (https://github.com/gatech-genemark/ProtHint)
#wget https://github.com/gatech-genemark/ProtHint/releases/download/v2.6.0/ProtHint-2.6.0.tar.gz -P ~/software/
tar zxf ~/software/ProtHint-2.6.0.tar.gz -C /opt/biosoft/
ln -s /opt/biosoft/ProtHint-2.6.0/bin/* /opt/biosoft/gth-1.7.3-Linux_x86_64-64bit/bin/
# Installing exonerate (https://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate)
#wget http://ftp.ebi.ac.uk/pub/software/vertebrategenomics/exonerate/exonerate-2.2.0-x86_64.tar.gz -P ~/software/
tar zxf ~/software/exonerate-2.2.0-x86_64.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/exonerate-2.2.0-x86_64/bin/' >> ~/.bashrc
source ~/.bashrc
# Installing DIAMOND (https://github.com/bbuchfink/diamond)#类似ncbi blast+,且比其快
#wget https://github.com/bbuchfink/diamond/archive/refs/tags/v2.1.8.tar.gz -O ~/software/diamond-2.1.8.tar.gz
#wget https://github.com/bbuchfink/diamond/releases/download/v2.1.8/diamond-linux64.tar.gz -P ~/software/
tar zxf ~/software/diamond-2.1.8.tar.gz && cd diamond-2.1.8
cmake ./ -DCMAKE_INSTALL_PREFIX=/opt/biosoft/diamond-2.1.8 && make -j 4 && make install
cd .. && rm -rf diamond-2.1.8
echo 'PATH=$PATH:/opt/biosoft/diamond-2.1.8/bin' >> ~/.bashrc
source ~/.bashrc
#从头预测软件安装
tar zxf ~/software/bamtools-2.5.2.tar.gz
cd bamtools-2.5.2/
mkdir build
cd build/
cmake ../ -DCMAKE_INSTALL_PREFIX=/opt/biosoft/bamtools-2.5.2
make -j 4
make install
sudo cp -a /opt/biosoft/bamtools-2.5.2/include/bamtools/ /usr/include/
sudo cp -a /opt/biosoft/bamtools-2.5.2/lib64/* /usr/lib64/
cd ../../ && rm bamtools-2.5.2/ -rf
# Installing boost C++ Libraries (https://www.boost.org/)
#wget https://boostorg.jfrog.io/artifactory/main/release/1.82.0/source/boost_1_82_0.tar.gz -P ~/software
tar zxf ~/software/boost_1_82_0.tar.gz
cd boost_1_82_0/
./bootstrap.sh --prefix=/opt/biosoft/boost_1_82_0
./b2 -j 4 install
cd .. && rm boost_1_82_0/ -rf
sudo ln -s /opt/biosoft/boost_1_82_0/include/boost /usr/include/
echo 'export LD_LIBRARY_PATH=/opt/biosoft/boost_1_82_0/lib:$LD_LIBRARY_PATH
export C_INCLUDE_PATH=/opt/biosoft/boost_1_82_0/include:$C_INCLUDE_PATH' >> ~/.bashrc
source ~/.bashrc
# Installing lpsolve (https://sourceforge.net/projects/lpsolve/)
#wget https://sourceforge.net/projects/lpsolve/files/lpsolve/5.5.2.11/lp_solve_5.5.2.11_source.tar.gz -P ~/software
tar zxf ~/software/lp_solve_5.5.2.11_source.tar.gz -C /opt/sysoft/
cd /opt/sysoft/lp_solve_5.5/lpsolve55
sh ccc
echo 'export LD_LIBRARY_PATH=/opt/sysoft/lp_solve_5.5/lpsolve55/bin/ux64:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
# Installing Mysql++ (https://tangentsoft.com/mysqlpp)
#wget https://tangentsoft.com/mysqlpp/releases/mysql++-3.3.0.tar.gz -P ~/software/
tar zxf ~/software/mysql++-3.3.0.tar.gz && cd mysql++-3.3.0
./configure --prefi=/opt/sysoft/mysql++-3.3.0 && make -j 4 && make install
cd .. && rm -rf mysql++-3.3.0
echo 'export LD_LIBRARY_PATH=/opt/sysoft/mysql++-3.3.0/lib:$LD_LIBRARY_PATH' >> ~/.bashrc
source ~/.bashrc
#wget https://github.com/Gaius-Augustus/Augustus/archive/refs/tags/v3.5.0.tar.gz -O ~/software/Augustus-3.5.0.tar.gz
sudo dnf --disablerepo=* --enablerepo=media-* -y install gsl gsl-devel sqlite sqlite-devel suitesparse suitesparse-devel mariadb mariadb-server
tar zxf ~/software/Augustus-3.5.0.tar.gz -C /opt/biosoft/
cd /opt/biosoft/Augustus-3.5.0/
perl -p -i -e 's#/usr/include/lpsolve#/opt/sysoft/lp_solve_5.5#' src/Makefile
perl -p -i -e 's#/usr/include/mysql#/opt/sysoft/mysql++-3.3.0/include -I/usr/include/mysql/#' src/Makefile
export LIBRARY_PATH=/opt/biosoft/boost_1_82_0/lib:/opt/sysoft/lp_solve_5.5/lpsolve55/bin/ux64:$LIBRARY_PATH
ln -s /usr/lib64/libcolamd.so.2 /usr/lib64/libcolamd.so
make augustus -j 4
#BUSCO软件安装
# Installing BUSCO (https://busco.ezlab.org/)
# wget https://gitlab.com/ezlab/busco/-/archive/5.4.7/busco-5.4.7.tar.gz -P ~/software/
tar zxf ~/software/busco-5.4.7.tar.gz -C /opt/biosoft
# installing Python modules: biopython pandas
pip3 install biopython pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
# installing BBTools (https://jgi.doe.gov/data-and-tools/software-tools/bbtools/)
#wget https://sourceforge.net/projects/bbmap/files/BBMap_39.01.tar.gz -P ~/software/
tar zxf ~/software/BBMap_39.01.tar.gz -C /opt/biosoft
echo 'PATH=$PATH:/opt/biosoft/bbmap/' >> ~/.bashrc
source ~/.bashrc
# installing NCBI-Blast+, Augustus and R (Already satisfied)
# installing meaeuk (https://github.com/soedinglab/metaeuk)
#wget https://github.com/soedinglab/metaeuk/releases/download/6-a5d39d9/metaeuk-linux-avx2.tar.gz -P ~/software/
tar zxf ~/software/metaeuk-linux-avx2.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/metaeuk/bin/' >> ~/.bashrc
source ~/.bashrc
# installing Prodigal (https://github.com/hyattpd/Prodigal)
#wget https://github.com/hyattpd/Prodigal/releases/download/v2.6.3/prodigal.linux -P ~/software/
#wget https://github.com/hyattpd/Prodigal/archive/refs/tags/v2.6.3.tar.gz -O ~/software/Prodigal-2.6.3.tar.gz
#tar zxf ~/software/Prodigal-2.6.3.tar.gz -C /opt/biosoft
#cd /opt/biosoft/Prodigal-2.6.3 && make -j 4
mkdir /opt/biosoft/Prodigal-2.6.3/
cp /home/train/software/prodigal.linux /opt/biosoft/Prodigal-2.6.3/prodigal
chmod 755 /opt/biosoft/Prodigal-2.6.3/prodigal
echo 'PATH=$PATH:/opt/biosoft/Prodigal-2.6.3/' >> ~/.bashrc
source ~/.bashrc
# configuring BUSCO
cd /opt/biosoft/busco-5.4.7
python3 setup.py install
perl -i -e '$/ = "\n\n"; while (<>) { if (m/command\s*=\s*(\S+)/) { my $path = `which $1`; $path =~ s/\s*$//; if ($path) { $path =~ s/(.*\/).*/$1/; s/path\s*=.*/path = $path/; } } print; }' config/config.ini
echo 'PATH=/opt/biosoft/busco-5.4.7/bin:$PATH
export BUSCO_CONFIG_FILE=/opt/biosoft/busco-5.4.7/config/config.ini' >> ~/.bashrc
source ~/.bashrc
# installing BUSCO Version 5 Databases #下载所有的数据库
mkdir -p /opt/biosoft/bioinfomatics_databases/BUSCO_V5
cd /opt/biosoft/bioinfomatics_databases/BUSCO_V5
#wget https://busco-data.ezlab.org/v5/data/information/lineages_list.2021-12-14.txt.tar.gz -P ~/software/
tar zxf ~/software/lineages_list.2021-12-14.txt.tar.gz &&
perl -e 'open IN, $ARGV[0]; while (<IN>) { last if m/^\s*eukaryota_odb10/; } $info{"eukaryota_odb10"} = 1; while (<IN>) { $info{$1} = 1 if m/-\s*(\S+)/; } my $curl = `curl https://busco-data.ezlab.org/v5/data/lineages/`; foreach (keys %info) { print "wget -c https://busco-data.ezlab.org/v5/data/lineages/$1\n" if $curl =~ m/href=\"($_.*?.tar.gz)/;; }' lineages_list.2021-12-14.txt > command.wget.list
ParaFly -c command.wget.list -CPU 5
for i in `ls *.tar.gz`
do
tar zxf $i
done
# 下载几个本次培训需要用到的数据库
#wget https://busco-data.ezlab.org/v5/data/lineages/fungi_odb10.2021-06-28.tar.gz -P ~/software/BUSCO_databases
#wget https://busco-data.ezlab.org/v5/data/lineages/basidiomycota_odb10.2020-09-10.tar.gz -P ~/software/BUSCO_databases
cd /opt/biosoft/bioinfomatics_databases/BUSCO_V5
tar zxf ~/software/BUSCO_databases/fungi_odb10.2021-06-28.tar.gz
tar zxf ~/software/BUSCO_databases/basidiomycota_odb10.2020-09-10.tar.gz
## 1. 使用 PASA 利用转录本序列进行基因预测
略过
## 2. 使用 genewise 利用同源蛋白进行基因预测
mkdir -p /home/train/10.gene_prediction/homolog
cd /home/train/10.gene_prediction/homolog
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/genome.softmask.fasta genome.fasta
gzip -dc /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000328475.2_Umaydis521_2.0_protein.faa.gz > homolog.fasta#近缘物种越多越好,建议10个
[train@MiWiFi-R3P-srv homolog]$ grep ">" homolog.fasta |head
>XP_011385989.1 hypothetical protein UMAG_00001 [Ustilago maydis 521]
>XP_011385990.1 hypothetical protein UMAG_00002 [Ustilago maydis 521]
>XP_011385991.1 hypothetical protein UMAG_00003 [Ustilago maydis 521]
>XP_011385992.1 putative Benzoate 4-monooxygenase cytochrome P450 [Ustilago maydis 521]
>XP_011385993.1 hypothetical protein UMAG_00006 [Ustilago maydis 521]
>XP_011385994.1 hypothetical protein UMAG_00007 [Ustilago maydis 521]
>XP_011385995.1 hypothetical protein UMAG_00008 [Ustilago maydis 521]
>XP_011385996.1 hypothetical protein UMAG_00009 [Ustilago maydis 521]
>XP_011385997.1 putative amidase [Ustilago maydis 521]
>XP_011385998.1 hypothetical protein UMAG_00012 [Ustilago maydis 521]
#由于序列名不标准,通过下面命令进行改名
perl -p -i -e 'if (m/^>/) { s/\s+.*//; s/\./_/g; }' homolog.fasta
[train@MiWiFi-R3P-srv homolog]$ grep ">" homolog.fasta |head
>XP_011385989_1
>XP_011385990_1
>XP_011385991_1
>XP_011385992_1
>XP_011385993_1
>XP_011385994_1
>XP_011385995_1
>XP_011385996_1
>XP_011385997_1
>XP_011385998_1
#perl -pe 's/\*$//' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/homolog_proteins_from_6_fungi_species.fasta >> homolog.fasta
homolog_prediction --cpu 8 --identity 0.2 --evalue 1e-9 --subject_coverage 0.3 --method all --genetic_code 1 --segmentSize 100000 --overlapSize 10000 --tmp_dir homolog_prediction.tmp homolog.fasta genome.fasta > homolog.gff3
cd ..
## 3. 使用 AUGUSTUS 进行基因预测(目前从头预测最好用的软件)
# 3.1 AUGUSTUS Training
mkdir -p /home/train/10.gene_prediction/augustus/training
cd /home/train/10.gene_prediction/augustus/training
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
# 提取准确完好的基因模型
perl -e '$/ = "\n\n"; while (<>) { print if (m/Type=excellent/ or m/Type=good/); }' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GFF3_files/evidence_gene_models.gff3 > geneModels.gff3
geneModels2AugusutsTrainingInput --cpu 8 --min_evalue 1e-9 --min_identity 0.7 --min_coverage_ratio 0.7 --min_cds_num 1 --min_cds_length 450 --min_cds_exon_ratio 0.4 --keep_ratio_for_excluding_too_long_gene 0.99 --out_prefix ati geneModels.gff3 genome.fasta#--min_cds_exon_ratio 0.4 若低于40%可能是lncrna
# real 0m44.092s
# user 3m35.421s
# sys 0m30.430s
# ati.filter1.gff3是去冗余后的基因模型;ati.filter2.gff3是完整的基因模型。这些基因模型可以用于其它 ab initio 基因预测软件的 HMM trainning 。
# 利用已有的准确的基因模型进行HMM Training,使用BGM2AT程序自动化运行
#BGM2AT --flanking_length 100 --CPU 8 --optimize_augustus_rounds 5 --onlytrain_GFF3 ati.filter1.gff3 ati.filter2.gff3 genome.fasta malassezia_sympodialis
# real 60m24.233s
# user 408m17.384s
# sys 5m23.935s
# 手动分步运行
# 将 GFF3 文件转换为 GeneBank 格式
gff2gbSmallDNA.pl ati.filter2.gff3 genome.fasta 100 genes.raw.gb
# 去除错误的基因
new_species.pl --species=for_bad_genes_removing #获得初始的规律文件,并写入/opt/biosoft/Augustus-3.5.0/config/species/中
etraining --species=for_bad_genes_removing --stopCodonExcludedFromCDS=false genes.raw.gb 2> train.err #找到有问题的基因模型
cat train.err | perl -ne 'print "$1\n" if m/in sequence (\S+):/' | sort | uniq > badgenes.lst #抓取有问题的基因模型
filterGenes.pl badgenes.lst genes.raw.gb > genes.gb
[train@MiWiFi-R3P-srv training]$ grep LOCUS genes.gb | wc -l
2076
经过对有问题的基因模型进行去除之后,开始进行正常的training
#cp ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/genes.gb ./
randomSplit.pl genes.gb 300 #对基因模型进行分割,随机生成两个文件分别包含300,1776
new_species.pl --species=malassezia_sympodialis
# 第一次training
etraining --species=malassezia_sympodialis genes.gb.train > train.out
perl -e 'open IN, "train.out"; while (<IN>) { $tag = $1 if m/tag:.*\((.*)\)/; $taa = $1 if m/taa:.*\((.*)\)/; $tga = $1 if m/tga:.*\((.*)\)/; } while (<>) { s#/Constant/amberprob.*#/Constant/amberprob $tag#; s#/Constant/ochreprob.*#/Constant/ochreprob $taa#; s#/Constant/opalprob.*#/Constant/opalprob $tga#; print }' /opt/biosoft/Augustus-3.5.0/config/species/malassezia_sympodialis/malassezia_sympodialis_parameters.cfg > 11
mv 11 /opt/biosoft/Augustus-3.5.0/config/species/malassezia_sympodialis/malassezia_sympodialis_parameters.cfg
augustus --species=malassezia_sympodialis genes.gb.test | tee firsttest.out
# 使用optimize_augustus.pl进行循环training找最优参数
# genes.gb.train中包含1777个基因模型。对其再次进行分割,取其中 400 个基因模型用于对HMM模型参数进行优化时的准确性检测,剩下1377个仅加入到training的过程中。本次优化过程如下:对某一个参数进行优化的时候,将400个基因模型随机分成8份,每份的基因模型数目为50个;取其中7份的基因模型和1377个基因模型用于etraining,再使用剩下的1份基因模型进行准确性检测;总共有8份数据,这8份数据并行化运行,得到8个准确性检测值,取其均值用于参数的优化。
# 因此,为了能更准确快速的进行HMM参数文件优化,则需要注意两点:(1)根据计算资源确定并行数。当然,并行数越大,则准确性检测的值更准确。(2)用于准确性检测的基因模型数目。该数目除以并行数的值不要大于100,否则,每次准确性检测耗时会很长。当然,用于准确性检测的基因模型数目除以并行数的值越大,则准确性检测的值更准确。这个需要在准确性和耗时中权衡该值,推荐50-100。
# 若用于training的基因模型数目很多,比如有4500个;同时计算资源达80线程。则可以设置:先将4500个基因模型分成200和4300个,其中200个用于参数文件的准确性检测;然后再将4300个分成300和4000个,在优化参数过程中,设置并行化数80,这4000个基因模型用于training和准确性检测(并行化的每个线程中随机从4000个基因中取50个用于准确性检测),而这300个仅用于training。
randomSplit.pl genes.gb.train 1377
ln -s genes.gb.train.test training.gb.onlytrain
optimize_augustus.pl --species=malassezia_sympodialis --rounds=5 --cpus=8 --kfold=8 --onlytrain=training.gb.onlytrain genes.gb.train.train > optimize.out 2> /dev/null
# real 38m26.834s
# user 2.5.34.136s
# sys 2m43.755s
# accuracy value = (3*nucleotide_sensitivity + 2*nucleotide_specificity + 4*exon_sensitivity + 3*exon_specificity + 2*gene_sensitivity + 1*gene_specificity)/15
# Commonly observed values at this position range from 40 to 60 percent. If you obtain a very low value, this gives a strong indication that the obtained parameter set is not very useful for predicting genes accurately.
# 第二次training
etraining --species=malassezia_sympodialis genes.gb.train
augustus --species=malassezia_sympodialis genes.gb.test | tee secondtest.out
cd ..
# 使用 RNA-seq 数据制作 hints 文件
mkdir -p /home/train/10.gene_prediction/augustus/making_hints
cd /home/train/10.gene_prediction/augustus/making_hints
samtools merge -@ 8 rnaseq.bam ~/06.reads_aligment/hisat2/*.sam
samtools sort -@ 8 -O bam -o rnaseq.sort.bam rnaseq.bam
bam2hints --intronsonly --in=rnaseq.sort.bam --out=hints.gff #通过bam文件找到intron信息
# 推荐使用masked重复序列后的基因组序列进行转录组测序数据的比对,得到hints信息。
# 运行 Augustus 进行基因预测
cd /home/train/10.gene_prediction/augustus
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/genome.softmask.fasta genome.fasta
tar zxf ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/augustus_trainingOut_malassezia_sympodialis.tar.gz -C /opt/biosoft/Augustus-3.5.0/config/species
#augustus --species=malassezia_sympodialis --extrinsicCfgFile=/opt/biosoft/augustus/config/extrinsic/extrinsic.M.RM.E.W.cfg --alternatives-from-evidence=true --allow_hinted_splicesites=gcag,atac --min_intron_len=20 --softmasking=1 --hintsfile=making_hints/hints.gff --gff3=on genome.fasta > aug.gff3 #该命令是标准的做法,但是只消耗一个线程,速度较慢,可以选择下面命令加快速度
paraAugusutusWithHints --cpu 8 --species malassezia_sympodialis --min_intron_len 20 genome.fasta making_hints/hints.gff > augustus.gff3 #该文件没有exon信息,转录组表达量分析不会识别,需要进一步操作
[train@MiWiFi-R3P-srv augustus]$ paraAugusutusWithHints
Usage:
perl /opt/biosoft/geta-2.6.1/bin/paraAugusutusWithHints [options] genome.fasta hints.gff
程序将基因组序列和hints信息分成多份,然后调用augustus进行并行化计算,最后合并并行化结果,得到最终的GFF3结果文件。注意:输入的额hints.gff3文件必须按染色体名和位置进行排序。
--gene_prefix <string> default: augustus
设置基因ID前缀
--species <string> default: None
设置用于进行Augustus基因预测的HMM模型数据。
--cpu <int> default: 1
设置并行运行augustus命令的数目。
--segmentSize <int> default: 5000000
--overlapSize <int> default: 100000
程序将基因组序列分割成单条进行基因预测;若单条序列长度超过5Mb,则将单条序列进行切割,分割成5Mb的序列后再进行基因预测;此时,两条相邻的序列间重叠的长度为100kb。
--tmp_dir <string> default: aug_para_with_hints.tmp
设置临时文件夹
--alternatives_from_evidence <bool> default: True
是否进行可变剪接分析。
--min_intron_len <int> default: 30
设置augustus预测时最小的intron长度。
GFF3Clear --gene_prefix aug --genome genome.fasta --GFF3_source AUGUSTUS augustus.gff3 > out; mv out augustus.gff3
cd ..
## 使用 GeneMark-ES/ET 进行基因预测 #对真菌物种比较准确
mkdir -p /home/train/10.gene_prediction/genemark_es_et/es
cd /home/train/10.gene_prediction/genemark_es_et/es
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
/opt/biosoft/gmes_linux_64_4/gmes_petap.pl --sequence genome.fasta --ES --fungus --cores 8
# real 2m40.876s
# user 11m22.678s
# sys 0m4.806s
gtfToGff3.pl genemark.gtf > genemark.gff3
GFF3Clear --genome genome.fasta --no_attr_add --GFF3_source GeneMarkES --gene_prefix gmes genemark.gff3 > ../genemarkES.gff3
mkdir -p /home/train/10.gene_prediction/genemark_es_et/et
cd /home/train/10.gene_prediction/genemark_es_et/et
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
#bet_to_gff.pl --bed ~/06.reads_aligment/tophat/A1/junctions.bed --gff introns.gff --label tophat2 --seq genome.fasta
hints2genemarkETintron.pl genome.fasta ../../augustus/making_hints/hints.gff > genemartET.intron.gff
/opt/biosoft/gmes_linux_64_4/gmes_petap.pl --sequence genome.fasta --ET genemartET.intron.gff --fungus --et_score 10 --cores 8
# real 2m13.495s
# user 10m30.754s
# sys 0m4.878s
gtfToGff3.pl genemark.gtf > genemark.gff3
GFF3Clear --genome genome.fasta --no_attr_add --GFF3_source GeneMarkET --gene_prefix gmet genemark.gff3 > ../genemarkET.gff3
cd ..
## 使用SNAP进行基因预测
mkdir -p /home/train/10.gene_prediction/snap
cd /home/train/10.gene_prediction/snap
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
export PERL5LIB=$PERL5LIB:/opt/biosoft/EVM_r2012-06-25/PerlLib/
/opt/biosoft/EVM_r2012-06-25/OtherGeneFinderTrainingGuide/SNAP/gff3_to_SNAP_train.pl ../augustus/training/ati.filter2.gff3 genome.fasta
fathom genome.ann genome.dna -gene-stats &> gene-stats.log
fathom genome.ann genome.dna -validate &> validate.log
perl -ne 'print "$1\n" if /.*:\s+(\S+)\s+OK/' validate.log > zff2keep.txt
perl -e 'open IN, "zff2keep.txt"; while (<IN>) { chomp; $keep{$_} = 1; } while (<>) { if (m/>/) { print; } else { chomp; @_ = split /\t/; print "$_\n" if exists $keep{$_[-1]}; } }' genome.ann > out;
mv out genome.ann
fathom genome.ann genome.dna -categorize 200
rm alt.* err.* olp.* wrn.*
fathom genome.ann genome.dna -export 200 -plus
mkdir params; cd params
forge ../export.ann ../export.dna
cd ..
hmm-assembler.pl species params > species.hmm
snap species.hmm genome.fasta > snap_out.zff
# real 0m32.237s
# user 0m31.768s
# sys 0m0.461s
/opt/biosoft/EVM_r2012-06-25/OtherGeneFinderTrainingGuide/SNAP/SNAP_output_to_gff3.pl snap_out.zff genome.fasta > snap_out.gff3
GFF3Clear --genome genome.fasta --no_attr_add --GFF3_source SNAP --gene_prefix snap snap_out.gff3 > snap.gff3
cd ..
## 使用 MAKER 进行基因预测
mkdir -p /home/train/10.gene_prediction/maker
cd /home/train/10.gene_prediction/maker
# 准备输入文件
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Trinity.fasta ./
gzip -dc /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000328475.2_Umaydis521_2.0_protein.faa.gz > homolog.fasta
perl -p -i -e 'if (m/^>/) { s/\s+.*//; s/\./_/g; }' homolog.fasta
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/species-families.fa ./
ln -s /home/train/10.gene_prediction/genemark_es_et/es/output/gmhmm.mod ./
ln -s /home/train/10.gene_prediction/snap/species.hmm ./
# 准备配置文件
PATH=/opt/biosoft/tRNAscan-SE-1.3.1/bin/:/opt/biosoft/maker-3.01.04/bin:$PATH
export PERL5LIB=$PERL5LIB:/opt/biosoft/tRNAscan-SE-1.3.1/bin/
/opt/biosoft/maker-3.01.04/bin/maker -CTL
perl -p -i -e 's/^genome=.*/genome=genome.fasta/; s/^est=.*/est=Trinity.fasta/; s/^protein=.*/protein=homolog.fasta/; s/^model_org=.*/model_org=Fungi/; s/^rmlib=.*/rmlib=species-families.fa/; s/^augustus_species=.*/augustus_species=malassezia_sympodialis/; s/^snaphmm=.*/snaphmm=species.hmm/; s/^gmhmm=.*/gmhmm=gmhmm.mod/; s/^est2genome=.*/est2genome=1/; s/^protein2genome=.*/protein2genome=1/; s/^trna=.*/trna=1/; s/^correct_est_fusion=.*/correct_est_fusion=1/; s/^keep_preds=.*/keep_preds=1/;' maker_opts.ctl
# 并行运行maker
#sudo dnf install mpich mpich-devel
export PKG_CONFIG_PATH=/usr/lib64/mpich/lib/pkgconfig:$PKG_CONFIG_PATH
export LD_LIBRARY_PATH=/usr/lib64/mpich/lib/:$LD_LIBRARY_PATH
export C_INCLUDE_PATH=/usr/include/mpich-x86_64/:$C_INCLUDE_PATH
export PATH=/usr/lib64/mpich/bin/:$PATH
ulimit -n 102400; ulimit -u 102400; ulimit -s 102400
mpiexec -n 8 /opt/biosoft/maker-3.01.04/bin/maker -fix_nucleotides &> maker.log
# 若程序运行失败,单独运行 /opt/biosoft/maker-3.01.04/bin/maker 命令,检查失败原因。可能是 Repeatmakser 软件需要重新配置。推荐手动修改其配置文件:
####################
#cat <<EOF > aa.pl
#my \$info;
#while (<>) { \$info .= \$_; }
#\$info =~ s/(DEFAULT_SEARCH_ENGINE.*?)'value' => ''/\$1'value' => 'rmblast'/s;
#print \$info;
#EOF
#perl aa.pl /opt/biosoft/RepeatMasker-4.1.5/RepeatMaskerConfig.pm > aa.txt
#mv aa.txt /opt/biosoft/RepeatMasker-4.1.5/RepeatMaskerConfig.pm; rm aa.pl
####################
cd genome.maker.output
gff3_merge -d genome_master_datastore_index.log
grep -P "\tmaker\t" genome.all.gff > genome.maker.gff3
GFF3Clear --genome ../genome.fasta --no_attr_add --GFF3_source MAKER --gene_prefix maker genome.maker.gff3 > ../maker.gff3
cd ../../
# 使用 BRAKER 进行基因预测
mkdir /home/train/10.gene_prediction/braker
cd /home/train/10.gene_prediction/braker
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/genome.softmask.fasta genome.fasta
gzip -dc /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000328475.2_Umaydis521_2.0_protein.faa.gz > homolog.fasta
perl -p -i -e 'if (m/^>/) { s/\s+.*//; s/\./_/g; }' homolog.fasta
ln -s ../augustus/making_hints/rnaseq.sort.bam ./
export AUGUSTUS_CONFIG_PATH=/opt/biosoft/Augustus-3.5.0/config/
export AUGUSTUS_BIN_PATH=/opt/biosoft/Augustus-3.5.0/bin/
export AUGUSTUS_SCRIPTS_PATH=/opt/biosoft/Augustus-3.5.0/scripts/
export GENEMARK_PATH=/opt/biosoft/GeneMark-ETP/bin/
export BAMTOOLS_PATH=/opt/biosoft/bamtools-2.5.2/bin/
export SAMTOOLS_PATH=/opt/biosoft/samtools-1.17/bin/
export ALIGNMENT_TOOL_PATH=/opt/biosoft/gth-1.7.3-Linux_x86_64-64bit/bin/
export BLAST_PATH=/opt/biosoft/rmblast-2.14.0/bin/
export PYTHON3_PATH=/opt/sysoft/Python-3.11.4/bin
export DIAMOND_PATH=/opt/biosoft/diamond-2.1.8/bin/
export PROTHINT_PATH=/opt/biosoft/ProtHint-2.6.0/bin/
export CDBTOOLS_PATH=/opt/biosoft/PASApipeline-v2.5.2/bin/
export TSEBRA_PATH=/opt/biosoft/TSEBRA-1.1.0/bin
braker.pl --species=malassezia_sympodialis_braker --genome=genome.fasta --bam=rnaseq.sort.bam --prot_seq=homolog.fasta --threads=8
# real 10m21.224s
# user 41m42.403s
# sys 0m54.929s
gtfToGff3.pl braker/braker.gtf > braker3.gff3
GFF3Clear --genome genome.fasta --no_attr_add --GFF3_source BRAKER --gene_prefix braker braker3.gff3 > out; mv out braker3.gff3
## 使用GETA进行基因预测
mkdir -p /home/train/10.gene_prediction/GETA
cd /home/train/10.gene_prediction/GETA
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
# wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/328/475/GCF_000328475.2_Umaydis521_2.0/GCF_000328475.2_Umaydis521_2.0_protein.faa.gz -P ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/
gzip -dc /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000328475.2_Umaydis521_2.0_protein.faa.gz > homolog.fasta
perl -p -i -e 'if (m/^>/) { s/\s+.*//; s/\./_/g; }' homolog.fasta
ln -s ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/species-families.fa species-families.fa
cat ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/*.1.fastq > reads.1.fastq
cat ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/*.2.fastq > reads.2.fastq
geta.pl --cpu 8 --RM_species fungi --RM_lib species-families.fa --pe1 reads.1.fastq --pe2 reads.2.fastq --protein homolog.fasta --augustus_species malassezia_sympodialis --HMM_db /opt/biosoft/bioinfomatics_databases/Pfam/Pfam-AB.hmm --gene_prefix MS01Gene --out_prefix Malassezia_sympodialis_V01 genome.fasta &> geta.log
cd ..
## 使用BUSCO进行基因组完整性分析,对各种预测软件结果进行评估
mkdir -p /home/train/10.gene_prediction/BUSCO
cd /home/train/10.gene_prediction/BUSCO
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
# 准备GFF3文件
cp -a ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GFF3_files ./
[train@MiWiFi-R3P-srv GFF3_files]$ ls
augustus.gff3 braker3.gff3 evidence_gene_models.gff3 genemarkES.gff3 genemarkET.gff3 geta.gff3 homolog.gff3 maker.gff3 NGSReads_prediction.gff3 pasa.gff3 snap.gff3
# 提取蛋白序列
mkdir protein_files
for i in `ls GFF3_files/*.gff3`
do
x=${i/*\//}
x=${x/.gff3/}
echo "gff3_to_sequences.pl --out_prefix protein_files/$x --only_gene_sequences --only_coding_gene_sequences --only_first_isoform genome.fasta $i" #先将gff3文件转化为蛋白序列
done > command.gff3_to_protein.list
ParaFly -c command.gff3_to_protein.list -CPU 4
rm command.gff3_to_protein.list*
# 获取NCBI注释的蛋白序列
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/349/305/GCF_000349305.1_ASM34930v2/GCF_000349305.1_ASM34930v2_protein.faa.gz -P ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/
gzip -dc /home/train/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000349305.1_ASM34930v2_protein.faa.gz > protein_files/ncbi.protein.fasta
# BUSCO分析
mkdir busco_out
for i in `ls protein_files/*.protein.fasta`
do
x=${i/*\//}
x=${x/.protein.fasta/}
echo "cd busco_out; busco -i ../$i -c 8 -o $x -m proteins -l /opt/biosoft/bioinfomatics_databases/BUSCO_V5/basidiomycota_odb10 --offline"
done > command.BUSCO.list #比对到担子菌数据库里面
[train@MiWiFi-R3P-srv augustus]$ ls /opt/biosoft/bioinfomatics_databases/BUSCO_V5/basidiomycota_odb10/hmms/ | wc -l
1764 #在该类中收录了1764个模型
ParaFly -c command.BUSCO.list -CPU 1
rm command.BUSCO.list*
grep C: */*/short_summary*.txt | perl -pe 's/.*odb10.(.+?).txt/$1/; $str = " " x (20 - length($1)); s/^/$str/;' > BUSCO.summary.txt
[train@MiWiFi-R3P-srv BUSCO]$ cat BUSCO.summary.txt
augustus: C:86.5%[S:86.2%,D:0.3%],F:2.4%,M:11.1%,n:1764
braker3: C:56.1%[S:56.0%,D:0.1%],F:1.5%,M:42.4%,n:1764
evidence_gene_models: C:82.0%[S:81.8%,D:0.2%],F:3.5%,M:14.5%,n:1764
genemarkES: C:67.2%[S:66.9%,D:0.3%],F:3.9%,M:28.9%,n:1764
genemarkET: C:82.9%[S:82.6%,D:0.3%],F:3.7%,M:13.4%,n:1764
geta: C:87.9%[S:87.5%,D:0.4%],F:1.8%,M:10.3%,n:1764
homolog: C:59.2%[S:59.0%,D:0.2%],F:1.6%,M:39.2%,n:1764
maker: C:86.0%[S:85.7%,D:0.3%],F:2.5%,M:11.5%,n:1764
ncbi: C:71.1%[S:71.0%,D:0.1%],F:5.0%,M:23.9%,n:1764
NGSReads_prediction: C:76.8%[S:76.6%,D:0.2%],F:3.5%,M:19.7%,n:1764
pasa: C:52.4%[S:52.1%,D:0.3%],F:13.4%,M:34.2%,n:1764
snap: C:78.4%[S:78.1%,D:0.3%],F:4.1%,M:17.5%,n:1764
cd busco_out
ln -s */short_summary* .
/opt/biosoft/busco-5.4.7/scripts/generate_plot.py -wd ./ -rt specific
perl -p -i -e 's/my_family, colour/my_family, face="italic", colour/ if m/axis.text.y/; s/size=1/size=0.4/; s/\%BUSCO/\% BUSCO/;' busco_figure.R
cat busco_figure.R | R --vanilla --slave
cd ../../
## 使用SpliceGrapher进行可变剪接分析
mkdir -p /home/train/10.gene_prediction/spliceGrapher
cd /home/train/10.gene_prediction/spliceGrapher
# 准备基因组序列和注释文件
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
perl -pe 's/^\s*$//' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.bestGeneModels.gff3 > genome.gff3
gff3ToGtf.pl genome.fasta ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.bestGeneModels.gff3 | perl -pe 's/^\s*$//' > genome.gtf
# 输入给SpliceGrapher的sam结果文件不能有softclip信息
cd /home/train/06.reads_aligment/hisat2
hisat2 -x genome -p 4 --min-intronlen 20 --max-intronlen 4000 --rna-strandness RF -1 A.1.fastq -2 A.2.fastq -S /home/train/10.gene_prediction/spliceGrapher/hisat2.sam --no-softclip
cd -
samtools sort -@ 8 -O sam -o 11.sam hisat2.sam; mv 11.sam hisat2.sam
ln -s /home/train/06.reads_aligment/hisat2/A.?.fastq ./
export SG_FASTA_REF=$PWD/genome.fasta
export SG_GENE_MODEL=$PWD/genome.gff3
mkdir 1.create_classifiers
cd 1.create_classifiers
build_classifiers.py -d gt,gc -a ag -l create_classifiers.log
# real 7m41.985s
# user 7m40.815s
# sys 0m2.435s
grep roc *.cfg
mkdir ../2.filter_alignments
cd ../2.filter_alignments
ln -s ../1.create_classifiers/classifiers.zip ./
sam_filter.py ../hisat2.sam classifiers.zip -o filtered.sam -v
# real 1m7.641s
# user 1m6.824s
# sys 0m1.216s
mkdir ../3.predict_graphs
cd ../3.predict_graphs
predict_graphs.py ../2.filter_alignments/filtered.sam -v
# real 0m57.385s
# user 0m56.808s
# sys 0m0.578s
mkdir ../4.realignment_pipeline
cd ../4.realignment_pipeline
realignment_pipeline.py ../3.predict_graphs/ -1 ../A.1.fastq -2 ../A.2.fastq
# real 0m39.302s
# user 0m49.648s
# sys 0m2.183s
mkdir ../5.calculate_counts
cd ../5.calculate_counts
# 得到包含可变剪接信息的gtf文件
ls $PWD/../4.realignment_pipeline/*/*.gff > predict_graphs.lis
generate_putative_sequences.py predict_graphs.lis -A -M splicegraph.gtf
# 注意生成的splicegraph.gtf文件中第一列,其序列ID的首字母可能会由原来的小写变为大写字母,其它字母会由大写变成小写,若发生该情况,则修改还原
perl -p -i -e 's/Ms01contig/MS01Contig/' splicegraph.gtf
# 若某个基因的可变剪接转录本过多,则该基因的信息不会包含在gtf文件中,需要从genome.gtf文件中添加该基因信息。
add_geneModels_to_spliceGrapher_GTF.pl splicegraph.gtf ../genome.gtf > all.gtf
samtools sort -@ 8 -O BAM -o filtered.bam ../2.filter_alignments/filtered.sam
# 使用cufflins进行转录本的表达量计算
cuffquant -o cufflinks -b ../genome.fasta -u -p 4 all.gtf filtered.bam &> cuffquant.log
cuffnorm -o cufflinks -L A,A -p 4 --library-norm-method classic-fpkm all.gtf cufflinks/abundances.cxb cufflinks/abundances.cxb
pick_effctive_isoforms.pl all.gtf cufflinks/isoforms.count_table > splicegraph.filtered.gtf 2> pick_effctive_isoforms.log
mkdir ../6.alternative_splicing_statisctis
cd ../6.alternative_splicing_statisctis
perl -p -e 's/(.*)(exon_number.*?;)(.*)( gene_name.*?;)(.*)/$1$3$5;$4 $2/' ../5.calculate_counts/splicegraph.filtered.gtf > splicegraph.filtered.gtf
para_alternative_splicing_statisctis.pl splicegraph.filtered.gtf 8 > alternative_splicing_statisctis.txt
alternative_splicing_count.pl alternative_splicing_statisctis.txt > alternative_splicing_statisctis.stats
mkdir ../7.create_graphs
cd ../7.create_graphs
samtools index ../5.calculate_counts/filtered.bam
spliceGrapher_create_graphs.pl ../genome.gff3 ../5.calculate_counts/splicegraph.filtered.gtf ../5.calculate_counts/filtered.bam ./ 8
## 原核生物基因预测
#软件安装
## 9. Installing genemarkS (http://exon.gatech.edu/GeneMark/)
#wget http://topaz.gatech.edu/GeneMark/tmp/GMtool_Gtc4c/gms2_linux_64.tar.gz -P ~/software/
tar zxf ~/software/gms2_linux_64.tar.gz -C /opt/biosoft/
gzip -dc /home/train/software/gm_key_64.gms2.gz > ~/.gmhmmp2_key
echo 'PATH=$PATH:/opt/biosoft/gms2_linux_64//' >> ~/.bashrc
source ~/.bashrc
mkdir -p /home/train/10.gene_prediction/genemark_s
cd /home/train/10.gene_prediction/genemark_s
#wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz -P ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/
gzip -dc ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/GCF_000005845.2_ASM584v2_genomic.fna.gz > genome.ecoli.fasta
perl -p -i -e 's/^(>\S+).*/$1/' genome.ecoli.fasta
/opt/biosoft/gms2_linux_64/gms2.pl --seq genome.ecoli.fasta --genome-type bacteria --gcode 11 --format gff --output gms2.gff --fnn genes.fasta --faa proteins.fasta
# --format 参数的值可以有 lst, gff, gtf, gff3
# real 1m19.722s
# user 1m18.800s
# sys 0m0.933s
geneMarkS_gff2gff3.pl gms2.gff Ecoli > gms2.gff3
网友评论