BASTA - BAsic Sequence Taxonomy Annotation
Github: https://github.com/timkahlke/BASTA
安装
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
#conda create -n basta_py3 python=3
conda activate basta_py3
conda install -c bioconda -c conda-forge leveldb plyvel krona python-wget
下载数据库
参数
usage: basta download [-h] [-d DIRECTORY] [-f FTP]
{wgs,prot,est,gss,gb,pdb,uni}
basta download: error: the following arguments are required: type
## help
basta download -h
usage: basta download [-h] [-d DIRECTORY] [-f FTP] {wgs,prot,est,gss,gb,pdb,uni}
Download NCBI taxonomy file(s)
positional arguments:
{wgs,prot,est,gss,gb,pdb,uni}
Type of mapping file to be downloaded (prot, est, wgs, gss or gb)
options:
-h, --help show this help message and exit
-d DIRECTORY, --directory DIRECTORY
Directory of mapping files (default: $HOME/.basta/taxonomy)
-f FTP, --ftp FTP URL to NCBI ftp for accession mapping (default: ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/)
下载
basta download -d taxonomy/ -f ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/ gb
地址:https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/
建库test
basta create_db -d /hwfssz1/ST_DIVERSITY/PUB/USER/xiangxueyan/database/BASTA/nucl_gb.accession2taxid gb_mapping.db 0 2
basta create_db -d nucl_gb.accession2taxid gb_mapping.db 0 2
1 diamond 比对geneset fasta 到 nr库
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate run_dbcan
db="/public/home/zzumgg03/huty/databases/nr_huty/nr.20210705.dmnd"
diamond blastx --db $db --query $infile -e 1e-5 --outfmt 6 --more-sensitive --max-target-seqs 1 --threads 30 --quiet --out $outfile
2 basta sequence 注释nr的蛋白id
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate basta_py3
basta sequence \
uniq_all_gene_fna_diamond/$i \
uniq_all_gene_fna_basta/$i \
prot -e 0.01 -l 10 -m 3 -i 10 -b T -p 60 \
-d /public/home/zzumgg03/huty/databases/basta/
参数
-e EVALUE, --evalue EVALUE
maximum evalue of good hit (default=0.00001)
-l ALEN, --alen ALEN minimum length for good blast alignment (default=100)
-m MINIMUM, --minimum MINIMUM
Minimum number of hits a sequence must have to be assigned an LCA. Used for all and majority LCA estimation (default=3)
-i IDENTITY, --identity IDENTITY
-d DIRECTORY, --directory DIRECTORY
directory of database files (default: $HOME/.basta/taxonomy)
-p Percentage of hits that are used for LCA estimation. Must be between 51-100 (default: 100)
-b BEST_HIT, --best_hit BEST_HIT
If set the final taxonomy will contain an additional column containing the taxonomy of the best (first) hit with defined taxonomy
3 提取细菌,古菌,真菌,病毒id
cd uniq_all_gene_fna_basta
mkdir 02_out_micro
echo -e "sample\tnum_nr_gene\tnum_micro\tbacteria\tarchaea\tviruses" >> 00_sum_micro
for i in `ls 01_basta`; do
cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $1}' > 02_out_micro/$i
num1=`cat 01_basta/$i | wc -l`
num2=`cat 02_out_micro/$i | wc -l`
num3=`cat 01_basta/$i | grep "Bacteria;" | wc -l`
num4=`cat 01_basta/$i | grep "Archaea;" | wc -l`
num5=`cat 01_basta/$i | grep "Viruses;" | wc -l`
echo -e "${i}\t${num1}\t${num2}\t${num3}\t${num4}\t${num5}" >> 00_sum_micro
done
bash sc_micro.sh
4 提取细菌,古菌,真菌,病毒id 和注释信息
mkdir 03_out_micro_anno
for i in `ls 01_basta`; do
cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $0}' > 03_out_micro_anno/$i
echo -e "$i done..."
done
mkdir 04_anno
cd 04_anno
cat ../03_out_micro_anno/* > 03_out_micro_anno.txt
nohup cat 03_out_micro_anno.txt | grep -E "Bacteria;" | sed 's/Unknown/Bacteria/' > out_Bacteria.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Archaea;" | sed 's/Unknown/Archaea/' > out_Archaea.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Viruses;" | sed 's/Unknown/Viruses/' > out_Viruses.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | sed 's/Unknown/Fungi/' > out_Fungi.txt &
5 从基因集中提取微生物基因
for i in `ls 02_out_micro/`; do
echo -e "python sc_clean_gene.py 02_out_micro/$i ../uniq_all_gene_fna/$i ../uniq_all_gene_fna_micro/$i"
done
sbatch sc_clean_gene.sh
sc_clean_gene.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re, sys, os
# 构建序列名-序列字典
ms, db, infile, outfile = sys.argv
with open(infile) as f:
Dict = {}
keys = []
for line in f:
if line[0] == ">":
key = line.strip()
keys.append(key)
Dict[key] = []
else:
Dict[key].append(line.strip())
# 遍历目标序列名,提取目标序列
with open(outfile, 'w') as o:
with open(db) as target:
for tar in target:
tar = tar.strip()
tar = ">{}".format(tar)
o.write("{}\n{}\n".format(tar, ''.join(Dict[tar])))
网友评论