B.A.S.T.A

作者: 胡童远 | 来源:发表于2023-07-13 10:07 被阅读0次

BASTA - BAsic Sequence Taxonomy Annotation

Github: https://github.com/timkahlke/BASTA

安装

source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
#conda create -n basta_py3 python=3
conda activate basta_py3
conda install -c bioconda -c conda-forge leveldb plyvel krona python-wget

下载数据库
参数

usage: basta download [-h] [-d DIRECTORY] [-f FTP]
                      {wgs,prot,est,gss,gb,pdb,uni}
basta download: error: the following arguments are required: type
## help
basta download -h
usage: basta download [-h] [-d DIRECTORY] [-f FTP] {wgs,prot,est,gss,gb,pdb,uni}

Download NCBI taxonomy file(s)

positional arguments:
  {wgs,prot,est,gss,gb,pdb,uni}
                        Type of mapping file to be downloaded (prot, est, wgs, gss or gb)

options:
  -h, --help            show this help message and exit
  -d DIRECTORY, --directory DIRECTORY
                        Directory of mapping files (default: $HOME/.basta/taxonomy)
  -f FTP, --ftp FTP     URL to NCBI ftp for accession mapping (default: ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/)

下载

basta download -d taxonomy/ -f ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/ gb

地址：https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/

建库test

basta create_db -d /hwfssz1/ST_DIVERSITY/PUB/USER/xiangxueyan/database/BASTA/nucl_gb.accession2taxid gb_mapping.db 0 2
basta create_db -d nucl_gb.accession2taxid gb_mapping.db 0 2

1 diamond 比对geneset fasta 到 nr库

source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate run_dbcan

db="/public/home/zzumgg03/huty/databases/nr_huty/nr.20210705.dmnd"

diamond blastx --db $db --query $infile -e 1e-5 --outfmt 6 --more-sensitive --max-target-seqs 1 --threads 30 --quiet --out $outfile

2 basta sequence 注释nr的蛋白id

source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate basta_py3

basta sequence \
uniq_all_gene_fna_diamond/$i \
uniq_all_gene_fna_basta/$i \
prot -e 0.01 -l 10 -m 3 -i 10 -b T -p 60 \
-d /public/home/zzumgg03/huty/databases/basta/

参数

-e EVALUE, --evalue EVALUE
                        maximum evalue of good hit (default=0.00001)
-l ALEN, --alen ALEN  minimum length for good blast alignment (default=100)
-m MINIMUM, --minimum MINIMUM
                        Minimum number of hits a sequence must have to be assigned an LCA. Used for all and majority LCA estimation (default=3)
-i IDENTITY, --identity IDENTITY
-d DIRECTORY, --directory DIRECTORY
                        directory of database files (default: $HOME/.basta/taxonomy)
-p  Percentage of hits that are used for LCA estimation. Must be between 51-100 (default: 100)
-b BEST_HIT, --best_hit BEST_HIT
                        If set the final taxonomy will contain an additional column containing the taxonomy of the best (first) hit with defined taxonomy

3 提取细菌，古菌，真菌，病毒id

cd uniq_all_gene_fna_basta
mkdir 02_out_micro
echo -e "sample\tnum_nr_gene\tnum_micro\tbacteria\tarchaea\tviruses" >> 00_sum_micro
for i in `ls 01_basta`; do
    cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $1}' > 02_out_micro/$i
    num1=`cat 01_basta/$i | wc -l`
    num2=`cat 02_out_micro/$i | wc -l`
    num3=`cat 01_basta/$i | grep "Bacteria;" | wc -l`
    num4=`cat 01_basta/$i | grep "Archaea;" | wc -l`
    num5=`cat 01_basta/$i | grep "Viruses;" | wc -l`
    echo -e "${i}\t${num1}\t${num2}\t${num3}\t${num4}\t${num5}" >> 00_sum_micro
done

bash sc_micro.sh

4 提取细菌，古菌，真菌，病毒id 和注释信息

mkdir 03_out_micro_anno
for i in `ls 01_basta`; do
    cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $0}' > 03_out_micro_anno/$i
    echo -e "$i done..."
done
mkdir 04_anno
cd 04_anno
cat ../03_out_micro_anno/* > 03_out_micro_anno.txt

nohup cat 03_out_micro_anno.txt | grep -E "Bacteria;" | sed 's/Unknown/Bacteria/' > out_Bacteria.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Archaea;" | sed 's/Unknown/Archaea/' > out_Archaea.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Viruses;" | sed 's/Unknown/Viruses/' > out_Viruses.txt &
nohup cat 03_out_micro_anno.txt | grep -E "Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | sed 's/Unknown/Fungi/' > out_Fungi.txt &

5 从基因集中提取微生物基因

for i in `ls 02_out_micro/`; do
    echo -e "python sc_clean_gene.py 02_out_micro/$i ../uniq_all_gene_fna/$i ../uniq_all_gene_fna_micro/$i"
done

sbatch sc_clean_gene.sh

sc_clean_gene.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
import re, sys, os
# 构建序列名-序列字典
ms, db, infile, outfile = sys.argv

with open(infile) as f:
    Dict = {}
    keys = []
    for line in f:
        if line[0] == ">":
            key = line.strip()
            keys.append(key)
            Dict[key] = []
        else:
            Dict[key].append(line.strip())
# 遍历目标序列名，提取目标序列            
with open(outfile, 'w') as o:
    with open(db) as target:
        for tar in target:
            tar = tar.strip()
            tar = ">{}".format(tar)
            o.write("{}\n{}\n".format(tar, ''.join(Dict[tar])))

网友评论

本文标题：B.A.S.T.A

本文链接：https://www.haomeiwen.com/subject/nsqeartx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

B.A.S.T.A

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读