B.A.S.T.A

作者: 胡童远 | 来源:发表于2023-07-13 10:07 被阅读0次

    BASTA - BAsic Sequence Taxonomy Annotation

    Github: https://github.com/timkahlke/BASTA

    安装

    source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
    #conda create -n basta_py3 python=3
    conda activate basta_py3
    conda install -c bioconda -c conda-forge leveldb plyvel krona python-wget
    

    下载数据库
    参数

    usage: basta download [-h] [-d DIRECTORY] [-f FTP]
                          {wgs,prot,est,gss,gb,pdb,uni}
    basta download: error: the following arguments are required: type
    ## help
    basta download -h
    usage: basta download [-h] [-d DIRECTORY] [-f FTP] {wgs,prot,est,gss,gb,pdb,uni}
    
    Download NCBI taxonomy file(s)
    
    positional arguments:
      {wgs,prot,est,gss,gb,pdb,uni}
                            Type of mapping file to be downloaded (prot, est, wgs, gss or gb)
    
    options:
      -h, --help            show this help message and exit
      -d DIRECTORY, --directory DIRECTORY
                            Directory of mapping files (default: $HOME/.basta/taxonomy)
      -f FTP, --ftp FTP     URL to NCBI ftp for accession mapping (default: ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/)
    

    下载

    basta download -d taxonomy/ -f ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/ gb
    

    地址:https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/

    建库test

    basta create_db -d /hwfssz1/ST_DIVERSITY/PUB/USER/xiangxueyan/database/BASTA/nucl_gb.accession2taxid gb_mapping.db 0 2
    basta create_db -d nucl_gb.accession2taxid gb_mapping.db 0 2
    

    1 diamond 比对geneset fasta 到 nr库

    source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
    conda activate run_dbcan
    
    db="/public/home/zzumgg03/huty/databases/nr_huty/nr.20210705.dmnd"
    
    diamond blastx --db $db --query $infile -e 1e-5 --outfmt 6 --more-sensitive --max-target-seqs 1 --threads 30 --quiet --out $outfile
    

    2 basta sequence 注释nr的蛋白id

    source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
    conda activate basta_py3
    
    basta sequence \
    uniq_all_gene_fna_diamond/$i \
    uniq_all_gene_fna_basta/$i \
    prot -e 0.01 -l 10 -m 3 -i 10 -b T -p 60 \
    -d /public/home/zzumgg03/huty/databases/basta/
    

    参数

    -e EVALUE, --evalue EVALUE
                            maximum evalue of good hit (default=0.00001)
    -l ALEN, --alen ALEN  minimum length for good blast alignment (default=100)
    -m MINIMUM, --minimum MINIMUM
                            Minimum number of hits a sequence must have to be assigned an LCA. Used for all and majority LCA estimation (default=3)
    -i IDENTITY, --identity IDENTITY
    -d DIRECTORY, --directory DIRECTORY
                            directory of database files (default: $HOME/.basta/taxonomy)
    -p  Percentage of hits that are used for LCA estimation. Must be between 51-100 (default: 100)
    -b BEST_HIT, --best_hit BEST_HIT
                            If set the final taxonomy will contain an additional column containing the taxonomy of the best (first) hit with defined taxonomy
    

    3 提取细菌,古菌,真菌,病毒id

    cd uniq_all_gene_fna_basta
    mkdir 02_out_micro
    echo -e "sample\tnum_nr_gene\tnum_micro\tbacteria\tarchaea\tviruses" >> 00_sum_micro
    for i in `ls 01_basta`; do
        cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $1}' > 02_out_micro/$i
        num1=`cat 01_basta/$i | wc -l`
        num2=`cat 02_out_micro/$i | wc -l`
        num3=`cat 01_basta/$i | grep "Bacteria;" | wc -l`
        num4=`cat 01_basta/$i | grep "Archaea;" | wc -l`
        num5=`cat 01_basta/$i | grep "Viruses;" | wc -l`
        echo -e "${i}\t${num1}\t${num2}\t${num3}\t${num4}\t${num5}" >> 00_sum_micro
    done
    
    bash sc_micro.sh
    

    4 提取细菌,古菌,真菌,病毒id 和注释信息

    mkdir 03_out_micro_anno
    for i in `ls 01_basta`; do
        cat 01_basta/$i | grep -E "Bacteria;|Archaea;|Viruses;|Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | awk '{print $0}' > 03_out_micro_anno/$i
        echo -e "$i done..."
    done
    mkdir 04_anno
    cd 04_anno
    cat ../03_out_micro_anno/* > 03_out_micro_anno.txt
    
    nohup cat 03_out_micro_anno.txt | grep -E "Bacteria;" | sed 's/Unknown/Bacteria/' > out_Bacteria.txt &
    nohup cat 03_out_micro_anno.txt | grep -E "Archaea;" | sed 's/Unknown/Archaea/' > out_Archaea.txt &
    nohup cat 03_out_micro_anno.txt | grep -E "Viruses;" | sed 's/Unknown/Viruses/' > out_Viruses.txt &
    nohup cat 03_out_micro_anno.txt | grep -E "Chytridiomycota;|Microsporidia;|Blastocladiomycota;|Cryptomycota;|Mucoromycota;|Zoopagomycota;|Olpidiomycota;|Ascomycota;|Basidiomycota;" | sed 's/Unknown/Fungi/' > out_Fungi.txt &
    

    5 从基因集中提取微生物基因

    for i in `ls 02_out_micro/`; do
        echo -e "python sc_clean_gene.py 02_out_micro/$i ../uniq_all_gene_fna/$i ../uniq_all_gene_fna_micro/$i"
    done
    
    sbatch sc_clean_gene.sh
    

    sc_clean_gene.py

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import re, sys, os
    # 构建序列名-序列字典
    ms, db, infile, outfile = sys.argv
    
    with open(infile) as f:
        Dict = {}
        keys = []
        for line in f:
            if line[0] == ">":
                key = line.strip()
                keys.append(key)
                Dict[key] = []
            else:
                Dict[key].append(line.strip())
    # 遍历目标序列名,提取目标序列            
    with open(outfile, 'w') as o:
        with open(db) as target:
            for tar in target:
                tar = tar.strip()
                tar = ">{}".format(tar)
                o.write("{}\n{}\n".format(tar, ''.join(Dict[tar])))
    

    相关文章

      网友评论

          本文标题:B.A.S.T.A

          本文链接:https://www.haomeiwen.com/subject/nsqeartx.html