Pan CARD

作者: 胡童远 | 来源:发表于2021-06-01 15:52 被阅读0次

    CARD | 抗生素耐受(基因,产物,表型)数据库

    文献:https://card.mcmaster.ca/about

    官网https://card.mcmaster.ca/

    数据获取https://card.mcmaster.ca/download

    注释软件:RGI
    RGI 5.2.0 版
    1 添加了自动发布 docker 镜像的工作流;
    2 添加了下载卡数据时证书验证失败的解决方法;
    3 更新和清理自述文件;
    4 添加了一个新功能来自动将 CARD 数据库加载到 RGI,称为 rgi auto_load;
    5 添加了 kma 对齐器作为 rgi bwt 的默认值,因为它比 bowtie2 和 bwa 显示出更好的对齐方式;
    6 更新了等位基因级报告(output_prefix.allele_mapping_data.txt)的RGI bwt读取映射结果,包括深度、SNP、共识序列DNA、共识序列蛋白;
    7 为 rgi main 添加了对 gz 和 bz2 输入 FASTA 的支持;
    8 rgi bwt 现在使用同源、变异、rRNA 基因变异、过表达和敲除模型;

    conda下载,安装

    # searches rgi package and show available versions
    conda search --channel bioconda rgi
    # install rgi package
    conda install --channel bioconda rgi
    # install rgi specific version
    conda install --channel bioconda rgi=3.1.1 
    

    运行

    rgi main -i test.fna \
    -o test.out \
    -t contig \
    --clean
    

    github地址:https://github.com/arpcard/rgi

    结果输出信息:


    结果整理

    1 将所有基因组的ARG注释文件存到一个新路径

    for i in `cat total_genome.list`;
    do
        cp /route/ARG-genome/${i}.card5.out.txt ./genome_arg/${i}.txt
        echo -e "$i done..."
    done
    

    2 抽提 rgo gene drug 信息,构建注释表

    ## 抽提 rgo gene drug 信息,构建注释表
    mkdir genome_arg_gene
    for i in `ls genome_arg/`;
    do
        cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $9, $11, $15}' >> total_arg_aro_drug.txt
        # 为了rgo drug 对应分类信息
        cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $11}' > genome_arg_gene/$i
        # 只要rgo,简化数据
        echo -e "$i done..."
    done
    
    cat total_arg_aro_drug.txt | sort | uniq > total_arg_aro_drug_uniq.txt
    # 去重得到最简注释分类表
    cat genome_arg_gene/* | sort | uniq > total_arg_gene_uniq.txt
    # 去重得到pan rgo
    
    head total_arg_aro_drug_uniq.txt
    AAC(3)-IId      3004623 aminoglycoside antibiotic
    AAC(6')-Ib-cr6  3005116 fluoroquinolone antibiotic; aminoglycoside antibiotic
    AAC(6')-Ie-APH(2'')-Ia  3002597 aminoglycoside antibiotic
    AAC(6')-Ii      3002556 aminoglycoside antibiotic
    aad(6)  3002628 aminoglycoside antibiotic
    
    head total_arg_gene_uniq.txt
    3000027
    3000074
    3000165
    3000166
    3000179
    

    3 genome x aro二维表,填充PAV信息

    conda activate assembly
    # python3 预装 pandas numpy
    
    #!/usr/bin/env python3
    import re,sys,os
    import pandas as pd
    import numpy as np
    # 读取文件,去除换行符给新列表
    with open("total_arg_gene_uniq.txt", 'r') as list_genes:
        list_genes = list_genes.readlines()
        list_genes_enter = []
        for each in list_genes:
            list_genes_enter.append(each.strip())
    
    with open("total_genome.list", 'r') as list_genomes:
        list_genomes = list_genomes.readlines()
        list_genomes_enter = []
        for each in list_genomes:
            list_genomes_enter.append(each.strip())
    
    # 重复命名不规范
    # TypeError: expected str, bytes or os.PathLike object, not list
    
    # 构造数据框
    num_row = len(list_genes_enter)
    num_col = len(list_genomes_enter)
    num_total = num_row * num_col
    df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
                      columns = list_genomes_enter,
                      index = list_genes_enter)
    
    # 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
    for each_genome in list_genomes_enter:
        target_file = "genome_arg_gene/{}.txt".format(each_genome)
        # 读取基因集
        with open(target_file, 'r') as target:
            target_db = target.readlines()
            for each_gene in list_genes_enter:
                # 判断行名基因是否在基因集,并给表格元素赋值
                # loc: 字符定位表格元素
                # iloc: 数字定位
                if "{}\n".format(each_gene) in target_db:
                    df.loc[each_gene, each_genome] = "yes"
                else:
                    df.loc[each_gene, each_genome] = "no"
                #print("\033[32m _____ {} DONE!\033[0m".format(each_gene))
        print("\033[32m {} DONE!\033[0m".format(each_genome))
    
    # 表格保存
    df.to_csv('total_arg_pav.tsv', sep='\t', index = True)
    

    4 python melt one to many的情况


    # /home/hutongyuan/recursion
    import sys, re, os
    infile = "input.txt"
    outfile = "out_2.txt"
    
    # recursion
    out = [] # 存储结果
    def recurse_print(n, li):
        if n == 2: 
        # 终止条件
            out.append("{}\t{}\n".format(li[0], li[1]))
            # id + 1st drug -> out list
        else:
            out.append("{}\t{}\n".format(li[0], li[n-1]))
            # id + last drug -> out list
            li.pop(n-1) # remove last drug
            recurse_print(n-1, li)  # recursion
    
    with open(infile, 'r') as i:
        for line in i:
            line = line.strip()
            line = re.split(r'\t|; | and ', line)
            length = len(line)
            recurse_print(length, line)
    
    with open(outfile, 'w') as o:
        for line in out:
            o.write(line)
    

    5 两个分类表

    6 R数据处理pan genome, core genome, 可视化

    setwd("piglet/pan_arg_total")
    
    ## read table
    df = read.table("data.txt", sep="\t", header=T, row.names=1)
    g_gene = read.table("data_gene.txt", sep="\t", header=T)
    g_genome = read.table("data_genome.txt", sep="\t", header=T)
    
    ## 重构df
    df$gene = rownames(df)
    df = merge(g_gene, df, by='gene', all.x=T)
    rownames(df) = df$gene
    
    # gene genome list
    uni_gene = unique(g_gene$g_gene)
    uni_genome = unique(g_genome$g_genome)
    
    # gene genome 构造新表
    row = length(uni_gene)
    col = length(uni_genome)
    fill = rep(0, row*col)
    tb = data.frame(matrix(fill, row, col))
    rownames(tb) = uni_gene
    colnames(tb) = uni_genome
    
    # pan genome 分类判定
    # core mark 2: 每个样方列和 “全大于1”
    # absence mark 0: 每个样方列和 “全等于0”
    # dispensable mark 1: 其他
    
    for(i in uni_gene)
    {
        for(j in uni_genome)
        {
            # 获取样方坐标
            tmp_genome = g_genome[g_genome$g_genome==j, ]
            tmp_genome = tmp_genome$genome
            # 获取样方
            tmp_df = df[df$g_gene==i, c(tmp_genome)]
            # 统计列和,dim(X)的值必需是正数,是表格才行
            sum = apply(tmp_df, 2, FUN=sum)
            if(all(sum > 0))
            {tb[i, j] = 2}
            else if(all(sum == 0))
            {tb[i, j] = 0}
            else
            {tb[i, j] = 1}
        }
    }
    
    library("pheatmap")
    input = t(tb)
    
    # write.table(input, file="out_binary.tsv", sep="\t", quote=F)
    
    pheatmap(input, 
     cluster_col = T,
     color = colorRampPalette(colors = c("white", "deepskyblue1", "indianred1"))(3),
     #legend = F,
     fontsize_col = 11,  
     fontsize_row = 13,  
     cellwidth = 16,  
     cellheight = 16, angle_col = 45,
     filename = "pan_arg_many.pdf")
    

    相关文章

      网友评论

        本文标题:Pan CARD

        本文链接:https://www.haomeiwen.com/subject/uombsltx.html