CARD | 抗生素耐受(基因,产物,表型)数据库
文献:https://card.mcmaster.ca/about
数据获取:https://card.mcmaster.ca/download
注释软件:RGI
RGI 5.2.0 版
1 添加了自动发布 docker 镜像的工作流;
2 添加了下载卡数据时证书验证失败的解决方法;
3 更新和清理自述文件;
4 添加了一个新功能来自动将 CARD 数据库加载到 RGI,称为 rgi auto_load;
5 添加了 kma 对齐器作为 rgi bwt 的默认值,因为它比 bowtie2 和 bwa 显示出更好的对齐方式;
6 更新了等位基因级报告(output_prefix.allele_mapping_data.txt)的RGI bwt读取映射结果,包括深度、SNP、共识序列DNA、共识序列蛋白;
7 为 rgi main 添加了对 gz 和 bz2 输入 FASTA 的支持;
8 rgi bwt 现在使用同源、变异、rRNA 基因变异、过表达和敲除模型;
conda下载,安装
# searches rgi package and show available versions
conda search --channel bioconda rgi
# install rgi package
conda install --channel bioconda rgi
# install rgi specific version
conda install --channel bioconda rgi=3.1.1
运行
rgi main -i test.fna \
-o test.out \
-t contig \
--clean
github地址:https://github.com/arpcard/rgi
结果输出信息:
结果整理
1 将所有基因组的ARG注释文件存到一个新路径
for i in `cat total_genome.list`;
do
cp /route/ARG-genome/${i}.card5.out.txt ./genome_arg/${i}.txt
echo -e "$i done..."
done
2 抽提 rgo gene drug 信息,构建注释表
## 抽提 rgo gene drug 信息,构建注释表
mkdir genome_arg_gene
for i in `ls genome_arg/`;
do
cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $9, $11, $15}' >> total_arg_aro_drug.txt
# 为了rgo drug 对应分类信息
cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $11}' > genome_arg_gene/$i
# 只要rgo,简化数据
echo -e "$i done..."
done
cat total_arg_aro_drug.txt | sort | uniq > total_arg_aro_drug_uniq.txt
# 去重得到最简注释分类表
cat genome_arg_gene/* | sort | uniq > total_arg_gene_uniq.txt
# 去重得到pan rgo
head total_arg_aro_drug_uniq.txt
AAC(3)-IId 3004623 aminoglycoside antibiotic
AAC(6')-Ib-cr6 3005116 fluoroquinolone antibiotic; aminoglycoside antibiotic
AAC(6')-Ie-APH(2'')-Ia 3002597 aminoglycoside antibiotic
AAC(6')-Ii 3002556 aminoglycoside antibiotic
aad(6) 3002628 aminoglycoside antibiotic
head total_arg_gene_uniq.txt
3000027
3000074
3000165
3000166
3000179
3 genome x aro二维表,填充PAV信息
conda activate assembly
# python3 预装 pandas numpy
#!/usr/bin/env python3
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
with open("total_arg_gene_uniq.txt", 'r') as list_genes:
list_genes = list_genes.readlines()
list_genes_enter = []
for each in list_genes:
list_genes_enter.append(each.strip())
with open("total_genome.list", 'r') as list_genomes:
list_genomes = list_genomes.readlines()
list_genomes_enter = []
for each in list_genomes:
list_genomes_enter.append(each.strip())
# 重复命名不规范
# TypeError: expected str, bytes or os.PathLike object, not list
# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
columns = list_genomes_enter,
index = list_genes_enter)
# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
for each_genome in list_genomes_enter:
target_file = "genome_arg_gene/{}.txt".format(each_genome)
# 读取基因集
with open(target_file, 'r') as target:
target_db = target.readlines()
for each_gene in list_genes_enter:
# 判断行名基因是否在基因集,并给表格元素赋值
# loc: 字符定位表格元素
# iloc: 数字定位
if "{}\n".format(each_gene) in target_db:
df.loc[each_gene, each_genome] = "yes"
else:
df.loc[each_gene, each_genome] = "no"
#print("\033[32m _____ {} DONE!\033[0m".format(each_gene))
print("\033[32m {} DONE!\033[0m".format(each_genome))
# 表格保存
df.to_csv('total_arg_pav.tsv', sep='\t', index = True)
4 python melt one to many的情况
# /home/hutongyuan/recursion
import sys, re, os
infile = "input.txt"
outfile = "out_2.txt"
# recursion
out = [] # 存储结果
def recurse_print(n, li):
if n == 2:
# 终止条件
out.append("{}\t{}\n".format(li[0], li[1]))
# id + 1st drug -> out list
else:
out.append("{}\t{}\n".format(li[0], li[n-1]))
# id + last drug -> out list
li.pop(n-1) # remove last drug
recurse_print(n-1, li) # recursion
with open(infile, 'r') as i:
for line in i:
line = line.strip()
line = re.split(r'\t|; | and ', line)
length = len(line)
recurse_print(length, line)
with open(outfile, 'w') as o:
for line in out:
o.write(line)
5 两个分类表
6 R数据处理pan genome, core genome, 可视化
setwd("piglet/pan_arg_total")
## read table
df = read.table("data.txt", sep="\t", header=T, row.names=1)
g_gene = read.table("data_gene.txt", sep="\t", header=T)
g_genome = read.table("data_genome.txt", sep="\t", header=T)
## 重构df
df$gene = rownames(df)
df = merge(g_gene, df, by='gene', all.x=T)
rownames(df) = df$gene
# gene genome list
uni_gene = unique(g_gene$g_gene)
uni_genome = unique(g_genome$g_genome)
# gene genome 构造新表
row = length(uni_gene)
col = length(uni_genome)
fill = rep(0, row*col)
tb = data.frame(matrix(fill, row, col))
rownames(tb) = uni_gene
colnames(tb) = uni_genome
# pan genome 分类判定
# core mark 2: 每个样方列和 “全大于1”
# absence mark 0: 每个样方列和 “全等于0”
# dispensable mark 1: 其他
for(i in uni_gene)
{
for(j in uni_genome)
{
# 获取样方坐标
tmp_genome = g_genome[g_genome$g_genome==j, ]
tmp_genome = tmp_genome$genome
# 获取样方
tmp_df = df[df$g_gene==i, c(tmp_genome)]
# 统计列和,dim(X)的值必需是正数,是表格才行
sum = apply(tmp_df, 2, FUN=sum)
if(all(sum > 0))
{tb[i, j] = 2}
else if(all(sum == 0))
{tb[i, j] = 0}
else
{tb[i, j] = 1}
}
}
library("pheatmap")
input = t(tb)
# write.table(input, file="out_binary.tsv", sep="\t", quote=F)
pheatmap(input,
cluster_col = T,
color = colorRampPalette(colors = c("white", "deepskyblue1", "indianred1"))(3),
#legend = F,
fontsize_col = 11,
fontsize_row = 13,
cellwidth = 16,
cellheight = 16, angle_col = 45,
filename = "pan_arg_many.pdf")
网友评论