Pan CARD

作者: 胡童远 | 来源:发表于2021-06-01 15:52 被阅读0次

Pan CARD
React-Native 07/24 ~ 08/13
Android使用Card View和Recycler View
Pan!Pan!
deepin显卡设置
视频撕裂及掉帧的原因
RGI注释CARD
卡片封装想法
记录下将查询的结果存入表中的sql
Pan

CARD | 抗生素耐受(基因，产物，表型)数据库

文献：https://card.mcmaster.ca/about

官网：https://card.mcmaster.ca/

数据获取：https://card.mcmaster.ca/download

注释软件：RGI
RGI 5.2.0 版
1 添加了自动发布 docker 镜像的工作流；
2 添加了下载卡数据时证书验证失败的解决方法；
3 更新和清理自述文件；
4 添加了一个新功能来自动将 CARD 数据库加载到 RGI，称为 rgi auto_load；
5 添加了 kma 对齐器作为 rgi bwt 的默认值，因为它比 bowtie2 和 bwa 显示出更好的对齐方式；
6 更新了等位基因级报告（output_prefix.allele_mapping_data.txt）的RGI bwt读取映射结果，包括深度、SNP、共识序列DNA、共识序列蛋白；
7 为 rgi main 添加了对 gz 和 bz2 输入 FASTA 的支持；
8 rgi bwt 现在使用同源、变异、rRNA 基因变异、过表达和敲除模型；

conda下载，安装

# searches rgi package and show available versions
conda search --channel bioconda rgi
# install rgi package
conda install --channel bioconda rgi
# install rgi specific version
conda install --channel bioconda rgi=3.1.1

运行

rgi main -i test.fna \
-o test.out \
-t contig \
--clean

github地址：https://github.com/arpcard/rgi

结果输出信息：

结果整理

1 将所有基因组的ARG注释文件存到一个新路径

for i in `cat total_genome.list`;
do
    cp /route/ARG-genome/${i}.card5.out.txt ./genome_arg/${i}.txt
    echo -e "$i done..."
done

2 抽提 rgo gene drug 信息，构建注释表

## 抽提 rgo gene drug 信息，构建注释表
mkdir genome_arg_gene
for i in `ls genome_arg/`;
do
    cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $9, $11, $15}' >> total_arg_aro_drug.txt
    # 为了rgo drug 对应分类信息
    cat genome_arg/$i | sed '1d' | awk -F"\t" 'BEGIN{OFS="\t"}{print $11}' > genome_arg_gene/$i
    # 只要rgo，简化数据
    echo -e "$i done..."
done

cat total_arg_aro_drug.txt | sort | uniq > total_arg_aro_drug_uniq.txt
# 去重得到最简注释分类表
cat genome_arg_gene/* | sort | uniq > total_arg_gene_uniq.txt
# 去重得到pan rgo

head total_arg_aro_drug_uniq.txt
AAC(3)-IId      3004623 aminoglycoside antibiotic
AAC(6')-Ib-cr6  3005116 fluoroquinolone antibiotic; aminoglycoside antibiotic
AAC(6')-Ie-APH(2'')-Ia  3002597 aminoglycoside antibiotic
AAC(6')-Ii      3002556 aminoglycoside antibiotic
aad(6)  3002628 aminoglycoside antibiotic

head total_arg_gene_uniq.txt
3000027
3000074
3000165
3000166
3000179

3 genome x aro二维表，填充PAV信息

conda activate assembly
# python3 预装 pandas numpy

#!/usr/bin/env python3
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件，去除换行符给新列表
with open("total_arg_gene_uniq.txt", 'r') as list_genes:
    list_genes = list_genes.readlines()
    list_genes_enter = []
    for each in list_genes:
        list_genes_enter.append(each.strip())

with open("total_genome.list", 'r') as list_genomes:
    list_genomes = list_genomes.readlines()
    list_genomes_enter = []
    for each in list_genomes:
        list_genomes_enter.append(each.strip())

# 重复命名不规范
# TypeError: expected str, bytes or os.PathLike object, not list

# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
                  columns = list_genomes_enter,
                  index = list_genes_enter)

# 遍历所有基因集，遍历所有行名(基因)是否存在于各基因集(CGR2)，重新给表格赋值
for each_genome in list_genomes_enter:
    target_file = "genome_arg_gene/{}.txt".format(each_genome)
    # 读取基因集
    with open(target_file, 'r') as target:
        target_db = target.readlines()
        for each_gene in list_genes_enter:
            # 判断行名基因是否在基因集，并给表格元素赋值
            # loc: 字符定位表格元素
            # iloc: 数字定位
            if "{}\n".format(each_gene) in target_db:
                df.loc[each_gene, each_genome] = "yes"
            else:
                df.loc[each_gene, each_genome] = "no"
            #print("\033[32m _____ {} DONE!\033[0m".format(each_gene))
    print("\033[32m {} DONE!\033[0m".format(each_genome))

# 表格保存
df.to_csv('total_arg_pav.tsv', sep='\t', index = True)

4 python melt one to many的情况

# /home/hutongyuan/recursion
import sys, re, os
infile = "input.txt"
outfile = "out_2.txt"

# recursion
out = [] # 存储结果
def recurse_print(n, li):
    if n == 2: 
    # 终止条件
        out.append("{}\t{}\n".format(li[0], li[1]))
        # id + 1st drug -> out list
    else:
        out.append("{}\t{}\n".format(li[0], li[n-1]))
        # id + last drug -> out list
        li.pop(n-1) # remove last drug
        recurse_print(n-1, li)  # recursion

with open(infile, 'r') as i:
    for line in i:
        line = line.strip()
        line = re.split(r'\t|; | and ', line)
        length = len(line)
        recurse_print(length, line)

with open(outfile, 'w') as o:
    for line in out:
        o.write(line)

5 两个分类表

6 R数据处理pan genome, core genome, 可视化

setwd("piglet/pan_arg_total")

## read table
df = read.table("data.txt", sep="\t", header=T, row.names=1)
g_gene = read.table("data_gene.txt", sep="\t", header=T)
g_genome = read.table("data_genome.txt", sep="\t", header=T)

## 重构df
df$gene = rownames(df)
df = merge(g_gene, df, by='gene', all.x=T)
rownames(df) = df$gene

# gene genome list
uni_gene = unique(g_gene$g_gene)
uni_genome = unique(g_genome$g_genome)

# gene genome 构造新表
row = length(uni_gene)
col = length(uni_genome)
fill = rep(0, row*col)
tb = data.frame(matrix(fill, row, col))
rownames(tb) = uni_gene
colnames(tb) = uni_genome

# pan genome 分类判定
# core mark 2: 每个样方列和 “全大于1”
# absence mark 0: 每个样方列和 “全等于0”
# dispensable mark 1: 其他

for(i in uni_gene)
{
    for(j in uni_genome)
    {
        # 获取样方坐标
        tmp_genome = g_genome[g_genome$g_genome==j, ]
        tmp_genome = tmp_genome$genome
        # 获取样方
        tmp_df = df[df$g_gene==i, c(tmp_genome)]
        # 统计列和，dim(X)的值必需是正数，是表格才行
        sum = apply(tmp_df, 2, FUN=sum)
        if(all(sum > 0))
        {tb[i, j] = 2}
        else if(all(sum == 0))
        {tb[i, j] = 0}
        else
        {tb[i, j] = 1}
    }
}

library("pheatmap")
input = t(tb)

# write.table(input, file="out_binary.tsv", sep="\t", quote=F)

pheatmap(input, 
 cluster_col = T,
 color = colorRampPalette(colors = c("white", "deepskyblue1", "indianred1"))(3),
 #legend = F,
 fontsize_col = 11,  
 fontsize_row = 13,  
 cellwidth = 16,  
 cellheight = 16, angle_col = 45,
 filename = "pan_arg_many.pdf")