BPGA结果是这个样子的
打开Sequence是这样的
core_seq.txt是这样的
解析该格式,发现聚类对应信息:
>泛基因组类型/基因cluster编号/物种编号/物种编号_基因编号
序列序列序列序列序列序列序列序列序列序列序列序列序列序列
基因cluster总数是和该物种cluster core gene number一致的,可推断
由此设计基因聚类表基本格式:
rowname/index -> 基因cluster编号
colname -> 物种编号
cell -> 物种编号_基因编号
python3实现
#!/usr/bin/env python3
import re, os, sys
import pandas as pd # pandas 编辑表格??
import numpy as np # numpy 设计初始表
ms, core_file, out = sys.argv
##########
## core ##
##########
#infile = "core_seq.txt"
#out = "cluster_info_core.txt"
org = []
cluster = []
# 读取,拆分fasta标题行,存储物种,存储基因cluster
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
org.append(line[3])
cluster.append(line[2])
org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)
# numpy 构造数据框,用 org cluster 行列数
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
np.arange(num_total).reshape((num_row, num_col)),
columns = org,
index = cluster)
# 给table 的每个cell赋上org_gene
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
df.loc[line[2], line[3]] = line[4]
# 保存csv
df.to_csv(out, sep='\t', index = True)
运行:
conda activate python3.7
python3 script/cluster_info_core.py core_seq.txt cluster_info_core.txt
结果:
core accessory
accessory/uniq文件也用类似的方法处理
python3
稍有不同的是,uniq gene cluster文件中的物种可能不全(造成表格空缺),因此是最全的core文件物种代谢。
#!/usr/bin/env python3
import re, os, sys
import pandas as pd
import numpy as np
ms, core_file, infile, out = sys.argv
###################
## accessory or uniq ##
###################
#infile = "accessory_seq.txt"
#out = "cluster_info_accessory.txt"
org = []
cluster = []
# get all core org only
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
org.append(line[3])
# get all accessory cluster only
with open(infile, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
cluster.append(line[2])
org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)
# 构造数据框
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
np.arange(num_total).reshape((num_row, num_col)),
columns = org,
index = cluster)
# table 重新赋值
with open(infile, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
df.loc[line[2], line[3]] = line[4]
df.to_csv(out, sep='\t', index = True)
运行:
python3 script/cluster_info_acc_or_uniq.py core_seq.txt accessory_seq.txt cluster_info_accessory.txt
python3 script/cluster_info_acc_or_uniq.py core_seq.txt unique_seq.txt cluster_info_unique.txt
结果:
数字空白转NA
numpy构造的初始表使用数字填充的,accessory uniq部分会出现未被重新赋值的数字部分其实是NA
#!/usr/bin/env python3
import pandas as pd
import re, sys, os
ms, infile, out = sys.argv
df = pd.read_csv(infile, index_col = 0, header = 0, sep = "\t")
for i in range(len(df.index)):
for j in range(len(df.columns)):
if "Gene" not in str(df.iloc[i, j]):
df.iloc[i, j] = "NA"
df.to_csv(out, sep='\t', index = True)
运行:
mkdir merge
python3 script/cluster_num2na.py \
cluster_info_unique.txt merge/cluster_info_unique_na.txt
python3 script/cluster_num2na.py \
cluster_info_accessory.txt merge/cluster_info_accessory_na.txt
accessory
uniq
网友评论