BPGA结果是这个样子的
data:image/s3,"s3://crabby-images/7ca86/7ca86242bf0ccbf86e1b14cbb0d87fcff73a3567" alt=""
打开Sequence是这样的
data:image/s3,"s3://crabby-images/6492d/6492d5ad3095d946cf861c4f71fb518b10e391bb" alt=""
core_seq.txt是这样的
data:image/s3,"s3://crabby-images/72ddb/72ddbe1ac77114911933e1356474739046856247" alt=""
解析该格式,发现聚类对应信息:
>泛基因组类型/基因cluster编号/物种编号/物种编号_基因编号
序列序列序列序列序列序列序列序列序列序列序列序列序列序列
基因cluster总数是和该物种cluster core gene number一致的,可推断
由此设计基因聚类表基本格式:
rowname/index -> 基因cluster编号
colname -> 物种编号
cell -> 物种编号_基因编号
python3实现
#!/usr/bin/env python3
import re, os, sys
import pandas as pd # pandas 编辑表格??
import numpy as np # numpy 设计初始表
ms, core_file, out = sys.argv
##########
## core ##
##########
#infile = "core_seq.txt"
#out = "cluster_info_core.txt"
org = []
cluster = []
# 读取,拆分fasta标题行,存储物种,存储基因cluster
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
org.append(line[3])
cluster.append(line[2])
org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)
# numpy 构造数据框,用 org cluster 行列数
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
np.arange(num_total).reshape((num_row, num_col)),
columns = org,
index = cluster)
# 给table 的每个cell赋上org_gene
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
df.loc[line[2], line[3]] = line[4]
# 保存csv
df.to_csv(out, sep='\t', index = True)
运行:
conda activate python3.7
python3 script/cluster_info_core.py core_seq.txt cluster_info_core.txt
结果:
data:image/s3,"s3://crabby-images/17403/17403fd91bd7e5e951df029d6fc7f1c2aedca150" alt=""
data:image/s3,"s3://crabby-images/491f8/491f8b0186c04e291c1603b98eff147a49f3d8a6" alt=""
accessory/uniq文件也用类似的方法处理
python3
稍有不同的是,uniq gene cluster文件中的物种可能不全(造成表格空缺),因此是最全的core文件物种代谢。
#!/usr/bin/env python3
import re, os, sys
import pandas as pd
import numpy as np
ms, core_file, infile, out = sys.argv
###################
## accessory or uniq ##
###################
#infile = "accessory_seq.txt"
#out = "cluster_info_accessory.txt"
org = []
cluster = []
# get all core org only
with open(core_file, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
org.append(line[3])
# get all accessory cluster only
with open(infile, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
#>core/72/3/Org3_Gene164
#['', 'core', '72', '3', 'Org3_Gene164']
cluster.append(line[2])
org = sorted(list(set(org)), key=int)
cluster = sorted(list(set(cluster)), key=int)
# 构造数据框
num_row = len(cluster)
num_col = len(org)
num_total = num_row * num_col
df = pd.DataFrame(
np.arange(num_total).reshape((num_row, num_col)),
columns = org,
index = cluster)
# table 重新赋值
with open(infile, 'r') as i:
for line in i:
if ">" in line:
line = line.strip()
line = re.split(r'>|/', line)
df.loc[line[2], line[3]] = line[4]
df.to_csv(out, sep='\t', index = True)
运行:
python3 script/cluster_info_acc_or_uniq.py core_seq.txt accessory_seq.txt cluster_info_accessory.txt
python3 script/cluster_info_acc_or_uniq.py core_seq.txt unique_seq.txt cluster_info_unique.txt
结果:
数字空白转NA
numpy构造的初始表使用数字填充的,accessory uniq部分会出现未被重新赋值的数字部分其实是NA
#!/usr/bin/env python3
import pandas as pd
import re, sys, os
ms, infile, out = sys.argv
df = pd.read_csv(infile, index_col = 0, header = 0, sep = "\t")
for i in range(len(df.index)):
for j in range(len(df.columns)):
if "Gene" not in str(df.iloc[i, j]):
df.iloc[i, j] = "NA"
df.to_csv(out, sep='\t', index = True)
运行:
mkdir merge
python3 script/cluster_num2na.py \
cluster_info_unique.txt merge/cluster_info_unique_na.txt
python3 script/cluster_num2na.py \
cluster_info_accessory.txt merge/cluster_info_accessory_na.txt
data:image/s3,"s3://crabby-images/8ae2e/8ae2ee031d4c8c835778615538bf2812c14ace9e" alt=""
data:image/s3,"s3://crabby-images/ea374/ea374cc2abb4482b6432b0815d55f783fb58307c" alt=""
网友评论