美文网首页python编程
python:获取BPGA基因聚类表

python:获取BPGA基因聚类表

作者: 胡童远 | 来源:发表于2021-07-09 09:52 被阅读0次

    BPGA结果是这个样子的

    打开Sequence是这样的

    core_seq.txt是这样的


    解析该格式,发现聚类对应信息:
    >泛基因组类型/基因cluster编号/物种编号/物种编号_基因编号
    序列序列序列序列序列序列序列序列序列序列序列序列序列序列

    基因cluster总数是和该物种cluster core gene number一致的,可推断

    由此设计基因聚类表基本格式:
    rowname/index -> 基因cluster编号
    colname -> 物种编号
    cell -> 物种编号_基因编号

    python3实现

    #!/usr/bin/env python3
    import re, os, sys
    import pandas as pd  # pandas 编辑表格??
    import numpy as np  # numpy 设计初始表
    
    ms, core_file, out = sys.argv
    
    ##########
    ## core ##
    ##########
    #infile = "core_seq.txt"
    #out = "cluster_info_core.txt"
    
    org = []
    cluster = []
    
    # 读取,拆分fasta标题行,存储物种,存储基因cluster
    with open(core_file, 'r') as i:
        for line in i:
            if ">" in line:
                line = line.strip()
                line = re.split(r'>|/', line)
                #>core/72/3/Org3_Gene164
                #['', 'core', '72', '3', 'Org3_Gene164']
                org.append(line[3])
                cluster.append(line[2])
    
    org = sorted(list(set(org)), key=int)
    cluster = sorted(list(set(cluster)), key=int)
    
    # numpy 构造数据框,用 org cluster 行列数
    num_row = len(cluster)
    num_col = len(org)
    num_total = num_row * num_col
    df = pd.DataFrame(
        np.arange(num_total).reshape((num_row, num_col)),
        columns = org,
        index = cluster)
    
    # 给table 的每个cell赋上org_gene
    with open(core_file, 'r') as i:
        for line in i:
            if ">" in line:
                line = line.strip()
                line = re.split(r'>|/', line)
                df.loc[line[2], line[3]] = line[4]
    
    # 保存csv
    df.to_csv(out, sep='\t', index = True)
    

    运行:

    conda activate python3.7
    python3 script/cluster_info_core.py core_seq.txt cluster_info_core.txt
    

    结果:


    core accessory

    accessory/uniq文件也用类似的方法处理
    python3
    稍有不同的是,uniq gene cluster文件中的物种可能不全(造成表格空缺),因此是最全的core文件物种代谢。

    #!/usr/bin/env python3
    import re, os, sys
    import pandas as pd
    import numpy as np
    
    ms, core_file, infile, out = sys.argv
    
    ###################
    ## accessory or uniq ##
    ###################
    #infile = "accessory_seq.txt"
    #out = "cluster_info_accessory.txt"
    
    org = []
    cluster = []
    
    # get all core org only
    with open(core_file, 'r') as i:
        for line in i:
            if ">" in line:
                line = line.strip()
                line = re.split(r'>|/', line)
                #>core/72/3/Org3_Gene164
                #['', 'core', '72', '3', 'Org3_Gene164']
                org.append(line[3])
    
    # get all accessory cluster only
    with open(infile, 'r') as i:
        for line in i:
            if ">" in line:
                line = line.strip()
                line = re.split(r'>|/', line)
                #>core/72/3/Org3_Gene164
                #['', 'core', '72', '3', 'Org3_Gene164']
                cluster.append(line[2])
    
    org = sorted(list(set(org)), key=int)
    cluster = sorted(list(set(cluster)), key=int)
    
    # 构造数据框
    num_row = len(cluster)
    num_col = len(org)
    num_total = num_row * num_col
    df = pd.DataFrame(
        np.arange(num_total).reshape((num_row, num_col)),
        columns = org,
        index = cluster)
    
    # table 重新赋值
    with open(infile, 'r') as i:
        for line in i:
            if ">" in line:
                line = line.strip()
                line = re.split(r'>|/', line)
                df.loc[line[2], line[3]] = line[4]
    
    df.to_csv(out, sep='\t', index = True)
    

    运行:

    python3 script/cluster_info_acc_or_uniq.py core_seq.txt accessory_seq.txt cluster_info_accessory.txt
    
    python3 script/cluster_info_acc_or_uniq.py core_seq.txt unique_seq.txt cluster_info_unique.txt
    

    结果:

    数字空白转NA

    numpy构造的初始表使用数字填充的,accessory uniq部分会出现未被重新赋值的数字部分其实是NA

    #!/usr/bin/env python3
    import pandas as pd
    import re, sys, os
    ms, infile, out = sys.argv
    
    df = pd.read_csv(infile, index_col = 0, header = 0, sep = "\t")
    for i in range(len(df.index)):
        for j in range(len(df.columns)):
            if "Gene" not in str(df.iloc[i, j]):
                df.iloc[i, j] = "NA"
    
    df.to_csv(out, sep='\t', index = True)
    

    运行:

    mkdir merge
    python3 script/cluster_num2na.py \
    cluster_info_unique.txt merge/cluster_info_unique_na.txt
    python3 script/cluster_num2na.py \
    cluster_info_accessory.txt merge/cluster_info_accessory_na.txt
    
    accessory uniq

    相关文章

      网友评论

        本文标题:python:获取BPGA基因聚类表

        本文链接:https://www.haomeiwen.com/subject/eeaxpltx.html