从orthomcl结果中挑选出单拷贝基因，并建立一个单拷贝文件

python pick_single_gene.py groups.txt

pick_single_gene.py

# 2020.0425
# @zlk
# 从orthomcl中挑选出单拷贝基因

import sys

file1=open(sys.argv[1],'r')
file2=open(sys.argv[1]+'_single','w')
species_num=int(sys.argv[2])

for line in file1:
    line_list=line.strip().split(' ')[1:]

    if len(line_list)==species_num:
        single_dict={}
        flag=0
        for i in line_list:
            i_list=i.split('|')
            if i_list[0] in single_dict.keys():
                flag=1
                break
            else:
                single_dict[i_list[0]] = i_list[1]
        if flag==1:
            continue
        else:
            for key in sorted(single_dict):
                file2.write(single_dict[key]+'\t')
            file2.write('\n')

每一列为一个物种（groups.txt_single）

1.png

三个参数分别为单拷贝文件、所有物种的蛋白文件和物种名文件（每一行一个物种的名）

python bulit_ML_tree.py groups.txt_single all_pep.fa ./species.txt

species.txt

2.png

bulit_ML_tree.py

# -*- coding=utf-8 -*-
# 2020.0425
# @zlk
# 构建距离树
import sys
import os

#需要安装三个软件，将路径放进脚本
# conda 都可以安装
mafft = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/mafft'
raxml = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/raxmlHPC-PTHREADS'
trimal = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/trimal'


# 每行是一组需要建树的的基因ID，各物种用制表符隔开

def get_pep_dict(file2):
    pep_dict = {}
    for line in file2:
        if line.startswith('>'):
            name = line.strip()[1:]
            pep_dict[name] = ''
        else:
            pep_dict[name] += line
    return (pep_dict)


def get_orth_gene(file1,pep_dict):
    total_gene_list = []
    for line in file1:
        line_list = line.strip().split('\t')
        total_gene_list.append(line_list)
    os.mkdir("SingleGene")
    os.chdir("SingleGene")
    index = 0
    for gene_list in total_gene_list:
        index += 1
        write_file = open('zlk' + str(index) + '.fa', 'w')
        for i in gene_list:
            write_file.write('>' + i + '\n' + pep_dict[i])
        write_file.close()
        
def cal_msa(fold):
    file_list = os.listdir(fold)
    for fa_file in file_list:
        seq_aln_file = fa_file + "_aln.fa"
        seq_aln_trimed_file = fa_file + "_trimed.fa"  
        cmd1 = mafft + ' ' + fa_file + ' >' + seq_aln_file
        os.system(cmd1)
        cmd2 = trimal + ' -in ' + seq_aln_file + ' -out ' + seq_aln_trimed_file + ' -automated1'
        os.system(cmd2)


def combine_seq(fold,species_list):
    file_list = os.listdir(fold)
    for trim_file in file_list:
        if trim_file.endswith('_trimed.fa'):
            opfile=open(trim_file,'r')
            index=-1
            for line1 in opfile:
                if line1.startswith('>'):
                    index+=1
                else:
                    species_list[index][1]+=line1.strip()
            opfile.close()
    out_file=open('combine.fa','w')
    for fa in species_list:
        out_file.write('>%s\n%s\n'%(fa[0],fa[1]))
        
def bulit_tree(thread,nb,model):
    cmd = raxml + ' -T ' + thread + ' -f a -N ' + nb + ' -m ' + model + ' -x 123456 -p 123456 -s combine.fa -n concatenation_out.nwk'
    os.system(cmd)

# 单拷贝基因文件
file1 = open(sys.argv[1], 'r')
#所有蛋白文件
file2 = open(sys.argv[2], 'r')
# 物种名文件
file3 = open(sys.argv[3], 'r')
species_list=[]
for line in file3:
    species_list.append([line.strip(),''])
use_dict=get_pep_dict(file2)
get_orth_gene(file1,use_dict)
cal_msa('./')
# os.chdir("SingleGene")
combine_seq('./',species_list)
# 40线程, bootstrap 100 ，model PROTGAMMAJTT
bulit_tree('40','100',"PROTGAMMAJTT")