从orthomcl结果中挑选出单拷贝基因,并建立一个单拷贝文件
python pick_single_gene.py groups.txt
pick_single_gene.py
# 2020.0425
# @zlk
# 从orthomcl中挑选出单拷贝基因
import sys
file1=open(sys.argv[1],'r')
file2=open(sys.argv[1]+'_single','w')
species_num=int(sys.argv[2])
for line in file1:
line_list=line.strip().split(' ')[1:]
if len(line_list)==species_num:
single_dict={}
flag=0
for i in line_list:
i_list=i.split('|')
if i_list[0] in single_dict.keys():
flag=1
break
else:
single_dict[i_list[0]] = i_list[1]
if flag==1:
continue
else:
for key in sorted(single_dict):
file2.write(single_dict[key]+'\t')
file2.write('\n')
每一列为一个物种(groups.txt_single)
1.png
三个参数分别为单拷贝文件、所有物种的蛋白文件和物种名文件(每一行一个物种的名)
python bulit_ML_tree.py groups.txt_single all_pep.fa ./species.txt
species.txt
2.png
bulit_ML_tree.py
# -*- coding=utf-8 -*-
# 2020.0425
# @zlk
# 构建距离树
import sys
import os
#需要安装三个软件,将路径放进脚本
# conda 都可以安装
mafft = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/mafft'
raxml = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/raxmlHPC-PTHREADS'
trimal = '/public/zhanglk/anaconda3/envs/orthofinder2/bin/trimal'
# 每行是一组需要建树的的基因ID,各物种用制表符隔开
def get_pep_dict(file2):
pep_dict = {}
for line in file2:
if line.startswith('>'):
name = line.strip()[1:]
pep_dict[name] = ''
else:
pep_dict[name] += line
return (pep_dict)
def get_orth_gene(file1,pep_dict):
total_gene_list = []
for line in file1:
line_list = line.strip().split('\t')
total_gene_list.append(line_list)
os.mkdir("SingleGene")
os.chdir("SingleGene")
index = 0
for gene_list in total_gene_list:
index += 1
write_file = open('zlk' + str(index) + '.fa', 'w')
for i in gene_list:
write_file.write('>' + i + '\n' + pep_dict[i])
write_file.close()
def cal_msa(fold):
file_list = os.listdir(fold)
for fa_file in file_list:
seq_aln_file = fa_file + "_aln.fa"
seq_aln_trimed_file = fa_file + "_trimed.fa"
cmd1 = mafft + ' ' + fa_file + ' >' + seq_aln_file
os.system(cmd1)
cmd2 = trimal + ' -in ' + seq_aln_file + ' -out ' + seq_aln_trimed_file + ' -automated1'
os.system(cmd2)
def combine_seq(fold,species_list):
file_list = os.listdir(fold)
for trim_file in file_list:
if trim_file.endswith('_trimed.fa'):
opfile=open(trim_file,'r')
index=-1
for line1 in opfile:
if line1.startswith('>'):
index+=1
else:
species_list[index][1]+=line1.strip()
opfile.close()
out_file=open('combine.fa','w')
for fa in species_list:
out_file.write('>%s\n%s\n'%(fa[0],fa[1]))
def bulit_tree(thread,nb,model):
cmd = raxml + ' -T ' + thread + ' -f a -N ' + nb + ' -m ' + model + ' -x 123456 -p 123456 -s combine.fa -n concatenation_out.nwk'
os.system(cmd)
# 单拷贝基因文件
file1 = open(sys.argv[1], 'r')
#所有蛋白文件
file2 = open(sys.argv[2], 'r')
# 物种名文件
file3 = open(sys.argv[3], 'r')
species_list=[]
for line in file3:
species_list.append([line.strip(),''])
use_dict=get_pep_dict(file2)
get_orth_gene(file1,use_dict)
cal_msa('./')
# os.chdir("SingleGene")
combine_seq('./',species_list)
# 40线程, bootstrap 100 ,model PROTGAMMAJTT
bulit_tree('40','100',"PROTGAMMAJTT")
RAxML_bipartitionsBranchLabels.concatenation_out.nwk为最终树文件
FigTree可视化结果
3.png
网友评论