马兜铃基因组文章 Aristolochia fimbriata genome 采用了很多策略进行建树,比如只提取第 1 和第 2 个密码子,还给出了提取脚本。
参考:https://github.com/yihenghu/Aristolochia_fimbriata_genome_analysis
因为每个人对中间文件的命名方式不同,所以对脚本做了一点点改变,可以在命令行输入前后缀。
$ ls
OG00001.fa OG00002.fa OG00003.fa
$ python3 get_concatenated_codon.py OG fa
$ ls
OG00001.fa OG00001.fa.12 OG00001.fa.3 OG00002.fa OG00002.fa.12 OG00002.fa.3 OG00003.fa OG00003.fa.12 OG00003.fa.3
$ cat OG00001.fa*
>gene1
ATGATGATGATGATGATGATGATGATGATG
>gene2
ATCATCATCATCATCATCATCATCATCATC
>gene3
ATAATAATAATAATAATAATAATAATAATA
>gene1
ATATATATATATATATATAT
>gene2
ATATATATATATATATATAT
>gene3
ATATATATATATATATATAT
>gene1
GGGGGGGGGG
>gene2
CCCCCCCCCC
>gene3
AAAAAAAAAA
import sys
import glob
import re
from Bio import SeqIO
fa_file = '*.' + str(sys.argv[2])
fa_file1 = str(sys.argv[1]) + '(\d+).' + str(sys.argv[2])
fa_file2 = str(sys.argv[1]) + '%s.' + str(sys.argv[2])
out_file12 = fa_file2 + '.12'
out_file3 = fa_file2 + '.3'
def rm_3(seq):
new = ''
for index, i in enumerate(str(seq)):
if (index + 1) % 3 != 0:
new += i
return new
def save_3(seq):
new = ''
for index, i in enumerate(str(seq)):
if (index + 1) % 3 == 0:
new += i
return new
fileID_list = []
for file in glob.glob(fa_file):
fileID = re.search(
fa_file1,
file).group(1)
fileID_list.append(fileID)
# Coalescent-based tree estimated from concatenated alignments of single gene first codon and second codon positions
for fileID in sorted(fileID_list):
ID2seq = {}
seqRs = SeqIO.parse(fa_file2 % fileID, 'fasta')
for seqR in seqRs:
ID2seq[seqR.id] = rm_3(seqR.seq)
with open(out_file12 % fileID, 'w') as fw:
for ID, seq in ID2seq.items():
fw.write('>' + ID + '\n' + seq + '\n')
# Coalescent-based tree estimated from concatenated alignments of single gene third codon positions
for fileID in sorted(fileID_list):
ID2seq = {}
seqRs = SeqIO.parse(fa_file2 % fileID, 'fasta')
for seqR in seqRs:
ID2seq[seqR.id] = save_3(seqR.seq)
with open(out_file3 % fileID, 'w') as fw:
for ID, seq in ID2seq.items():
fw.write('>' + ID + '\n' + seq + '\n')
网友评论