老早以前写的:
师弟有一批测序返回序列,需要从起始密码子开始翻译成蛋白质序列后逐条输出。
分析:
(1)将多个序列.txt文件整合成一个.fasta文件;
(2)保留每条序列的“ATG"后的序列;
(3)将每条序列翻译成对应的氨基酸系列;
(4)将翻译后的氨基酸序列切割成多个文件保存。
代码实现:
#(1)将多个序列.txt文件整合成一个.fasta文件
#_*_coding:UTF-8_*_
import os
filedir = os.getcwd()+'\yuliao' #获取目标文件夹的路径
filenames=os.listdir(filedir) #获取当前文件夹中的文件名称列表
f=open('fasta合并.fasta','w') #打开当前目录下的fasta合并.fasta文件,如果没有则创建
for filename in filenames: #先遍历文件名
filepath = filedir+'/'+filename
for line in open(filepath): #遍历单个文件,读取行数
line = line.strip()
f.writelines(line+"\n")
f.close()
#(2)保留每条序列的“ATG"后的序列
import re
output_file = open(r"fasta对齐.fasta","w")
with open(r"fasta合并.fasta","r") as input_file:
seq = ""
header = input_file.readline().strip()[0:]
pattern=r"ATG"
for line in input_file:
line = line.strip()
if line[0] != ">":
seq = ""
seq = seq + line
new_seq = seq[re.search(pattern, seq).span()[0]:] #获取”ATG"索引并截取随后的序列
else:
header = line
continue
output_file.write(header + "\n" + new_seq + "\n")
output_file.close()
#(4)将翻译后的氨基酸序列切割成多个文件保存
with open(r"C:\Users\admin\Desktop\all_proteins.fasta","r") as input_file:
seq = ""
header = input_file.readline().strip()[1:]
for line in input_file:
line = line.strip()
if line[0] != ">":
seq = seq + line
else:
with open(header + ".txt", 'w') as file:
file.write(">"+header + "\n" + seq + "\n")
header = line[1:]
seq = ""
with open( header + ".txt",'w') as file:
file.write( ">"+header+"\n"+seq+"\n")
网友评论