一、准备文件:
1、YNC.xlsx
“Modifications in Master Proteins” 一列包含以下几种情况:
需提取蛋白对应的位点(位点必须以数字结尾)
- Q8N6H7 1xPhospho [S] 去除不要
- P12270 1xPhospho [T2116]
- Q9HCD6 2xPhospho [S1824; S1827]
- P49792 1xPhospho [S1894]; Q99666 1xPhospho [S918]
- P49792 2xPhospho [T2450; S2454]; Q99666 2xPhospho [T1474; S1478]
- Q96FQ6 1xAcetyl [N-Term]; 1xPhospho [S2]
二、结果
输出文件 YNC_phos_out.txt**
在原始文件YNC.xlsx文件后面追加两列pho,res列
pro | res |
---|---|
P12270 | T2116 |
Q14676 | S453 |
Q6PD62 | S941 |
三、脚本
#! /usr/bin/env python
# _*_ coding: utf-8 _*_
# Format to extract for Modifications in Master Proteins
# eg:python3 pho_split.py YNC.txt
# output:YNC_phos_out.txt
__email__ = ".com.cn"
import re
import sys
if __name__ == "__main__":
if len(sys.argv) != 2:
print("usage: python pho_split.py YNC.txt")
sys.exit()
#file = open("YNC.txt", 'r')
YNC = sys.argv[1]
file = open(YNC, 'r')
out = open("YNC_phos_out.txt", 'w')
header = next(file).strip()
out.write("{0}\t{1}\t{2}\n".format(header, "pro", "res"))
for line in file:
line = line.strip()
line1 = line.split("\t")
Modifications = line1[4]
numEnd = re.compile(r".*[0-9]+$")
if ';' not in Modifications:
text = re.compile(r".*[0-9]+\]$")
if text.match(Modifications): # P12270 1xPhospho [T2116]
pro, pho = Modifications.split(" ", 1)
res = re.findall(r'\[(.*[0-9]+?)\]', pho)[0]
out.write("{0}\t{1}\t{2}\n".format(line, pro, res))
elif '];' in Modifications:
items = Modifications.split("]; ")
if numEnd.match(items[0].split(" ", 1)[0]) and items[1][0].isdigit(): # Q96FQ6 1xAcetyl [N-Term]; 1xPhospho [S2]
pro, pho = Modifications.split(" ", 1)
res = re.findall(r'\[(.*?)\]', pho)
for i in res:
if ';' not in i and numEnd.match(i):
out.write("{0}\t{1}\t{2}\n".format(line, pro, i))
elif ';' in i: # O75531 1xAcetyl [N-Term]; 2xPhospho [T3; S4]
length = len(i.split(";"))
for n in range(length):
out.write("{0}\t{1}\t{2}\n".format(line, pro, i.split("; ")[n]))
else:
for item in Modifications.split("]; "):
pro, pho = item.split(" ", 1)
if ';' not in pho and numEnd.match(pro): # P49792 1xPhospho [S1894]; Q99666 1xPhospho [S918]
res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
if numEnd.match(res):
out.write("{0}\t{1}\t{2}\n".format(line, pro, res))
elif ';' in pho and numEnd.match(pro): # P49792 2xPhospho [T2450; S2454]; Q99666 2xPhospho [T1474; S1478]
res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
for i in res.split("; "):
if numEnd.match(i):
out.write("{0}\t{1}\t{2}\n".format(line, pro, i))
else:
pro, pho = Modifications.split(" ", 1) # Q9HCD6 2xPhospho [S1824; S1827]
res = re.sub(r'[\[\]]', "", pho.split(" ", 1)[1])
for i in res.split("; "):
if numEnd.match(i):
out.write("{0}\t{1}\t{2}\n".format(line, pro, i))
网友评论