1. PDB_ID列表数据爬虫&数据预处理工作
import re
import os
import json
import time
import socket
import threading
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from multiprocessing import Pool
from urllib.request import urlopen
socket.setdefaulttimeout(60)
os.environ['KMP_DUPLICATE_LIB_OK']= 'TRUE'
url_base = 'http://huanglab.phys.hust.edu.cn/pepbdb/browse.php?pagenum='
url_detail = 'http://huanglab.phys.hust.edu.cn'
pattern = re.compile(r'href=\".*\" ')
PDB_ID = []
# for page in tqdm(range(1, 267)):
# html = urlopen(url_base + str(page), timeout=30).read().decode('utf-8')
# soup = BeautifulSoup(html, features='lxml') # for lxml struct
# target = soup.find_all('td', {"class": "tablink"}) # decode PDB ID
# for sample in target:
# PDB_ID.append(pattern.findall(str(sample))[0][6:-2])
# PDB_ID = PDB_ID[:-1]
# with open(r'data/PDB_url.txt', 'w') as PDB:
# for i in PDB_ID:
# PDB.write(i + '\n')
PDB = open('data/PDB_url.txt', 'r')
lines = PDB.readlines()#读取全部内容
for line in lines:
PDB_ID.append(line[:-1])
Interacting_peptide_residues, Interacting_receptor_residues, Peptide_sequence, Receptor_sequence, Name = [], [], [], [], []
def craw_scipy(x, y):
for pdb in tqdm(PDB_ID[x:y]):
url_final = url_detail + pdb
html = urlopen(url_final, timeout=10).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml') # for lxml struct
Name.append(re.compile(r'...._.').findall(str(soup.find_all('td')[1]))[0])
Interacting_peptide_residues.append(soup.find_all('td')[6].text)
Interacting_receptor_residues.append(soup.find_all('td')[7].text)
Peptide_sequence.append(soup.find_all('td')[8].text)
Receptor_sequence.append(soup.find_all('td')[9].text)
for thread in range(1, 13):
locals()['thread' + str(thread)] = threading.Thread(name = 't'+str(thread), target = craw_scipy, args = (thread*1000,(thread+1)*1000))
thread13 = threading.Thread(name = 't13', target = craw_scipy, args = (13000,13299))
for thread in range(1, 14):
locals()['thread' + str(thread)].start()
2. 共爬取到13299个PDB_ID
BDB_dict = {
'Name' : Name,
'Interacting_peptide_residues' : Interacting_peptide_residues,
'Interacting_receptor_residues' : Interacting_receptor_residues,
'Peptide_sequence' : Peptide_sequence,
'Receptor_sequence' : Receptor_sequence
}
df = pd.DataFrame(BDB_dict)
df = df[~df['Peptide_sequence'].str.contains('X')] # Eliminate x effect
df = df[~df['Receptor_sequence'].str.contains('X')] # Eliminate x effect
df = df[df['Receptor_sequence'].str.count('>') <= 1] # Eliminate multiple sequence effects
df['Interacting_peptide_residues'] = df['Interacting_peptide_residues'].str[2:-2]
df['Interacting_receptor_residues'] = df['Interacting_receptor_residues'].str[2:-2]
df['Peptide_sequence'] = df['Peptide_sequence'].str[2:]
df['Receptor_sequence'] = df['Receptor_sequence'].str[2:]
df = df[df.Name!='6mk1_Z']
df = df.reset_index(drop = True)
df.to_csv('data/data.csv')
去除非标准氨基酸
以及取消多氨基酸序列
后数据包含6444条
vocab_list = []
for i in df.Receptor_sequence:
vocab_list += [j for j in i]
vocab = {}
vocab['pad'] = 0
for i in range(len(set(vocab_list))):
vocab[list(set(vocab_list))[i]] = i+1
vocab['DOCK'] = 22
with open('data/vocab.json','w') as f:
json.dump(vocab, f)
print('vocab: ' + str(vocab))
vocab: {'pad': 0, 'L': 1, 'Q': 2, 'N': 3, 'V': 4, 'H': 5, 'A': 6, 'I': 7, 'M': 8, 'D': 9, 'C': 10, 'W': 11, 'E': 12, 'Y': 13, 'T': 14, 'P': 15, 'K': 16, 'G': 17, 'F': 18, 'U': 19, 'S': 20, 'R': 21, 'DOCK': 22}
data_count = []
for Peptide_sequence in df.Peptide_sequence:
tmp = [0] * 21
for token in Peptide_sequence:
tmp[vocab[token]-1] += 1
data_count.append(tmp)
for Receptor_sequence in df.Receptor_sequence:
tmp = [0] * 21
for token in Receptor_sequence:
tmp[vocab[token]-1] += 1
data_count.append(tmp)
heat_martix = [[0 for i in range(21)] for j in range(21)]
for data in tqdm(data_count):
for source in range(len(data)):
for target in range(len(data)):
if data[source] > 0 and data[target] > 0:
heat_martix[source][target] += 1
3. 绘制特征feature map
mask = np.zeros_like(heat_martix)
mask[np.triu_indices_from(mask)] = True
heatmap = pd.DataFrame(heat_martix)
heatmap.index = pd.Series(list(vocab.keys())[1:-1])
heatmap.columns = pd.Series(list(vocab.keys())[1:-1])
plt.figure(dpi=100, figsize=(6,5))
with sns.axes_style("white"):
ax = sns.heatmap(heatmap, linewidths=.6, mask=mask, square=True, cmap="YlGnBu")
plt.show()

Peptide_sequence 最大长度为: 49
Receptor_sequence 最大长度为: 1569
len_Receptor_sequence = [len(i) for i in df.Receptor_sequence]
plt.figure(figsize=(15,5))
plt.hist(len_Receptor_sequence, bins=40, density=False, facecolor="tab:blue", edgecolor="tab:orange", alpha=0.7)
plt.show()

设想为氨基酸长度大于600的受体忽略不计(<3%)
df = df.reset_index(drop = True)
file_list = []
for filename in os.listdir(r'C:\Users\DELL\Desktop\BDB_seq_lab\data\pepbdb'):
file_list.append(filename)
for i in df.Name:
if i not in file_list:
print('受体 {} 不在下载列表中'.format(i))
miss_count = []
for i in range(len(df)):
fname = './data/pepbdb/{}/peptide.pdb'.format(str(df.Name[i]))
with open(fname, 'rb') as f: # 打开文件
first_line = f.readline()
need_minus = 0
try:
head = str(first_line).split()[5]
if int(head) != 1:
need_minus = int(head) - 1
df.Interacting_peptide_residues[i] = [int(k) - need_minus for k in df.Interacting_peptide_residues[i].replace(' ', '').split(',')]
df.Interacting_peptide_residues[i] = ' '.join(str(df.Interacting_peptide_residues[i])[1:-1].split(','))
except:
miss_count.append(i)
df.drop(miss_count, inplace=True)
df = df.reset_index(drop = True)
miss_count = []
for i in range(len(df)):
fname = './data/pepbdb/{}/receptor.pdb'.format(str(df.Name[i]))
with open(fname, 'rb') as f: # 打开文件
first_line = f.readline()
need_minus = 0
try:
head = str(first_line).split()[5]
if int(head) != 1:
need_minus = int(head) - 1
df.Interacting_receptor_residues[i] = [int(k) - need_minus for k in df.Interacting_receptor_residues[i].replace(' ', '').split(',')]
df.Interacting_receptor_residues[i] = ' '.join(str(df.Interacting_receptor_residues[i])[1:-1].split(','))
except:
miss_count.append(i)
df.drop(miss_count, inplace=True)
df = df.reset_index(drop = True)
df.to_csv('data/process_data.csv')
明天发布蛋白质对接序列预测代码
网友评论