PepBDB受体蛋白质序列信息获取及处理

作者: 晓柒NLP与药物设计 | 来源:发表于2022-07-26 17:06 被阅读0次

PepBDB受体蛋白质序列信息获取及处理
史上最全的BLAST本地化教程（三）-nr数据库的下载及拆分
1️⃣序列获取(3)：蛋白质序列获取
如何计算蛋白序列的长度、分子量、等电点等信息
信息处理过程
NCBI上通过蛋白质编号寻找序列
SpringSecurity开发基于表单的认证（二）
一文极速读懂 Uniprot 数据库
一文极速读懂 Uniprot 蛋白质数据库
iOS获取系统信息

1. PDB_ID列表数据爬虫&数据预处理工作

import re
import os
import json
import time
import socket
import threading
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from multiprocessing import Pool
from urllib.request import urlopen

socket.setdefaulttimeout(60)
os.environ['KMP_DUPLICATE_LIB_OK']= 'TRUE'
url_base = 'http://huanglab.phys.hust.edu.cn/pepbdb/browse.php?pagenum='
url_detail = 'http://huanglab.phys.hust.edu.cn'
pattern = re.compile(r'href=\".*\" ')

PDB_ID = []
# for page in tqdm(range(1, 267)):
#     html = urlopen(url_base + str(page), timeout=30).read().decode('utf-8')
#     soup = BeautifulSoup(html, features='lxml')        # for lxml struct
#     target = soup.find_all('td', {"class": "tablink"}) # decode PDB ID
#     for sample in target:
#         PDB_ID.append(pattern.findall(str(sample))[0][6:-2])
# PDB_ID = PDB_ID[:-1]
# with open(r'data/PDB_url.txt', 'w') as PDB:
#     for i in PDB_ID:
#         PDB.write(i + '\n')
PDB = open('data/PDB_url.txt', 'r')  
lines = PDB.readlines()#读取全部内容  
for line in lines:
    PDB_ID.append(line[:-1])  
Interacting_peptide_residues, Interacting_receptor_residues, Peptide_sequence, Receptor_sequence, Name = [], [], [], [], []

def craw_scipy(x, y):
    for pdb in tqdm(PDB_ID[x:y]):
        url_final =  url_detail + pdb
        html = urlopen(url_final, timeout=10).read().decode('utf-8')
        soup = BeautifulSoup(html, features='lxml')        # for lxml struct
        Name.append(re.compile(r'...._.').findall(str(soup.find_all('td')[1]))[0])
        Interacting_peptide_residues.append(soup.find_all('td')[6].text)    
        Interacting_receptor_residues.append(soup.find_all('td')[7].text)
        Peptide_sequence.append(soup.find_all('td')[8].text)
        Receptor_sequence.append(soup.find_all('td')[9].text)

for thread in range(1, 13):
    locals()['thread' + str(thread)] = threading.Thread(name = 't'+str(thread), target = craw_scipy, args = (thread*1000,(thread+1)*1000))
thread13 = threading.Thread(name = 't13', target = craw_scipy, args = (13000,13299))
for thread in range(1, 14):
    locals()['thread' + str(thread)].start()

2. 共爬取到13299个PDB_ID

BDB_dict = {
    'Name' : Name,
    'Interacting_peptide_residues' : Interacting_peptide_residues,
    'Interacting_receptor_residues' : Interacting_receptor_residues,
    'Peptide_sequence' : Peptide_sequence,
    'Receptor_sequence' : Receptor_sequence
}
df = pd.DataFrame(BDB_dict)
df = df[~df['Peptide_sequence'].str.contains('X')]                          # Eliminate x effect
df = df[~df['Receptor_sequence'].str.contains('X')]                         # Eliminate x effect
df = df[df['Receptor_sequence'].str.count('>') <= 1]                        # Eliminate multiple sequence effects
df['Interacting_peptide_residues'] = df['Interacting_peptide_residues'].str[2:-2] 
df['Interacting_receptor_residues'] = df['Interacting_receptor_residues'].str[2:-2]
df['Peptide_sequence'] = df['Peptide_sequence'].str[2:]
df['Receptor_sequence'] = df['Receptor_sequence'].str[2:]
df = df[df.Name!='6mk1_Z']
df = df.reset_index(drop = True)
df.to_csv('data/data.csv')

去除`非标准氨基酸`以及取消`多氨基酸序列`后数据包含6444条

vocab_list = []
for i in df.Receptor_sequence:
    vocab_list += [j for j in i]
vocab = {}
vocab['pad'] = 0
for i in range(len(set(vocab_list))):
    vocab[list(set(vocab_list))[i]] = i+1
vocab['DOCK'] = 22
with open('data/vocab.json','w') as f:
    json.dump(vocab, f)
print('vocab: ' + str(vocab))
vocab: {'pad': 0, 'L': 1, 'Q': 2, 'N': 3, 'V': 4, 'H': 5, 'A': 6, 'I': 7, 'M': 8, 'D': 9, 'C': 10, 'W': 11, 'E': 12, 'Y': 13, 'T': 14, 'P': 15, 'K': 16, 'G': 17, 'F': 18, 'U': 19, 'S': 20, 'R': 21, 'DOCK': 22}

data_count = []
for Peptide_sequence in df.Peptide_sequence:
    tmp = [0] * 21
    for token in Peptide_sequence:
        tmp[vocab[token]-1] += 1
    data_count.append(tmp)
for Receptor_sequence in df.Receptor_sequence:
    tmp = [0] * 21
    for token in Receptor_sequence:
        tmp[vocab[token]-1] += 1
    data_count.append(tmp)
heat_martix = [[0 for i in range(21)] for j in range(21)]
for data in tqdm(data_count):
    for source in range(len(data)):
        for target in range(len(data)):
            if data[source] > 0 and data[target] > 0:
                heat_martix[source][target] += 1

3. 绘制特征feature map

mask = np.zeros_like(heat_martix)
mask[np.triu_indices_from(mask)] = True
heatmap = pd.DataFrame(heat_martix)
heatmap.index = pd.Series(list(vocab.keys())[1:-1])
heatmap.columns = pd.Series(list(vocab.keys())[1:-1])
plt.figure(dpi=100, figsize=(6,5))
with sns.axes_style("white"):
    ax = sns.heatmap(heatmap, linewidths=.6, mask=mask, square=True, cmap="YlGnBu")
plt.show()

Peptide_sequence 最大长度为: 49

Receptor_sequence 最大长度为: 1569

len_Receptor_sequence = [len(i) for i in df.Receptor_sequence]
plt.figure(figsize=(15,5))
plt.hist(len_Receptor_sequence, bins=40, density=False, facecolor="tab:blue", edgecolor="tab:orange", alpha=0.7)
plt.show()

设想为氨基酸长度大于600的受体忽略不计（<3%）

df = df.reset_index(drop = True)

file_list = []
for filename in os.listdir(r'C:\Users\DELL\Desktop\BDB_seq_lab\data\pepbdb'):
    file_list.append(filename)
for i in df.Name:
    if i not in file_list:
        print('受体 {} 不在下载列表中'.format(i))

miss_count = []
for i in range(len(df)):
    fname = './data/pepbdb/{}/peptide.pdb'.format(str(df.Name[i]))
    with open(fname, 'rb') as f: # 打开文件
        first_line = f.readline()
    need_minus = 0
    try:
        head = str(first_line).split()[5]
        if int(head) != 1:
            need_minus = int(head) - 1
        df.Interacting_peptide_residues[i] = [int(k) - need_minus for k in df.Interacting_peptide_residues[i].replace(' ', '').split(',')]
        df.Interacting_peptide_residues[i] = ' '.join(str(df.Interacting_peptide_residues[i])[1:-1].split(','))
    except:
        miss_count.append(i)
df.drop(miss_count, inplace=True)
df = df.reset_index(drop = True)

miss_count = []
for i in range(len(df)):
    fname = './data/pepbdb/{}/receptor.pdb'.format(str(df.Name[i]))
    with open(fname, 'rb') as f: # 打开文件
        first_line = f.readline()
    need_minus = 0
    try:
        head = str(first_line).split()[5]
        if int(head) != 1:
            need_minus = int(head) - 1
        df.Interacting_receptor_residues[i] = [int(k) - need_minus for k in df.Interacting_receptor_residues[i].replace(' ', '').split(',')]
        df.Interacting_receptor_residues[i] = ' '.join(str(df.Interacting_receptor_residues[i])[1:-1].split(','))
    except:
        miss_count.append(i)
df.drop(miss_count, inplace=True)
df = df.reset_index(drop = True)
df.to_csv('data/process_data.csv')

明天发布蛋白质对接序列预测代码