原书下载地址:
我用阿里云盘分享了「OReilly.PyTorch.Pocket.R...odels.149209000X.pdf」,你可以不限速下载🚀
复制这段内容打开「阿里云盘」App 即可获取
链接:https://www.aliyundrive.com/s/NZvnGbTYr6C
第四章 基于已有网络设计进行神经网络应用开发
这一章主要通过三个例子来表现Pytorch在神经网络开发应用的便捷性及高效性。
- 基于迁移学习的图片分类
- 自然语言处理里的情感分析
- GAN,文字生成图片
自然语言处理里的情感分析
Sentiment Analysis with Torchtext
示例出处:https://github.com/bentrevett/pytorch-sentiment-analysis
示例:对电影评论进行情感分析
- 数据处理
IMDb(Internet Movie Database)数据库有25000条电影评论且这些评论都有被打标签(正面或负面)。
Pytorch里有一个包Torchtext就提供了一些便捷方法对文本数据进行处理,本节的例子就是通过torchtext来处理相应的文本数据。
import random
import torch
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random_state = random.seed(SEED)
def generate_bigrams(x):
n_grams = set(zip(*[x[i:] for i in range(2)]))
for n_gram in n_grams:
x.append(' '.join(n_gram))
return x
generate_bigrams(['This', 'movie', 'is', 'awesome'])
# out:
# ['This', 'movie', 'is', 'awesome', 'This movie',
# 'movie is', 'is awesome']
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split
train_iter, test_iter = IMDB(
split=('train', 'test')) #<1>
train_dataset = list(train_iter) #<2>
test_data = list(test_iter)
num_train = int(len(train_dataset) * 0.70)
train_data, valid_data = \
random_split(train_dataset,
[num_train,
len(train_dataset) - num_train]) # <3>
print(len(train_data), len(valid_data), len(test_data))
# out:17500 7500 25000
data_index = 21
print(train_data[data_index][0])
# out: (your results may vary)
# pos
print(train_data[data_index][1])
# out: (your results may vary)
# ['This', 'film', 'moved', 'me', 'beyond', 'comprehension', ...
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
tokenizer = get_tokenizer('spacy') # <1>
counter = Counter()
for (label, line) in train_data:
counter.update(generate_bigrams(
tokenizer(line))) # <2>
vocab = Vocab(counter,
max_size = 25000,
vectors = "glove.6B.100d",
unk_init = torch.Tensor.normal_) # <3>
print(len(counter))
print(len(vocab))
print(vocab['<pad>'])
tokens = generate_bigrams(tokenizer(('this is the greatest movie ever!')))
for token in tokens:
print(token, vocab[token])
"""
this 16
is 9
the 2
greatest 1457
movie 22
ever 170
! 40
this is 224
is the 181
greatest movie 0
movie ever 4749
the greatest 2349
ever ! 20411
"""
text_pipeline = lambda x: [vocab[token]
for token in generate_bigrams(tokenizer(x))]
label_pipeline = lambda x: 1 if x=='pos' else 0
print(text_pipeline('the movie was horrible'))
# out: [2, 22, 19, 942, 157, 14859, 538]
print(label_pipeline('neg'))
# out: 0
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
device = torch.device("cuda" if
torch.cuda.is_available() else "cpu")
def collate_batch(batch):
label_list, text_list = [], []
for (_label, _text) in batch:
label_list.append(label_pipeline(_label))
processed_text = torch.tensor(text_pipeline(_text))
text_list.append(processed_text)
return (torch.tensor(label_list, dtype=torch.float64).to(device),
pad_sequence(text_list,
padding_value=1.0).to(device))
batch_size = 64
def batch_sampler():
indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_dataset)]
random.shuffle(indices)
pooled_indices = []
# create pool of indices with similar lengths
for i in range(0, len(indices), batch_size * 100):
pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))
pooled_indices = [x[0] for x in pooled_indices]
# yield indices for current batch
for i in range(0, len(pooled_indices), batch_size):
yield pooled_indices[i:i + batch_size]
BATCH_SIZE = 64
train_dataloader = DataLoader(train_data,
# batch_sampler=batch_sampler(),
collate_fn=collate_batch,
batch_size=BATCH_SIZE,
shuffle=True)
# collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data,
batch_size=BATCH_SIZE,
shuffle=True,
collate_fn=collate_batch)
test_dataloader = DataLoader(test_data,
batch_size=BATCH_SIZE,
shuffle=True,
collate_fn=collate_batch)
label, text = next(iter(train_dataloader))
print(label.size(), text.size())
print(label, text)
- 模型设计
import torch.nn as nn
import torch.nn.functional as F
class FastText(nn.Module):
def __init__(self,
vocab_size,
embedding_dim,
output_dim,
pad_idx):
super().__init__()
self.embedding = nn.Embedding(
vocab_size,
embedding_dim,
padding_idx=pad_idx)
self.fc = nn.Linear(embedding_dim,
output_dim)
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.permute(1, 0, 2)
pooled = F.avg_pool2d(
embedded,
(embedded.shape[1], 1)).squeeze(1)
return self.fc(pooled)
model = FastText(
vocab_size = len(vocab),
embedding_dim = 100,
output_dim = 1,
pad_idx = \
vocab['<pad>'])
pretrained_embeddings = vocab.vectors # <1>
model.embedding.weight.data.copy_(
pretrained_embeddings) # <2>
EMBEDDING_DIM = 100
unk_idx = vocab['<unk>'] # <3>
pad_idx = vocab['<pad>']
model.embedding.weight.data[unk_idx] = \
torch.zeros(EMBEDDING_DIM) # <4>
model.embedding.weight.data[pad_idx] = \
torch.zeros(EMBEDDING_DIM)
- 训练及验证
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
for epoch in range(5):
epoch_loss = 0
epoch_acc = 0
model.train()
for label, text in train_dataloader:
optimizer.zero_grad()
predictions = model(text).squeeze(1)
loss = criterion(predictions, label)
rounded_preds = torch.round(
torch.sigmoid(predictions))
correct = \
(rounded_preds == label).float()
acc = correct.sum() / len(correct)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
print("Epoch %d Train: Loss: %.4f Acc: %.4f" %
(epoch,
epoch_loss / len(train_dataloader),
epoch_acc / len(train_dataloader)))
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for label, text in valid_dataloader:
predictions = model(text).squeeze(1)
loss = criterion(predictions, label)
rounded_preds = torch.round(
torch.sigmoid(predictions))
correct = \
(rounded_preds == label).float()
acc = correct.sum() / len(correct)
epoch_loss += loss.item()
epoch_acc += acc.item()
print("Epoch %d Valid: Loss: %.4f Acc: %.4f" %
(epoch,
epoch_loss / len(valid_dataloader),
epoch_acc / len(valid_dataloader)))
# out: (your results may vary)
# Epoch 0 Train: Loss: 0.6523 Acc: 0.7165
# Epoch 0 Valid: Loss: 0.5259 Acc: 0.7474
# Epoch 1 Train: Loss: 0.5935 Acc: 0.7765
# Epoch 1 Valid: Loss: 0.4571 Acc: 0.7933
# Epoch 2 Train: Loss: 0.5230 Acc: 0.8257
# Epoch 2 Valid: Loss: 0.4103 Acc: 0.8245
# Epoch 3 Train: Loss: 0.4559 Acc: 0.8598
# Epoch 3 Valid: Loss: 0.3828 Acc: 0.8549
# Epoch 4 Train: Loss: 0.4004 Acc: 0.8813
# Epoch 4 Valid: Loss: 0.3781 Acc: 0.8675
- 测试有部署
test_loss = 0
test_acc = 0
model.eval() # <1>
with torch.no_grad(): # <1>
for label, text in test_dataloader:
predictions = model(text).squeeze(1)
loss = criterion(predictions, label)
rounded_preds = torch.round(
torch.sigmoid(predictions))
correct = \
(rounded_preds == label).float()
acc = correct.sum() / len(correct)
test_loss += loss.item()
test_acc += acc.item()
print("Test: Loss: %.4f Acc: %.4f" %
(test_loss / len(test_dataloader),
test_acc / len(test_dataloader)))
# out: (your results will vary)
# Test: Loss: 0.3821 Acc: 0.8599
import spacy
nlp = spacy.load('en_core_web_sm')
def predict_sentiment(model, sentence):
model.eval()
text = torch.tensor(text_pipeline(sentence)).unsqueeze(1).to(device)
prediction = torch.sigmoid(model(text))
return prediction.item()
sentiment = predict_sentiment(model,
"Don't waste your time")
print(sentiment)
# out: 4.763594888613835e-34
sentiment = predict_sentiment(model,
"You gotta see this movie!")
print(sentiment)
# out: 0.941755473613739
- 保存模型
torch.save(model.state_dict(), 'fasttext-model.pt')
运行程序过程出错
- [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.
解决方案:直接通过github安装
pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
- spacy的安装出现Creating a shortcut link for 'en' didn't work问题以及 from spacy.en import English报错
解决方案:https://www.jianshu.com/p/60a18659ba71
Windows上需要使用管理员权限打开Anaconda Prompt。然后使用命令
python -m spacy download en
image.png
网友评论