torchtext 用法 - 简书 (jianshu.com)
image.pngimage.png
image.png
image.png
独热编码的缺点:维度比较大。
散列编码:文本间的关系无法表示;固定长度的散列值表示,容易散列冲突
image.png
image.png
image.png
image.png
image.png
image.png
image.png
image.png
image.png
image.png
分词方式
import string
string.punctuation # '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 特殊符号
s = "I love You"
for c in string.punctuation:
s = s.replace(c,' ').lower()
# 分词方式1, 将每个字符单独分词
# list(s) # ['i', ' ', 'l', 'o', 'v', 'e', ' ', 'y', 'o', 'u']
# 分词方式2,按照每个单词单独分词,按照空格分词
# s.split() # ['i', 'love', 'you']
# 分词方式3,n-gram
向量化
import string
import torch
import torch.nn as nn
# 向量化: one-hot embeding
s = "I love You,you love me"
for c in string.punctuation: # '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 特殊符号
s = s.replace(c,' ').lower()
s.split() # ['i', 'love', 'you', 'you', 'love', 'me']
vocab = dict((word,index) for index,word in enumerate(np.unique(s.split()))) # 得到单词到序号的词表
# # 如果只有np.unique(s.split()),则没有序号。因此需要将np.unique(s.split())包在enumerate里面
# vocab # {'i': 0, 'love': 1, 'me': 2, 'you': 3}
s = [vocab.get(w) for w in s.split()]
# s # [0, 1, 3, 3, 1, 2]
b = np.zeros((len(s),len(vocab)))
for index,i in enumerate(s):
b[index,i] = 1 # 每一行的第i行为1
# b
# array([[1., 0., 0., 0.],
# [0., 1., 0., 0.],
# [0., 0., 0., 1.],
# [0., 0., 0., 1.],
# [0., 1., 0., 0.],
# [0., 0., 1., 0.]])
# 词嵌入表示
# 不仅可以做文本分类,还可以做分类表示。将每一类映射到一个张量上面
em = torch.nn.Embedding(len(vocab),20) # 将每个单词映射成长度为20的张量。
print(len(s)) # 6 将6个单词,映射为长度为20的张量
s_em = em(torch.LongTensor(s))
# print(s_em.shape) # torch.Size([6, 20])
TorchText文本分类数据集
文本分类预处理的步骤
1、分词
2、 创建词表
3、创建词嵌入表示
image.pngimage.png
image.png
image.png
image.png
image.png
image.png
每个单词在深度学习里面相当于一个分类,每个分类相当于给它建立一个索引 从0开始。对于单词也是这样的,对每个单词创建索引编码。
创建好词表,就可以做词嵌入,将词映射到一个张量,反映不同单词间的映射关系。用torch.nn.Embeding层做。
所有的词嵌入表示做聚合,一般用平均方法。缺点:忽略了文本的前后关系。 速度快,但精度不够
image.png
image.png
image.png
使用TrochText库对文本预处理。从TorchTest加载数据集,如IMDB;使用TorchText分词工具做分词;使用TorchText提供的创建词表工具创建词表(word look-up table),得到一个个的索引(index1 .。。。 indexn); 将字符索引送入EmbeddingBag层,会对每个索引所词嵌入,再将词嵌入的结果进行聚合,聚合后的结果交给Linear Layer;Linear Layer分类输出。
深度学习中都是小批量数据同时训练。batch
对于电影评分案例,每条的评论并非等长,此时创建批次就会有问题。批次中每条评论均该等长。文本中该怎么做?文本中,我们并不需要对每条评论进行填充到等长,因为使用了EmbeddingBag层,可以将一个批次中全部文本创建成一个序列,EmbeddingBag层怎么知道哪个层是哪个评论?将序列告诉embeding层同时还需要告诉偏移量,如1-3为第1条评论,4-8 第二条评论等等,通过偏移量告诉Embedding层每条评论在什么位置,这样可以非常高效的序列。
定义预处理函数。
import torch
import torchtext # pip install portalocker/pip install torchtext
from torchtext.data.utils import get_tokenizer # 分词工具
from torchtext.vocab import build_vocab_from_iterator # 创建词表工具
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
train_iter,test_iter = torchtext.datasets.IMDB()
train_data,test_data = list(train_iter),list(test_iter)
# all_classes = set([label for (label,text) in train_data])
tokenizer = get_tokenizer('basic_english')
tokenizer('this is a aaaa PYtorch lesson') # ['this', 'is', 'a', 'aaaa', 'pytorch', 'lesson']
# 处理文本的思路:
# 1. 分词
# 2. 生成词表 he--30, her--31
# 3. 词嵌入 30--》(0.2, 0.4, 0.2, 0.9, 2.1) 独热编码 tf-idf hash
def yield_token(data):
for(_,text) in data:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_token(train_data),
specials=['<pad>','<unk>'],# pad会被映射为0,unk映射为1
min_freq =3) # 至少出现3次的单词才会创建此表
vocab.set_default_index(vocab['<unk>']) # 不认识的单词,设置索引为1
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x:int(x=='pos') # 将标签定义为0 /1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_label, _text) in batch:
label_list.append(label_pipeline(_label))
precess_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
text_list.append(precess_text)
offsets.append(precess_text.size(0))
label_list = torch.tensor(label_list)
text_list = torch.cat(text_list)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
return label_list.to(device), text_list.to(device), offsets.to(device)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
train_dataloader = DataLoader(train_dataset, batch_size=64,
shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=64,
shuffle=True, collate_fn=collate_batch)
class TextClassificationModel(nn.Module):
def __init__(self, vocab_size, embed_dim, num_class):
super(TextClassificationModel, self).__init__()
self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
self.fc = nn.Linear(embed_dim, num_class)
self.init_weights()
def init_weights(self):
initrange = 0.5
self.embedding.weight.data.uniform_(-initrange, initrange)
self.fc.weight.data.uniform_(-initrange, initrange)
self.fc.bias.data.zero_()
def forward(self, text, offsets):
embedded = self.embedding(text, offsets)
return self.fc(embedded)
num_class = 2
vocab_size = len(vocab)
emsize = 100
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
loss_fn = nn.CrossEntropyLoss()
from torch.optim import lr_scheduler
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
def train(dataloader):
total_acc, total_count, total_loss, = 0, 0, 0
model.train()
for label, text, offsets in dataloader:
predited_label = model(text, offsets)
loss = loss_fn(predited_label, label)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
total_acc += (predited_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def test(dataloader):
model.eval()
total_acc, total_count, total_loss, = 0, 0, 0
with torch.no_grad():
for idx, (label, text, offsets) in enumerate(dataloader):
predited_label = model(text, offsets)
loss = loss_fn(predited_label, label)
total_acc += (predited_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def fit(epochs, train_dl, test_dl):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
epoch_loss, epoch_acc = train(train_dl)
epoch_test_loss, epoch_test_acc = test(test_dl)
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
test_loss.append(epoch_test_loss)
test_acc.append(epoch_test_acc)
exp_lr_scheduler.step()
template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ,"
"test_loss: {:.5f}, test_acc: {:.1f}%")
print(template.format(
epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
print("Done!")
return train_loss, test_loss, train_acc, test_acc
EPOCHS = 30
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS,
train_dataloader,
test_dataloader)
epoch: 0, train_loss: 0.05308, train_acc: 99.9% ,test_loss: 0.01097, test_acc: 100.0%
epoch: 1, train_loss: 0.00760, train_acc: 100.0% ,test_loss: 0.00540, test_acc: 100.0%
epoch: 2, train_loss: 0.00440, train_acc: 100.0% ,test_loss: 0.00358, test_acc: 100.0%
epoch: 3, train_loss: 0.00311, train_acc: 100.0% ,test_loss: 0.00268, test_acc: 100.0%
epoch: 4, train_loss: 0.00241, train_acc: 100.0% ,test_loss: 0.00214, test_acc: 100.0%
epoch: 5, train_loss: 0.00197, train_acc: 100.0% ,test_loss: 0.00179, test_acc: 100.0%
epoch: 6, train_loss: 0.00167, train_acc: 100.0% ,test_loss: 0.00153, test_acc: 100.0%
epoch: 7, train_loss: 0.00145, train_acc: 100.0% ,test_loss: 0.00134, test_acc: 100.0%
epoch: 8, train_loss: 0.00128, train_acc: 100.0% ,test_loss: 0.00120, test_acc: 100.0%
epoch: 9, train_loss: 0.00115, train_acc: 100.0% ,test_loss: 0.00108, test_acc: 100.0%
epoch:10, train_loss: 0.00108, train_acc: 100.0% ,test_loss: 0.00107, test_acc: 100.0%
epoch:11, train_loss: 0.00107, train_acc: 100.0% ,test_loss: 0.00106, test_acc: 100.0%
epoch:12, train_loss: 0.00106, train_acc: 100.0% ,test_loss: 0.00105, test_acc: 100.0%
epoch:13, train_loss: 0.00105, train_acc: 100.0% ,test_loss: 0.00104, test_acc: 100.0%
epoch:14, train_loss: 0.00104, train_acc: 100.0% ,test_loss: 0.00103, test_acc: 100.0%
epoch:15, train_loss: 0.00103, train_acc: 100.0% ,test_loss: 0.00102, test_acc: 100.0%
epoch:16, train_loss: 0.00102, train_acc: 100.0% ,test_loss: 0.00101, test_acc: 100.0%
epoch:17, train_loss: 0.00102, train_acc: 100.0% ,test_loss: 0.00100, test_acc: 100.0%
epoch:18, train_loss: 0.00101, train_acc: 100.0% ,test_loss: 0.00099, test_acc: 100.0%
epoch:19, train_loss: 0.00100, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:20, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:21, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:22, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:23, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:24, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:25, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:26, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:27, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:28, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
epoch:29, train_loss: 0.00099, train_acc: 100.0% ,test_loss: 0.00098, test_acc: 100.0%
Done!
# 词嵌入
# 词嵌入表示
em = torch.nn.Embedding(len(vocab),20) # 映射到长度为20的张量
s_em = em(torch.LongTensor(s))
s_em
# tensor([[ 0.6799, 0.0031, 1.8743, 0.5423, 0.4416, -0.5952, -0.2199, -0.5157,
# -0.3273, -1.3469, 0.0934, -0.7734, -1.1675, -1.7603, -0.6279, 0.0658,
# -0.0439, -0.7101, -2.8237, 0.5240],
# [ 1.2440, -0.6924, -1.5589, 1.0726, 0.5467, 0.4919, -0.5138, 0.6218,
# -0.5417, 1.8314, 0.7325, 0.3218, 0.1628, 0.0626, 0.3276, 0.2167,
# -0.3692, -0.7034, 0.0705, 0.3935],
# [-1.5574, 1.2716, -0.2661, 1.1207, -2.0768, -1.8648, -0.0386, -0.4261,
# -0.5502, 1.6994, -0.0997, 0.6697, -0.2014, 0.7134, 0.4169, -0.5963,
# 1.0591, 1.2019, 0.5507, 0.1586],
# [-0.2499, 2.1329, 0.0696, -0.7143, 1.9350, 0.3822, -0.6562, 0.5425,
# 0.1180, 2.6066, 2.2274, 1.8885, -0.2532, -1.0749, 2.3597, 0.1946,
# 0.5644, -2.3629, -0.5948, 1.7755],
# [-1.0459, 1.1564, 0.0283, -1.6528, 0.5663, 0.3941, 0.4620, -1.8766,
# 0.0099, 0.9699, 0.1013, -0.4488, 0.5943, -0.7578, -0.5864, 0.5749,
# -0.5252, -0.1428, -0.5080, -0.1126],
# [ 0.7462, -0.7669, -0.6719, -1.4782, 0.6049, 0.0729, 1.1420, -1.2980,
# 0.7976, 0.5524, 0.0607, 1.5890, 0.9569, -1.7878, 0.1119, -0.0818,
# -0.2928, -1.7373, -0.0768, 1.2419],
# [-0.1877, -1.6885, 0.2267, 0.0714, 0.0725, -0.0426, -0.4804, 1.3219,
# -0.9896, 0.6099, -0.1541, -0.1877, -1.5077, -0.6768, 0.5033, 0.3638,
# -0.0308, -2.2888, 1.4341, 1.1249],
# [ 1.9103, 0.4591, 1.5937, -0.6241, -0.8967, -1.3835, 0.6170, 1.4025,
# 0.4851, 1.4262, 0.0310, -0.7734, -1.2927, -0.2866, 0.8493, 0.1133,
# 0.2878, -1.1133, 1.7173, 0.0744],
# [ 0.2675, -0.5814, 0.0361, -1.1336, 0.0780, -1.1488, 0.1985, 1.6851,
# -0.1978, -1.8281, -1.3562, -0.8949, -1.1598, -1.5098, -0.6373, -1.2807,
# -1.7206, 1.2317, -0.3598, 0.0665]], grad_fn=<EmbeddingBackward0>)
网友评论