pip install pandas jieba scikit-learn -i https://pypi.doubanio.com/simple
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import jieba
from torch.utils.data import DataLoader
data = pd.read_csv(r'D:/data/waimai_10k.csv')
# data.head()
# label review
# 0 1 很快,好吃,味道足,量大
# 1 1 没有送水没有送水没有送水
# 2 1 非常快,态度好。
# 3 1 方便,快捷,味道可口,快递给力
# 4 1 菜味道很棒!送餐很及时!
# data.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 11987 entries, 0 to 11986
# Data columns (total 2 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 label 11987 non-null int64
# 1 review 11987 non-null object
# dtypes: int64(1), object(1)
# memory usage: 187.4+ KB
# data.label.value_counts() # 查看数据是否无偏
# 0 7987
# 1 4000
# Name: label, dtype: int64
# 去掉所有的标点符号
def pre_text(text):
text = text.replace('!', '').replace(',', '').replace('。', '')
return jieba.lcut(text) # 通过lcut直接对文本进行分词
data['review'] = data.review.apply(pre_text)
# 分词完成,且顺序未变
# data.review
# 0 [很快, 好吃, 味道, 足量, 大]
# 1 [没有, 送水, 没有, 送水, 没有, 送水]
# 2 [非常, 快, 态度, 好]
# 3 [方便快捷, 味道, 可口, 快, 递给, 力]
# 4 [菜, 味道, 很棒, 送餐, 很, 及时]
# ...
# 11982 [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
# 11983 [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
# 11984 [凉皮, 太辣, ,, 吃不下, 都]
# 11985 [本来, 迟到, 了, 还, 自己, 点]
# 11986 [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 凉面, 没, 想象, 中, ...
# Name: review, Length: 11987, dtype: object
from torchtext.vocab import build_vocab_from_iterator # 创建词表工具
def yield_tokens(data):
for text in data:
yield text
vocab = build_vocab_from_iterator(yield_tokens(data.review), specials=["<pad>", "<unk>"], min_freq=2)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
# data.review[0]
# ['很快', '好吃', '味道', '足量', '大']
# vocab(data.review[0])
# [55, 14, 13, 5228, 114]
i = int(len(data)*0.8)
train_data = data.sample(i)
# data.index
# RangeIndex(start=0, stop=11987, step=1)
# train_data.index
# Int64Index([ 8068, 11711, 6382, 11725, 8037, 9812, 8538, 3873, 674,
# 2533,
# ...
# 5346, 2959, 7224, 1695, 5973, 7100, 7643, 10514, 228,
# 4515],
# dtype='int64', length=9589)
test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]
# test_data.values
# array([[1,
# list(['超级', '快', '就', '送到', '了', '这么', '冷', '的', '天气', '骑士', '们', '辛苦', '了', '谢谢你们', '麻辣', '香锅', '依然', '很', '好吃'])],
# [1,
# list(['最后', '五分钟', '订', '的', '卖家', '特别', '好', '接单', '了', '谢谢'])],
# [1, list(['量', '大', '好吃', '每次', '点', '的', '都', '够吃', '两次'])],
# ...,
# [0, list(['不吃', '辣', '都', '给', '的', '我们', '辣'])],
# [0, list(['鸡蛋', '都', '坏', '了', '凉菜', '也', '洒', '了'])],
# [0, list(['凉皮', '太辣', ',', '吃不下', '都'])]], dtype=object)
# 创建dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(batch):
label_list, text_list = [], []
for (_label, _text) in batch:
label_list.append(_label)
precess_text = torch.tensor(vocab(_text), dtype=torch.int64)
text_list.append(precess_text)
label_list = torch.tensor(label_list)
text_list = torch.nn.utils.rnn.pad_sequence(text_list)
return label_list.to(device), text_list.to(device)
train_dataloader = DataLoader(train_data.values, batch_size=64, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data.values, batch_size=64, shuffle=False, collate_fn=collate_batch)
# Embeding : 把文本映射为一个密集向量
embeding_dim = 100
hidden_size = 200
# 这个参考代码使用双向 LSTM, 注意 self.fc1 定义中 hidden_size 乘以 2 。
class BIRNN_Net(nn.Module):
def __init__(self, vocab_size, embeding_dim, hidden_size):
super(BIRNN_Net, self).__init__()
self.em = nn.Embedding(vocab_size, embeding_dim)
self.rnn = nn.LSTM(embeding_dim, hidden_size, bidirectional=True)
self.fc1 = nn.Linear(hidden_size*2, 64)
self.fc2 = nn.Linear(64, 2)
def forward(self, inputs):
x = self.em(inputs)
x = F.dropout(x)
x, _ = self.rnn(x)
x = F.dropout(F.relu(self.fc1(x[-1])))
x = self.fc2(x)
return x
model = BIRNN_Net(vocab_size, embeding_dim, hidden_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), betas=(0.5, 0.5), lr=0.005)
def train(dataloader):
total_acc, total_count, total_loss, = 0, 0, 0
model.train()
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def test(dataloader):
model.eval()
total_acc, total_count, total_loss, = 0, 0, 0
with torch.no_grad():
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def fit(epochs, train_dl, test_dl):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
epoch_loss, epoch_acc = train(train_dl)
epoch_test_loss, epoch_test_acc = test(test_dl)
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
test_loss.append(epoch_test_loss)
test_acc.append(epoch_test_acc)
template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ,"
"test_loss: {:.5f}, test_acc: {:.1f}%")
print(template.format(
epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
print("Done!")
return train_loss, test_loss, train_acc, test_acc
EPOCHS = 25
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)
epoch: 0, train_loss: 0.66007, train_acc: 67.8% ,test_loss: 0.55988, test_acc: 74.8%
epoch: 1, train_loss: 0.43784, train_acc: 81.6% ,test_loss: 0.43157, test_acc: 84.5%
epoch: 2, train_loss: 0.35181, train_acc: 86.7% ,test_loss: 0.36999, test_acc: 86.6%
epoch: 3, train_loss: 0.31692, train_acc: 88.0% ,test_loss: 0.37560, test_acc: 86.9%
epoch: 4, train_loss: 0.29593, train_acc: 88.8% ,test_loss: 0.39419, test_acc: 86.8%
epoch: 5, train_loss: 0.28346, train_acc: 89.7% ,test_loss: 0.38223, test_acc: 88.2%
epoch: 6, train_loss: 0.27074, train_acc: 90.0% ,test_loss: 0.39987, test_acc: 87.7%
epoch: 7, train_loss: 0.26633, train_acc: 90.4% ,test_loss: 0.36834, test_acc: 88.6%
epoch: 8, train_loss: 0.25801, train_acc: 91.2% ,test_loss: 0.41890, test_acc: 86.9%
epoch: 9, train_loss: 0.25385, train_acc: 91.3% ,test_loss: 0.36202, test_acc: 87.7%
epoch:10, train_loss: 0.25404, train_acc: 91.2% ,test_loss: 0.37089, test_acc: 88.2%
epoch:11, train_loss: 0.24154, train_acc: 91.3% ,test_loss: 0.41041, test_acc: 87.6%
epoch:12, train_loss: 0.23717, train_acc: 91.6% ,test_loss: 0.38667, test_acc: 88.0%
epoch:13, train_loss: 0.23823, train_acc: 91.8% ,test_loss: 0.41536, test_acc: 87.8%
epoch:14, train_loss: 0.23486, train_acc: 91.9% ,test_loss: 0.39620, test_acc: 87.5%
epoch:15, train_loss: 0.23189, train_acc: 92.2% ,test_loss: 0.37189, test_acc: 88.2%
epoch:16, train_loss: 0.22775, train_acc: 92.4% ,test_loss: 0.36210, test_acc: 89.0%
epoch:17, train_loss: 0.23258, train_acc: 92.2% ,test_loss: 0.50890, test_acc: 87.6%
epoch:18, train_loss: 0.22637, train_acc: 92.7% ,test_loss: 0.44622, test_acc: 87.7%
epoch:19, train_loss: 0.22351, train_acc: 92.6% ,test_loss: 0.44134, test_acc: 88.3%
epoch:20, train_loss: 0.22929, train_acc: 92.7% ,test_loss: 0.43854, test_acc: 87.6%
epoch:21, train_loss: 0.22166, train_acc: 92.8% ,test_loss: 0.58517, test_acc: 87.9%
epoch:22, train_loss: 0.22993, train_acc: 92.4% ,test_loss: 0.40604, test_acc: 88.8%
epoch:23, train_loss: 0.22971, train_acc: 92.6% ,test_loss: 0.41986, test_acc: 88.5%
epoch:24, train_loss: 0.23276, train_acc: 92.5% ,test_loss: 0.44144, test_acc: 88.3%
Done!
# 通过data.review.values 取出全部不重合的词
# 通过np.concatenate将词全部组合到array中。
#目的是想通过pd.value_counts 取出每个词所出现的次数
np.concatenate(data.review.values)
# array(['很快', '好吃', '味道', ..., '倒', '是', '很快'], dtype='<U14')
pd.value_counts(np.concatenate(data.review.values))
# 了 9397
# 的 7836
# , 4212
# 很 2257
# 都 2192
# ...
# 饿昏 1
# 气人 1
# 味似 1
# 长长 1
# 绝望 1
# Length: 11330, dtype: int64
# 取出出现次数大于2的单词,剔除只出现1-2次的单词
word_count = pd.value_counts(np.concatenate(data.review.values))
word_count[word_count > 2]
# 了 9397
# 的 7836
# , 4212
# 很 2257
# 都 2192
# ...
# 边 3
# 乌龟 3
# 死难 3
# 排骨面 3
# 够呛 3
# Length: 3870, dtype: int64
# 将索引添加到列表
word_list = list(word_count.index)
word_list
#['了',
# '的',
# ',',
# '很',
# '都',
# '是',
# '我',
# 获得每个词的数值表示
word_list.index('好吃')
# 12
# 获取word在字典中的索引
word_index = dict((word,word_list.index(word) + 1) for word in word_list)
word_index
# {'了': 1,
# '的': 2,
# ',': 3,
# '很': 4,
# 应用索引,将文本编码成数值化表示
text = data.review.apply(lambda x: [word_index.get(word,0) for word in x])
text
# 0 [54, 13, 12, 4839, 112]
# 1 [21, 3389, 21, 3389, 21, 3389]
# 2 [44, 35, 64, 11]
# 3 [2738, 12, 1388, 35, 2533, 518]
# 4 [40, 12, 395, 14, 4, 290]
# ...
# 对文本的长度进行规范
max(len(x) for x in text) # 得到最大长度是279.
# 通过观察,大部分文本很短。此处不填充到最大长度,统一填充为20,超过部分截断
text_len=20
pad_text = [L + (text_len - len(L))*[0] if len(L)<=text_len else L[:text_len] for L in text]
pad_text
#[[54, 13, 12, 4839, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [21, 3389, 21, 3389, 21, 3389, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [44, 35, 64, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
# [2738, 12, 1388, 35, 2533, 518, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
pad_text = np.array(pad_text)
pad_text.shape
# (11987, 20) 每个序列的长度都已经被规范到20
labels = data.label.values
labels.shape
# (11987,)
# 切分训练集与测试集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(pad_text,labels)
x_train.shape,x_test.shape,y_train.shape
# ((8990, 20), (2997, 20), (8990,))
网友评论