美文网首页
PyTrch深度学习简明实战28 - 外卖评价情绪预测(中文)

PyTrch深度学习简明实战28 - 外卖评价情绪预测(中文)

作者: 薛东弗斯 | 来源:发表于2023-05-09 21:09 被阅读0次
    pip install pandas jieba scikit-learn -i https://pypi.doubanio.com/simple
    
    import torch
    import torchtext
    import torch.nn as nn
    import torch.nn.functional as F
    import numpy as np
    import pandas as pd
    import jieba
    from torch.utils.data import DataLoader
    
    data = pd.read_csv(r'D:/data/waimai_10k.csv')
    
    # data.head()
    #       label   review
    #   0   1   很快,好吃,味道足,量大
    #   1   1   没有送水没有送水没有送水
    #   2   1   非常快,态度好。
    #   3   1   方便,快捷,味道可口,快递给力
    #   4   1   菜味道很棒!送餐很及时!
    
    # data.info()
    #   <class 'pandas.core.frame.DataFrame'>
    #   RangeIndex: 11987 entries, 0 to 11986
    #   Data columns (total 2 columns):
    #    #   Column  Non-Null Count  Dtype 
    #   ---  ------  --------------  ----- 
    #    0   label   11987 non-null  int64 
    #    1   review  11987 non-null  object
    #   dtypes: int64(1), object(1)
    #   memory usage: 187.4+ KB
    
    # data.label.value_counts()   # 查看数据是否无偏
    #   0    7987
    #   1    4000
    #   Name: label, dtype: int64
    
    # 去掉所有的标点符号
    def pre_text(text):
        text = text.replace('!', '').replace(',', '').replace('。', '')
        return jieba.lcut(text)    # 通过lcut直接对文本进行分词
        
    data['review'] = data.review.apply(pre_text)   
    
    # 分词完成,且顺序未变
    # data.review
    #   0                                      [很快, 好吃, 味道, 足量, 大]
    #   1                                 [没有, 送水, 没有, 送水, 没有, 送水]
    #   2                                           [非常, 快, 态度, 好]
    #   3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
    #   4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
    #                                  ...                        
    #   11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
    #   11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
    #   11984                                  [凉皮, 太辣, ,, 吃不下, 都]
    #   11985                                [本来, 迟到, 了, 还, 自己, 点]
    #   11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 凉面, 没, 想象, 中, ...
    #   Name: review, Length: 11987, dtype: object
    
    
    
    from torchtext.vocab import build_vocab_from_iterator     # 创建词表工具
    
    def yield_tokens(data):
        for text in data:
            yield text
            
    vocab = build_vocab_from_iterator(yield_tokens(data.review), specials=["<pad>", "<unk>"], min_freq=2)
    vocab.set_default_index(vocab["<unk>"])
    vocab_size = len(vocab)
    # data.review[0]
    # ['很快', '好吃', '味道', '足量', '大']
    
    # vocab(data.review[0])
    # [55, 14, 13, 5228, 114]
    
    i = int(len(data)*0.8)
    train_data = data.sample(i)
    # data.index
    # RangeIndex(start=0, stop=11987, step=1)
    
    # train_data.index
    # Int64Index([ 8068, 11711,  6382, 11725,  8037,  9812,  8538,  3873,   674,
    #             2533,
    #            ...
    #             5346,  2959,  7224,  1695,  5973,  7100,  7643, 10514,   228,
    #             4515],
    #           dtype='int64', length=9589)
    
    test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]
    # test_data.values
    # array([[1,
    #        list(['超级', '快', '就', '送到', '了', '这么', '冷', '的', '天气', '骑士', '们', '辛苦', '了', '谢谢你们', '麻辣', '香锅', '依然', '很', '好吃'])],
    #       [1,
    #        list(['最后', '五分钟', '订', '的', '卖家', '特别', '好', '接单', '了', '谢谢'])],
    #       [1, list(['量', '大', '好吃', '每次', '点', '的', '都', '够吃', '两次'])],
    #       ...,
    #       [0, list(['不吃', '辣', '都', '给', '的', '我们', '辣'])],
    #       [0, list(['鸡蛋', '都', '坏', '了', '凉菜', '也', '洒', '了'])],
    #       [0, list(['凉皮', '太辣', ',', '吃不下', '都'])]], dtype=object)
    
    # 创建dataset
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    def collate_batch(batch):
        label_list, text_list = [], []
        for (_label, _text) in batch:
            label_list.append(_label)
            precess_text = torch.tensor(vocab(_text), dtype=torch.int64)
            text_list.append(precess_text)
        label_list = torch.tensor(label_list)
        text_list = torch.nn.utils.rnn.pad_sequence(text_list)
        return label_list.to(device), text_list.to(device)
        
    train_dataloader = DataLoader(train_data.values, batch_size=64, shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_data.values, batch_size=64, shuffle=False, collate_fn=collate_batch)
    
    # Embeding : 把文本映射为一个密集向量
    embeding_dim = 100
    hidden_size = 200
    
    # 这个参考代码使用双向 LSTM, 注意 self.fc1 定义中 hidden_size 乘以 2 。
    class BIRNN_Net(nn.Module):
        def __init__(self, vocab_size, embeding_dim, hidden_size):
            super(BIRNN_Net, self).__init__()
            self.em = nn.Embedding(vocab_size, embeding_dim)   
            self.rnn = nn.LSTM(embeding_dim, hidden_size, bidirectional=True)
            self.fc1 = nn.Linear(hidden_size*2, 64)
            self.fc2 = nn.Linear(64, 2)
    
        def forward(self, inputs):
            x = self.em(inputs)
            x = F.dropout(x)
            x, _ = self.rnn(x)
            x = F.dropout(F.relu(self.fc1(x[-1])))
            x = self.fc2(x)
            return x
            
    model = BIRNN_Net(vocab_size, embeding_dim, hidden_size).to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), betas=(0.5, 0.5), lr=0.005)
    
    def train(dataloader):
        total_acc, total_count, total_loss, = 0, 0, 0
        model.train()
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
                total_loss += loss.item()*label.size(0)
        return total_loss/total_count, total_acc/total_count
        
    def test(dataloader):
        model.eval()
        total_acc, total_count, total_loss, = 0, 0, 0
    
        with torch.no_grad():
            for label, text in dataloader:
                predicted_label = model(text)
                loss = loss_fn(predicted_label, label)
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
                total_loss += loss.item()*label.size(0)
        return total_loss/total_count, total_acc/total_count
        
    def fit(epochs, train_dl, test_dl):
        train_loss = []
        train_acc = []
        test_loss = []
        test_acc = []
    
        for epoch in range(epochs):
            epoch_loss, epoch_acc = train(train_dl)
            epoch_test_loss, epoch_test_acc = test(test_dl)
            train_loss.append(epoch_loss)
            train_acc.append(epoch_acc)
            test_loss.append(epoch_test_loss)
            test_acc.append(epoch_test_acc)
            template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                        "test_loss: {:.5f}, test_acc: {:.1f}%")
            print(template.format(
                  epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
        print("Done!")
        
        return train_loss, test_loss, train_acc, test_acc
        
    EPOCHS = 25
    
    train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, train_dataloader, test_dataloader)
    
    
    epoch: 0, train_loss: 0.66007, train_acc: 67.8% ,test_loss: 0.55988, test_acc: 74.8%
    epoch: 1, train_loss: 0.43784, train_acc: 81.6% ,test_loss: 0.43157, test_acc: 84.5%
    epoch: 2, train_loss: 0.35181, train_acc: 86.7% ,test_loss: 0.36999, test_acc: 86.6%
    epoch: 3, train_loss: 0.31692, train_acc: 88.0% ,test_loss: 0.37560, test_acc: 86.9%
    epoch: 4, train_loss: 0.29593, train_acc: 88.8% ,test_loss: 0.39419, test_acc: 86.8%
    epoch: 5, train_loss: 0.28346, train_acc: 89.7% ,test_loss: 0.38223, test_acc: 88.2%
    epoch: 6, train_loss: 0.27074, train_acc: 90.0% ,test_loss: 0.39987, test_acc: 87.7%
    epoch: 7, train_loss: 0.26633, train_acc: 90.4% ,test_loss: 0.36834, test_acc: 88.6%
    epoch: 8, train_loss: 0.25801, train_acc: 91.2% ,test_loss: 0.41890, test_acc: 86.9%
    epoch: 9, train_loss: 0.25385, train_acc: 91.3% ,test_loss: 0.36202, test_acc: 87.7%
    epoch:10, train_loss: 0.25404, train_acc: 91.2% ,test_loss: 0.37089, test_acc: 88.2%
    epoch:11, train_loss: 0.24154, train_acc: 91.3% ,test_loss: 0.41041, test_acc: 87.6%
    epoch:12, train_loss: 0.23717, train_acc: 91.6% ,test_loss: 0.38667, test_acc: 88.0%
    epoch:13, train_loss: 0.23823, train_acc: 91.8% ,test_loss: 0.41536, test_acc: 87.8%
    epoch:14, train_loss: 0.23486, train_acc: 91.9% ,test_loss: 0.39620, test_acc: 87.5%
    epoch:15, train_loss: 0.23189, train_acc: 92.2% ,test_loss: 0.37189, test_acc: 88.2%
    epoch:16, train_loss: 0.22775, train_acc: 92.4% ,test_loss: 0.36210, test_acc: 89.0%
    epoch:17, train_loss: 0.23258, train_acc: 92.2% ,test_loss: 0.50890, test_acc: 87.6%
    epoch:18, train_loss: 0.22637, train_acc: 92.7% ,test_loss: 0.44622, test_acc: 87.7%
    epoch:19, train_loss: 0.22351, train_acc: 92.6% ,test_loss: 0.44134, test_acc: 88.3%
    epoch:20, train_loss: 0.22929, train_acc: 92.7% ,test_loss: 0.43854, test_acc: 87.6%
    epoch:21, train_loss: 0.22166, train_acc: 92.8% ,test_loss: 0.58517, test_acc: 87.9%
    epoch:22, train_loss: 0.22993, train_acc: 92.4% ,test_loss: 0.40604, test_acc: 88.8%
    epoch:23, train_loss: 0.22971, train_acc: 92.6% ,test_loss: 0.41986, test_acc: 88.5%
    epoch:24, train_loss: 0.23276, train_acc: 92.5% ,test_loss: 0.44144, test_acc: 88.3%
    Done!
    
    # 通过data.review.values 取出全部不重合的词
    # 通过np.concatenate将词全部组合到array中。 
    #目的是想通过pd.value_counts 取出每个词所出现的次数 
    np.concatenate(data.review.values)  
    # array(['很快', '好吃', '味道', ..., '倒', '是', '很快'], dtype='<U14')
    
    pd.value_counts(np.concatenate(data.review.values))
    # 了     9397
    # 的     7836
    # ,     4212
    # 很     2257
    # 都     2192
    #       ... 
    # 饿昏       1
    # 气人       1
    # 味似       1
    # 长长       1
    # 绝望       1
    # Length: 11330, dtype: int64
    
    # 取出出现次数大于2的单词,剔除只出现1-2次的单词
    word_count = pd.value_counts(np.concatenate(data.review.values)) 
    word_count[word_count > 2]
    # 了      9397
    # 的      7836
    # ,      4212
    # 很      2257
    # 都      2192
    #        ... 
    # 边         3
    # 乌龟        3
    # 死难        3
    # 排骨面       3
    # 够呛        3
    # Length: 3870, dtype: int64
    
    # 将索引添加到列表
    word_list = list(word_count.index)
    word_list
    #['了',
    # '的',
    # ',',
    # '很',
    # '都',
    # '是',
    # '我',
    
    # 获得每个词的数值表示
    word_list.index('好吃')
    # 12
    
    # 获取word在字典中的索引
    word_index = dict((word,word_list.index(word) + 1) for word in word_list)
    word_index
    # {'了': 1,
    #  '的': 2,
    #  ',': 3,
    #  '很': 4,
    
    # 应用索引,将文本编码成数值化表示
    text = data.review.apply(lambda x: [word_index.get(word,0) for word in x])
    text
    # 0                                  [54, 13, 12, 4839, 112]
    # 1                           [21, 3389, 21, 3389, 21, 3389]
    # 2                                         [44, 35, 64, 11]
    # 3                          [2738, 12, 1388, 35, 2533, 518]
    # 4                                [40, 12, 395, 14, 4, 290]
    #                                ...                        
    
    # 对文本的长度进行规范
    max(len(x) for x in text)     # 得到最大长度是279.
    # 通过观察,大部分文本很短。此处不填充到最大长度,统一填充为20,超过部分截断
    text_len=20
    pad_text = [L + (text_len - len(L))*[0] if len(L)<=text_len else L[:text_len] for L in text]
    pad_text
    #[[54, 13, 12, 4839, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # [21, 3389, 21, 3389, 21, 3389, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # [44, 35, 64, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    # [2738, 12, 1388, 35, 2533, 518, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    
    pad_text = np.array(pad_text)
    pad_text.shape
    # (11987, 20)  每个序列的长度都已经被规范到20
    
    labels = data.label.values
    labels.shape
    # (11987,)
    
    # 切分训练集与测试集
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(pad_text,labels)
    x_train.shape,x_test.shape,y_train.shape
    # ((8990, 20), (2997, 20), (8990,))
    
    
    

    相关文章

      网友评论

          本文标题:PyTrch深度学习简明实战28 - 外卖评价情绪预测(中文)

          本文链接:https://www.haomeiwen.com/subject/rdkrsdtx.html