美文网首页
PyTrch深度学习简明实战36 - 一维卷积神经网络

PyTrch深度学习简明实战36 - 一维卷积神经网络

作者: 薛东弗斯 | 来源:发表于2023-09-16 11:06 被阅读0次
    数据集: 某外卖平台收集的用户评价,正向 4000 条,负向 约 8000 条
    
    ## 字段说明[]
    
    image.png
    安装 jieba 和 pandas
    
     pip install jieba -i https://pypi.doubanio.com/simple
    
    import torch
    # import torchtext 
    import torch.nn as nn
    import torch.nn.functional as F
    import numpy as np
    import pandas as pd
    import jieba
    from torch.utils.data import DataLoader
    
    data = pd.read_csv('waimai_10k.csv')
    data.head()    # label 1/正面评价   0/负面评价
    #   label   review
    # 0 1   很快,好吃,味道足,量大
    # 1 1   没有送水没有送水没有送水
    # 2 1   非常快,态度好。
    # 3 1   方便,快捷,味道可口,快递给力
    # 4 1   菜味道很棒!送餐很及时!
    
    data.info()   
    # <class 'pandas.core.frame.DataFrame'>
    # RangeIndex: 11987 entries, 0 to 11986
    # Data columns (total 2 columns):
    #  #   Column  Non-Null Count  Dtype 
    # ---  ------  --------------  ----- 
    #  0   label   11987 non-null  int64 
    #  1   review  11987 non-null  object
    # dtypes: int64(1), object(1)
    # memory usage: 187.4+ KB
    
    data.label.value_counts()    # 统计正面/负面评价的数量
    # 0    7987
    # 1    4000
    # Name: label, dtype: int64
    # 该数据是不均衡数据,正负面评论条数不同;可以对负面数据采样,都是4000条数据;或者对正面数据过采样。
    
    jieba.lcut('这是日月光华在网易云课堂的课程')    # 中文分词
    # ['这是', '日月', '光华', '在', '网易', '云', '课堂', '的', '课程']
    
    def pre_text(text):     
        text = text.replace(',', '').replace('!', '')   # 将逗号/叹号都替换为空
        return jieba.lcut(text)   # 返回分词后的结果
    
    data['review'] = data.review.apply(pre_text)   # 将pre_text函数应用到每个评论上面
    
    data['review']     
    # 0                                      [很快, 好吃, 味道, 足量, 大]
    # 1                                 [没有, 送水, 没有, 送水, 没有, 送水]
    # 2                                        [非常, 快, 态度, 好, 。]
    # 3                                 [方便快捷, 味道, 可口, 快, 递给, 力]
    # 4                                   [菜, 味道, 很棒, 送餐, 很, 及时]
    #                                ...                        
    # 11982                   [以前, 几乎, 天天, 吃, 现在, 调料, 什么, 都, 不放]
    # 11983    [昨天, 订, 凉皮, 两份, 什么, 调料, 都, 没有, 放, 就, 放, 了, 点, ...
    # 11984                                  [凉皮, 太辣, ,, 吃不下, 都]
    # 11985                                [本来, 迟到, 了, 还, 自己, 点]
    # 11986    [肉夹馍, 不错, 羊肉, 泡馍, 酱肉, 包, 很, 一般, 。, 凉面, 没, 想象, ...
    # Name: review, Length: 11987, dtype: object
    
    文本处理的两步骤:
        1. 分词,创建词表
        2. embeding
    
    # 导入创建词表工具(日月: 1, 光华: 2, 吃饭: 3, 调料: 4)
    from torchtext.vocab import build_vocab_from_iterator
    
    def yield_tokens(data):
        for text in data:
            yield text     # 以生成器的方式返回
    
    # 创建词表,将每个单词编码为索引。 文本处理的本质,对文本进行分类
    vocab = build_vocab_from_iterator(yield_tokens(data['review']),
                                      specials=['<pad>', '<unk>'],    # specials 标注特殊字符,不认识的返回unk, pad填充; pad编码为0/unk编码为1
                                      min_freq=2)   # 如果出现次数小于2,则删除,认为次数太少,无意义的词
    
    vocab.set_default_index(vocab['<unk>'])   # 将unk设置为默认索引
    
    vocab['调料']
    # 965
    
    vocab(['很快', '好吃', '味道', '足量', '大'])   # 必须是字符串的形式
    # [56, 15, 14, 5229, 114]
    
    vocab['<unk>']   # 默认返回值
    # 1
    
    vocab['山峰']   # 对于生僻的词,评论里面不存在的词,返回unk 1
    # 1
    
    data.info()
    # <class 'pandas.core.frame.DataFrame'>
    # RangeIndex: 11987 entries, 0 to 11986
    # Data columns (total 2 columns):
    #  #   Column  Non-Null Count  Dtype 
    # ---  ------  --------------  ----- 
    #  0   label   11987 non-null  int64 
    #  1   review  11987 non-null  object
    # dtypes: int64(1), object(1)
    # memory usage: 187.4+ KB
    
    i = int(len(data)*0.8)      # 选取80%作为训练集
    train_data = data.sample(i)  # 用sample方法进行采样
    train_data.head()
    #           label   review
    # 5365  0   [两个, 小时, 才, 送到, 慢]
    # 11666 0   [第一次, 见, 服务态度, 这么, 差, 的, 点, 了, 一份, 套餐, 和, 一根, ...
    # 766   1   [很, 是, 不错, 的, 送餐, 体验, 。]
    # 3570  1   [不错, 不错, 胃, 不, 舒服, 才, 点, 的, 粥, 清爽, 料足, 。, 外卖, ...
    # 11969 0   [谢谢, 速度, 很快, 辛苦, 了]
    
    len(train_data)
    # 9589
    
    # iloc按照索引取值
    test_data = data.iloc[data.index[~data.index.isin(train_data.index)]]   # 将剩余的数据作为测试数据
    test_data.head()      # vx  : louhh01
    #   label   review
    # 0 1   [很快, 好吃, 味道, 足量, 大]
    # 7 1   [超级, 快, 就, 送到, 了, 这么, 冷, 的, 天气, 骑士, 们, 辛苦, 了, ...
    # 17    1   [好吃, 速度, 包装, 也, 有, 品质, 不, 出, 家门, 就, 能, 吃, 到, 餐...
    # 18    1   [味道, 好极, 啦, 送餐, 很快, 师傅, 辛苦, 啦]
    # 21    1   [送货, 速度, 很快, 一直, 定, 这家, 赞]
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    train_data.values
    # array([[0, list(['两个', '小时', '才', '送到', '慢'])],
    #        [0,
    #         list(['第一次', '见', '服务态度', '这么', '差', '的', '点', '了', '一份', '套餐', '和', '一根', '烤肠', '烤肠', '居然', '直接', '就', '没有', '送过来', '送到', '的', '时候', '饭菜', '都', '凉', '了', '也', '是', '我', '吃', '过', '的', '最', '难吃', '的', '梅菜', '扣肉', '再也', '不订', '他家', '的', '了'])],
    #        [1, list(['很', '是', '不错', '的', '送餐', '体验', '。'])],
    #        ...,
    #        [0, list(['送餐', '服务', '有待', '提高', '味道', '不用说', '晚', '了', '半小时'])],
    #        [0,
    #         list(['岂', '一个', '慢字', '了', '得', '而且', '明明', '是', '自己', '送', '的', '晚', '还', '赖', '人家', '送', '外卖', '的', '小哥', '速度慢', '真是', '醉', '死', '了'])],
    #        [1,
    #        list(['一盒', '撒', '了', ',', '小哥', '又', '去', '排队', '拿', '了', '一盒', ',', '棒棒', '哒'])]],
    #      dtype=object)
    
    train_data.values[0]
    # array([0, list(['两个', '小时', '才', '送到', '慢'])], dtype=object)
    
    # 文本的批处理函数,参数是batch  一个批次的数据
    def collate_batch(batch):
        label_list, text_list = [], []       # 创建标签/评论列表
        for (_label, _text) in batch:   # 对批次进行迭代
            label_list.append(_label)  
            precessed_text = torch.tensor(vocab(_text), dtype=torch.int64)   # 使用vocab将文本转换为索引值 -> 转换为tensor
            text_list.append(precessed_text)
        label_list = torch.tensor(label_list)
        text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)   # 使用填充,使得一个批次中所有序列与最长的序列一致
        return label_list.to(device), text_list.to(device)
    
    # dataset   __getitem__; __len__()
    train_dl = DataLoader(train_data.values, batch_size=64,
                          collate_fn=collate_batch,
                          shuffle=True)
    
    test_dl = DataLoader(test_data.values, batch_size=64,
                          collate_fn=collate_batch)
    
    label_batch, text_batch = next(iter(train_dl))   # next返回一个批次
    label_batch
    # tensor([1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
    #         1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
    #         0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0])
    
    text_batch         # 发现文本已经填充为同样的长度。 louhh01 
    # tensor([[1811, 1049,   96,  ...,    0,    0,    0],
    #         [  63,  273,  209,  ...,    0,    0,    0],
    #         [ 789, 3769,   31,  ...,    0,    0,    0],
    #         ...,
    #         [ 768,  100,    3,  ...,    0,    0,    0],
    #         [ 223,  384,    3,  ...,    0,    0,    0],
    #         [ 106,    5,   11,  ...,    0,    0,    0]])
    
    # 一维卷积模型:
       1. embdeing                           # 文本词嵌入到张量
       2. conv1d + maxpool              # 一维卷积 + 最大池化
       3. conv1d
       4. nn.AdaptiveAvgPool1d        # 自适应平均池化层,将不同长度的序列规范为统一长度
    # nn.AdaptiveAvgPool2d          用于规范二维图像 
       5. view()                                   # view方法展平为2维
       6. Linear()                                # 输出层
    
    vocab_size = len(vocab)
    embeding_dim = 100
    
    # 输入 text shape: batch, lengh
    # 经过embeding后:  batch, lengh, feathues=100
    
    class CONV1D_Net(nn.Module):
        def __init__(self, vocab_size, embeding_dim):
            super(CONV1D_Net, self).__init__()
            self.em = nn.Embedding(vocab_size, embeding_dim)
            self.conv1 = nn.Conv1d(embeding_dim, 64, kernel_size=7)
            self.pool = nn.MaxPool1d(kernel_size=2)
            self.conv2 = nn.Conv1d(64, 128, kernel_size=7)
            self.avgpool = nn.AdaptiveAvgPool1d(output_size=5)  # batch*128*5
            self.fc1 = nn.Linear(128*5, 64)
            self.fc2 = nn.Linear(64, 2)
        def forward(self, x):
            x = self.em(x)
            x = x.permute(0, 2, 1)
            x = F.relu(self.conv1(x))
            x = self.pool(x)
            x = F.relu(self.conv2(x))
            x = self.avgpool(x)
            x = x.view(-1, x.size(1)*x.size(2))
            x = F.dropout(F.relu(self.fc1(x)))
            x = self.fc2(x)
            return x
    
    model = CONV1D_Net(vocab_size, embeding_dim).to(device)
    
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    def train(dataloader):
        total_acc, total_count, total_loss, = 0, 0, 0
        model.train()
        for label, text in dataloader:
            predicted_label = model(text)
            loss = loss_fn(predicted_label, label)
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
                total_loss += loss.item()*label.size(0)
        return total_loss/total_count, total_acc/total_count
    
    def test(dataloader):
        model.eval()
        total_acc, total_count, total_loss, = 0, 0, 0
    
        with torch.no_grad():
            for label, text in dataloader:
                predicted_label = model(text)
                loss = loss_fn(predicted_label, label)
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                total_count += label.size(0)
                total_loss += loss.item()*label.size(0)
        return total_loss/total_count, total_acc/total_count
    
    def fit(epochs, train_dl, test_dl):
        train_loss = []
        train_acc = []
        test_loss = []
        test_acc = []
    
        for epoch in range(epochs):
            epoch_loss, epoch_acc = train(train_dl)
            epoch_test_loss, epoch_test_acc = test(test_dl)
            train_loss.append(epoch_loss)
            train_acc.append(epoch_acc)
            test_loss.append(epoch_test_loss)
            test_acc.append(epoch_test_acc)
            template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ," 
                        "test_loss: {:.5f}, test_acc: {:.1f}%")
            print(template.format(
                  epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
        print("Done!")
        
        return train_loss, test_loss, train_acc, test_acc
    
    EPOCHS = 10
    
    train_loss, test_loss, train_acc, test_acc = fit(EPOCHS, 
                                                     train_dl, 
                                                     test_dl)
    # epoch: 0, train_loss: 0.55354, train_acc: 71.5% ,test_loss: 0.49842, test_acc: 79.2%
    # epoch: 1, train_loss: 0.40823, train_acc: 83.9% ,test_loss: 0.36572, test_acc: 85.1%
    # epoch: 2, train_loss: 0.31483, train_acc: 88.1% ,test_loss: 0.40902, test_acc: 85.1%
    # epoch: 3, train_loss: 0.25132, train_acc: 90.8% ,test_loss: 0.39407, test_acc: 87.2%
    # epoch: 4, train_loss: 0.19346, train_acc: 93.4% ,test_loss: 0.59312, test_acc: 85.9%
    # epoch: 5, train_loss: 0.15583, train_acc: 94.8% ,test_loss: 0.41282, test_acc: 86.2%
    # epoch: 6, train_loss: 0.11183, train_acc: 96.4% ,test_loss: 0.59546, test_acc: 86.1%
    # epoch: 7, train_loss: 0.09305, train_acc: 97.3% ,test_loss: 0.73912, test_acc: 85.4%
    # epoch: 8, train_loss: 0.06812, train_acc: 98.0% ,test_loss: 0.85146, test_acc: 84.7%
    # epoch: 9, train_loss: 0.05540, train_acc: 98.3% ,test_loss: 0.82380, test_acc: 85.4%
    # Done!
    
    

    相关文章

      网友评论

          本文标题:PyTrch深度学习简明实战36 - 一维卷积神经网络

          本文链接:https://www.haomeiwen.com/subject/csqsvdtx.html