美文网首页
【深度学习TensorFlow (11)】文本嵌入模型_分词器参

【深度学习TensorFlow (11)】文本嵌入模型_分词器参

作者: Geekero | 来源:发表于2021-02-27 23:45 被阅读0次

    学习自中国大学MOOC TensorFlow学习课程

    一、超参数的影响设定

    import json
    import tensorflow as tf
    import numpy as np
    
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    

    设定超参数组合1

    vocab_size = 10000
    embedding_dim = 16
    max_length = 100
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000
    
    # !wget --no-check-certificate \
    #     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    #     -O /tmp/sarcasm.json
    
    
    with open("./sarcasm.json", 'r') as f:
        datastore = json.load(f)
    
    sentences = []
    labels = []
    
    for item in datastore:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])
    
    training_sentences = sentences[0:training_size]
    testing_sentences = sentences[training_size:]
    training_labels = labels[0:training_size]
    testing_labels = labels[training_size:]
    
    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(training_sentences)
    
    word_index = tokenizer.word_index
    
    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    

    模型构建

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    

    查看模型结构:

    model.summary()
    
        Model: "sequential"
        _________________________________________________________________
        Layer (type)                 Output Shape              Param #   
        =================================================================
        embedding (Embedding)        (None, 100, 16)           160000    
        _________________________________________________________________
        global_average_pooling1d (Gl (None, 16)                0         
        _________________________________________________________________
        dense (Dense)                (None, 24)                408       
        _________________________________________________________________
        dense_1 (Dense)              (None, 1)                 25        
        =================================================================
        Total params: 160,433
        Trainable params: 160,433
        Non-trainable params: 0
        _________________________________________________________________
    

    训练模型

    num_epochs = 30
    training_padded = np.array(training_padded)
    training_labels = np.array(training_labels)
    testing_padded = np.array(testing_padded)
    testing_labels = np.array(testing_labels)
    history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)
    
        Epoch 1/30
        625/625 [==============================] - 7s 9ms/step - loss: 0.6824 - accuracy: 0.5645 - val_loss: 0.5900 - val_accuracy: 0.8076
        ...
        Epoch 30/30
        625/625 [==============================] - 4s 7ms/step - loss: 0.0225 - accuracy: 0.9944 - val_loss: 1.0454 - val_accuracy: 0.8116
    

    可视化训练精度和损失情况

    import matplotlib.pyplot as plt
    
    
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()
      
    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    
    output_9_0.png output_9_1.png

    设定超参数组合2

    由上图可知是过拟合了
    调整超参数为:可以减少词典词汇量 将句子长度变小

    vocab_size = 1000 #(was 10000)
    embedding_dim = 16 
    max_length = 16 #(was 100)
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000
    
    history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)
    
    
        Epoch 1/30
        625/625 [==============================] - 4s 7ms/step - loss: 0.0229 - accuracy: 0.9934 - val_loss: 1.0839 - val_accuracy: 0.8068
        Epoch 30/30
        625/625 [==============================] - 4s 7ms/step - loss: 0.0050 - accuracy: 0.9983 - val_loss: 2.1899 - val_accuracy: 0.7998
    

    可视化训练精度

    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    
    output_11_0.png output_11_1.png

    发现验证准确性依然不高
    再调整一下嵌入维度

    vocab_size = 1000 #(was 10000)
    embedding_dim = 32 #(was 16)
    max_length = 16 #(was 32)
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000
    
    history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)
    
        Epoch 1/30
        625/625 [==============================] - 5s 7ms/step - loss: 0.0036 - accuracy: 0.9988 - val_loss: 2.1994 - val_accuracy: 0.7980
        Epoch 8/30
        625/625 [==============================] - 5s 7ms/step - loss: 0.0042 - accuracy: 
    
    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    
    output_13_0.png output_13_1.png

    分析分析器的输出

    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    
    def decode_sentence(text):
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    print(decode_sentence(training_padded[0]))
    print(training_sentences[2])
    print(labels[2])
    
    
        former <OOV> store clerk sues over secret 'black <OOV> for minority shoppers ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
        mom starting to fear son's web series closest thing she will have to grandchild
        1
    
    e = model.layers[0]
    weights = e.get_weights()[0]
    print(weights.shape) # shape: (vocab_size, embedding_dim)
    
        (10000, 16)
    
    import io
    
    out_v = io.open('vecs2.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta2.tsv', 'w', encoding='utf-8')
    for word_num in range(1, vocab_size):
      word = reverse_word_index[word_num]
      embeddings = weights[word_num]
      out_m.write(word + "\n")
      out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    out_v.close()
    out_m.close()
    
    # try:
    #   from google.colab import files
    # except ImportError:
    #   pass
    # else:
    #   files.download('vecs.tsv')
    #   files.download('meta.tsv')
    
    sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    print(model.predict(padded))
    
        WARNING:tensorflow:Model was constructed with shape (None, 100) for input KerasTensor(type_spec=TensorSpec(shape=(None, 100), dtype=tf.float32, name='embedding_input'), name='embedding_input', description="created by layer 'embedding_input'"), but it was called on an input with incompatible shape (None, 16).
        [[1.0000000e+00]
         [3.2138368e-37]]
    

    发现效果也不好
    资源释放

    import os, signal
    
    os.kill(os.getpid(), signal.SIGINT)
    

    二、探讨子词对分类器的影响

    探讨子词对分类器的影响

    # NOTE: PLEASE MAKE SURE YOU ARE RUNNING THIS IN A PYTHON3 ENVIRONMENT
    
    import tensorflow as tf
    print(tf.__version__)
        2.4.0
    
    # Uncomment and run this if you don't have TensorFlow 2.0x [Check for latest 2.0 instructions at https://www.tensorflow.org/versions/r2.0/api_docs/python/tf]
    #!pip install tensorflow==2.0.0-beta0
    
    # Double check TF 2.0x is installed. If you ran the above block, there was a 
    # 'reset all runtimes' button at the bottom that you needed to press
    #import tensorflow as tf
    #print(tf.__version__)
    
    # If the import fails, run this
    # !pip install -q tensorflow-datasets 
    

    数据加载预处理

    import tensorflow_datasets as tfds
    imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
    
    train_data, test_data = imdb['train'], imdb['test']
    
    train_data #张量
    
        <PrefetchDataset shapes: ((None,), ()), types: (tf.int64, tf.int64)>
    

    查看info.features结构:

    info.features
    
        FeaturesDict({
            'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
            'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
        })
    
    #从数据集中获得子词分类器
    tokenizer = info.features['text'].encoder
    
    sample_string = 'TensorFlow, from basics to mastery'
    
    tokenized_string = tokenizer.encode(sample_string)
    print ('Tokenized string is {}'.format(tokenized_string))
    
    original_string = tokenizer.decode(tokenized_string)
    print ('The original string: {}'.format(original_string))
    
    # expected output:
    # Tokenized string is [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]
    # The original string: TensorFlow, from basics to mastery
    
        Tokenized string is [6307, 2327, 4043, 2120, 2, 48, 4249, 4429, 7, 2652, 8050]
        The original string: TensorFlow, from basics to mastery
    

    查看编码对应关系

    for ts in tokenized_string:
      print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))
    
        6307 ----> Ten
        2327 ----> sor
        4043 ----> Fl
        2120 ----> ow
        2 ----> , 
        48 ----> from 
        4249 ----> basi
        4429 ----> cs 
        7 ----> to 
        2652 ----> master
        8050 ----> y
    

    设定超参数

    BUFFER_SIZE = 25000
    BATCH_SIZE = 1
    
    train_data = train_data.shuffle(BUFFER_SIZE)
    train_data = train_data.padded_batch(BATCH_SIZE)
    test_data = test_data.padded_batch(BATCH_SIZE)
    
    test_data = test_data.padded_batch(BATCH_SIZE)
    

    构建嵌入神经模型

    embedding_dim = 64 #嵌入层维度为64
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.summary()
    
        Model: "sequential"
        _________________________________________________________________
        Layer (type)                 Output Shape              Param #   
        =================================================================
        embedding (Embedding)        (None, None, 64)          523840    
        _________________________________________________________________
        global_average_pooling1d (Gl (None, 64)                0         
        _________________________________________________________________
        dense (Dense)                (None, 6)                 390       
        _________________________________________________________________
        dense_1 (Dense)              (None, 1)                 7         
        =================================================================
        Total params: 524,237
        Trainable params: 524,237
        Non-trainable params: 0
        _________________________________________________________________
    

    训练模型

    num_epochs = 10
    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    history = model.fit(train_data, epochs=num_epochs, validation_data=test_data)
    
        Epoch 1/10
        25000/25000 [==============================] - 325s 13ms/step - loss: 0.4260 - accuracy: 0.7989 - val_loss: 0.2964 - val_accuracy: 0.8769
    
        Epoch 9/10
        25000/25000 [==============================] - 345s 14ms/step - loss: 0.0479 - accuracy: 0.9842 - val_loss: 0.7027 - val_accuracy: 0.8476
        Epoch 10/10
        25000/25000 [==============================] - 334s 13ms/step - loss: 0.0345 - accuracy: 0.9890 - val_loss: 0.8299 - val_accuracy: 0.8482
    

    可视化训练时的精度和loss

    import matplotlib.pyplot as plt
    
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()
      
    plot_graphs(history, "accuracy")
    plot_graphs(history, "loss")
    
    output_15_0.png output_15_1.png

    可视化模型输出

    e = model.layers[0]
    weights = e.get_weights()[0]
    print(weights.shape) # shape: (vocab_size, embedding_dim)
    
        (8185, 64)
    

    导出矩阵文件

    import io
    
    out_v = io.open('vecs3.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta3.tsv', 'w', encoding='utf-8')
    for word_num in range(1, tokenizer.vocab_size):
      word = tokenizer.decode([word_num])
      embeddings = weights[word_num]
      out_m.write(word + "\n")
      out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    out_v.close()
    out_m.close()
    
    
    # try:
    #   from google.colab import files
    # except ImportError:
    #   pass
    # else:
    #   files.download('vecs.tsv')
    #   files.download('meta.tsv')
    
    
    tokenizer.vocab_size
        8185
    

    总结

    不完整的子词,很难学习到单词的正确含义和情感。所以就引入后面要说得循环神经网路。

    资源释放

    import os, signal
    
    os.kill(os.getpid(), signal.SIGINT)
    

    相关文章

      网友评论

          本文标题:【深度学习TensorFlow (11)】文本嵌入模型_分词器参

          本文链接:https://www.haomeiwen.com/subject/mjewfltx.html