美文网首页
NLP in TensorFlow: BBC新闻(多分类问题)

NLP in TensorFlow: BBC新闻(多分类问题)

作者: poteman | 来源:发表于2019-08-07 19:46 被阅读0次
    • 导入所需的包
    import csv
    import tensorflow as tf
    import numpy as np
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    
    • 下载数据
    !wget --no-check-certificate \
        https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
        -O /tmp/bbc-text.csv
    
    • 定义参数
    vocab_size = 20000
    oov_tok = '<OOV>'
    
    embedding_dim = 16
    max_length = 120
    trunc_type = 'pre'
    padding_type = 'pre'
    
    training_portion = .8
    
    stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    print(len(stopwords))
    # Expected Output
    # 153
    
    • 获得文本和标签
    sentences = []
    labels = []
    with open("/tmp/bbc-text.csv", 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)
        for row in reader:
          labels.append(row[0])
          sentence = row[1]
          for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
          sentences.append(sentence)
    
    • 拆分数据集
    train_size = int(training_portion * len(labels))
    
    train_sentences = sentences[:train_size]
    train_labels = labels[:train_size]
    
    validation_sentences = sentences[train_size:]
    validation_labels = labels[train_size:]
    
    • tokenizer和padding
    tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
    tokenizer.fit_on_texts(sentences)
    word_index = tokenizer.word_index
    
    train_sequences = tokenizer.texts_to_sequences(train_sentences)
    train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
    validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    • 对标签文本tokenizer
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)
    
    training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
    validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))
    
    • 定义模型
    from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
    
    model = tf.keras.Sequential([
        Embedding(vocab_size, embedding_dim, input_length = max_length),
        GlobalAveragePooling1D(),
        Dense(24, activation = 'relu'),
        Dense(6, activation = 'softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    
    • 训练模型
    num_epochs = 30
    history = model.fit(train_padded, training_label_seq, epochs = num_epochs, \
                        validation_data = (validation_padded, validation_label_seq))
    
    • 作图查看训练曲线
    import matplotlib.pyplot as plt
    
    
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()
      
    plot_graphs(history, "acc")
    plot_graphs(history, "loss")
    
    • 获得index2word的字典
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    
    def decode_sentence(text):
        return ' '.join([reverse_word_index.get(i, '?') for i in text])
    
    • 获得embedding参数
    e = model.layers[0]
    weights = e.get_weights()[0]
    print(weights.shape) # shape: (vocab_size, embedding_dim)
    
    • 保存embedding参数
    import io
    
    out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
    out_m = io.open('meta.tsv', 'w', encoding='utf-8')
    for word_num in range(1, vocab_size):
      word = reverse_word_index[word_num]
      embeddings = weights[word_num]
      out_m.write(word + "\n")
      out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    out_v.close()
    out_m.close()
    
    • 下载embedding数据
    try:
      from google.colab import files
    except ImportError:
      pass
    else:
      files.download('vecs.tsv')
      files.download('meta.tsv')
    

    相关文章

      网友评论

          本文标题:NLP in TensorFlow: BBC新闻(多分类问题)

          本文链接:https://www.haomeiwen.com/subject/kcdadctx.html