LSTM

作者: 凌霄文强 | 来源:发表于2019-05-25 22:42 被阅读0次

pytorch1.0 搭建LSTM网络
keras lstm 杂记
详解 LSTM
LSTM Custom
keras lstm return sequence参数理解
LSTM
lstm理解
Tensorflow神经网络之LSTM
LSTM原理、源码、Demo及习题
双向 LSTM

import re
import numpy as np
import pandas as pd
from keras import Sequential
from keras.layers import Embedding, LSTM, Flatten, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

train_data = pd.read_csv('./data/train.csv', lineterminator='\n')
test_data = pd.read_csv('./data/test.csv', lineterminator='\n')
train_data['label'] = train_data['label'].map({'Negative': 0, 'Positive': 1})
train_data = train_data.as_matrix()
test_data = test_data.as_matrix()


# two commom ways to clean data
def cleaner(word):
    word = re.sub(r'\#\.', '', word)
    word = re.sub(r'\n', '', word)
    word = re.sub(r',', '', word)
    word = re.sub(r'\-', ' ', word)
    word = re.sub(r'\.', '', word)
    word = re.sub(r'\\', ' ', word)
    word = re.sub(r'\\x\.+', '', word)
    word = re.sub(r'\d', '', word)
    word = re.sub(r'^_.', '', word)
    word = re.sub(r'_', ' ', word)
    word = re.sub(r'^ ', '', word)
    word = re.sub(r' $', '', word)
    word = re.sub(r'\?', '', word)

    return word.lower()


def hashing(word):
    word = re.sub(r'ain$', r'ein', word)
    word = re.sub(r'ai', r'ae', word)
    word = re.sub(r'ay$', r'e', word)
    word = re.sub(r'ey$', r'e', word)
    word = re.sub(r'ie$', r'y', word)
    word = re.sub(r'^es', r'is', word)
    word = re.sub(r'a+', r'a', word)
    word = re.sub(r'j+', r'j', word)
    word = re.sub(r'd+', r'd', word)
    word = re.sub(r'u', r'o', word)
    word = re.sub(r'o+', r'o', word)
    word = re.sub(r'ee+', r'i', word)
    if not re.match(r'ar', word):
        word = re.sub(r'ar', r'r', word)
    word = re.sub(r'iy+', r'i', word)
    word = re.sub(r'ih+', r'eh', word)
    word = re.sub(r's+', r's', word)
    if re.search(r'[rst]y', 'word') and word[-1] != 'y':
        word = re.sub(r'y', r'i', word)
    if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
        word = re.sub(r'i$', r'y', word)
    if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
        word = re.sub(r'h', '', word)
    word = re.sub(r'k', r'q', word)
    return word


def array_cleaner(array):
    X = []
    for sentence in array:
        clean_sentence = ''
        words = sentence.split(' ')
        for word in words:
            clean_sentence = clean_sentence + ' ' + cleaner(word)
        X.append(clean_sentence)
    return X


X_test = test_data[:, 1]
X_train = train_data[:, 1]
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
y_train = np.array(train_data[:, 2], dtype=int)

X_all = X_train + X_test
tokenizer = Tokenizer(nb_words=2000, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
tokenizer.fit_on_texts(X_all)
X_all = tokenizer.texts_to_sequences(X_all)
X_all = pad_sequences(X_all)
X_train = X_all[:len(y_train)]
X_test = X_all[len(y_train):]

embed_dim = 128
list_out = 256
batch_size = 32
model = Sequential()
model.add(Embedding(2000, embed_dim, input_length=X_train.shape[1], dropout=0.2))
model.add(LSTM(list_out, dropout=0.2, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, batch_size=batch_size, epochs=10)
y_pred = model.predict(X_test)
print(y_pred[:10])
result = pd.DataFrame.from_dict({
    'ID': range(1, len(y_pred) + 1),
    'Pred': y_pred.reshape((-1))
})
result.to_csv('./result/submission.csv', index=None)