参考:
https://bbs.huaweicloud.com/blogs/127160
https://www.jianshu.com/p/3596d55dfaa5
https://www.jianshu.com/p/b440a62f3c3d
#!/usr/bin/python
# coding=utf-8
import numpy as np
import pandas as pd
from keras.datasets import imdb
from matplotlib import pyplot as plt
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, Flatten,Dropout
from keras.models import Sequential
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000,默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)
(x_train, y_train), (x_validation, y_validation) = imdb.load_data(path='G:\\rnn\\rnn\imdb_movieReview_emotion_analysis\dataset\imdb.npz')
print(type(x_train), type(x_train[0]))
print(x_train[0:3])
# 合并训练集和评估数据集
x = np.concatenate((x_train, x_validation), axis=0)
y = np.concatenate((y_train, y_validation), axis=0)
print('x shape is %s, y shape is %s' % (x.shape, y.shape))
print(len(x[0:1][0]), len(x[1:2][0]))
print(x[0])
print(y[0:3])
print('Classes: %s' % np.unique(y))
print('Total words: %s' % len(np.unique(np.hstack(x))))
result = [len(word) for word in x]
print('Mean: %.2f words (STD: %.2f)' %(np.mean(result), np.std(result)))
plt.subplot(221)
plt.boxplot(result)
plt.subplot(222)
plt.hist(result)
plt.show()
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# 将整数转换成文本
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index(path='G:\\rnn\\rnn\imdb_movieReview_emotion_analysis\dataset\imdb_word_index.json')
# The first indices are reserved
# for k,v in word_index.items():
# print(k)
# print(v)
word_index = {k:(v+3) for k,v in word_index.items()}
# print(word_index.items())
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2 # unknown
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print(reverse_word_index[0])
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
movie_review_zh = decode_review([9999999999999])
print(movie_review_zh)
# -----------------------------------------
seed = 7
top_words = 5000 # 最大词汇量
max_words = 500 # 将影评的长度限制在500个单词以内
out_dimension = 32
batch_size = 512
epochs = 10
def create_model():
model = Sequential()
# 构建嵌入层
model.add(Embedding(top_words, out_dimension, input_length=max_words))
# 1维度卷积层
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
if __name__ == '__main__':
np.random.seed(seed=seed)
# 导入数据
(x_train, y_train), (x_validation, y_validation) = imdb.load_data(num_words=top_words, path='G:\\rnn\\rnn\imdb_movieReview_emotion_analysis\dataset\imdb.npz')
# 限定数据集的长度
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_validation = sequence.pad_sequences(x_validation, maxlen=max_words)
# 生成模型
model = create_model()
optimizer = None
history = model.fit(x_train, y_train, validation_data=(x_validation, y_validation), batch_size=batch_size, epochs=epochs, verbose=2)
# 训练集和验证集上的accurcy和loss
#
-
dataset
链接:https://pan.baidu.com/s/1pYQzAiRmq96S-mlCez322A
提取码:y00q
链接:https://pan.baidu.com/s/1ms7TS3RlxC1FAVW6kcJ1hw
提取码:5s9m
网友评论