# 载入所需工具包
import jieba
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd
# 有的环境配置下read_table出错,因此改用read_csv
raw = pd.read_csv(r"C:\Users\Administrator\Desktop\机器学习英语内容.txt", names=['txt'], sep='aaa', encoding="GBK")
# print(len(raw))
# print(raw)
# dict1 = r"C:\Users\Administrator\Desktop\词库.txt"
# jieba.load_userdict(dict1) # dict为自定义词典的路径
# 以整句或者整段为基本单位进行分析显然更为合适
corpus = [jieba.lcut(item) for item in raw.txt]
# print(corpus[:3])
# 此处完全可以使用外部语料库进行更全面的训练
w2v_model = Word2Vec(corpus, size=100, window=5, min_count=5, workers=4)
# print(w2v_model.wv['郭啸天'])
# 将数据还原为一个长list
raw_input = [item for sublist in corpus for item in sublist]
# print(len(raw_input))
# print(raw_input[:10])
# 列出模型中纳入的词条
vocab = w2v_model.wv.vocab
# print(vocab)
# min_count = 5参数会过滤掉低频词,因此需要在文本中同步清除这些低频词
text_stream = []
for word in raw_input:
if word in vocab:
text_stream.append(word)
# print(len(text_stream))
# print(text_stream[:10])
# 构造训练测试集
seq_length = 8 # 取前面10个单词用于预测
x = []; y = []
for i in range(0, len(text_stream) - seq_length):
given = text_stream[i : i + seq_length]
predict = text_stream[i + seq_length]
x.append(np.array([w2v_model.wv[word] for word in given]))
y.append(w2v_model.wv[predict])
# len(x)
# print(x[0][0])
# print(y[0])
# 随后将w2v格式的数值表达转换为LSTM需要的格式:[样本数,时间步伐,特征]
x = np.reshape(x, (-1, seq_length, 100)) # 每一个词条,对应一个word2vec向量
y = np.reshape(y, (-1, 100))
# 建立LSTM模型
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, 100)))
model.add(Dropout(0.2))
model.add(Dense(100, activation='sigmoid'))
model.compile(loss='mse', optimizer='adam')
# model.fit(x, y, epochs=500, batch_size=64)
# model.summary()
# model.save_weights(r"C:\Users\Administrator\Desktop\LSTM.hdf5")# 文件类型是HDF5
model.load_weights(r"C:\Users\Administrator\Desktop\LSTM_best.hdf5")
# model.fit(x, y, epochs=3000) # 按照指定的数据和参数继续训练模型
# from keras.callbacks import ModelCheckpoint
#
# checkpointer = ModelCheckpoint(filepath=r"C:\Users\Administrator\Desktop\LSTM_best.hdf5", monitor='val_loss', save_best_only=True, verbose=1)
# model.compile(loss='mse', optimizer='adam')
# model.fit(x, y, epochs=50, validation_data=(x, y), callbacks=[checkpointer])
def predict_next(input_array):
x = np.reshape(input_array, (-1, seq_length, 100))
y = model.predict(x)
return y
def string_to_index(raw_input):
input_stream = []
for word in jieba.lcut(raw_input):
if word in vocab:
input_stream.append(word)
res = []
for word in input_stream[(len(input_stream) - seq_length):]:
res.append(w2v_model.wv[word])
return res
def y_to_word(y):
word = w2v_model.wv.most_similar(positive=y, topn=1)
return word
def generate_article(init, rounds = 50):
in_string = init.lower()
for i in range(rounds):
n = y_to_word(predict_next(string_to_index(in_string)))
in_string += n[0][0]
return in_string
while 1 < 4:
init = input("请输入内容:")
init = init.replace('\'', '')
article = generate_article(init)
print(article)
网友评论