![](https://img.haomeiwen.com/i14340919/70f483948845ec12.jpg)
说明:本文依据《中文自然语言处理入门实战》完成。目前网上有不少转载的课程,我是从GitChat上购买。
这一课开始讲深度学习部分的RNN(LSTM和GRU),之前也在教程中学过,但是仅仅是实现了一个LSTM,然后使用RNN构建了一个词向量模型用来做词嵌入预测。
第十一课 使用LSTM生成古诗
这次课是运用LSTM实现一个诗词生成模型,我看了一下源代码,貌似比较粗糙,这里会先按照源代码跑一遍流程,然后从github上找了另一个star2000+的项目来实现。
之前因为我的疏忽,没有看到教程中的Github地址,在这里列出来。
语料库使用教程提供的爬取处理后的语料,四万首古诗,题材、长度不限,大概8mb左右。
1.模型参数配置
预先定义模型参数和加载语料以及模型保存名称,通过类 Config 实现。
import os
import random
import keras
from keras.callbacks import LambdaCallback
from keras.engine.saving import load_model
from keras import *
from keras.layers import *
from keras.optimizers import Adam
class Config(object):
poetry_file = r"...datasets\\poetry.txt"
weight_file = r"...datasets\\poetry_model.h5"
# 根据前六个字预测第七个字
max_len = 6
batch_size = 512
learning_rate = 0.001
2.文本预处理
教程中使用了传统的One-Hot形式来生成词向量。
def preprocess_file(Config):
# 语料文本内容
files_content = ''
with open(Config.poetry_file, 'r', encoding='utf-8') as f:
for line in f:
# 每行的末尾加上"]"符号代表一首诗结束
for char in puncs:
line = line.replace(char, "")
files_content += line.strip() + "]"
words = sorted(list(files_content))
words.remove(']')
counted_words = {}
for word in words:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
# 去掉低频字
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
del counted_words[']']
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
words, _ = zip(*wordPairs)
# word到id的映射
word2num = dict((c, i + 1) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
word2numF = lambda x: word2num.get(x, 0)
return word2numF, num2word, words, files_content
3.构建模型,通过 PoetryModel 类实现
puncs = [']', '[', '(', ')', '{', '}', ':', '《', '》']
class PoetryModel(object):
def __init__(self,config):
self.model = None
self.do_train = True
self.loaded_model = False
self.config = Config
self.word2numF, self.num2word, self.words, self.files_content = preprocess_file(self.config)
if os.path.exists(self.config.weight_file):
self.model = load_model(self.config.weight_file)
self.model.summary()
else:
self.train()
self.do_train = False
self.loaded_model = True
def build_model(self):
input_tensor = Input(shape=(self.config.max_len,))
embedd = Embedding(len(self.num2word) + 1, 300, input_length=self.config.max_len)(input_tensor)
# lstm = Bidirectional(GRU(128, return_sequences=True))(embedd)
# dropout = Dropout(0.6)(lstm)
lstm = Bidirectional(GRU(128, return_sequences=True))(embedd)
dropout = Dropout(0.6)(lstm)
flatten = Flatten()(lstm)
dense = Dense(len(self.words), activation='softmax')(flatten)
self.model = Model(inputs=input_tensor, outputs=dense)
optimizer = Adam(lr=self.config.learning_rate)
self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# sample 函数,在训练过程的每个 epoch 迭代中采样。
def sample(self, preds, temperature=1.0):
# 当temperature=1.0时,模型输出正常
# 当temperature=0.5时,模型输出比较open
# 当temperature=1.5时,模型输出比较保守
# 在训练的过程中可以看到temperature不同,结果也不同
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def generate_sample_result(self, epoch, logs):
print("\n==================Epoch {}=====================".format(epoch))
for diversity in [0.5, 1.0, 1.5]:
start_index = random.randint(0, len(self.files_content) - self.config.max_len - 1)
generated = ''
sentence = self.files_content[start_index: start_index + self.config.max_len]
generated += sentence
for i in range(20):
x_pred = np.zeros((1, self.config.max_len))
for t, char in enumerate(sentence[-6:]):
x_pred[0, t] = self.word2numF(char)
preds = self.model.predict(x_pred, verbose=0)[0]
next_index = self.sample(preds, diversity)
next_char = self.num2word[next_index]
generated += next_char
sentence = sentence + next_char
print(sentence)
def predict(self, text):
if not self.loaded_model:
return
with open(self.config.poetry_file, 'r', encoding='utf-8') as f:
file_list = f.readlines()
random_line = random.choice(file_list)
# 如果给的text不到四个字,则随机补全
if not text or len(text) != 4:
for _ in range(4 - len(text)):
random_str_index = random.randrange(0, len(self.words))
text += self.num2word.get(random_str_index) if self.num2word.get(random_str_index) not in [',', '。',
','] else self.num2word.get(
random_str_index + 1)
seed = random_line[-(self.config.max_len):-1]
res = ''
seed = 'c' + seed
for c in text:
seed = seed[1:] + c
for j in range(5):
x_pred = np.zeros((1, self.config.max_len))
for t, char in enumerate(seed):
x_pred[0, t] = self.word2numF(char)
preds = self.model.predict(x_pred, verbose=0)[0]
next_index = self.sample(preds, 1.0)
next_char = self.num2word[next_index]
seed = seed[1:] + next_char
res += seed
return res
def data_generator(self):
i = 0
while 1:
x = self.files_content[i:i + self.config.max_len]
y = self.files_content[i + self.config.max_len]
puncs = [']', '[', '(', ')', '{', '}', ':', '《', '》', ':']
if len([i for i in puncs if i in x]) != 0:
i += 1
continue
if len([i for i in puncs if i in y]) != 0:
i += 1
continue
y_vec = np.zeros(
shape=(1, len(self.words)),
dtype=np.bool
)
y_vec[0, self.word2numF(y)] = 1.0
x_vec = np.zeros(
shape=(1, self.config.max_len),
dtype=np.int32
)
for t, char in enumerate(x):
x_vec[0, t] = self.word2numF(char)
yield x_vec, y_vec
i += 1
def train(self):
number_of_epoch = 10
if not self.model:
self.build_model()
self.model.summary()
self.model.fit_generator(
generator=self.data_generator(),
verbose=True,
steps_per_epoch=self.config.batch_size,
epochs=number_of_epoch,
callbacks=[
keras.callbacks.ModelCheckpoint(self.config.weight_file, save_weights_only=False),
LambdaCallback(on_epoch_end=self.generate_sample_result)
]
)
4.训练模型
if __name__ == '__main__':
model = PoetryModel(Config)
text = input("text:")
sentence = model.predict(text)
print(sentence)
5.训练结果
一共进行了10次迭代,对比可以发现生成的越来越像人话了……
==================Epoch 0=====================
西将,犹能射无不不宁烽叫锥聘王瘢襦猿锋顾洪竹摽俘泮谈
。]嘉兴郭里。箓迤于尺濑。蚤缨魄无乡。梡巅损场坌。餔
遂安舒。清流琊渡样苟趢邦蓄结鞘奸轺教鲐舲喉僧泄狄元溱
==================Epoch 5=====================
痕露,微思月不无不。归未灯知花。耕散别惊忧。每兽片何
静言念终始,木杳志钱山不乎_那蹈惜罗边霾雨忆惊清解璇
。]槠楠无冬。朝妖掇忘散。报子捃况乡桎渚郇岑介不脍囊
==================Epoch 9=====================
君俦。白发虽遥视。汉不梨母石。兮叶右无不。暂晚乡奇律
寿。全胜汉武轻。诗渴肩刈油。铿银断怡著。怡处升脉池。
拜舞归,轻纨杼饵依不鹢挚玖德茅赖泉舷弩郡趢艰陌涴齾臣
我们可以随意输入一个关键字来生成诗句
text:雨
雨零伤来臾。箭牧归劳肝。昨当痕门忘。八龙鉴良清。
网友评论