Tensorflow练习1:使用RNN生成古诗词

作者: Panverson | 来源:发表于2019-07-14 20:58 被阅读58次

    介绍

    RNN(Recurrent Nueral Network, 循环神经网络),自然语言处理常用的一种神经网络类型。因为它的输入和输出(通常为时间序列)是可变长的,详细介绍参考:https://blog.csdn.net/heyongluoyao8/article/details/48636251

    准备

    数据集
    全唐诗(43030首):链接: https://pan.baidu.com/s/10rcjAVmrPJwEWF0blglldQ
    提取码: 666g
    参考代码
    自动生成英文诗歌:https://github.com/karpathy/char-rnn
    博客:http://blog.topspeedsnail.com/archives/10542

    代码部分

    数据预处理

    import collections
    
    ORIGIN_DATA = 'data/poetry.txt'  # 源数据路径
    OUTPUT_DATA = 'data/o_poetry.txt'  # 输出向量路径
    VOCAB_DATA = 'data/poetry.vocab'
    
    
    def word_to_id(word, id_dict):
        if word in id_dict:
            return id_dict[word]
        else:
            return id_dict['<unknow>']
    
    
    poetrys = []  # 存放唐诗的数组
    
    # 从文件中读取唐诗
    with open(ORIGIN_DATA, 'r', encoding='utf-8') as f:
        f_lines = f.readlines()
        print('唐诗总数 : {}'.format(len(f_lines)))
        # 逐行进行处理
        for line in f_lines:
            # 去除前后空白符,转码
            strip_line = line.strip()
            try:
                # 将唐诗分为标题和内容
                title, content = strip_line.split(':')
            except:
                # 出现多个':'的将被舍弃
                continue
            # 去除内容中的空格
            content = content.strip().replace(' ', '')
            # 舍弃含有非法字符的唐诗
            if '(' in content or '(' in content or '<' in content or '《' in content or '_' in content or '[' in content:
                continue
            # 舍弃过短或过长的唐诗
            lenth = len(content)
            if lenth < 20 or lenth > 100:
                continue
            # 加入列表
            poetrys.append('s' + content + 'e')
    
    print('用于训练的唐诗数 : {}'.format(len(poetrys)))
    

    分割结果:

    ['[寒随穷律变,春逐鸟声开。初风飘带柳,晚雪间花梅。碧林青旧竹,绿沼翠新苔。芝田初雁去,绮树巧莺来。]', '[晚霞聊自怡,初晴弥可喜。日晃百花色,风动千林翠。池鱼跃不同,园鸟声还异。寄言博通者,知予物外志。]', '[一朝春夏改,隔夜鸟花迁。阴阳深浅叶,晓夕重轻烟。哢莺犹响殿,横丝正网天。珮高兰影接,绶细草纹连。碧鳞惊棹侧,玄燕舞檐前。何必汾阳处,始复有山泉。]']
    
    poetry_list = sorted(poetrys, key=lambda x: len(x))
    
    words_list = []
    # 获取唐诗中所有的字符
    for poetry in poetry_list:
        words_list.extend([word for word in poetry])
    # 统计其出现的次数
    counter = collections.Counter(words_list)
    # 排序
    sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    # 获得出现次数降序排列的字符列表
    words_list = ['<unknow>'] + [x[0] for x in sorted_words]
    # 这里选择保留高频词的数目,词只有不到七千个,所以我全部保留
    words_list = words_list[:len(words_list)]
    
    print('词汇表大小 : {}'.format(words_list))
    
    with open(VOCAB_DATA, 'w', encoding='utf-8') as f:
        for word in words_list:
            f.write(word + '\n')
    
    # 生成单词到id的映射
    word_id_dict = dict(zip(words_list, range(len(words_list))))
    # 将poetry_list转换成向量形式
    id_list = []
    for poetry in poetry_list:
        id_list.append([str(word_to_id(word, word_id_dict)) for word in poetry])
    
    # 将向量写入文件
    with open(OUTPUT_DATA, 'w', encoding='utf-8') as f:
        for id_l in id_list:
            f.write(' '.join(id_l) + '\n')
    

    RNN

    import tensorflow as tf
    import functools
    
    VOCAB_SIZE = 6272  # 词汇表大小
    
    SHARE_EMD_WITH_SOFTMAX = True  # 是否在embedding层和softmax层之间共享参数
    MAX_GRAD = 5.0  # 最大梯度,防止梯度爆炸
    LEARN_RATE = 0.0005  # 初始学习率
    LR_DECAY = 0.92  # 学习率衰减
    LR_DECAY_STEP = 600  # 衰减步数
    BATCH_SIZE = 64  # batch大小
    CKPT_PATH = 'ckpt/model_ckpt'  # 模型保存路径
    VOCAB_PATH = 'vocab/poetry.vocab'  # 词表路径
    EMB_KEEP = 0.5  # embedding层dropout保留率
    RNN_KEEP = 0.5  # lstm层dropout保留率
    HIDDEN_SIZE = 128  # LSTM隐藏节点个数
    NUM_LAYERS = 2  # RNN深度
    
    
    def doublewrap(function):
        def decorator(*args, **kwargs):
            if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
                return function(args[0])
            else:
                return lambda wrapee: function(wrapee, *args, **kwargs)
    
        return decorator
    
    
    def define_scope(function, scope=None, *args, **kwargs):
        attribute = '_cache_' + function.__name__
        name = scope or function.__name__
        def decorator(self):
            if not hasattr(self, attribute):
                with tf.variable_scope(name, *args, **kwargs):
                    setattr(self, attribute, function(self))
            return getattr(self, attribute)
    
        return decorator
    
    
    class TrainModel(object):
        """
        训练模型
        """
    
        def __init__(self, data, labels, emb_keep, rnn_keep):
            self.data = data  # 数据
            self.labels = labels  # 标签
            self.emb_keep = emb_keep  # embedding层dropout保留率
            self.rnn_keep = rnn_keep  # lstm层dropout保留率
            self.global_step
            self.cell
            self.predict
            self.loss
            self.optimize
    
        def cell(self):
            """
            rnn网络结构
            :return:
            """
            lstm_cell = [
                tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE), output_keep_prob=self.rnn_keep) for
                _ in range(NUM_LAYERS)]
            cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cell)
            return cell
    
        def predict(self):
            """
            定义前向传播
            :return:
            """
            # 创建词嵌入矩阵权重
            embedding = tf.get_variable('embedding', shape=[VOCAB_SIZE, HIDDEN_SIZE])
            # 创建softmax层参数
            if SHARE_EMD_WITH_SOFTMAX:
                softmax_weights = tf.transpose(embedding)
            else:
                softmax_weights = tf.get_variable('softmaweights', shape=[HIDDEN_SIZE, VOCAB_SIZE])
            softmax_bais = tf.get_variable('softmax_bais', shape=[VOCAB_SIZE])
            # 进行词嵌入
            emb = tf.nn.embedding_lookup(embedding, self.data)
            # dropout
            emb_dropout = tf.nn.dropout(emb, self.emb_keep)
            # 计算循环神经网络的输出
            self.init_state = self.cell.zero_state(BATCH_SIZE, dtype=tf.float32)
            outputs, last_state = tf.nn.dynamic_rnn(self.cell, emb_dropout, scope='d_rnn', dtype=tf.float32,
                                                    initial_state=self.init_state)
            outputs = tf.reshape(outputs, [-1, HIDDEN_SIZE])
            # 计算logits
            logits = tf.matmul(outputs, softmax_weights) + softmax_bais
            return logits
    
        def loss(self):
            """
            定义损失函数
            :return:
            """
            # 计算交叉熵
            outputs_target = tf.reshape(self.labels, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.predict, labels=outputs_target, )
            # 平均
            cost = tf.reduce_mean(loss)
            return cost
    
        def global_step(self):
            """
            global_step
            :return:
            """
            global_step = tf.Variable(0, trainable=False)
            return global_step
    
        def optimize(self):
            """
            定义反向传播过程
            :return:
            """
            # 学习率衰减
            learn_rate = tf.train.exponential_decay(LEARN_RATE, self.global_step, LR_DECAY_STEP,
                                                    LR_DECAY)
            # 计算梯度,并防止梯度爆炸
            trainable_variables = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, trainable_variables), MAX_GRAD)
            # 创建优化器,进行反向传播
            optimizer = tf.train.AdamOptimizer(learn_rate)
            train_op = optimizer.apply_gradients(zip(grads, trainable_variables), self.global_step)
            return train_op
    
    
    class EvalModel(object):
    
        def __init__(self, data, emb_keep, rnn_keep):
            self.data = data  # 输入
            self.emb_keep = emb_keep  # embedding层dropout保留率
            self.rnn_keep = rnn_keep  # lstm层dropout保留率
            self.cell
            self.predict
            self.prob
    
        def cell(self):
            """
            rnn网络结构
            :return:
            """
            lstm_cell = [
                tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE), output_keep_prob=self.rnn_keep) for
                _ in range(NUM_LAYERS)]
            cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cell)
            return cell
    
        def predict(self):
            """
            定义前向传播过程
            :return:
            """
            embedding = tf.get_variable('embedding', shape=[VOCAB_SIZE, HIDDEN_SIZE])
    
            if SHARE_EMD_WITH_SOFTMAX:
                softmax_weights = tf.transpose(embedding)
            else:
                softmax_weights = tf.get_variable('softmaweights', shape=[HIDDEN_SIZE, VOCAB_SIZE])
            softmax_bais = tf.get_variable('softmax_bais', shape=[VOCAB_SIZE])
    
            emb = tf.nn.embedding_lookup(embedding, self.data)
            emb_dropout = tf.nn.dropout(emb, self.emb_keep)
            # 与训练模型不同,这里只要生成一首古体诗,所以batch_size=1
            self.init_state = self.cell.zero_state(1, dtype=tf.float32)
            outputs, last_state = tf.nn.dynamic_rnn(self.cell, emb_dropout, scope='d_rnn', dtype=tf.float32,
                                                    initial_state=self.init_state)
            outputs = tf.reshape(outputs, [-1, HIDDEN_SIZE])
            logits = tf.matmul(outputs, softmax_weights) + softmax_bais
            # 与训练模型不同,这里要记录最后的状态,以此来循环生成字,直到完成一首诗
            self.last_state = last_state
            return logits
        def prob(self):
            """
            softmax计算概率
            :return:
            """
            probs = tf.nn.softmax(self.predict)
            return probs
    
    

    训练

    使用LSMT模型,直接一轮训练,50000次,耗时大约2小时训练完成。

    import tensorflow as tf
    from rnn_model import TrainModel
    import org
    
    SHARE_EMD_WITH_SOFTMAX = True  # 是否在embedding层和softmax层之间共享参数
    MAX_GRAD = 5.0  # 最大梯度,防止梯度爆炸
    LEARN_RATE = 0.0005  # 初始学习率
    LR_DECAY = 0.92  # 学习率衰减
    LR_DECAY_STEP = 600  # 衰减步数
    BATCH_SIZE = 64  # batch大小
    CKPT_PATH = 'ckpt/model_ckpt'  # 模型保存路径
    VOCAB_PATH = 'vocab/poetry.vocab'  # 词表路径
    EMB_KEEP = 0.5  # embedding层dropout保留率
    RNN_KEEP = 0.5  # lstm层dropout保留率
    HIDDEN_SIZE = 128  # LSTM隐藏节点个数
    NUM_LAYERS = 2  # RNN深度
    TRAIN_TIMES = 30000  # 迭代总次数(没有计算epoch)
    SHOW_STEP = 1  # 显示loss频率
    SAVE_STEP = 100  # 保存模型参数频率
    
    x_data = tf.placeholder(tf.int32, [BATCH_SIZE, None])  # 输入数据
    y_data = tf.placeholder(tf.int32, [BATCH_SIZE, None])  # 标签
    emb_keep = tf.placeholder(tf.float32)  # embedding层dropout保留率
    rnn_keep = tf.placeholder(tf.float32)  # lstm层dropout保留率
    
    data = org.Dataset(BATCH_SIZE)  # 创建数据集
    
    model = TrainModel(x_data, y_data, emb_keep, rnn_keep)  # 创建训练模型
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())  # 初始化
        for step in range(TRAIN_TIMES):
            # 获取训练batch
            x, y = data.next_batch()
            # 计算loss
            loss, _ = sess.run([model.loss, model.optimize],
                               {model.data: x, model.labels: y, model.emb_keep: EMB_KEEP,
                                model.rnn_keep: RNN_KEEP})
            if step % SHOW_STEP == 0:
                print('step {}, loss is {}'.format(step, loss))
            # 保存模型
            if step % SAVE_STEP == 0:
                saver.save(sess, CKPT_PATH, global_step=model.global_step)
    
    

    经过50000次的迭代后,最终的loss值大概在4~5%左右,这里忘记截图了。

    测试

    import sys
    import tensorflow as tf
    import numpy as np
    from rnn_model import EvalModel
    import utils
    import os
    
    # 指定验证时不使用cuda,这样可以在用gpu训练的同时,使用cpu进行验证
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    
    x_data = tf.placeholder(tf.int32, [1, None])
    
    emb_keep = tf.placeholder(tf.float32)
    
    rnn_keep = tf.placeholder(tf.float32)
    
    # 验证用模型
    model = EvalModel(x_data, emb_keep, rnn_keep)
    
    saver = tf.train.Saver()
    # 单词到id的映射
    word2id_dict = utils.read_word_to_id_dict()
    # id到单词的映射
    id2word_dict = utils.read_id_to_word_dict()
    
    
    def generate_word(prob):
        """
        选择概率最高的前100个词,并用轮盘赌法选取最终结果
        :param prob: 概率向量
        :return: 生成的词
        """
        prob = sorted(prob, reverse=True)[:100]
        index = np.searchsorted(np.cumsum(prob), np.random.rand(1) * np.sum(prob))
        return id2word_dict[int(index)]
    
    
    # def generate_word(prob):
    #  """
    #  从所有词中,使用轮盘赌法选取最终结果
    #  :param prob: 概率向量
    #  :return: 生成的词
    #  """
    #  index = int(np.searchsorted(np.cumsum(prob), np.random.rand(1) * np.sum(prob)))
    #  return id2word_dict[index]
    
    
    def generate_poem():
        """
        随机生成一首诗歌
        :return:
        """
        with tf.Session() as sess:
            # 加载最新的模型
            ckpt = tf.train.get_checkpoint_state('ckpt')
            saver.restore(sess, ckpt.model_checkpoint_path)
            # 预测第一个词
            rnn_state = sess.run(model.cell.zero_state(1, tf.float32))
            x = np.array([[word2id_dict['s']]], np.int32)
            prob, rnn_state = sess.run([model.prob, model.last_state],
                                       {model.data: x, model.init_state: rnn_state, model.emb_keep: 1.0,
                                        model.rnn_keep: 1.0})
            word = generate_word(prob)
            poem = ''
            # 循环操作,直到预测出结束符号‘e'
            while word != 'e':
                poem += word
                x = np.array([[word2id_dict[word]]])
                prob, rnn_state = sess.run([model.prob, model.last_state],
                                           {model.data: x, model.init_state: rnn_state, model.emb_keep: 1.0,
                                            model.rnn_keep: 1.0})
                word = generate_word(prob)
            # 打印生成的诗歌
            print(poem)
    
    
    if __name__ == '__main__':
        generate_poem()
    
    

    结果:

    江川重舌助清悬,风起别苏临夜新。
    江月吴笼罢白客,空夜山山许可悠。
    -----------------------------
    伤能题家节,相态不今多。
    斟军笑不与,莫应伴朝情。
    -----------------------------
    劳是孤商欲醉含,人相能处转坐由。
    瀑莺共君全赏处,袁轮行上爱何心。
    

    可以看出来,格式起码是正确的。语法上还是存在一些问题,可以使用在对数据预处理时候,使用一些NLP方法(分词、语法等)来进行优化。

    相关文章

      网友评论

        本文标题:Tensorflow练习1:使用RNN生成古诗词

        本文链接:https://www.haomeiwen.com/subject/kgbbkctx.html