[tf]中一个整体训练的流程

作者: VanJordan | 来源:发表于2018-12-17 15:12 被阅读15次

[tf]中一个整体训练的流程
训练和评估(03)
TF2.0：解决网络只训练完一个epoch就停止的问题
柅兵闲聊素描头像训练
TF2.0学习笔记
TensorFlow安装
可训练变量获取
tf.slim模块学习
整体流程
tf.trainable_variables和tf.all_va

def run_epoch(session, cost_op, train_op, saver, step):
    # 训练一个epoch。
    # 重复训练步骤直至遍历完Dataset中所有数据。
    while True:
        try:
            # 运行train_op并计算损失值。训练数据在main()函数中以Dataset方式提供。
            cost, _ = session.run([cost_op, train_op])
            if step % 10 == 0:
                print("After %d steps, per token cost is %.3f" % (step, cost))
            # 每200步保存一个checkpoint。
            if step % 200 == 0:
                saver.save(session, CHECKPOINT_PATH, global_step=step)
            step += 1
        except tf.errors.OutOfRangeError:
            break
    return step

def main():
    # 定义初始化函数。
    initializer = tf.random_uniform_initializer(-0.05, 0.05)

    # 定义训练用的循环神经网络模型。
    with tf.variable_scope("nmt_model", reuse=None, 
                           initializer=initializer):
        train_model = NMTModel()
  
    # 定义输入数据。
    data = MakeSrcTrgDataset(SRC_TRAIN_DATA, TRG_TRAIN_DATA, BATCH_SIZE)
    iterator = data.make_initializable_iterator()
    (src, src_size), (trg_input, trg_label, trg_size) = iterator.get_next()

    # 定义前向计算图。输入数据以张量形式提供给forward函数。
    cost_op, train_op = train_model.forward(src, src_size, trg_input,
                                            trg_label, trg_size)
    # 训练模型。
    saver = tf.train.Saver()
    step = 0
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        for i in range(NUM_EPOCH):
            print("In iteration: %d" % (i + 1))
            sess.run(iterator.initializer)
            step = run_epoch(sess, cost_op, train_op, saver, step)

定义什么loss，使用什么optimizer，如何做Clip都在model的forward里面编写


# 定义输入数据。
data = MakeSrcTrgDataset(SRC_TRAIN_DATA, TRG_TRAIN_DATA, BATCH_SIZE)
iterator = data.make_initializable_iterator()
(src, src_size), (trg_input, trg_label, trg_size) = iterator.get_next()

# 定义前向计算图。输入数据以张量形式提供给forward函数。
cost_op, train_op = train_model.forward(src, src_size, trg_input,
                                        trg_label, trg_size)

# 计算解码器每一步的log perplexity。这一步与语言模型代码相同。
output = tf.reshape(dec_outputs, [-1, HIDDEN_SIZE])
logits = tf.matmul(output, self.softmax_weight) + self.softmax_bias
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=tf.reshape(trg_label, [-1]), logits=logits)

# 在计算平均损失时，需要将填充位置的权重设置为0，以避免无效位置的预测干扰
# 模型的训练。
label_weights = tf.sequence_mask(
    trg_size, maxlen=tf.shape(trg_label)[1], dtype=tf.float32)
label_weights = tf.reshape(label_weights, [-1])
cost = tf.reduce_sum(loss * label_weights)
cost_per_token = cost / tf.reduce_sum(label_weights)

# 定义反向传播操作。反向操作的实现与语言模型代码相同。
trainable_variables = tf.trainable_variables()

# 控制梯度大小，定义优化方法和训练步骤。
grads = tf.gradients(cost / tf.to_float(batch_size),
                     trainable_variables)
grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
train_op = optimizer.apply_gradients(
    zip(grads, trainable_variables))
return cost_per_token, train_op