美文网首页
2019-03-31

2019-03-31

作者: sea_monster | 来源:发表于2019-03-31 09:23 被阅读0次

    《机器学习实战》--增強学习(代码未经验证)

    from tensorflow.contrib.layers import convolution2d, fully_connected
    input_height = 88
    input_width = 80
    input_channels = 1
    conv_n_maps = [32,64,64]
    conv_kernel_sizes = [(8,8),(4,4),(3,3)]
    conv_strides = [4,2,1]
    conv_paddings = ['SAME']*3
    conv_activation = [tf.nn.relu]*3
    n_hidden_in = 64*11*10      #conv3 有64个11*10映射
    each_n_hidden = 512
    hidden_activation = tf.nn.relu 
    n_outputs = env.action_space.n #9个离散动作
    
    initializer = tf.contrib.layers.variance_scaling_initializer()
    #三层卷积,两层全连接
    def q_network(X_state, scope):
        prev_layer = X_state
        conv_layers = []
        with tf.variable_scope(scope) as scope:
            for n_map,kernel_size, stride, padding, activation in zip(conv_n_maps,
                    conv_kernel_sizes,conv_strides,conv_paddings,conv_activation):
                prev_layer = convolution2d(prev_layer, n_outputs=n_maps,kernel_size=kernel_size,
                            stride=stride,padding=padding,activation=activation,
                            weights_initializer=initializer)
                conv_layers.append(prev_layer)
            last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1,n_hidden_in])
            hidden = fully_connected(last_conv_layer_flat,n_hidden,activation=hidden_activation,weights_initializer=initializer)
            outputs = fully_connected(hidden, n_outputs, activation=None,weights_initializer=initializer)
        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=scope.name)
        trainable_vars_by_name = {var.name[len(scope.name):]: for var in trainable_vars}
        return outputs, trainable_vars_by_name
    
    X_state = tf.placeholder(tf.float32,shape=[None,inpu_height,input_width,input_channels])
    actor_q_values, actor_vars = q_network(X_state, scope='q_networks/actor')
    critic_q_values, critic_vars = q_network(X_state, scope='q_networks/critic')
    copy_ops = [actor_var.assign(critic_vars[var_name]) for var_name,actor_var in actor_vars.items()]
    copy_critic_to_actor = tf.group(*copy_ops)
    
    X_action=tf.placeholder(tf.int32,shape=[ None ])
    q_value=tf.reduce_sum(critic_q_values*tf.one_hot(X_action,n_outputs),axis= 1,keep_dims= True )
    y = tf.placeholder(tf.float32, shape=[None,1])
    cost = tf.reduce_mean(tf.square(y-q_value))
    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.AdamOptimizer(learning_ratei)
    training_op = optimizer.minimize(cost, global_step=global_step)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    from collections import deque
    
    relay_memory_size = 10000
    relay_memory = deque([],maxlen=relay_memory_size)
    
    def sample_memories(batch_size):
        indices = rnd.permutation(len(relay_memory))[:batch_size]
        cols = [[],[],[],[],[]] #state,action,reward,next_state,continue
        for idx in indices:
            memory = relay_memory[idx]
            for col,value in zip(cols,memory):
                col.append(value)
        cols = [np.array(col) for col in cols]
        return (cols[0], cols[1],cols[2].reshape(-1,1),cols[3],cols[4].reshape(-1,1))
    
    eps_min = 0.05
    eps_max = 1.0
    eps_decay_steps = 50000
    def epsilon_greedy(q_values, step):
        epsilon = max(eps_min,eps_max - (eps_max-eps_min)*step/eps_decay_steps)
        if rnd.rnd() < epsilon:
            return rnd.randint(n_outputs) #随机动作
        else:
            return np.argmax(q_values) #最优动作
    
    n_steps = 100000 #总的步长
    training_start = 1000 #在游戏1000次迭代后开始训练
    training_interval = 3 #每三次迭代训练一次
    save_steps = 50  
    copy_steps = 25
    discount_rate = 0.95
    skip_start = 90     #跳过游戏开始(只是等待时间)
    batch_size = 50     
    iteration = 0        #游戏迭代
    checkpoint_path = './my_dqn.ckpt'
    done = True   #env需要被重置
    
    with tf.Session() as sess:
        if os.path.isfile(checkpoint_path):
            saver.restore(sess, checkpoint_path)
        else:
            init.run()
        while True:
            step = global_step.eval()
            if step >= n_steps:
                break
            iteration += 1
            if done:
                obs = env.reset()
                for skip in range(skip_start):
                    obs, reward, done, info = env.step(0)
                    state = preprocess_observation(obs)
            
            q_values = actor_q_values.eval(feed_dict={X_state:[state]})
            action = epsilon_greedy(q_values, step)
            obs, reward, done, info = env.step(action)
            next_state = preprocess_observation(obs)
            
            replay_memory.append((state, action, reward, next_state, 1.0-done))
            state =next_state
            
            if iteration < training_start or iteration % training_interval != 0:
                continue
            X_state_val,X_action_val,rewards,X_next_state_val,continues=(sample_memories(batch_size))
            next_q_values = actor_q_values.eval(feed_dict={X_state: X_next_state_val})
            
            max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
            y_val = rewards + continues*discount_rate*max_next_q_values
            training_op.run(feed_dict={X_state:X_state_val,X_action:X_action_val,
                                y:y_val})
            if step%copy_steps==0:
                copy_critic_to_actor.run()
            
            if step % save_steps == 0:
                saver.save(sess, checkpoint_path)
    

    相关文章

      网友评论

          本文标题:2019-03-31

          本文链接:https://www.haomeiwen.com/subject/qpkybqtx.html