美文网首页
RL: q_learning_mountaincar.py

RL: q_learning_mountaincar.py

作者: 魏鹏飞 | 来源:发表于2020-04-16 12:08 被阅读0次

    Keywords:

    on_policy(SARSA)、off_policy(Q-Learning)、iter_max、t_max、gamma、eps、epsilon-greedy、

    q_learning_mountaincar.py
    """
    Model-free Prediction and Control
    Example of SARSA Learning (on-policy) and Q-Learning (off-policy) using OpenAI gym MountainCar enviornment (https://gym.openai.com/envs/MountainCar-v0/)
    Bolei Zhou for IERG6130, with parts of code adapted from Moustafa Alzantot (malzantot@ucla.edu)
    """
    import numpy as np
    
    import gym
    from gym import wrappers
    
    off_policy = True # if True use off-policy q-learning update, if False, use on-policy SARSA update
    
    n_states = 40
    iter_max = 5000
    
    initial_lr = 1.0 # Learning rate
    min_lr = 0.003
    gamma = 1.0
    t_max = 10000
    eps = 0.1
    
    def run_episode(env, policy=None, render=False):
        obs = env.reset()
        total_reward = 0
        step_idx = 0
        for _ in range(t_max):
            if render:
                env.render()
            if policy is None:
                action = env.action_space.sample()
            else:
                a,b = obs_to_state(env, obs)
                action = policy[a][b]
            obs, reward, done, _ = env.step(action)
            total_reward += gamma ** step_idx * reward
            step_idx += 1
            if done:
                break
        return total_reward
    
    def obs_to_state(env, obs):
        """ Maps an observation to state """
        # we quantify the continous state space into discrete space
        env_low = env.observation_space.low
        env_high = env.observation_space.high
        env_dx = (env_high - env_low) / n_states
        a = int((obs[0] - env_low[0])/env_dx[0])
        b = int((obs[1] - env_low[1])/env_dx[1])
        return a, b
    
    if __name__ == '__main__':
        env_name = 'MountainCar-v0'
        env = gym.make(env_name)
        env.seed(0)
        np.random.seed(0)
        if off_policy == True:
            print ('----- using Q Learning -----')
        else:
            print('------ using SARSA Learning ---')
    
        q_table = np.zeros((n_states, n_states, 3))
        for i in range(iter_max):
            obs = env.reset()
            total_reward = 0
            ## eta: learning rate is decreased at each step
            eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
            for j in range(t_max):
                a, b = obs_to_state(env, obs)
                if np.random.uniform(0, 1) < eps:
                    action = np.random.choice(env.action_space.n)
                else:
                    action = np.argmax(q_table[a][b])
                obs, reward, done, _ = env.step(action)
                total_reward += reward
                # update q table
                a_, b_ = obs_to_state(env, obs)
                if off_policy == True:
                    # use q-learning update (off-policy learning)
                    q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  np.max(q_table[a_][b_]) - q_table[a][b][action])
                else:
                    # use SARSA update (on-policy learning)
                    # epsilon-greedy policy on Q again
                    if np.random.uniform(0,1) < eps:
                        action_ = np.random.choice(env.action_space.n)
                    else:
                        action_ = np.argmax(q_table[a_][b_])
                    q_table[a][b][action] = q_table[a][b][action] + eta * (reward + gamma *  q_table[a_][b_][action_] - q_table[a][b][action])
                if done:
                    break
            if i % 200 == 0:
                print('Iteration #%d -- Total reward = %d.' %(i+1, total_reward))
        solution_policy = np.argmax(q_table, axis=2)
        solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
        print("Average score of solution = ", np.mean(solution_policy_scores))
        # Animate it
        for _ in range(2):
            run_episode(env, solution_policy, True)
        env.close()
    
    
    # Results:
    python q_learning_mountaincar.py
    
    ----- using Q Learning -----
    Iteration #1 -- Total reward = -200.
    Iteration #201 -- Total reward = -200.
    Iteration #401 -- Total reward = -200.
    Iteration #601 -- Total reward = -200.
    Iteration #801 -- Total reward = -200.
    Iteration #1001 -- Total reward = -200.
    Iteration #1201 -- Total reward = -200.
    Iteration #1401 -- Total reward = -200.
    Iteration #1601 -- Total reward = -200.
    Iteration #1801 -- Total reward = -200.
    Iteration #2001 -- Total reward = -161.
    Iteration #2201 -- Total reward = -200.
    Iteration #2401 -- Total reward = -200.
    Iteration #2601 -- Total reward = -200.
    Iteration #2801 -- Total reward = -200.
    Iteration #3001 -- Total reward = -153.
    Iteration #3201 -- Total reward = -159.
    Iteration #3401 -- Total reward = -200.
    Iteration #3601 -- Total reward = -200.
    Iteration #3801 -- Total reward = -200.
    Iteration #4001 -- Total reward = -200.
    Iteration #4201 -- Total reward = -200.
    Iteration #4401 -- Total reward = -200.
    Iteration #4601 -- Total reward = -200.
    Iteration #4801 -- Total reward = -200.
    Average score of solution =  -149.42
    
    
    ------ using SARSA Learning ---
    Iteration #1 -- Total reward = -200.
    Iteration #201 -- Total reward = -200.
    Iteration #401 -- Total reward = -200.
    Iteration #601 -- Total reward = -200.
    Iteration #801 -- Total reward = -200.
    Iteration #1001 -- Total reward = -200.
    Iteration #1201 -- Total reward = -158.
    Iteration #1401 -- Total reward = -160.
    Iteration #1601 -- Total reward = -166.
    Iteration #1801 -- Total reward = -200.
    Iteration #2001 -- Total reward = -200.
    Iteration #2201 -- Total reward = -191.
    Iteration #2401 -- Total reward = -200.
    Iteration #2601 -- Total reward = -154.
    Iteration #2801 -- Total reward = -157.
    Iteration #3001 -- Total reward = -200.
    Iteration #3201 -- Total reward = -155.
    Iteration #3401 -- Total reward = -159.
    Iteration #3601 -- Total reward = -158.
    Iteration #3801 -- Total reward = -194.
    Iteration #4001 -- Total reward = -200.
    Iteration #4201 -- Total reward = -200.
    Iteration #4401 -- Total reward = -200.
    Iteration #4601 -- Total reward = -158.
    Iteration #4801 -- Total reward = -200.
    Average score of solution =  -181.78
    

    At a glance:

    Q Learning SARSA

    相关文章

      网友评论

          本文标题:RL: q_learning_mountaincar.py

          本文链接:https://www.haomeiwen.com/subject/mnqrvhtx.html