Q-learning 实现代码

作者: 翩翩公子银圈圈 | 来源:发表于2018-09-13 17:26 被阅读0次

Q-learning 实现代码
莫烦强化学习笔记2- Q Learning
实战深度强化学习DQN-理论和实践
Win10环境下使用WSL安装OpenAI/gym +Tenso
RL[0] - 初见
python实现Q-Learning算法
Q-learning 的 python 实现
2021-05-10 一天又过去了
RL
强化学习之Sarsa

实现1


import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
                [-1, -1, -1, 0, -1,100],
                [-1, -1, -1, 0, -1,-1],
                [-1, 0, 0, -1, 0,-1],
                [0, -1, -1, 0, -1,100],
                [-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)


def getMaxQ(statex, statey):
    state = []
    if statex > 0:
        state.append(Q[statex-1, statey])
    if statey > 0:
        state.append(Q[statex, statey-1])
    if statex < SIZE-1:
        state.append(Q[statex+1, statey])
    if statey < SIZE-1:
        state.append(Q[statex, statey+1])
    return max(state[:])


def QLearning():
    for statex in range(SIZE):
        for statey in range(SIZE):
            Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
            # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
            #这两个公式基本等价，最后收敛的Q是一样的

count = 0    
while count < num_steps:
    QLearning()
    count += 1

print(Q)

实现版本2：

import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
                [-1, -1, -1, 0, -1,100],
                [-1, -1, -1, 0, -1,-1],
                [-1, 0, 0, -1, 0,-1],
                [0, -1, -1, 0, -1,100],
                [-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)


def getMaxQ(statex, statey):
    state = []
    if statex > 0:
        state.append(Q[statex-1, statey])
    if statey > 0:
        state.append(Q[statex, statey-1])
    if statex < SIZE-1:
        state.append(Q[statex+1, statey])
    if statey < SIZE-1:
        state.append(Q[statex, statey+1])
    return max(state[:])


def QLearning():
    for statex in range(SIZE):
        for statey in range(SIZE):
            Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
            # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)

count = 0
while count < num_steps:
    QLearning()
    count += 1

print(Q)

实验结果