美文网首页
Q-learning 实现代码

Q-learning 实现代码

作者: 翩翩公子银圈圈 | 来源:发表于2018-09-13 17:26 被阅读0次

实现1


import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
                [-1, -1, -1, 0, -1,100],
                [-1, -1, -1, 0, -1,-1],
                [-1, 0, 0, -1, 0,-1],
                [0, -1, -1, 0, -1,100],
                [-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)


def getMaxQ(statex, statey):
    state = []
    if statex > 0:
        state.append(Q[statex-1, statey])
    if statey > 0:
        state.append(Q[statex, statey-1])
    if statex < SIZE-1:
        state.append(Q[statex+1, statey])
    if statey < SIZE-1:
        state.append(Q[statex, statey+1])
    return max(state[:])


def QLearning():
    for statex in range(SIZE):
        for statey in range(SIZE):
            Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
            # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
            #这两个公式基本等价,最后收敛的Q是一样的

count = 0    
while count < num_steps:
    QLearning()
    count += 1

print(Q)

实现版本2:

import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
                [-1, -1, -1, 0, -1,100],
                [-1, -1, -1, 0, -1,-1],
                [-1, 0, 0, -1, 0,-1],
                [0, -1, -1, 0, -1,100],
                [-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)


def getMaxQ(statex, statey):
    state = []
    if statex > 0:
        state.append(Q[statex-1, statey])
    if statey > 0:
        state.append(Q[statex, statey-1])
    if statex < SIZE-1:
        state.append(Q[statex+1, statey])
    if statey < SIZE-1:
        state.append(Q[statex, statey+1])
    return max(state[:])


def QLearning():
    for statex in range(SIZE):
        for statey in range(SIZE):
            Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
            # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)

count = 0
while count < num_steps:
    QLearning()
    count += 1

print(Q)
实验结果

相关文章

网友评论

      本文标题:Q-learning 实现代码

      本文链接:https://www.haomeiwen.com/subject/gqvmgftx.html