美文网首页
Q-learning 实现代码

Q-learning 实现代码

作者: 翩翩公子银圈圈 | 来源:发表于2018-09-13 17:26 被阅读0次

    实现1

    
    import numpy as np
    GAMMA = 0.8
    ALPHA = 0.01
    num_steps = 100000
    SIZE = 6
    R = np.asarray([[-1, -1, -1, -1, 0,-1],
                    [-1, -1, -1, 0, -1,100],
                    [-1, -1, -1, 0, -1,-1],
                    [-1, 0, 0, -1, 0,-1],
                    [0, -1, -1, 0, -1,100],
                    [-1,0,-1,-1,0,100]])
    Q = np.zeros([SIZE, SIZE], np.float32)
    
    
    def getMaxQ(statex, statey):
        state = []
        if statex > 0:
            state.append(Q[statex-1, statey])
        if statey > 0:
            state.append(Q[statex, statey-1])
        if statex < SIZE-1:
            state.append(Q[statex+1, statey])
        if statey < SIZE-1:
            state.append(Q[statex, statey+1])
        return max(state[:])
    
    
    def QLearning():
        for statex in range(SIZE):
            for statey in range(SIZE):
                Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
                # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
                #这两个公式基本等价,最后收敛的Q是一样的
    
    count = 0    
    while count < num_steps:
        QLearning()
        count += 1
    
    print(Q)
    

    实现版本2:

    import numpy as np
    GAMMA = 0.8
    ALPHA = 0.01
    num_steps = 100000
    SIZE = 6
    R = np.asarray([[-1, -1, -1, -1, 0,-1],
                    [-1, -1, -1, 0, -1,100],
                    [-1, -1, -1, 0, -1,-1],
                    [-1, 0, 0, -1, 0,-1],
                    [0, -1, -1, 0, -1,100],
                    [-1,0,-1,-1,0,100]])
    Q = np.zeros([SIZE, SIZE], np.float32)
    
    
    def getMaxQ(statex, statey):
        state = []
        if statex > 0:
            state.append(Q[statex-1, statey])
        if statey > 0:
            state.append(Q[statex, statey-1])
        if statex < SIZE-1:
            state.append(Q[statex+1, statey])
        if statey < SIZE-1:
            state.append(Q[statex, statey+1])
        return max(state[:])
    
    
    def QLearning():
        for statex in range(SIZE):
            for statey in range(SIZE):
                Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
                # Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
    
    count = 0
    while count < num_steps:
        QLearning()
        count += 1
    
    print(Q)
    
    实验结果

    相关文章

      网友评论

          本文标题:Q-learning 实现代码

          本文链接:https://www.haomeiwen.com/subject/gqvmgftx.html