实现1
import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
[-1, -1, -1, 0, -1,100],
[-1, -1, -1, 0, -1,-1],
[-1, 0, 0, -1, 0,-1],
[0, -1, -1, 0, -1,100],
[-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)
def getMaxQ(statex, statey):
state = []
if statex > 0:
state.append(Q[statex-1, statey])
if statey > 0:
state.append(Q[statex, statey-1])
if statex < SIZE-1:
state.append(Q[statex+1, statey])
if statey < SIZE-1:
state.append(Q[statex, statey+1])
return max(state[:])
def QLearning():
for statex in range(SIZE):
for statey in range(SIZE):
Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
# Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
#这两个公式基本等价,最后收敛的Q是一样的
count = 0
while count < num_steps:
QLearning()
count += 1
print(Q)
实现版本2:
import numpy as np
GAMMA = 0.8
ALPHA = 0.01
num_steps = 100000
SIZE = 6
R = np.asarray([[-1, -1, -1, -1, 0,-1],
[-1, -1, -1, 0, -1,100],
[-1, -1, -1, 0, -1,-1],
[-1, 0, 0, -1, 0,-1],
[0, -1, -1, 0, -1,100],
[-1,0,-1,-1,0,100]])
Q = np.zeros([SIZE, SIZE], np.float32)
def getMaxQ(statex, statey):
state = []
if statex > 0:
state.append(Q[statex-1, statey])
if statey > 0:
state.append(Q[statex, statey-1])
if statex < SIZE-1:
state.append(Q[statex+1, statey])
if statey < SIZE-1:
state.append(Q[statex, statey+1])
return max(state[:])
def QLearning():
for statex in range(SIZE):
for statey in range(SIZE):
Q[statex, statey] = (1-ALPHA)*Q[statex, statey] + ALPHA* (R[statex, statey]+GAMMA * getMaxQ(statex, statey))
# Q[statex, statey] = R[statex, statey] + GAMMA * getMaxQ(statex, statey)
count = 0
while count < num_steps:
QLearning()
count += 1
print(Q)
实验结果
网友评论