Code Experiment
测试环境叫做cliff run,如下图:
起始位置是左下角(3,0),目标位置是右下角(3,11),其中黄色方框是当前位置,深紫色的是悬崖,如果掉下去了这个回合就结束了,得重新开始。
具体的文件可以在我的Github上,下载下来,直接运行jupyter notebook即可
https://github.com/Qxxxx/ReinforcementLearning.git
Q-Learning
class QLearningAgent(Agent):
def __init__(self, actions, epsilon=0.01, alpha=0.5, gamma=1):
super(QLearningAgent, self).__init__(actions)
## Initialize empty dictionary here
## In addition, initialize the value of epsilon, alpha and gamma
self.Q = {}
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
def stateToString(self, state):
mystring = ""
if np.isscalar(state):
mystring = str(state)
else:
for digit in state:
mystring += str(digit)
return mystring
def act(self, state):
stateStr = self.stateToString(state)
action = np.random.randint(0, self.num_actions)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not stateStr+' %i'%a in self.Q:
self.Q[stateStr+' %i'%a] = 0
Q[a] = self.Q[stateStr+' %i'%a]
choice = None
if self.epsilon == 0:
choice = 0
elif self.epsilon == 1:
choice = 1
else:
choice = np.random.binomial(1, self.epsilon)
if choice == 1:
return np.random.randint(0, self.num_actions)
else:
m = max(Q)
best_Q = [i for i, j in enumerate(Q) if j == m]
action = np.random.choice(best_Q)
#set_trace()
return action
return action
def learn(self, state1, action1, reward, state2, done):
state1Str = self.stateToString(state1)
state2Str = self.stateToString(state2)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not state2Str+' %i'%a in self.Q:
self.Q[state2Str+' %i'%a] = 0
Q[a] = self.Q[state2Str+' %i'%a]
self.Q[state1Str+' %i'%action1] = self.Q[state1Str+' %i'%action1]+\
self.alpha*(reward+self.gamma*max(Q)-\
self.Q[state1Str+' %i'%action1])
"""
Q-learning Update:
Q(s,a) <- Q(s,a) + alpha * (reward + gamma * max(Q(s') - Q(s,a))
"""
SARSA
class SarsaAgent(Agent):
def __init__(self, actions, epsilon=0.01, alpha=0.5, gamma=1):
super(SarsaAgent, self).__init__(actions)
## Initialize empty dictionary here
## In addition, initialize the value of epsilon, alpha and gamma
self.Q = {}
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
def stateToString(self, state):
mystring = ""
if np.isscalar(state):
mystring = str(state)
else:
for digit in state:
mystring += str(digit)
return mystring
def act(self, state):
stateStr = self.stateToString(state)
action = np.random.randint(0, self.num_actions)
Q = self.num_actions*[0]
for a in range(self.num_actions):
if not stateStr+' %i'%a in self.Q:
self.Q[stateStr+' %i'%a] = 0
Q[a] = self.Q[stateStr+' %i'%a]
## Implement epsilon greedy policy here
choice = None
if self.epsilon == 0:
choice = 0
elif self.epsilon == 1:
choice = 1
else:
choice = np.random.binomial(1, self.epsilon)
if choice == 1:
return np.random.randint(0, self.num_actions)
else:
m = max(Q)
best_Q = [i for i, j in enumerate(Q) if j == m]
action = np.random.choice(best_Q)
#set_trace()
return action
def learn(self, state1, action1, reward, state2, action2):
state1Str = self.stateToString(state1)
state2Str = self.stateToString(state2)
## Implement the sarsa update here
#if not state2Str+' %i'%action2 in self.Q:
# self.Q[state2Str+' %i'%action2] = 0
self.Q[state1Str+' %i'%action1] = self.Q[state1Str+' %i'%action1]+\
self.alpha*(reward+self.gamma*self.Q[state2Str+' %i'%action2]-self.Q[state1Str+' %i'%action1])
#set_trace()
"""
SARSA Update
Q(s,a) <- Q(s,a) + alpha * (reward + gamma * Q(s',a') - Q(s,a))
"""
别的地方几乎都是一样的,唯一的区别就是learn
这个函数,这是唯一的区别。
下面是两个算法的结果:
SARSA
Q-Learning
可以发现一下几点:
- Q-learning收敛稍稍比SARSA快一点,至少在这个测试环境中
- 实际上Q-Learning收敛到13步(最优解)走到目标位置,而SARSA收敛到17步
- Q-Learning比起SARSA在收敛之后,明显更加容易“跌入悬崖”,这个问题我也没有想明白,欢迎讨论,但是可以decay epsilon可以有解决。
网友评论