with common.RewardTracker(writer, np.inf, group_rewards=100) as reward_tracker:
while True:
step_idx += 1
buffer.populate(1)
selector.epsilon = max(EPSILON_STOP, EPSILON_START - step_idx / EPSILON_STEPS)
new_rewards = exp_source.pop_rewards_steps()
if new_rewards:
reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon)
- 只需要有
__enter__
方法,__exit__
方法就行了,
class RewardTracker:
def __init__(self, writer, stop_reward, group_rewards=1):
self.writer = writer
self.stop_reward = stop_reward
self.reward_buf = []
self.steps_buf = []
self.group_rewards = group_rewards
def __enter__(self):
self.ts = time.time()
self.ts_frame = 0
self.total_rewards = []
self.total_steps = []
return self
def __exit__(self, *args):
self.writer.close()
def reward(self, reward_steps, frame, epsilon=None):
reward, steps = reward_steps
self.reward_buf.append(reward)
self.steps_buf.append(steps)
if len(self.reward_buf) < self.group_rewards:
return False
reward = np.mean(self.reward_buf)
steps = np.mean(self.steps_buf)
self.reward_buf.clear()
self.steps_buf.clear()
self.total_rewards.append(reward)
self.total_steps.append(steps)
speed = (frame - self.ts_frame) / (time.time() - self.ts)
self.ts_frame = frame
self.ts = time.time()
mean_reward = np.mean(self.total_rewards[-100:])
mean_steps = np.mean(self.total_steps[-100:])
epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
print("%d: done %d games, mean reward %.3f, mean steps %.2f, speed %.2f f/s%s" % (
frame, len(self.total_rewards)*self.group_rewards, mean_reward, mean_steps, speed, epsilon_str
))
sys.stdout.flush()
if epsilon is not None:
self.writer.add_scalar("epsilon", epsilon, frame)
self.writer.add_scalar("speed", speed, frame)
self.writer.add_scalar("reward_100", mean_reward, frame)
self.writer.add_scalar("reward", reward, frame)
self.writer.add_scalar("steps_100", mean_steps, frame)
self.writer.add_scalar("steps", steps, frame)
if mean_reward > self.stop_reward:
print("Solved in %d frames!" % frame)
return True
return False
网友评论