import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import gym
import gym.spaces as spaces
import gym.utils.seeding as seeding
from gym.envs.registration import register
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
class Observation:
LEFT, RIGHT, START = range(3)
class Action:
LEFT, RIGHT, LISTEN = range(3)
class TigerEnv(gym.Env):
def __init__(self, episodic=True):
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Discrete(2)
self.episodic = episodic
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
self.state = np.random.choice(2)
return Observation.START, {} # placebo observation
def step(self, action):
if action == Action.LISTEN:
if np.random.rand() > 0.85:
observation = 1 - self.state
else:
observation = self.state
reward = -1
terminated = False
else:
observation = self.state
if action == self.state:
reward = 10.
else:
reward = -100.
if self.episodic:
terminated = True
else:
terminated = False
observation = self.reset()
return observation, reward, terminated, False, {}
register(
id="Tiger-v0",
entry_point=TigerEnv,
kwargs={"episodic": True},
)
register(
id="Tiger200-v0",
kwargs={"episodic": False},
entry_point=TigerEnv,
max_episode_steps=200,
)
env = gym.make('Tiger-v0')
class Agent:
def __init__(self, env=None):
pass
def reset(self, mode=None):
self.count = 0
def step(self, observation, reward, terminated):
if observation == Observation.LEFT:
self.count += 1
elif observation == Observation.RIGHT:
self.count -= 1
else: # observation == Observation.START
self.count = 0
if self.count > 2:
action = Action.LEFT
elif self.count < -2:
action = Action.RIGHT
else:
action = Action.LISTEN
return action
def close(self):
pass
agent = Agent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
00:00:00 [INFO] ==== test ==== 00:00:00 [INFO] test episode 0: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 1: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 2: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 3: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 4: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 5: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 6: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 7: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 8: reward = 1.00, steps = 10 00:00:00 [INFO] test episode 9: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 10: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 11: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 12: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 13: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 14: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 15: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 16: reward = 1.00, steps = 10 00:00:00 [INFO] test episode 17: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 18: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 19: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 20: reward = 1.00, steps = 10 00:00:00 [INFO] test episode 21: reward = 1.00, steps = 10 00:00:00 [INFO] test episode 22: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 23: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 24: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 25: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 26: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 27: reward = -1.00, steps = 12 00:00:00 [INFO] test episode 28: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 29: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 30: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 31: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 32: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 33: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 34: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 35: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 36: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 37: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 38: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 39: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 40: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 41: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 42: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 43: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 44: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 45: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 46: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 47: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 48: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 49: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 50: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 51: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 52: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 53: reward = -107.00, steps = 8 00:00:00 [INFO] test episode 54: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 55: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 56: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 57: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 58: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 59: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 60: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 61: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 62: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 63: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 64: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 65: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 66: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 67: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 68: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 69: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 70: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 71: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 72: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 73: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 74: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 75: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 76: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 77: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 78: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 79: reward = 1.00, steps = 10 00:00:00 [INFO] test episode 80: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 81: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 82: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 83: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 84: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 85: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 86: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 87: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 88: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 89: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 90: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 91: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 92: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 93: reward = 3.00, steps = 8 00:00:00 [INFO] test episode 94: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 95: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 96: reward = 5.00, steps = 6 00:00:00 [INFO] test episode 97: reward = -1.00, steps = 12 00:00:00 [INFO] test episode 98: reward = 7.00, steps = 4 00:00:00 [INFO] test episode 99: reward = 1.00, steps = 10 00:00:00 [INFO] average episode reward = 4.44 ± 11.39
env = gym.make('Tiger200-v0')
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
00:00:00 [INFO] ==== test ==== 00:00:00 [INFO] test episode 0: reward = 185.00, steps = 200 00:00:00 [INFO] test episode 1: reward = 53.00, steps = 200 00:00:00 [INFO] test episode 2: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 3: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 4: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 5: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 6: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 7: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 8: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 9: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 10: reward = 141.00, steps = 200 00:00:00 [INFO] test episode 11: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 12: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 13: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 14: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 15: reward = 174.00, steps = 200 00:00:00 [INFO] test episode 16: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 17: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 18: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 19: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 20: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 21: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 22: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 23: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 24: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 25: reward = 185.00, steps = 200 00:00:00 [INFO] test episode 26: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 27: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 28: reward = 163.00, steps = 200 00:00:00 [INFO] test episode 29: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 30: reward = 97.00, steps = 200 00:00:00 [INFO] test episode 31: reward = 141.00, steps = 200 00:00:00 [INFO] test episode 32: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 33: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 34: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 35: reward = 174.00, steps = 200 00:00:00 [INFO] test episode 36: reward = -46.00, steps = 200 00:00:00 [INFO] test episode 37: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 38: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 39: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 40: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 41: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 42: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 43: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 44: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 45: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 46: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 47: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 48: reward = 163.00, steps = 200 00:00:00 [INFO] test episode 49: reward = 130.00, steps = 200 00:00:00 [INFO] test episode 50: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 51: reward = 75.00, steps = 200 00:00:00 [INFO] test episode 52: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 53: reward = 163.00, steps = 200 00:00:00 [INFO] test episode 54: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 55: reward = 119.00, steps = 200 00:00:00 [INFO] test episode 56: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 57: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 58: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 59: reward = 97.00, steps = 200 00:00:00 [INFO] test episode 60: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 61: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 62: reward = 152.00, steps = 200 00:00:00 [INFO] test episode 63: reward = -35.00, steps = 200 00:00:00 [INFO] test episode 64: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 65: reward = 284.00, steps = 200 00:00:00 [INFO] test episode 66: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 67: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 68: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 69: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 70: reward = 262.00, steps = 200 00:00:00 [INFO] test episode 71: reward = 273.00, steps = 200 00:00:00 [INFO] test episode 72: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 73: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 74: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 75: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 76: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 77: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 78: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 79: reward = 53.00, steps = 200 00:00:00 [INFO] test episode 80: reward = 207.00, steps = 200 00:00:00 [INFO] test episode 81: reward = 97.00, steps = 200 00:00:00 [INFO] test episode 82: reward = 185.00, steps = 200 00:00:00 [INFO] test episode 83: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 84: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 85: reward = 229.00, steps = 200 00:00:00 [INFO] test episode 86: reward = 185.00, steps = 200 00:00:00 [INFO] test episode 87: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 88: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 89: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 90: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 91: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 92: reward = 196.00, steps = 200 00:00:00 [INFO] test episode 93: reward = 240.00, steps = 200 00:00:00 [INFO] test episode 94: reward = 119.00, steps = 200 00:00:00 [INFO] test episode 95: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 96: reward = 251.00, steps = 200 00:00:00 [INFO] test episode 97: reward = 273.00, steps = 200 00:00:00 [INFO] test episode 98: reward = 218.00, steps = 200 00:00:00 [INFO] test episode 99: reward = 185.00, steps = 200 00:00:00 [INFO] average episode reward = 200.62 ± 56.69