import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import gym
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('CliffWalking-v0')
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
logging.info('%s: %s', key, vars(env.unwrapped)[key])
00:00:00 [INFO] id: CliffWalking-v0 00:00:00 [INFO] entry_point: gym.envs.toy_text:CliffWalkingEnv 00:00:00 [INFO] reward_threshold: None 00:00:00 [INFO] nondeterministic: False 00:00:00 [INFO] max_episode_steps: None 00:00:00 [INFO] order_enforce: True 00:00:00 [INFO] _kwargs: {} 00:00:00 [INFO] _env_name: CliffWalking 00:00:00 [INFO] shape: (4, 12) 00:00:00 [INFO] start_state_index: 36 00:00:00 [INFO] _cliff: [[False False False False False False False False False False False False] [False False False False False False False False False False False False] [False False False False False False False False False False False False] [False True True True True True True True True True True False]] 00:00:00 [INFO] P: {0: {0: [(1.0, 0, -1, False)], 1: [(1.0, 1, -1, False)], 2: [(1.0, 12, -1, False)], 3: [(1.0, 0, -1, False)]}, 1: {0: [(1.0, 1, -1, False)], 1: [(1.0, 2, -1, False)], 2: [(1.0, 13, -1, False)], 3: [(1.0, 0, -1, False)]}, 2: {0: [(1.0, 2, -1, False)], 1: [(1.0, 3, -1, False)], 2: [(1.0, 14, -1, False)], 3: [(1.0, 1, -1, False)]}, 3: {0: [(1.0, 3, -1, False)], 1: [(1.0, 4, -1, False)], 2: [(1.0, 15, -1, False)], 3: [(1.0, 2, -1, False)]}, 4: {0: [(1.0, 4, -1, False)], 1: [(1.0, 5, -1, False)], 2: [(1.0, 16, -1, False)], 3: [(1.0, 3, -1, False)]}, 5: {0: [(1.0, 5, -1, False)], 1: [(1.0, 6, -1, False)], 2: [(1.0, 17, -1, False)], 3: [(1.0, 4, -1, False)]}, 6: {0: [(1.0, 6, -1, False)], 1: [(1.0, 7, -1, False)], 2: [(1.0, 18, -1, False)], 3: [(1.0, 5, -1, False)]}, 7: {0: [(1.0, 7, -1, False)], 1: [(1.0, 8, -1, False)], 2: [(1.0, 19, -1, False)], 3: [(1.0, 6, -1, False)]}, 8: {0: [(1.0, 8, -1, False)], 1: [(1.0, 9, -1, False)], 2: [(1.0, 20, -1, False)], 3: [(1.0, 7, -1, False)]}, 9: {0: [(1.0, 9, -1, False)], 1: [(1.0, 10, -1, False)], 2: [(1.0, 21, -1, False)], 3: [(1.0, 8, -1, False)]}, 10: {0: [(1.0, 10, -1, False)], 1: [(1.0, 11, -1, False)], 2: [(1.0, 22, -1, False)], 3: [(1.0, 9, -1, False)]}, 11: {0: [(1.0, 11, -1, False)], 1: [(1.0, 11, -1, False)], 2: [(1.0, 23, -1, False)], 3: [(1.0, 10, -1, False)]}, 12: {0: [(1.0, 0, -1, False)], 1: [(1.0, 13, -1, False)], 2: [(1.0, 24, -1, False)], 3: [(1.0, 12, -1, False)]}, 13: {0: [(1.0, 1, -1, False)], 1: [(1.0, 14, -1, False)], 2: [(1.0, 25, -1, False)], 3: [(1.0, 12, -1, False)]}, 14: {0: [(1.0, 2, -1, False)], 1: [(1.0, 15, -1, False)], 2: [(1.0, 26, -1, False)], 3: [(1.0, 13, -1, False)]}, 15: {0: [(1.0, 3, -1, False)], 1: [(1.0, 16, -1, False)], 2: [(1.0, 27, -1, False)], 3: [(1.0, 14, -1, False)]}, 16: {0: [(1.0, 4, -1, False)], 1: [(1.0, 17, -1, False)], 2: [(1.0, 28, -1, False)], 3: [(1.0, 15, -1, False)]}, 17: {0: [(1.0, 5, -1, False)], 1: [(1.0, 18, -1, False)], 2: [(1.0, 29, -1, False)], 3: [(1.0, 16, -1, False)]}, 18: {0: [(1.0, 6, -1, False)], 1: [(1.0, 19, -1, False)], 2: [(1.0, 30, -1, False)], 3: [(1.0, 17, -1, False)]}, 19: {0: [(1.0, 7, -1, False)], 1: [(1.0, 20, -1, False)], 2: [(1.0, 31, -1, False)], 3: [(1.0, 18, -1, False)]}, 20: {0: [(1.0, 8, -1, False)], 1: [(1.0, 21, -1, False)], 2: [(1.0, 32, -1, False)], 3: [(1.0, 19, -1, False)]}, 21: {0: [(1.0, 9, -1, False)], 1: [(1.0, 22, -1, False)], 2: [(1.0, 33, -1, False)], 3: [(1.0, 20, -1, False)]}, 22: {0: [(1.0, 10, -1, False)], 1: [(1.0, 23, -1, False)], 2: [(1.0, 34, -1, False)], 3: [(1.0, 21, -1, False)]}, 23: {0: [(1.0, 11, -1, False)], 1: [(1.0, 23, -1, False)], 2: [(1.0, 35, -1, False)], 3: [(1.0, 22, -1, False)]}, 24: {0: [(1.0, 12, -1, False)], 1: [(1.0, 25, -1, False)], 2: [(1.0, 36, -1, False)], 3: [(1.0, 24, -1, False)]}, 25: {0: [(1.0, 13, -1, False)], 1: [(1.0, 26, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 24, -1, False)]}, 26: {0: [(1.0, 14, -1, False)], 1: [(1.0, 27, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 25, -1, False)]}, 27: {0: [(1.0, 15, -1, False)], 1: [(1.0, 28, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 26, -1, False)]}, 28: {0: [(1.0, 16, -1, False)], 1: [(1.0, 29, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 27, -1, False)]}, 29: {0: [(1.0, 17, -1, False)], 1: [(1.0, 30, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 28, -1, False)]}, 30: {0: [(1.0, 18, -1, False)], 1: [(1.0, 31, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 29, -1, False)]}, 31: {0: [(1.0, 19, -1, False)], 1: [(1.0, 32, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 30, -1, False)]}, 32: {0: [(1.0, 20, -1, False)], 1: [(1.0, 33, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 31, -1, False)]}, 33: {0: [(1.0, 21, -1, False)], 1: [(1.0, 34, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 32, -1, False)]}, 34: {0: [(1.0, 22, -1, False)], 1: [(1.0, 35, -1, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 33, -1, False)]}, 35: {0: [(1.0, 23, -1, False)], 1: [(1.0, 35, -1, False)], 2: [(1.0, 47, -1, True)], 3: [(1.0, 34, -1, False)]}, 36: {0: [(1.0, 24, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -1, False)], 3: [(1.0, 36, -1, False)]}, 37: {0: [(1.0, 25, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -1, False)]}, 38: {0: [(1.0, 26, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 39: {0: [(1.0, 27, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 40: {0: [(1.0, 28, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 41: {0: [(1.0, 29, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 42: {0: [(1.0, 30, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 43: {0: [(1.0, 31, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 44: {0: [(1.0, 32, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 45: {0: [(1.0, 33, -1, False)], 1: [(1.0, 36, -100, False)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 46: {0: [(1.0, 34, -1, False)], 1: [(1.0, 47, -1, True)], 2: [(1.0, 36, -100, False)], 3: [(1.0, 36, -100, False)]}, 47: {0: [(1.0, 35, -1, False)], 1: [(1.0, 47, -1, True)], 2: [(1.0, 47, -1, True)], 3: [(1.0, 36, -100, False)]}} 00:00:00 [INFO] isd: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 00:00:00 [INFO] lastaction: None 00:00:00 [INFO] nS: 48 00:00:00 [INFO] nA: 4 00:00:00 [INFO] action_space: Discrete(4) 00:00:00 [INFO] observation_space: Discrete(48) 00:00:00 [INFO] np_random: RandomState(MT19937) 00:00:00 [INFO] s: 36 00:00:00 [INFO] spec: EnvSpec(CliffWalking-v0)
class ClosedFormAgent:
def __init__(self, _):
pass
def reset(self, mode=None):
pass
def step(self, observation, reward, terminated):
if observation == 36:
action = 0
elif observation % 12 == 11:
action = 2
else:
action = 1
return action
def close(self):
pass
agent = ClosedFormAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
00:00:00 [INFO] ==== test ==== 00:00:00 [INFO] test episode 0: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 1: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 2: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 3: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 4: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 5: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 6: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 7: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 8: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 9: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 10: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 11: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 12: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 13: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 14: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 15: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 16: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 17: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 18: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 19: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 20: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 21: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 22: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 23: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 24: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 25: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 26: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 27: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 28: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 29: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 30: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 31: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 32: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 33: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 34: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 35: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 36: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 37: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 38: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 39: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 40: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 41: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 42: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 43: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 44: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 45: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 46: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 47: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 48: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 49: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 50: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 51: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 52: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 53: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 54: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 55: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 56: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 57: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 58: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 59: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 60: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 61: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 62: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 63: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 64: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 65: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 66: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 67: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 68: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 69: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 70: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 71: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 72: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 73: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 74: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 75: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 76: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 77: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 78: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 79: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 80: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 81: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 82: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 83: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 84: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 85: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 86: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 87: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 88: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 89: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 90: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 91: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 92: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 93: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 94: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 95: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 96: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 97: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 98: reward = -13.00, steps = 13 00:00:00 [INFO] test episode 99: reward = -13.00, steps = 13 00:00:00 [INFO] average episode reward = -13.00 ± 0.00
env.close()