Use Closed-Form Policy to Play Acrobot-v1¶

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('Acrobot-v1')
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
    logging.info('%s: %s', key, vars(env.unwrapped)[key])
00:00:00 [INFO] id: Acrobot-v1
00:00:00 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv
00:00:00 [INFO] reward_threshold: -100.0
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 500
00:00:00 [INFO] order_enforce: True
00:00:00 [INFO] _kwargs: {}
00:00:00 [INFO] _env_name: Acrobot
00:00:00 [INFO] viewer: None
00:00:00 [INFO] observation_space: Box([ -1.        -1.        -1.        -1.       -12.566371 -28.274334], [ 1.        1.        1.        1.       12.566371 28.274334], (6,), float32)
00:00:00 [INFO] action_space: Discrete(3)
00:00:00 [INFO] state: None
00:00:00 [INFO] np_random: RandomState(MT19937)
00:00:00 [INFO] spec: EnvSpec(Acrobot-v1)
In [3]:
class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        pass

    def step(self, observation, reward, terminated):
        x0, y0, x1, y1, v0, v1 = observation
        if v1 < -0.3:
            action = 0
        elif v1 > 0.3:
            action = 2
        else:
            y = y1 + x0 * y1 + x1 * y0
            if y > 0.:
                action = 0
            else:
                action = 2
        return action

    def close(self):
        pass


agent = ClosedFormAgent(env)
In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:00:00 [INFO] ==== test ====
00:00:00 [INFO] test episode 0: reward = -70.00, steps = 71
00:00:00 [INFO] test episode 1: reward = -264.00, steps = 265
00:00:00 [INFO] test episode 2: reward = -71.00, steps = 72
00:00:00 [INFO] test episode 3: reward = -78.00, steps = 79
00:00:00 [INFO] test episode 4: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 5: reward = -157.00, steps = 158
00:00:01 [INFO] test episode 6: reward = -71.00, steps = 72
00:00:01 [INFO] test episode 7: reward = -74.00, steps = 75
00:00:01 [INFO] test episode 8: reward = -91.00, steps = 92
00:00:01 [INFO] test episode 9: reward = -138.00, steps = 139
00:00:01 [INFO] test episode 10: reward = -92.00, steps = 93
00:00:01 [INFO] test episode 11: reward = -90.00, steps = 91
00:00:01 [INFO] test episode 12: reward = -107.00, steps = 108
00:00:01 [INFO] test episode 13: reward = -85.00, steps = 86
00:00:01 [INFO] test episode 14: reward = -78.00, steps = 79
00:00:01 [INFO] test episode 15: reward = -85.00, steps = 86
00:00:01 [INFO] test episode 16: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 17: reward = -72.00, steps = 73
00:00:01 [INFO] test episode 18: reward = -72.00, steps = 73
00:00:01 [INFO] test episode 19: reward = -86.00, steps = 87
00:00:01 [INFO] test episode 20: reward = -82.00, steps = 83
00:00:01 [INFO] test episode 21: reward = -177.00, steps = 178
00:00:01 [INFO] test episode 22: reward = -78.00, steps = 79
00:00:01 [INFO] test episode 23: reward = -207.00, steps = 208
00:00:01 [INFO] test episode 24: reward = -95.00, steps = 96
00:00:01 [INFO] test episode 25: reward = -80.00, steps = 81
00:00:01 [INFO] test episode 26: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 27: reward = -73.00, steps = 74
00:00:01 [INFO] test episode 28: reward = -75.00, steps = 76
00:00:01 [INFO] test episode 29: reward = -143.00, steps = 144
00:00:01 [INFO] test episode 30: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 31: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 32: reward = -96.00, steps = 97
00:00:01 [INFO] test episode 33: reward = -80.00, steps = 81
00:00:01 [INFO] test episode 34: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 35: reward = -93.00, steps = 94
00:00:01 [INFO] test episode 36: reward = -71.00, steps = 72
00:00:01 [INFO] test episode 37: reward = -71.00, steps = 72
00:00:01 [INFO] test episode 38: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 39: reward = -71.00, steps = 72
00:00:01 [INFO] test episode 40: reward = -85.00, steps = 86
00:00:01 [INFO] test episode 41: reward = -71.00, steps = 72
00:00:01 [INFO] test episode 42: reward = -79.00, steps = 80
00:00:01 [INFO] test episode 43: reward = -64.00, steps = 65
00:00:01 [INFO] test episode 44: reward = -65.00, steps = 66
00:00:01 [INFO] test episode 45: reward = -96.00, steps = 97
00:00:01 [INFO] test episode 46: reward = -84.00, steps = 85
00:00:01 [INFO] test episode 47: reward = -78.00, steps = 79
00:00:01 [INFO] test episode 48: reward = -78.00, steps = 79
00:00:01 [INFO] test episode 49: reward = -91.00, steps = 92
00:00:02 [INFO] test episode 50: reward = -86.00, steps = 87
00:00:02 [INFO] test episode 51: reward = -80.00, steps = 81
00:00:02 [INFO] test episode 52: reward = -71.00, steps = 72
00:00:02 [INFO] test episode 53: reward = -87.00, steps = 88
00:00:02 [INFO] test episode 54: reward = -77.00, steps = 78
00:00:02 [INFO] test episode 55: reward = -141.00, steps = 142
00:00:02 [INFO] test episode 56: reward = -76.00, steps = 77
00:00:02 [INFO] test episode 57: reward = -77.00, steps = 78
00:00:02 [INFO] test episode 58: reward = -89.00, steps = 90
00:00:02 [INFO] test episode 59: reward = -93.00, steps = 94
00:00:02 [INFO] test episode 60: reward = -85.00, steps = 86
00:00:02 [INFO] test episode 61: reward = -93.00, steps = 94
00:00:02 [INFO] test episode 62: reward = -80.00, steps = 81
00:00:02 [INFO] test episode 63: reward = -172.00, steps = 173
00:00:02 [INFO] test episode 64: reward = -91.00, steps = 92
00:00:02 [INFO] test episode 65: reward = -65.00, steps = 66
00:00:02 [INFO] test episode 66: reward = -78.00, steps = 79
00:00:02 [INFO] test episode 67: reward = -80.00, steps = 81
00:00:02 [INFO] test episode 68: reward = -125.00, steps = 126
00:00:02 [INFO] test episode 69: reward = -85.00, steps = 86
00:00:02 [INFO] test episode 70: reward = -72.00, steps = 73
00:00:02 [INFO] test episode 71: reward = -70.00, steps = 71
00:00:02 [INFO] test episode 72: reward = -96.00, steps = 97
00:00:02 [INFO] test episode 73: reward = -100.00, steps = 101
00:00:02 [INFO] test episode 74: reward = -78.00, steps = 79
00:00:02 [INFO] test episode 75: reward = -72.00, steps = 73
00:00:02 [INFO] test episode 76: reward = -500.00, steps = 500
00:00:02 [INFO] test episode 77: reward = -177.00, steps = 178
00:00:02 [INFO] test episode 78: reward = -71.00, steps = 72
00:00:02 [INFO] test episode 79: reward = -76.00, steps = 77
00:00:02 [INFO] test episode 80: reward = -72.00, steps = 73
00:00:02 [INFO] test episode 81: reward = -85.00, steps = 86
00:00:02 [INFO] test episode 82: reward = -85.00, steps = 86
00:00:02 [INFO] test episode 83: reward = -97.00, steps = 98
00:00:02 [INFO] test episode 84: reward = -70.00, steps = 71
00:00:02 [INFO] test episode 85: reward = -94.00, steps = 95
00:00:02 [INFO] test episode 86: reward = -79.00, steps = 80
00:00:03 [INFO] test episode 87: reward = -233.00, steps = 234
00:00:03 [INFO] test episode 88: reward = -80.00, steps = 81
00:00:03 [INFO] test episode 89: reward = -71.00, steps = 72
00:00:03 [INFO] test episode 90: reward = -72.00, steps = 73
00:00:03 [INFO] test episode 91: reward = -98.00, steps = 99
00:00:03 [INFO] test episode 92: reward = -72.00, steps = 73
00:00:03 [INFO] test episode 93: reward = -84.00, steps = 85
00:00:03 [INFO] test episode 94: reward = -93.00, steps = 94
00:00:03 [INFO] test episode 95: reward = -91.00, steps = 92
00:00:03 [INFO] test episode 96: reward = -82.00, steps = 83
00:00:03 [INFO] test episode 97: reward = -90.00, steps = 91
00:00:03 [INFO] test episode 98: reward = -72.00, steps = 73
00:00:03 [INFO] test episode 99: reward = -238.00, steps = 239
00:00:03 [INFO] average episode reward = -96.13 ± 55.15
In [5]:
env.close()