Use Closed-Form Policy to Play PongNoFrameskip-v4¶

In [1]:
import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('PongNoFrameskip-v4')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
00:00:00 [INFO] env: <AtariEnv<PongNoFrameskip-v4>>
00:00:00 [INFO] action_space: Discrete(6)
00:00:00 [INFO] observation_space: Box(0, 255, (210, 160, 3), uint8)
00:00:00 [INFO] reward_range: (-inf, inf)
00:00:00 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:00:00 [INFO] _max_episode_steps: 400000
00:00:00 [INFO] _elapsed_steps: None
00:00:00 [INFO] id: PongNoFrameskip-v4
00:00:00 [INFO] entry_point: gym.envs.atari:AtariEnv
00:00:00 [INFO] reward_threshold: None
00:00:00 [INFO] nondeterministic: False
00:00:00 [INFO] max_episode_steps: 400000
00:00:00 [INFO] _kwargs: {'game': 'pong', 'obs_type': 'image', 'frameskip': 1}
00:00:00 [INFO] _env_name: PongNoFrameskip
In [3]:
class ClosedFormAgent:
    def __init__(self, _):
        pass

    def reset(self, mode=None):
        pass

    def step(self, observation, reward, terminated):
        racket = np.where((observation[34:193, :, 0] == 92).any(axis=1))[0].mean()
        ball = np.where((observation[34:193, :, 0] == 236).any(axis=1))[0].mean()
        return 2 + int(racket < ball)

    def close(self):
        pass


agent = ClosedFormAgent(env)
In [4]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:05:52 [INFO] ==== test ====
00:06:35 [INFO] test episode 0: reward = 21.00, steps = 29819
00:07:17 [INFO] test episode 1: reward = 21.00, steps = 29819
00:07:58 [INFO] test episode 2: reward = 21.00, steps = 29819
00:08:43 [INFO] test episode 3: reward = 21.00, steps = 29819
00:09:26 [INFO] test episode 4: reward = 21.00, steps = 29819
00:10:09 [INFO] test episode 5: reward = 21.00, steps = 29819
00:10:53 [INFO] test episode 6: reward = 21.00, steps = 29819
00:11:37 [INFO] test episode 7: reward = 21.00, steps = 29819
00:12:22 [INFO] test episode 8: reward = 21.00, steps = 29819
00:13:09 [INFO] test episode 9: reward = 21.00, steps = 29819
00:13:54 [INFO] test episode 10: reward = 21.00, steps = 29819
00:14:37 [INFO] test episode 11: reward = 21.00, steps = 29819
00:15:20 [INFO] test episode 12: reward = 21.00, steps = 29819
00:16:02 [INFO] test episode 13: reward = 21.00, steps = 29819
00:16:44 [INFO] test episode 14: reward = 21.00, steps = 29819
00:17:27 [INFO] test episode 15: reward = 21.00, steps = 29819
00:18:10 [INFO] test episode 16: reward = 21.00, steps = 29819
00:18:54 [INFO] test episode 17: reward = 21.00, steps = 29819
00:19:38 [INFO] test episode 18: reward = 21.00, steps = 29819
00:20:21 [INFO] test episode 19: reward = 21.00, steps = 29819
00:21:03 [INFO] test episode 20: reward = 21.00, steps = 29819
00:21:46 [INFO] test episode 21: reward = 21.00, steps = 29819
00:22:29 [INFO] test episode 22: reward = 21.00, steps = 29819
00:23:12 [INFO] test episode 23: reward = 21.00, steps = 29819
00:23:55 [INFO] test episode 24: reward = 21.00, steps = 29819
00:24:37 [INFO] test episode 25: reward = 21.00, steps = 29819
00:25:20 [INFO] test episode 26: reward = 21.00, steps = 29819
00:26:03 [INFO] test episode 27: reward = 21.00, steps = 29819
00:26:47 [INFO] test episode 28: reward = 21.00, steps = 29819
00:27:30 [INFO] test episode 29: reward = 21.00, steps = 29819
00:28:14 [INFO] test episode 30: reward = 21.00, steps = 29819
00:28:57 [INFO] test episode 31: reward = 21.00, steps = 29819
00:29:42 [INFO] test episode 32: reward = 21.00, steps = 29819
00:30:24 [INFO] test episode 33: reward = 21.00, steps = 29819
00:31:08 [INFO] test episode 34: reward = 21.00, steps = 29819
00:31:51 [INFO] test episode 35: reward = 21.00, steps = 29819
00:32:34 [INFO] test episode 36: reward = 21.00, steps = 29819
00:33:17 [INFO] test episode 37: reward = 21.00, steps = 29819
00:34:00 [INFO] test episode 38: reward = 21.00, steps = 29819
00:34:44 [INFO] test episode 39: reward = 21.00, steps = 29819
00:35:27 [INFO] test episode 40: reward = 21.00, steps = 29819
00:36:10 [INFO] test episode 41: reward = 21.00, steps = 29819
00:36:52 [INFO] test episode 42: reward = 21.00, steps = 29819
00:37:35 [INFO] test episode 43: reward = 21.00, steps = 29819
00:38:19 [INFO] test episode 44: reward = 21.00, steps = 29819
00:39:03 [INFO] test episode 45: reward = 21.00, steps = 29819
00:39:47 [INFO] test episode 46: reward = 21.00, steps = 29819
00:40:31 [INFO] test episode 47: reward = 21.00, steps = 29819
00:41:15 [INFO] test episode 48: reward = 21.00, steps = 29819
00:41:58 [INFO] test episode 49: reward = 21.00, steps = 29819
00:42:41 [INFO] test episode 50: reward = 21.00, steps = 29819
00:43:24 [INFO] test episode 51: reward = 21.00, steps = 29819
00:44:07 [INFO] test episode 52: reward = 21.00, steps = 29819
00:44:50 [INFO] test episode 53: reward = 21.00, steps = 29819
00:45:35 [INFO] test episode 54: reward = 21.00, steps = 29819
00:46:20 [INFO] test episode 55: reward = 21.00, steps = 29819
00:47:07 [INFO] test episode 56: reward = 21.00, steps = 29819
00:47:53 [INFO] test episode 57: reward = 21.00, steps = 29819
00:48:37 [INFO] test episode 58: reward = 21.00, steps = 29819
00:49:22 [INFO] test episode 59: reward = 21.00, steps = 29819
00:50:06 [INFO] test episode 60: reward = 21.00, steps = 29819
00:50:49 [INFO] test episode 61: reward = 21.00, steps = 29819
00:51:34 [INFO] test episode 62: reward = 21.00, steps = 29819
00:52:18 [INFO] test episode 63: reward = 21.00, steps = 29819
00:53:03 [INFO] test episode 64: reward = 21.00, steps = 29819
00:53:46 [INFO] test episode 65: reward = 21.00, steps = 29819
00:54:30 [INFO] test episode 66: reward = 21.00, steps = 29819
00:55:12 [INFO] test episode 67: reward = 21.00, steps = 29819
00:55:55 [INFO] test episode 68: reward = 21.00, steps = 29819
00:56:38 [INFO] test episode 69: reward = 21.00, steps = 29819
00:57:21 [INFO] test episode 70: reward = 21.00, steps = 29819
00:58:04 [INFO] test episode 71: reward = 21.00, steps = 29819
00:58:48 [INFO] test episode 72: reward = 21.00, steps = 29819
00:59:31 [INFO] test episode 73: reward = 21.00, steps = 29819
01:00:14 [INFO] test episode 74: reward = 21.00, steps = 29819
01:00:59 [INFO] test episode 75: reward = 21.00, steps = 29819
01:01:42 [INFO] test episode 76: reward = 21.00, steps = 29819
01:02:25 [INFO] test episode 77: reward = 21.00, steps = 29819
01:03:09 [INFO] test episode 78: reward = 21.00, steps = 29819
01:03:52 [INFO] test episode 79: reward = 21.00, steps = 29819
01:04:36 [INFO] test episode 80: reward = 21.00, steps = 29819
01:05:19 [INFO] test episode 81: reward = 21.00, steps = 29819
01:06:03 [INFO] test episode 82: reward = 21.00, steps = 29819
01:06:47 [INFO] test episode 83: reward = 21.00, steps = 29819
01:07:32 [INFO] test episode 84: reward = 21.00, steps = 29819
01:08:16 [INFO] test episode 85: reward = 21.00, steps = 29819
01:09:00 [INFO] test episode 86: reward = 21.00, steps = 29819
01:09:44 [INFO] test episode 87: reward = 21.00, steps = 29819
01:10:28 [INFO] test episode 88: reward = 21.00, steps = 29819
01:11:12 [INFO] test episode 89: reward = 21.00, steps = 29819
01:11:55 [INFO] test episode 90: reward = 21.00, steps = 29819
01:12:38 [INFO] test episode 91: reward = 21.00, steps = 29819
01:13:21 [INFO] test episode 92: reward = 21.00, steps = 29819
01:14:03 [INFO] test episode 93: reward = 21.00, steps = 29819
01:14:46 [INFO] test episode 94: reward = 21.00, steps = 29819
01:15:28 [INFO] test episode 95: reward = 21.00, steps = 29819
01:16:12 [INFO] test episode 96: reward = 21.00, steps = 29819
01:16:54 [INFO] test episode 97: reward = 21.00, steps = 29819
01:17:37 [INFO] test episode 98: reward = 21.00, steps = 29819
01:18:20 [INFO] test episode 99: reward = 21.00, steps = 29819
01:18:20 [INFO] average episode reward = 21.00 ± 0.00
In [5]:
env.close()