Use DQN to Play MoutainCar-v0¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('MountainCar-v0')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
22:49:45 [INFO] env: <MountainCarEnv<MountainCar-v0>>
22:49:45 [INFO] action_space: Discrete(3)
22:49:45 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
22:49:45 [INFO] reward_range: (-inf, inf)
22:49:45 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
22:49:45 [INFO] _max_episode_steps: 200
22:49:45 [INFO] _elapsed_steps: None
22:49:45 [INFO] id: MountainCar-v0
22:49:45 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
22:49:45 [INFO] reward_threshold: -110.0
22:49:45 [INFO] nondeterministic: False
22:49:45 [INFO] max_episode_steps: 200
22:49:45 [INFO] _kwargs: {}
22:49:45 [INFO] _env_name: MountainCar
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class DQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.evaluate_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[64, 64], output_size=self.action_n)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).squeeze(0)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(1024)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update value net
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor, _ = next_q_tensor.max(axis=-1)
        target_tensor = reward_tensor + self.gamma * \
                (1. - terminated_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DQNAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
22:49:46 [INFO] ==== train ====
22:49:46 [INFO] train episode 0: reward = -200.00, steps = 200
22:49:46 [INFO] train episode 1: reward = -200.00, steps = 200
22:49:46 [INFO] train episode 2: reward = -200.00, steps = 200
22:49:46 [INFO] train episode 3: reward = -200.00, steps = 200
22:49:47 [INFO] train episode 4: reward = -200.00, steps = 200
22:49:47 [INFO] train episode 5: reward = -200.00, steps = 200
22:49:47 [INFO] train episode 6: reward = -200.00, steps = 200
22:49:47 [INFO] train episode 7: reward = -200.00, steps = 200
22:49:47 [INFO] train episode 8: reward = -200.00, steps = 200
22:49:48 [INFO] train episode 9: reward = -200.00, steps = 200
22:49:48 [INFO] train episode 10: reward = -200.00, steps = 200
22:49:48 [INFO] train episode 11: reward = -200.00, steps = 200
22:49:48 [INFO] train episode 12: reward = -200.00, steps = 200
22:49:48 [INFO] train episode 13: reward = -200.00, steps = 200
22:49:49 [INFO] train episode 14: reward = -200.00, steps = 200
22:49:49 [INFO] train episode 15: reward = -200.00, steps = 200
22:49:49 [INFO] train episode 16: reward = -200.00, steps = 200
22:49:49 [INFO] train episode 17: reward = -200.00, steps = 200
22:49:49 [INFO] train episode 18: reward = -200.00, steps = 200
22:49:50 [INFO] train episode 19: reward = -200.00, steps = 200
22:49:50 [INFO] train episode 20: reward = -200.00, steps = 200
22:49:50 [INFO] train episode 21: reward = -200.00, steps = 200
22:49:50 [INFO] train episode 22: reward = -200.00, steps = 200
22:49:50 [INFO] train episode 23: reward = -200.00, steps = 200
22:49:51 [INFO] train episode 24: reward = -200.00, steps = 200
22:49:51 [INFO] train episode 25: reward = -200.00, steps = 200
22:49:51 [INFO] train episode 26: reward = -200.00, steps = 200
22:49:51 [INFO] train episode 27: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 28: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 29: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 30: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 31: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 32: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 33: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 34: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 35: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 36: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 37: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 38: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 39: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 40: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 41: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 42: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 43: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 44: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 45: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 46: reward = -200.00, steps = 200
22:50:12 [INFO] train episode 47: reward = -200.00, steps = 200
22:51:00 [INFO] train episode 48: reward = -200.00, steps = 200
22:51:47 [INFO] train episode 49: reward = -200.00, steps = 200
22:52:34 [INFO] train episode 50: reward = -200.00, steps = 200
22:53:22 [INFO] train episode 51: reward = -200.00, steps = 200
22:54:13 [INFO] train episode 52: reward = -200.00, steps = 200
22:55:03 [INFO] train episode 53: reward = -200.00, steps = 200
22:55:51 [INFO] train episode 54: reward = -200.00, steps = 200
22:56:38 [INFO] train episode 55: reward = -200.00, steps = 200
22:57:24 [INFO] train episode 56: reward = -200.00, steps = 200
22:58:12 [INFO] train episode 57: reward = -200.00, steps = 200
22:59:00 [INFO] train episode 58: reward = -200.00, steps = 200
22:59:47 [INFO] train episode 59: reward = -200.00, steps = 200
23:00:48 [INFO] train episode 60: reward = -200.00, steps = 200
23:01:50 [INFO] train episode 61: reward = -200.00, steps = 200
23:02:40 [INFO] train episode 62: reward = -200.00, steps = 200
23:03:39 [INFO] train episode 63: reward = -200.00, steps = 200
23:04:30 [INFO] train episode 64: reward = -200.00, steps = 200
23:05:24 [INFO] train episode 65: reward = -200.00, steps = 200
23:06:18 [INFO] train episode 66: reward = -200.00, steps = 200
23:07:08 [INFO] train episode 67: reward = -200.00, steps = 200
23:07:59 [INFO] train episode 68: reward = -200.00, steps = 200
23:08:51 [INFO] train episode 69: reward = -200.00, steps = 200
23:09:44 [INFO] train episode 70: reward = -200.00, steps = 200
23:10:40 [INFO] train episode 71: reward = -200.00, steps = 200
23:11:43 [INFO] train episode 72: reward = -200.00, steps = 200
23:13:10 [INFO] train episode 73: reward = -200.00, steps = 200
23:14:46 [INFO] train episode 74: reward = -200.00, steps = 200
23:16:21 [INFO] train episode 75: reward = -200.00, steps = 200
23:17:53 [INFO] train episode 76: reward = -200.00, steps = 200
23:19:25 [INFO] train episode 77: reward = -200.00, steps = 200
23:20:59 [INFO] train episode 78: reward = -200.00, steps = 200
23:22:33 [INFO] train episode 79: reward = -200.00, steps = 200
23:24:10 [INFO] train episode 80: reward = -200.00, steps = 200
23:25:43 [INFO] train episode 81: reward = -200.00, steps = 200
23:27:19 [INFO] train episode 82: reward = -200.00, steps = 200
23:28:53 [INFO] train episode 83: reward = -200.00, steps = 200
23:30:28 [INFO] train episode 84: reward = -200.00, steps = 200
23:32:04 [INFO] train episode 85: reward = -200.00, steps = 200
23:33:36 [INFO] train episode 86: reward = -200.00, steps = 200
23:35:09 [INFO] train episode 87: reward = -200.00, steps = 200
23:36:44 [INFO] train episode 88: reward = -200.00, steps = 200
23:38:22 [INFO] train episode 89: reward = -200.00, steps = 200
23:39:57 [INFO] train episode 90: reward = -200.00, steps = 200
23:41:33 [INFO] train episode 91: reward = -200.00, steps = 200
23:43:20 [INFO] train episode 92: reward = -200.00, steps = 200
23:45:02 [INFO] train episode 93: reward = -200.00, steps = 200
23:46:41 [INFO] train episode 94: reward = -200.00, steps = 200
23:48:39 [INFO] train episode 95: reward = -200.00, steps = 200
23:50:22 [INFO] train episode 96: reward = -200.00, steps = 200
23:52:03 [INFO] train episode 97: reward = -200.00, steps = 200
23:53:47 [INFO] train episode 98: reward = -200.00, steps = 200
23:55:22 [INFO] train episode 99: reward = -200.00, steps = 200
23:56:56 [INFO] train episode 100: reward = -200.00, steps = 200
23:58:30 [INFO] train episode 101: reward = -200.00, steps = 200
00:00:03 [INFO] train episode 102: reward = -200.00, steps = 200
00:01:37 [INFO] train episode 103: reward = -200.00, steps = 200
00:03:10 [INFO] train episode 104: reward = -200.00, steps = 200
00:04:43 [INFO] train episode 105: reward = -200.00, steps = 200
00:06:16 [INFO] train episode 106: reward = -200.00, steps = 200
00:07:45 [INFO] train episode 107: reward = -200.00, steps = 200
00:09:19 [INFO] train episode 108: reward = -200.00, steps = 200
00:10:52 [INFO] train episode 109: reward = -200.00, steps = 200
00:12:26 [INFO] train episode 110: reward = -200.00, steps = 200
00:13:58 [INFO] train episode 111: reward = -200.00, steps = 200
00:15:31 [INFO] train episode 112: reward = -200.00, steps = 200
00:17:02 [INFO] train episode 113: reward = -200.00, steps = 200
00:18:34 [INFO] train episode 114: reward = -200.00, steps = 200
00:20:06 [INFO] train episode 115: reward = -200.00, steps = 200
00:21:39 [INFO] train episode 116: reward = -200.00, steps = 200
00:23:12 [INFO] train episode 117: reward = -200.00, steps = 200
00:24:44 [INFO] train episode 118: reward = -200.00, steps = 200
00:26:18 [INFO] train episode 119: reward = -200.00, steps = 200
00:27:53 [INFO] train episode 120: reward = -200.00, steps = 200
00:29:25 [INFO] train episode 121: reward = -200.00, steps = 200
00:30:56 [INFO] train episode 122: reward = -200.00, steps = 200
00:32:34 [INFO] train episode 123: reward = -200.00, steps = 200
00:34:06 [INFO] train episode 124: reward = -200.00, steps = 200
00:35:36 [INFO] train episode 125: reward = -200.00, steps = 200
00:37:11 [INFO] train episode 126: reward = -200.00, steps = 200
00:38:43 [INFO] train episode 127: reward = -200.00, steps = 200
00:40:10 [INFO] train episode 128: reward = -200.00, steps = 200
00:41:42 [INFO] train episode 129: reward = -200.00, steps = 200
00:43:15 [INFO] train episode 130: reward = -200.00, steps = 200
00:44:46 [INFO] train episode 131: reward = -200.00, steps = 200
00:46:17 [INFO] train episode 132: reward = -200.00, steps = 200
00:47:48 [INFO] train episode 133: reward = -200.00, steps = 200
00:49:19 [INFO] train episode 134: reward = -200.00, steps = 200
00:50:50 [INFO] train episode 135: reward = -200.00, steps = 200
00:52:22 [INFO] train episode 136: reward = -200.00, steps = 200
00:53:53 [INFO] train episode 137: reward = -200.00, steps = 200
00:55:24 [INFO] train episode 138: reward = -200.00, steps = 200
00:56:55 [INFO] train episode 139: reward = -200.00, steps = 200
00:58:13 [INFO] train episode 140: reward = -200.00, steps = 200
00:59:36 [INFO] train episode 141: reward = -200.00, steps = 200
01:01:07 [INFO] train episode 142: reward = -200.00, steps = 200
01:02:38 [INFO] train episode 143: reward = -200.00, steps = 200
01:04:08 [INFO] train episode 144: reward = -200.00, steps = 200
01:05:38 [INFO] train episode 145: reward = -200.00, steps = 200
01:07:09 [INFO] train episode 146: reward = -200.00, steps = 200
01:08:42 [INFO] train episode 147: reward = -200.00, steps = 200
01:10:14 [INFO] train episode 148: reward = -200.00, steps = 200
01:11:45 [INFO] train episode 149: reward = -200.00, steps = 200
01:13:16 [INFO] train episode 150: reward = -200.00, steps = 200
01:14:48 [INFO] train episode 151: reward = -200.00, steps = 200
01:16:19 [INFO] train episode 152: reward = -200.00, steps = 200
01:17:50 [INFO] train episode 153: reward = -200.00, steps = 200
01:19:20 [INFO] train episode 154: reward = -200.00, steps = 200
01:20:51 [INFO] train episode 155: reward = -200.00, steps = 200
01:22:23 [INFO] train episode 156: reward = -200.00, steps = 200
01:23:55 [INFO] train episode 157: reward = -200.00, steps = 200
01:25:26 [INFO] train episode 158: reward = -200.00, steps = 200
01:27:01 [INFO] train episode 159: reward = -200.00, steps = 200
01:28:33 [INFO] train episode 160: reward = -200.00, steps = 200
01:30:05 [INFO] train episode 161: reward = -200.00, steps = 200
01:31:38 [INFO] train episode 162: reward = -200.00, steps = 200
01:33:09 [INFO] train episode 163: reward = -200.00, steps = 200
01:34:41 [INFO] train episode 164: reward = -200.00, steps = 200
01:36:11 [INFO] train episode 165: reward = -200.00, steps = 200
01:37:43 [INFO] train episode 166: reward = -200.00, steps = 200
01:39:14 [INFO] train episode 167: reward = -200.00, steps = 200
01:40:45 [INFO] train episode 168: reward = -200.00, steps = 200
01:42:17 [INFO] train episode 169: reward = -200.00, steps = 200
01:43:49 [INFO] train episode 170: reward = -200.00, steps = 200
01:45:20 [INFO] train episode 171: reward = -200.00, steps = 200
01:46:52 [INFO] train episode 172: reward = -200.00, steps = 200
01:48:23 [INFO] train episode 173: reward = -200.00, steps = 200
01:49:53 [INFO] train episode 174: reward = -200.00, steps = 200
01:51:24 [INFO] train episode 175: reward = -200.00, steps = 200
01:52:56 [INFO] train episode 176: reward = -200.00, steps = 200
01:54:28 [INFO] train episode 177: reward = -200.00, steps = 200
01:55:59 [INFO] train episode 178: reward = -200.00, steps = 200
01:57:32 [INFO] train episode 179: reward = -200.00, steps = 200
01:59:02 [INFO] train episode 180: reward = -200.00, steps = 200
02:00:32 [INFO] train episode 181: reward = -200.00, steps = 200
02:02:03 [INFO] train episode 182: reward = -200.00, steps = 200
02:03:33 [INFO] train episode 183: reward = -200.00, steps = 200
02:05:03 [INFO] train episode 184: reward = -200.00, steps = 200
02:06:33 [INFO] train episode 185: reward = -200.00, steps = 200
02:08:06 [INFO] train episode 186: reward = -200.00, steps = 200
02:09:38 [INFO] train episode 187: reward = -200.00, steps = 200
02:11:10 [INFO] train episode 188: reward = -200.00, steps = 200
02:12:42 [INFO] train episode 189: reward = -200.00, steps = 200
02:14:13 [INFO] train episode 190: reward = -200.00, steps = 200
02:15:44 [INFO] train episode 191: reward = -200.00, steps = 200
02:17:14 [INFO] train episode 192: reward = -200.00, steps = 200
02:18:45 [INFO] train episode 193: reward = -200.00, steps = 200
02:20:16 [INFO] train episode 194: reward = -200.00, steps = 200
02:21:48 [INFO] train episode 195: reward = -200.00, steps = 200
02:23:01 [INFO] train episode 196: reward = -157.00, steps = 157
02:24:33 [INFO] train episode 197: reward = -200.00, steps = 200
02:26:05 [INFO] train episode 198: reward = -200.00, steps = 200
02:27:39 [INFO] train episode 199: reward = -200.00, steps = 200
02:29:10 [INFO] train episode 200: reward = -200.00, steps = 200
02:30:40 [INFO] train episode 201: reward = -200.00, steps = 200
02:32:11 [INFO] train episode 202: reward = -200.00, steps = 200
02:33:42 [INFO] train episode 203: reward = -200.00, steps = 200
02:35:12 [INFO] train episode 204: reward = -200.00, steps = 200
02:36:43 [INFO] train episode 205: reward = -200.00, steps = 200
02:38:14 [INFO] train episode 206: reward = -200.00, steps = 200
02:39:31 [INFO] train episode 207: reward = -200.00, steps = 200
02:41:03 [INFO] train episode 208: reward = -200.00, steps = 200
02:42:22 [INFO] train episode 209: reward = -200.00, steps = 200
02:43:53 [INFO] train episode 210: reward = -200.00, steps = 200
02:45:24 [INFO] train episode 211: reward = -200.00, steps = 200
02:46:54 [INFO] train episode 212: reward = -200.00, steps = 200
02:48:24 [INFO] train episode 213: reward = -200.00, steps = 200
02:49:54 [INFO] train episode 214: reward = -200.00, steps = 200
02:51:25 [INFO] train episode 215: reward = -200.00, steps = 200
02:52:57 [INFO] train episode 216: reward = -200.00, steps = 200
02:54:30 [INFO] train episode 217: reward = -200.00, steps = 200
02:56:01 [INFO] train episode 218: reward = -200.00, steps = 200
02:57:17 [INFO] train episode 219: reward = -200.00, steps = 200
02:58:28 [INFO] train episode 220: reward = -200.00, steps = 200
02:59:40 [INFO] train episode 221: reward = -200.00, steps = 200
03:00:52 [INFO] train episode 222: reward = -200.00, steps = 200
03:02:05 [INFO] train episode 223: reward = -200.00, steps = 200
03:03:16 [INFO] train episode 224: reward = -200.00, steps = 200
03:04:29 [INFO] train episode 225: reward = -200.00, steps = 200
03:05:40 [INFO] train episode 226: reward = -200.00, steps = 200
03:06:54 [INFO] train episode 227: reward = -200.00, steps = 200
03:08:07 [INFO] train episode 228: reward = -200.00, steps = 200
03:09:19 [INFO] train episode 229: reward = -200.00, steps = 200
03:10:32 [INFO] train episode 230: reward = -200.00, steps = 200
03:11:30 [INFO] train episode 231: reward = -160.00, steps = 160
03:12:42 [INFO] train episode 232: reward = -200.00, steps = 200
03:13:53 [INFO] train episode 233: reward = -200.00, steps = 200
03:15:01 [INFO] train episode 234: reward = -187.00, steps = 187
03:16:09 [INFO] train episode 235: reward = -187.00, steps = 187
03:17:18 [INFO] train episode 236: reward = -190.00, steps = 190
03:18:29 [INFO] train episode 237: reward = -200.00, steps = 200
03:19:34 [INFO] train episode 238: reward = -185.00, steps = 185
03:20:40 [INFO] train episode 239: reward = -188.00, steps = 188
03:21:48 [INFO] train episode 240: reward = -200.00, steps = 200
03:22:53 [INFO] train episode 241: reward = -188.00, steps = 188
03:23:58 [INFO] train episode 242: reward = -192.00, steps = 192
03:25:07 [INFO] train episode 243: reward = -200.00, steps = 200
03:26:15 [INFO] train episode 244: reward = -200.00, steps = 200
03:27:27 [INFO] train episode 245: reward = -200.00, steps = 200
03:28:20 [INFO] train episode 246: reward = -155.00, steps = 155
03:29:28 [INFO] train episode 247: reward = -199.00, steps = 199
03:30:37 [INFO] train episode 248: reward = -200.00, steps = 200
03:31:45 [INFO] train episode 249: reward = -200.00, steps = 200
03:32:52 [INFO] train episode 250: reward = -195.00, steps = 195
03:33:59 [INFO] train episode 251: reward = -200.00, steps = 200
03:35:06 [INFO] train episode 252: reward = -200.00, steps = 200
03:36:13 [INFO] train episode 253: reward = -200.00, steps = 200
03:36:56 [INFO] train episode 254: reward = -123.00, steps = 123
03:38:02 [INFO] train episode 255: reward = -191.00, steps = 191
03:39:10 [INFO] train episode 256: reward = -200.00, steps = 200
03:39:58 [INFO] train episode 257: reward = -140.00, steps = 140
03:40:41 [INFO] train episode 258: reward = -124.00, steps = 124
03:41:22 [INFO] train episode 259: reward = -121.00, steps = 121
03:42:14 [INFO] train episode 260: reward = -150.00, steps = 150
03:43:03 [INFO] train episode 261: reward = -144.00, steps = 144
03:43:38 [INFO] train episode 262: reward = -101.00, steps = 101
03:44:28 [INFO] train episode 263: reward = -145.00, steps = 145
03:45:21 [INFO] train episode 264: reward = -158.00, steps = 158
03:45:55 [INFO] train episode 265: reward = -98.00, steps = 98
03:46:29 [INFO] train episode 266: reward = -98.00, steps = 98
03:47:00 [INFO] train episode 267: reward = -92.00, steps = 92
03:47:32 [INFO] train episode 268: reward = -93.00, steps = 93
03:48:25 [INFO] train episode 269: reward = -154.00, steps = 154
03:48:55 [INFO] train episode 270: reward = -88.00, steps = 88
03:49:45 [INFO] train episode 271: reward = -149.00, steps = 149
03:50:37 [INFO] train episode 272: reward = -150.00, steps = 150
03:51:29 [INFO] train episode 273: reward = -152.00, steps = 152
03:51:59 [INFO] train episode 274: reward = -87.00, steps = 87
03:52:31 [INFO] train episode 275: reward = -92.00, steps = 92
03:53:24 [INFO] train episode 276: reward = -154.00, steps = 154
03:53:55 [INFO] train episode 277: reward = -89.00, steps = 89
03:54:25 [INFO] train episode 278: reward = -87.00, steps = 87
03:55:17 [INFO] train episode 279: reward = -153.00, steps = 153
03:56:10 [INFO] train episode 280: reward = -153.00, steps = 153
03:57:01 [INFO] train episode 281: reward = -149.00, steps = 149
03:57:57 [INFO] train episode 282: reward = -161.00, steps = 161
03:58:47 [INFO] train episode 283: reward = -147.00, steps = 147
03:59:16 [INFO] train episode 284: reward = -84.00, steps = 84
04:00:06 [INFO] train episode 285: reward = -146.00, steps = 146
04:01:14 [INFO] train episode 286: reward = -200.00, steps = 200
04:02:03 [INFO] train episode 287: reward = -145.00, steps = 145
04:02:52 [INFO] train episode 288: reward = -143.00, steps = 143
04:03:47 [INFO] train episode 289: reward = -165.00, steps = 165
04:04:19 [INFO] train episode 290: reward = -98.00, steps = 98
04:04:48 [INFO] train episode 291: reward = -85.00, steps = 85
04:05:41 [INFO] train episode 292: reward = -160.00, steps = 160
04:06:39 [INFO] train episode 293: reward = -174.00, steps = 174
04:07:28 [INFO] train episode 294: reward = -144.00, steps = 144
04:08:28 [INFO] train episode 295: reward = -176.00, steps = 176
04:09:34 [INFO] train episode 296: reward = -187.00, steps = 187
04:10:30 [INFO] train episode 297: reward = -142.00, steps = 142
04:11:18 [INFO] train episode 298: reward = -141.00, steps = 141
04:12:07 [INFO] train episode 299: reward = -146.00, steps = 146
04:12:56 [INFO] train episode 300: reward = -147.00, steps = 147
04:13:43 [INFO] train episode 301: reward = -138.00, steps = 138
04:14:24 [INFO] train episode 302: reward = -118.00, steps = 118
04:15:21 [INFO] train episode 303: reward = -169.00, steps = 169
04:16:01 [INFO] train episode 304: reward = -120.00, steps = 120
04:16:46 [INFO] train episode 305: reward = -133.00, steps = 133
04:17:23 [INFO] train episode 306: reward = -112.00, steps = 112
04:18:05 [INFO] train episode 307: reward = -124.00, steps = 124
04:18:48 [INFO] train episode 308: reward = -129.00, steps = 129
04:19:40 [INFO] train episode 309: reward = -200.00, steps = 200
04:20:13 [INFO] train episode 310: reward = -122.00, steps = 122
04:21:05 [INFO] train episode 311: reward = -200.00, steps = 200
04:21:56 [INFO] train episode 312: reward = -200.00, steps = 200
04:22:47 [INFO] train episode 313: reward = -200.00, steps = 200
04:23:40 [INFO] train episode 314: reward = -200.00, steps = 200
04:24:11 [INFO] train episode 315: reward = -120.00, steps = 120
04:24:42 [INFO] train episode 316: reward = -119.00, steps = 119
04:25:14 [INFO] train episode 317: reward = -124.00, steps = 124
04:25:47 [INFO] train episode 318: reward = -128.00, steps = 128
04:26:16 [INFO] train episode 319: reward = -114.00, steps = 114
04:26:48 [INFO] train episode 320: reward = -114.00, steps = 114
04:27:18 [INFO] train episode 321: reward = -113.00, steps = 113
04:27:48 [INFO] train episode 322: reward = -116.00, steps = 116
04:28:39 [INFO] train episode 323: reward = -200.00, steps = 200
04:29:07 [INFO] train episode 324: reward = -112.00, steps = 112
04:29:39 [INFO] train episode 325: reward = -123.00, steps = 123
04:30:10 [INFO] train episode 326: reward = -121.00, steps = 121
04:30:41 [INFO] train episode 327: reward = -121.00, steps = 121
04:31:12 [INFO] train episode 328: reward = -119.00, steps = 119
04:31:43 [INFO] train episode 329: reward = -127.00, steps = 127
04:32:08 [INFO] train episode 330: reward = -117.00, steps = 117
04:32:35 [INFO] train episode 331: reward = -119.00, steps = 119
04:33:00 [INFO] train episode 332: reward = -116.00, steps = 116
04:33:34 [INFO] train episode 333: reward = -156.00, steps = 156
04:33:59 [INFO] train episode 334: reward = -110.00, steps = 110
04:34:24 [INFO] train episode 335: reward = -114.00, steps = 114
04:34:48 [INFO] train episode 336: reward = -112.00, steps = 112
04:35:08 [INFO] train episode 337: reward = -87.00, steps = 87
04:35:32 [INFO] train episode 338: reward = -113.00, steps = 113
04:35:51 [INFO] train episode 339: reward = -93.00, steps = 93
04:36:10 [INFO] train episode 340: reward = -84.00, steps = 84
04:36:28 [INFO] train episode 341: reward = -88.00, steps = 88
04:36:28 [INFO] ==== test ====
04:36:28 [INFO] test episode 0: reward = -115.00, steps = 115
04:36:28 [INFO] test episode 1: reward = -158.00, steps = 158
04:36:28 [INFO] test episode 2: reward = -160.00, steps = 160
04:36:28 [INFO] test episode 3: reward = -111.00, steps = 111
04:36:28 [INFO] test episode 4: reward = -110.00, steps = 110
04:36:28 [INFO] test episode 5: reward = -86.00, steps = 86
04:36:28 [INFO] test episode 6: reward = -96.00, steps = 96
04:36:28 [INFO] test episode 7: reward = -116.00, steps = 116
04:36:29 [INFO] test episode 8: reward = -110.00, steps = 110
04:36:29 [INFO] test episode 9: reward = -111.00, steps = 111
04:36:29 [INFO] test episode 10: reward = -115.00, steps = 115
04:36:29 [INFO] test episode 11: reward = -145.00, steps = 145
04:36:29 [INFO] test episode 12: reward = -84.00, steps = 84
04:36:29 [INFO] test episode 13: reward = -116.00, steps = 116
04:36:29 [INFO] test episode 14: reward = -117.00, steps = 117
04:36:29 [INFO] test episode 15: reward = -115.00, steps = 115
04:36:29 [INFO] test episode 16: reward = -90.00, steps = 90
04:36:29 [INFO] test episode 17: reward = -176.00, steps = 176
04:36:29 [INFO] test episode 18: reward = -84.00, steps = 84
04:36:29 [INFO] test episode 19: reward = -157.00, steps = 157
04:36:29 [INFO] test episode 20: reward = -112.00, steps = 112
04:36:30 [INFO] test episode 21: reward = -195.00, steps = 195
04:36:30 [INFO] test episode 22: reward = -95.00, steps = 95
04:36:30 [INFO] test episode 23: reward = -143.00, steps = 143
04:36:30 [INFO] test episode 24: reward = -112.00, steps = 112
04:36:30 [INFO] test episode 25: reward = -86.00, steps = 86
04:36:30 [INFO] test episode 26: reward = -86.00, steps = 86
04:36:30 [INFO] test episode 27: reward = -160.00, steps = 160
04:36:30 [INFO] test episode 28: reward = -158.00, steps = 158
04:36:30 [INFO] test episode 29: reward = -115.00, steps = 115
04:36:30 [INFO] test episode 30: reward = -114.00, steps = 114
04:36:30 [INFO] test episode 31: reward = -115.00, steps = 115
04:36:30 [INFO] test episode 32: reward = -145.00, steps = 145
04:36:31 [INFO] test episode 33: reward = -110.00, steps = 110
04:36:31 [INFO] test episode 34: reward = -90.00, steps = 90
04:36:31 [INFO] test episode 35: reward = -198.00, steps = 198
04:36:31 [INFO] test episode 36: reward = -115.00, steps = 115
04:36:31 [INFO] test episode 37: reward = -113.00, steps = 113
04:36:31 [INFO] test episode 38: reward = -112.00, steps = 112
04:36:31 [INFO] test episode 39: reward = -111.00, steps = 111
04:36:31 [INFO] test episode 40: reward = -84.00, steps = 84
04:36:31 [INFO] test episode 41: reward = -84.00, steps = 84
04:36:31 [INFO] test episode 42: reward = -200.00, steps = 200
04:36:31 [INFO] test episode 43: reward = -83.00, steps = 83
04:36:31 [INFO] test episode 44: reward = -110.00, steps = 110
04:36:31 [INFO] test episode 45: reward = -85.00, steps = 85
04:36:32 [INFO] test episode 46: reward = -116.00, steps = 116
04:36:32 [INFO] test episode 47: reward = -145.00, steps = 145
04:36:32 [INFO] test episode 48: reward = -90.00, steps = 90
04:36:32 [INFO] test episode 49: reward = -115.00, steps = 115
04:36:32 [INFO] test episode 50: reward = -91.00, steps = 91
04:36:32 [INFO] test episode 51: reward = -111.00, steps = 111
04:36:32 [INFO] test episode 52: reward = -85.00, steps = 85
04:36:32 [INFO] test episode 53: reward = -166.00, steps = 166
04:36:32 [INFO] test episode 54: reward = -88.00, steps = 88
04:36:32 [INFO] test episode 55: reward = -112.00, steps = 112
04:36:32 [INFO] test episode 56: reward = -150.00, steps = 150
04:36:32 [INFO] test episode 57: reward = -115.00, steps = 115
04:36:32 [INFO] test episode 58: reward = -85.00, steps = 85
04:36:32 [INFO] test episode 59: reward = -114.00, steps = 114
04:36:33 [INFO] test episode 60: reward = -188.00, steps = 188
04:36:33 [INFO] test episode 61: reward = -85.00, steps = 85
04:36:33 [INFO] test episode 62: reward = -159.00, steps = 159
04:36:33 [INFO] test episode 63: reward = -110.00, steps = 110
04:36:33 [INFO] test episode 64: reward = -114.00, steps = 114
04:36:33 [INFO] test episode 65: reward = -110.00, steps = 110
04:36:33 [INFO] test episode 66: reward = -112.00, steps = 112
04:36:33 [INFO] test episode 67: reward = -88.00, steps = 88
04:36:33 [INFO] test episode 68: reward = -157.00, steps = 157
04:36:33 [INFO] test episode 69: reward = -117.00, steps = 117
04:36:33 [INFO] test episode 70: reward = -159.00, steps = 159
04:36:33 [INFO] test episode 71: reward = -110.00, steps = 110
04:36:34 [INFO] test episode 72: reward = -199.00, steps = 199
04:36:34 [INFO] test episode 73: reward = -151.00, steps = 151
04:36:34 [INFO] test episode 74: reward = -200.00, steps = 200
04:36:34 [INFO] test episode 75: reward = -95.00, steps = 95
04:36:34 [INFO] test episode 76: reward = -114.00, steps = 114
04:36:34 [INFO] test episode 77: reward = -84.00, steps = 84
04:36:34 [INFO] test episode 78: reward = -180.00, steps = 180
04:36:34 [INFO] test episode 79: reward = -111.00, steps = 111
04:36:34 [INFO] test episode 80: reward = -200.00, steps = 200
04:36:34 [INFO] test episode 81: reward = -86.00, steps = 86
04:36:34 [INFO] test episode 82: reward = -115.00, steps = 115
04:36:34 [INFO] test episode 83: reward = -110.00, steps = 110
04:36:35 [INFO] test episode 84: reward = -115.00, steps = 115
04:36:35 [INFO] test episode 85: reward = -89.00, steps = 89
04:36:35 [INFO] test episode 86: reward = -83.00, steps = 83
04:36:35 [INFO] test episode 87: reward = -158.00, steps = 158
04:36:35 [INFO] test episode 88: reward = -115.00, steps = 115
04:36:35 [INFO] test episode 89: reward = -110.00, steps = 110
04:36:35 [INFO] test episode 90: reward = -116.00, steps = 116
04:36:35 [INFO] test episode 91: reward = -84.00, steps = 84
04:36:35 [INFO] test episode 92: reward = -113.00, steps = 113
04:36:35 [INFO] test episode 93: reward = -114.00, steps = 114
04:36:35 [INFO] test episode 94: reward = -85.00, steps = 85
04:36:35 [INFO] test episode 95: reward = -115.00, steps = 115
04:36:35 [INFO] test episode 96: reward = -86.00, steps = 86
04:36:35 [INFO] test episode 97: reward = -110.00, steps = 110
04:36:35 [INFO] test episode 98: reward = -113.00, steps = 113
04:36:36 [INFO] test episode 99: reward = -114.00, steps = 114
04:36:36 [INFO] average episode reward = -119.60 ± 31.85
In [6]:
env.close()