Use Double DQN to Play MoutainCar-v0¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('MountainCar-v0')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
22:49:52 [INFO] env: <MountainCarEnv<MountainCar-v0>>
22:49:52 [INFO] action_space: Discrete(3)
22:49:52 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
22:49:52 [INFO] reward_range: (-inf, inf)
22:49:52 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
22:49:52 [INFO] _max_episode_steps: 200
22:49:52 [INFO] _elapsed_steps: None
22:49:52 [INFO] id: MountainCar-v0
22:49:52 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
22:49:52 [INFO] reward_threshold: -110.0
22:49:52 [INFO] nondeterministic: False
22:49:52 [INFO] max_episode_steps: 200
22:49:52 [INFO] _kwargs: {}
22:49:52 [INFO] _env_name: MountainCar
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class DoubleDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.evaluate_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[64, 64], output_size=self.action_n)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        model = nn.Sequential(*layers)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).reshape(1, -1)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(1024)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update value net
        next_eval_q_tensor = self.evaluate_net(next_state_tensor)
        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor = torch.gather(next_q_tensor, 1,
                next_action_tensor.unsqueeze(1)).squeeze(1)
        target_tensor = reward_tensor + self.gamma * \
                (1. - terminated_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DoubleDQNAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
22:49:52 [INFO] ==== train ====
22:49:52 [INFO] train episode 0: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 1: reward = -200.00, steps = 200
22:49:52 [INFO] train episode 2: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 3: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 4: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 5: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 6: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 7: reward = -200.00, steps = 200
22:49:53 [INFO] train episode 8: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 9: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 10: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 11: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 12: reward = -200.00, steps = 200
22:49:54 [INFO] train episode 13: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 14: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 15: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 16: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 17: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 18: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 19: reward = -200.00, steps = 200
22:49:55 [INFO] train episode 20: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 21: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 22: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 23: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 24: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 25: reward = -200.00, steps = 200
22:49:56 [INFO] train episode 26: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 27: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 28: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 29: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 30: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 31: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 32: reward = -200.00, steps = 200
22:49:57 [INFO] train episode 33: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 34: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 35: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 36: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 37: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 38: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 39: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 40: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 41: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 42: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 43: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 44: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 45: reward = -200.00, steps = 200
22:50:00 [INFO] train episode 46: reward = -200.00, steps = 200
22:50:18 [INFO] train episode 47: reward = -200.00, steps = 200
22:51:15 [INFO] train episode 48: reward = -200.00, steps = 200
22:52:11 [INFO] train episode 49: reward = -200.00, steps = 200
22:53:09 [INFO] train episode 50: reward = -200.00, steps = 200
22:54:09 [INFO] train episode 51: reward = -200.00, steps = 200
22:55:10 [INFO] train episode 52: reward = -200.00, steps = 200
22:56:07 [INFO] train episode 53: reward = -200.00, steps = 200
22:57:03 [INFO] train episode 54: reward = -200.00, steps = 200
22:58:00 [INFO] train episode 55: reward = -200.00, steps = 200
22:58:58 [INFO] train episode 56: reward = -200.00, steps = 200
22:59:56 [INFO] train episode 57: reward = -200.00, steps = 200
23:01:12 [INFO] train episode 58: reward = -200.00, steps = 200
23:02:20 [INFO] train episode 59: reward = -200.00, steps = 200
23:03:29 [INFO] train episode 60: reward = -200.00, steps = 200
23:04:32 [INFO] train episode 61: reward = -200.00, steps = 200
23:05:36 [INFO] train episode 62: reward = -200.00, steps = 200
23:06:40 [INFO] train episode 63: reward = -200.00, steps = 200
23:07:42 [INFO] train episode 64: reward = -200.00, steps = 200
23:08:44 [INFO] train episode 65: reward = -200.00, steps = 200
23:09:48 [INFO] train episode 66: reward = -200.00, steps = 200
23:10:56 [INFO] train episode 67: reward = -200.00, steps = 200
23:12:21 [INFO] train episode 68: reward = -200.00, steps = 200
23:14:17 [INFO] train episode 69: reward = -200.00, steps = 200
23:16:11 [INFO] train episode 70: reward = -200.00, steps = 200
23:18:03 [INFO] train episode 71: reward = -200.00, steps = 200
23:19:55 [INFO] train episode 72: reward = -200.00, steps = 200
23:21:50 [INFO] train episode 73: reward = -200.00, steps = 200
23:23:47 [INFO] train episode 74: reward = -200.00, steps = 200
23:25:40 [INFO] train episode 75: reward = -200.00, steps = 200
23:27:37 [INFO] train episode 76: reward = -200.00, steps = 200
23:29:32 [INFO] train episode 77: reward = -200.00, steps = 200
23:31:29 [INFO] train episode 78: reward = -200.00, steps = 200
23:33:20 [INFO] train episode 79: reward = -200.00, steps = 200
23:35:13 [INFO] train episode 80: reward = -200.00, steps = 200
23:37:07 [INFO] train episode 81: reward = -200.00, steps = 200
23:39:07 [INFO] train episode 82: reward = -200.00, steps = 200
23:41:03 [INFO] train episode 83: reward = -200.00, steps = 200
23:43:09 [INFO] train episode 84: reward = -200.00, steps = 200
23:45:12 [INFO] train episode 85: reward = -200.00, steps = 200
23:47:18 [INFO] train episode 86: reward = -200.00, steps = 200
23:49:33 [INFO] train episode 87: reward = -200.00, steps = 200
23:51:38 [INFO] train episode 88: reward = -200.00, steps = 200
23:53:45 [INFO] train episode 89: reward = -200.00, steps = 200
23:55:39 [INFO] train episode 90: reward = -200.00, steps = 200
23:57:33 [INFO] train episode 91: reward = -200.00, steps = 200
23:59:26 [INFO] train episode 92: reward = -200.00, steps = 200
00:01:19 [INFO] train episode 93: reward = -200.00, steps = 200
00:03:13 [INFO] train episode 94: reward = -200.00, steps = 200
00:05:05 [INFO] train episode 95: reward = -200.00, steps = 200
00:06:54 [INFO] train episode 96: reward = -200.00, steps = 200
00:08:47 [INFO] train episode 97: reward = -200.00, steps = 200
00:10:41 [INFO] train episode 98: reward = -200.00, steps = 200
00:12:34 [INFO] train episode 99: reward = -200.00, steps = 200
00:14:26 [INFO] train episode 100: reward = -200.00, steps = 200
00:16:18 [INFO] train episode 101: reward = -200.00, steps = 200
00:18:09 [INFO] train episode 102: reward = -200.00, steps = 200
00:20:00 [INFO] train episode 103: reward = -200.00, steps = 200
00:21:53 [INFO] train episode 104: reward = -200.00, steps = 200
00:23:46 [INFO] train episode 105: reward = -200.00, steps = 200
00:25:39 [INFO] train episode 106: reward = -200.00, steps = 200
00:27:35 [INFO] train episode 107: reward = -200.00, steps = 200
00:29:27 [INFO] train episode 108: reward = -200.00, steps = 200
00:31:15 [INFO] train episode 109: reward = -195.00, steps = 195
00:33:12 [INFO] train episode 110: reward = -200.00, steps = 200
00:35:02 [INFO] train episode 111: reward = -200.00, steps = 200
00:36:53 [INFO] train episode 112: reward = -200.00, steps = 200
00:38:47 [INFO] train episode 113: reward = -200.00, steps = 200
00:40:33 [INFO] train episode 114: reward = -200.00, steps = 200
00:42:25 [INFO] train episode 115: reward = -200.00, steps = 200
00:44:15 [INFO] train episode 116: reward = -200.00, steps = 200
00:46:06 [INFO] train episode 117: reward = -200.00, steps = 200
00:47:56 [INFO] train episode 118: reward = -200.00, steps = 200
00:49:46 [INFO] train episode 119: reward = -200.00, steps = 200
00:51:37 [INFO] train episode 120: reward = -200.00, steps = 200
00:53:28 [INFO] train episode 121: reward = -200.00, steps = 200
00:55:18 [INFO] train episode 122: reward = -200.00, steps = 200
00:57:08 [INFO] train episode 123: reward = -200.00, steps = 200
00:58:37 [INFO] train episode 124: reward = -200.00, steps = 200
01:00:27 [INFO] train episode 125: reward = -200.00, steps = 200
01:02:17 [INFO] train episode 126: reward = -200.00, steps = 200
01:04:06 [INFO] train episode 127: reward = -200.00, steps = 200
01:05:55 [INFO] train episode 128: reward = -200.00, steps = 200
01:07:47 [INFO] train episode 129: reward = -200.00, steps = 200
01:09:19 [INFO] train episode 130: reward = -164.00, steps = 164
01:11:11 [INFO] train episode 131: reward = -200.00, steps = 200
01:13:01 [INFO] train episode 132: reward = -200.00, steps = 200
01:14:52 [INFO] train episode 133: reward = -200.00, steps = 200
01:16:42 [INFO] train episode 134: reward = -200.00, steps = 200
01:18:03 [INFO] train episode 135: reward = -146.00, steps = 146
01:19:51 [INFO] train episode 136: reward = -200.00, steps = 200
01:21:43 [INFO] train episode 137: reward = -200.00, steps = 200
01:23:33 [INFO] train episode 138: reward = -200.00, steps = 200
01:25:24 [INFO] train episode 139: reward = -200.00, steps = 200
01:27:19 [INFO] train episode 140: reward = -200.00, steps = 200
01:29:11 [INFO] train episode 141: reward = -200.00, steps = 200
01:31:03 [INFO] train episode 142: reward = -200.00, steps = 200
01:32:54 [INFO] train episode 143: reward = -200.00, steps = 200
01:34:11 [INFO] train episode 144: reward = -139.00, steps = 139
01:36:01 [INFO] train episode 145: reward = -200.00, steps = 200
01:37:53 [INFO] train episode 146: reward = -200.00, steps = 200
01:39:21 [INFO] train episode 147: reward = -159.00, steps = 159
01:41:12 [INFO] train episode 148: reward = -200.00, steps = 200
01:43:03 [INFO] train episode 149: reward = -200.00, steps = 200
01:44:54 [INFO] train episode 150: reward = -200.00, steps = 200
01:46:44 [INFO] train episode 151: reward = -200.00, steps = 200
01:48:34 [INFO] train episode 152: reward = -200.00, steps = 200
01:50:25 [INFO] train episode 153: reward = -200.00, steps = 200
01:52:16 [INFO] train episode 154: reward = -200.00, steps = 200
01:54:08 [INFO] train episode 155: reward = -200.00, steps = 200
01:55:41 [INFO] train episode 156: reward = -166.00, steps = 166
01:57:33 [INFO] train episode 157: reward = -200.00, steps = 200
01:59:22 [INFO] train episode 158: reward = -200.00, steps = 200
02:00:43 [INFO] train episode 159: reward = -145.00, steps = 145
02:02:07 [INFO] train episode 160: reward = -154.00, steps = 154
02:03:40 [INFO] train episode 161: reward = -168.00, steps = 168
02:05:29 [INFO] train episode 162: reward = -200.00, steps = 200
02:07:02 [INFO] train episode 163: reward = -167.00, steps = 167
02:08:20 [INFO] train episode 164: reward = -138.00, steps = 138
02:09:37 [INFO] train episode 165: reward = -137.00, steps = 137
02:10:54 [INFO] train episode 166: reward = -136.00, steps = 136
02:12:13 [INFO] train episode 167: reward = -142.00, steps = 142
02:13:34 [INFO] train episode 168: reward = -143.00, steps = 143
02:14:53 [INFO] train episode 169: reward = -143.00, steps = 143
02:16:13 [INFO] train episode 170: reward = -143.00, steps = 143
02:17:33 [INFO] train episode 171: reward = -145.00, steps = 145
02:19:07 [INFO] train episode 172: reward = -169.00, steps = 169
02:20:33 [INFO] train episode 173: reward = -155.00, steps = 155
02:21:59 [INFO] train episode 174: reward = -153.00, steps = 153
02:23:23 [INFO] train episode 175: reward = -150.00, steps = 150
02:24:51 [INFO] train episode 176: reward = -157.00, steps = 157
02:26:33 [INFO] train episode 177: reward = -183.00, steps = 183
02:28:04 [INFO] train episode 178: reward = -159.00, steps = 159
02:29:31 [INFO] train episode 179: reward = -158.00, steps = 158
02:31:04 [INFO] train episode 180: reward = -168.00, steps = 168
02:32:47 [INFO] train episode 181: reward = -185.00, steps = 185
02:34:37 [INFO] train episode 182: reward = -200.00, steps = 200
02:36:27 [INFO] train episode 183: reward = -200.00, steps = 200
02:38:19 [INFO] train episode 184: reward = -200.00, steps = 200
02:39:39 [INFO] train episode 185: reward = -170.00, steps = 170
02:41:15 [INFO] train episode 186: reward = -173.00, steps = 173
02:42:54 [INFO] train episode 187: reward = -200.00, steps = 200
02:44:34 [INFO] train episode 188: reward = -182.00, steps = 182
02:45:30 [INFO] train episode 189: reward = -102.00, steps = 102
02:46:20 [INFO] train episode 190: reward = -90.00, steps = 90
02:48:09 [INFO] train episode 191: reward = -200.00, steps = 200
02:49:05 [INFO] train episode 192: reward = -102.00, steps = 102
02:50:55 [INFO] train episode 193: reward = -200.00, steps = 200
02:52:47 [INFO] train episode 194: reward = -200.00, steps = 200
02:54:25 [INFO] train episode 195: reward = -174.00, steps = 174
02:55:55 [INFO] train episode 196: reward = -162.00, steps = 162
02:57:28 [INFO] train episode 197: reward = -200.00, steps = 200
02:58:05 [INFO] train episode 198: reward = -84.00, steps = 84
02:58:42 [INFO] train episode 199: reward = -86.00, steps = 86
02:59:57 [INFO] train episode 200: reward = -171.00, steps = 171
03:00:37 [INFO] train episode 201: reward = -89.00, steps = 89
03:01:16 [INFO] train episode 202: reward = -88.00, steps = 88
03:02:05 [INFO] train episode 203: reward = -111.00, steps = 111
03:03:32 [INFO] train episode 204: reward = -200.00, steps = 200
03:04:16 [INFO] train episode 205: reward = -100.00, steps = 100
03:04:53 [INFO] train episode 206: reward = -83.00, steps = 83
03:06:04 [INFO] train episode 207: reward = -163.00, steps = 163
03:07:04 [INFO] train episode 208: reward = -134.00, steps = 134
03:08:13 [INFO] train episode 209: reward = -157.00, steps = 157
03:09:22 [INFO] train episode 210: reward = -156.00, steps = 156
03:09:59 [INFO] train episode 211: reward = -84.00, steps = 84
03:11:26 [INFO] train episode 212: reward = -200.00, steps = 200
03:12:04 [INFO] train episode 213: reward = -87.00, steps = 87
03:12:42 [INFO] train episode 214: reward = -86.00, steps = 86
03:13:53 [INFO] train episode 215: reward = -162.00, steps = 162
03:15:05 [INFO] train episode 216: reward = -166.00, steps = 166
03:15:45 [INFO] train episode 217: reward = -90.00, steps = 90
03:16:22 [INFO] train episode 218: reward = -84.00, steps = 84
03:17:48 [INFO] train episode 219: reward = -200.00, steps = 200
03:18:54 [INFO] train episode 220: reward = -151.00, steps = 151
03:20:17 [INFO] train episode 221: reward = -200.00, steps = 200
03:21:20 [INFO] train episode 222: reward = -151.00, steps = 151
03:22:26 [INFO] train episode 223: reward = -156.00, steps = 156
03:23:41 [INFO] train episode 224: reward = -182.00, steps = 182
03:24:46 [INFO] train episode 225: reward = -155.00, steps = 155
03:25:32 [INFO] train episode 226: reward = -112.00, steps = 112
03:26:33 [INFO] train episode 227: reward = -146.00, steps = 146
03:27:23 [INFO] train episode 228: reward = -112.00, steps = 112
03:28:13 [INFO] train episode 229: reward = -119.00, steps = 119
03:29:01 [INFO] train episode 230: reward = -116.00, steps = 116
03:29:47 [INFO] train episode 231: reward = -110.00, steps = 110
03:30:30 [INFO] train episode 232: reward = -104.00, steps = 104
03:31:16 [INFO] train episode 233: reward = -110.00, steps = 110
03:32:05 [INFO] train episode 234: reward = -119.00, steps = 119
03:32:55 [INFO] train episode 235: reward = -120.00, steps = 120
03:34:16 [INFO] train episode 236: reward = -200.00, steps = 200
03:35:04 [INFO] train episode 237: reward = -117.00, steps = 117
03:35:51 [INFO] train episode 238: reward = -116.00, steps = 116
03:36:42 [INFO] train episode 239: reward = -122.00, steps = 122
03:38:05 [INFO] train episode 240: reward = -200.00, steps = 200
03:38:54 [INFO] train episode 241: reward = -117.00, steps = 117
03:39:42 [INFO] train episode 242: reward = -117.00, steps = 117
03:41:05 [INFO] train episode 243: reward = -200.00, steps = 200
03:41:53 [INFO] train episode 244: reward = -116.00, steps = 116
03:43:16 [INFO] train episode 245: reward = -200.00, steps = 200
03:44:38 [INFO] train episode 246: reward = -200.00, steps = 200
03:45:25 [INFO] train episode 247: reward = -116.00, steps = 116
03:46:46 [INFO] train episode 248: reward = -200.00, steps = 200
03:48:08 [INFO] train episode 249: reward = -200.00, steps = 200
03:48:56 [INFO] train episode 250: reward = -115.00, steps = 115
03:49:50 [INFO] train episode 251: reward = -132.00, steps = 132
03:50:39 [INFO] train episode 252: reward = -120.00, steps = 120
03:51:35 [INFO] train episode 253: reward = -133.00, steps = 133
03:52:25 [INFO] train episode 254: reward = -118.00, steps = 118
03:53:15 [INFO] train episode 255: reward = -122.00, steps = 122
03:54:07 [INFO] train episode 256: reward = -123.00, steps = 123
03:54:56 [INFO] train episode 257: reward = -120.00, steps = 120
03:55:43 [INFO] train episode 258: reward = -113.00, steps = 113
03:56:30 [INFO] train episode 259: reward = -111.00, steps = 111
03:57:17 [INFO] train episode 260: reward = -112.00, steps = 112
03:58:02 [INFO] train episode 261: reward = -109.00, steps = 109
03:58:48 [INFO] train episode 262: reward = -109.00, steps = 109
03:59:33 [INFO] train episode 263: reward = -109.00, steps = 109
04:00:19 [INFO] train episode 264: reward = -109.00, steps = 109
04:01:03 [INFO] train episode 265: reward = -108.00, steps = 108
04:01:48 [INFO] train episode 266: reward = -109.00, steps = 109
04:02:30 [INFO] train episode 267: reward = -101.00, steps = 101
04:02:30 [INFO] ==== test ====
04:02:30 [INFO] test episode 0: reward = -85.00, steps = 85
04:02:30 [INFO] test episode 1: reward = -107.00, steps = 107
04:02:30 [INFO] test episode 2: reward = -85.00, steps = 85
04:02:30 [INFO] test episode 3: reward = -108.00, steps = 108
04:02:30 [INFO] test episode 4: reward = -107.00, steps = 107
04:02:30 [INFO] test episode 5: reward = -107.00, steps = 107
04:02:30 [INFO] test episode 6: reward = -86.00, steps = 86
04:02:30 [INFO] test episode 7: reward = -108.00, steps = 108
04:02:30 [INFO] test episode 8: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 9: reward = -107.00, steps = 107
04:02:31 [INFO] test episode 10: reward = -87.00, steps = 87
04:02:31 [INFO] test episode 11: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 12: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 13: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 14: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 15: reward = -107.00, steps = 107
04:02:31 [INFO] test episode 16: reward = -85.00, steps = 85
04:02:31 [INFO] test episode 17: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 18: reward = -107.00, steps = 107
04:02:31 [INFO] test episode 19: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 20: reward = -107.00, steps = 107
04:02:31 [INFO] test episode 21: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 22: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 23: reward = -87.00, steps = 87
04:02:31 [INFO] test episode 24: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 25: reward = -108.00, steps = 108
04:02:31 [INFO] test episode 26: reward = -107.00, steps = 107
04:02:31 [INFO] test episode 27: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 28: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 29: reward = -106.00, steps = 106
04:02:32 [INFO] test episode 30: reward = -84.00, steps = 84
04:02:32 [INFO] test episode 31: reward = -106.00, steps = 106
04:02:32 [INFO] test episode 32: reward = -84.00, steps = 84
04:02:32 [INFO] test episode 33: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 34: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 35: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 36: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 37: reward = -85.00, steps = 85
04:02:32 [INFO] test episode 38: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 39: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 40: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 41: reward = -84.00, steps = 84
04:02:32 [INFO] test episode 42: reward = -108.00, steps = 108
04:02:32 [INFO] test episode 43: reward = -86.00, steps = 86
04:02:32 [INFO] test episode 44: reward = -107.00, steps = 107
04:02:32 [INFO] test episode 45: reward = -85.00, steps = 85
04:02:32 [INFO] test episode 46: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 47: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 48: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 49: reward = -107.00, steps = 107
04:02:33 [INFO] test episode 50: reward = -107.00, steps = 107
04:02:33 [INFO] test episode 51: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 52: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 53: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 54: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 55: reward = -86.00, steps = 86
04:02:33 [INFO] test episode 56: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 57: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 58: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 59: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 60: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 61: reward = -107.00, steps = 107
04:02:33 [INFO] test episode 62: reward = -107.00, steps = 107
04:02:33 [INFO] test episode 63: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 64: reward = -108.00, steps = 108
04:02:33 [INFO] test episode 65: reward = -88.00, steps = 88
04:02:34 [INFO] test episode 66: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 67: reward = -107.00, steps = 107
04:02:34 [INFO] test episode 68: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 69: reward = -84.00, steps = 84
04:02:34 [INFO] test episode 70: reward = -107.00, steps = 107
04:02:34 [INFO] test episode 71: reward = -87.00, steps = 87
04:02:34 [INFO] test episode 72: reward = -85.00, steps = 85
04:02:34 [INFO] test episode 73: reward = -87.00, steps = 87
04:02:34 [INFO] test episode 74: reward = -107.00, steps = 107
04:02:34 [INFO] test episode 75: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 76: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 77: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 78: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 79: reward = -86.00, steps = 86
04:02:34 [INFO] test episode 80: reward = -107.00, steps = 107
04:02:34 [INFO] test episode 81: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 82: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 83: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 84: reward = -108.00, steps = 108
04:02:34 [INFO] test episode 85: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 86: reward = -84.00, steps = 84
04:02:35 [INFO] test episode 87: reward = -107.00, steps = 107
04:02:35 [INFO] test episode 88: reward = -107.00, steps = 107
04:02:35 [INFO] test episode 89: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 90: reward = -88.00, steps = 88
04:02:35 [INFO] test episode 91: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 92: reward = -84.00, steps = 84
04:02:35 [INFO] test episode 93: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 94: reward = -107.00, steps = 107
04:02:35 [INFO] test episode 95: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 96: reward = -107.00, steps = 107
04:02:35 [INFO] test episode 97: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 98: reward = -108.00, steps = 108
04:02:35 [INFO] test episode 99: reward = -86.00, steps = 86
04:02:35 [INFO] average episode reward = -102.59 ± 9.34
In [6]:
env.close()