Use Dueling DQN to Play MoutainCar-v0¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('MountainCar-v0')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
22:49:57 [INFO] env: <MountainCarEnv<MountainCar-v0>>
22:49:57 [INFO] action_space: Discrete(3)
22:49:57 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
22:49:57 [INFO] reward_range: (-inf, inf)
22:49:57 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
22:49:57 [INFO] _max_episode_steps: 200
22:49:57 [INFO] _elapsed_steps: None
22:49:57 [INFO] id: MountainCar-v0
22:49:57 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
22:49:57 [INFO] reward_threshold: -110.0
22:49:57 [INFO] nondeterministic: False
22:49:57 [INFO] max_episode_steps: 200
22:49:57 [INFO] _kwargs: {}
22:49:57 [INFO] _env_name: MountainCar
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class DuelNet(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.common_net = nn.Sequential(nn.Linear(input_size, 64), nn.ReLU())
        self.advantage_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, output_size))
        self.v_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1))

    def forward(self, s):
        h = self.common_net(s)
        adv = self.advantage_net(h)
        adv = adv - adv.mean(1).unsqueeze(1)
        v = self.v_net(h)
        q = v + adv
        return q


class DuelDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.evaluate_net = DuelNet(input_size=env.observation_space.shape[0],
                output_size=self.action_n)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
        self.loss = nn.MSELoss()

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net = copy.deepcopy(self.evaluate_net)

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).reshape(1, -1)
            q_tensor = self.evaluate_net(state_tensor)
            action_tensor = torch.argmax(q_tensor)
            action = action_tensor.item()
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(1024)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update value net
        next_eval_q_tensor = self.evaluate_net(next_state_tensor)
        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
        next_q_tensor = self.target_net(next_state_tensor)
        next_max_q_tensor = torch.gather(next_q_tensor, 1,
                next_action_tensor.unsqueeze(1)).squeeze(1)
        target_tensor = reward_tensor + self.gamma * \
                (1. - terminated_tensor) * next_max_q_tensor
        pred_tensor = self.evaluate_net(state_tensor)
        unsqueeze_tensor = action_tensor.unsqueeze(1)
        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
        loss_tensor = self.loss(target_tensor, q_tensor)
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = DuelDQNAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
22:49:57 [INFO] ==== train ====
22:49:58 [INFO] train episode 0: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 1: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 2: reward = -200.00, steps = 200
22:49:58 [INFO] train episode 3: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 4: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 5: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 6: reward = -200.00, steps = 200
22:49:59 [INFO] train episode 7: reward = -200.00, steps = 200
22:50:00 [INFO] train episode 8: reward = -200.00, steps = 200
22:50:00 [INFO] train episode 9: reward = -200.00, steps = 200
22:50:00 [INFO] train episode 10: reward = -200.00, steps = 200
22:50:00 [INFO] train episode 11: reward = -200.00, steps = 200
22:50:01 [INFO] train episode 12: reward = -200.00, steps = 200
22:50:01 [INFO] train episode 13: reward = -200.00, steps = 200
22:50:01 [INFO] train episode 14: reward = -200.00, steps = 200
22:50:01 [INFO] train episode 15: reward = -200.00, steps = 200
22:50:02 [INFO] train episode 16: reward = -200.00, steps = 200
22:50:02 [INFO] train episode 17: reward = -200.00, steps = 200
22:50:02 [INFO] train episode 18: reward = -200.00, steps = 200
22:50:02 [INFO] train episode 19: reward = -200.00, steps = 200
22:50:03 [INFO] train episode 20: reward = -200.00, steps = 200
22:50:03 [INFO] train episode 21: reward = -200.00, steps = 200
22:50:03 [INFO] train episode 22: reward = -200.00, steps = 200
22:50:03 [INFO] train episode 23: reward = -200.00, steps = 200
22:50:04 [INFO] train episode 24: reward = -200.00, steps = 200
22:50:04 [INFO] train episode 25: reward = -200.00, steps = 200
22:50:04 [INFO] train episode 26: reward = -200.00, steps = 200
22:50:04 [INFO] train episode 27: reward = -200.00, steps = 200
22:50:05 [INFO] train episode 28: reward = -200.00, steps = 200
22:50:05 [INFO] train episode 29: reward = -200.00, steps = 200
22:50:05 [INFO] train episode 30: reward = -200.00, steps = 200
22:50:05 [INFO] train episode 31: reward = -200.00, steps = 200
22:50:06 [INFO] train episode 32: reward = -200.00, steps = 200
22:50:06 [INFO] train episode 33: reward = -200.00, steps = 200
22:50:06 [INFO] train episode 34: reward = -200.00, steps = 200
22:50:06 [INFO] train episode 35: reward = -200.00, steps = 200
22:50:07 [INFO] train episode 36: reward = -200.00, steps = 200
22:50:07 [INFO] train episode 37: reward = -200.00, steps = 200
22:50:07 [INFO] train episode 38: reward = -200.00, steps = 200
22:50:07 [INFO] train episode 39: reward = -200.00, steps = 200
22:50:08 [INFO] train episode 40: reward = -200.00, steps = 200
22:50:08 [INFO] train episode 41: reward = -200.00, steps = 200
22:50:08 [INFO] train episode 42: reward = -200.00, steps = 200
22:50:08 [INFO] train episode 43: reward = -200.00, steps = 200
22:50:08 [INFO] train episode 44: reward = -200.00, steps = 200
22:50:09 [INFO] train episode 45: reward = -200.00, steps = 200
22:50:09 [INFO] train episode 46: reward = -200.00, steps = 200
22:50:30 [INFO] train episode 47: reward = -200.00, steps = 200
22:51:47 [INFO] train episode 48: reward = -200.00, steps = 200
22:53:04 [INFO] train episode 49: reward = -200.00, steps = 200
22:54:25 [INFO] train episode 50: reward = -200.00, steps = 200
22:55:44 [INFO] train episode 51: reward = -200.00, steps = 200
22:56:59 [INFO] train episode 52: reward = -200.00, steps = 200
22:58:16 [INFO] train episode 53: reward = -200.00, steps = 200
22:59:33 [INFO] train episode 54: reward = -200.00, steps = 200
23:01:09 [INFO] train episode 55: reward = -200.00, steps = 200
23:02:37 [INFO] train episode 56: reward = -200.00, steps = 200
23:04:08 [INFO] train episode 57: reward = -200.00, steps = 200
23:05:34 [INFO] train episode 58: reward = -200.00, steps = 200
23:06:59 [INFO] train episode 59: reward = -200.00, steps = 200
23:08:21 [INFO] train episode 60: reward = -200.00, steps = 200
23:09:46 [INFO] train episode 61: reward = -200.00, steps = 200
23:11:20 [INFO] train episode 62: reward = -200.00, steps = 200
23:13:39 [INFO] train episode 63: reward = -200.00, steps = 200
23:16:13 [INFO] train episode 64: reward = -200.00, steps = 200
23:18:45 [INFO] train episode 65: reward = -200.00, steps = 200
23:21:17 [INFO] train episode 66: reward = -200.00, steps = 200
23:23:54 [INFO] train episode 67: reward = -200.00, steps = 200
23:26:30 [INFO] train episode 68: reward = -200.00, steps = 200
23:29:03 [INFO] train episode 69: reward = -200.00, steps = 200
23:31:40 [INFO] train episode 70: reward = -200.00, steps = 200
23:34:10 [INFO] train episode 71: reward = -200.00, steps = 200
23:36:44 [INFO] train episode 72: reward = -200.00, steps = 200
23:39:23 [INFO] train episode 73: reward = -200.00, steps = 200
23:41:58 [INFO] train episode 74: reward = -200.00, steps = 200
23:44:52 [INFO] train episode 75: reward = -200.00, steps = 200
23:47:43 [INFO] train episode 76: reward = -200.00, steps = 200
23:50:38 [INFO] train episode 77: reward = -200.00, steps = 200
23:53:27 [INFO] train episode 78: reward = -200.00, steps = 200
23:56:03 [INFO] train episode 79: reward = -200.00, steps = 200
23:58:36 [INFO] train episode 80: reward = -200.00, steps = 200
00:01:10 [INFO] train episode 81: reward = -200.00, steps = 200
00:03:43 [INFO] train episode 82: reward = -200.00, steps = 200
00:06:14 [INFO] train episode 83: reward = -200.00, steps = 200
00:08:43 [INFO] train episode 84: reward = -200.00, steps = 200
00:11:15 [INFO] train episode 85: reward = -200.00, steps = 200
00:13:47 [INFO] train episode 86: reward = -200.00, steps = 200
00:16:17 [INFO] train episode 87: reward = -200.00, steps = 200
00:18:48 [INFO] train episode 88: reward = -200.00, steps = 200
00:21:19 [INFO] train episode 89: reward = -200.00, steps = 200
00:23:51 [INFO] train episode 90: reward = -200.00, steps = 200
00:26:22 [INFO] train episode 91: reward = -200.00, steps = 200
00:28:56 [INFO] train episode 92: reward = -200.00, steps = 200
00:31:25 [INFO] train episode 93: reward = -200.00, steps = 200
00:34:01 [INFO] train episode 94: reward = -200.00, steps = 200
00:36:30 [INFO] train episode 95: reward = -200.00, steps = 200
00:39:03 [INFO] train episode 96: reward = -200.00, steps = 200
00:41:28 [INFO] train episode 97: reward = -200.00, steps = 200
00:43:58 [INFO] train episode 98: reward = -200.00, steps = 200
00:46:27 [INFO] train episode 99: reward = -200.00, steps = 200
00:48:55 [INFO] train episode 100: reward = -200.00, steps = 200
00:51:24 [INFO] train episode 101: reward = -200.00, steps = 200
00:53:54 [INFO] train episode 102: reward = -200.00, steps = 200
00:56:23 [INFO] train episode 103: reward = -200.00, steps = 200
00:58:28 [INFO] train episode 104: reward = -200.00, steps = 200
01:00:56 [INFO] train episode 105: reward = -200.00, steps = 200
01:03:24 [INFO] train episode 106: reward = -200.00, steps = 200
01:05:52 [INFO] train episode 107: reward = -200.00, steps = 200
01:08:23 [INFO] train episode 108: reward = -200.00, steps = 200
01:10:53 [INFO] train episode 109: reward = -200.00, steps = 200
01:13:22 [INFO] train episode 110: reward = -200.00, steps = 200
01:15:52 [INFO] train episode 111: reward = -200.00, steps = 200
01:18:20 [INFO] train episode 112: reward = -200.00, steps = 200
01:20:49 [INFO] train episode 113: reward = -200.00, steps = 200
01:23:18 [INFO] train episode 114: reward = -200.00, steps = 200
01:25:46 [INFO] train episode 115: reward = -200.00, steps = 200
01:28:20 [INFO] train episode 116: reward = -200.00, steps = 200
01:30:50 [INFO] train episode 117: reward = -200.00, steps = 200
01:33:20 [INFO] train episode 118: reward = -200.00, steps = 200
01:35:48 [INFO] train episode 119: reward = -200.00, steps = 200
01:38:18 [INFO] train episode 120: reward = -200.00, steps = 200
01:40:48 [INFO] train episode 121: reward = -200.00, steps = 200
01:43:18 [INFO] train episode 122: reward = -200.00, steps = 200
01:45:47 [INFO] train episode 123: reward = -200.00, steps = 200
01:48:15 [INFO] train episode 124: reward = -200.00, steps = 200
01:50:44 [INFO] train episode 125: reward = -200.00, steps = 200
01:53:13 [INFO] train episode 126: reward = -200.00, steps = 200
01:55:44 [INFO] train episode 127: reward = -200.00, steps = 200
01:58:13 [INFO] train episode 128: reward = -200.00, steps = 200
02:00:41 [INFO] train episode 129: reward = -200.00, steps = 200
02:03:09 [INFO] train episode 130: reward = -200.00, steps = 200
02:05:37 [INFO] train episode 131: reward = -200.00, steps = 200
02:08:07 [INFO] train episode 132: reward = -200.00, steps = 200
02:10:37 [INFO] train episode 133: reward = -200.00, steps = 200
02:13:07 [INFO] train episode 134: reward = -200.00, steps = 200
02:15:35 [INFO] train episode 135: reward = -200.00, steps = 200
02:18:04 [INFO] train episode 136: reward = -200.00, steps = 200
02:20:33 [INFO] train episode 137: reward = -200.00, steps = 200
02:23:03 [INFO] train episode 138: reward = -200.00, steps = 200
02:25:33 [INFO] train episode 139: reward = -200.00, steps = 200
02:28:06 [INFO] train episode 140: reward = -200.00, steps = 200
02:30:33 [INFO] train episode 141: reward = -200.00, steps = 200
02:33:00 [INFO] train episode 142: reward = -200.00, steps = 200
02:35:28 [INFO] train episode 143: reward = -200.00, steps = 200
02:37:57 [INFO] train episode 144: reward = -200.00, steps = 200
02:40:11 [INFO] train episode 145: reward = -200.00, steps = 200
02:42:28 [INFO] train episode 146: reward = -200.00, steps = 200
02:44:55 [INFO] train episode 147: reward = -200.00, steps = 200
02:47:22 [INFO] train episode 148: reward = -200.00, steps = 200
02:49:50 [INFO] train episode 149: reward = -200.00, steps = 200
02:52:19 [INFO] train episode 150: reward = -200.00, steps = 200
02:54:50 [INFO] train episode 151: reward = -200.00, steps = 200
02:57:05 [INFO] train episode 152: reward = -200.00, steps = 200
02:59:02 [INFO] train episode 153: reward = -200.00, steps = 200
03:01:00 [INFO] train episode 154: reward = -200.00, steps = 200
03:02:58 [INFO] train episode 155: reward = -200.00, steps = 200
03:04:56 [INFO] train episode 156: reward = -200.00, steps = 200
03:06:54 [INFO] train episode 157: reward = -200.00, steps = 200
03:08:52 [INFO] train episode 158: reward = -200.00, steps = 200
03:10:50 [INFO] train episode 159: reward = -200.00, steps = 200
03:12:48 [INFO] train episode 160: reward = -200.00, steps = 200
03:14:45 [INFO] train episode 161: reward = -200.00, steps = 200
03:16:42 [INFO] train episode 162: reward = -200.00, steps = 200
03:18:38 [INFO] train episode 163: reward = -200.00, steps = 200
03:20:31 [INFO] train episode 164: reward = -200.00, steps = 200
03:22:23 [INFO] train episode 165: reward = -200.00, steps = 200
03:24:15 [INFO] train episode 166: reward = -200.00, steps = 200
03:26:07 [INFO] train episode 167: reward = -200.00, steps = 200
03:28:01 [INFO] train episode 168: reward = -200.00, steps = 200
03:29:52 [INFO] train episode 169: reward = -200.00, steps = 200
03:31:44 [INFO] train episode 170: reward = -200.00, steps = 200
03:33:34 [INFO] train episode 171: reward = -200.00, steps = 200
03:35:24 [INFO] train episode 172: reward = -200.00, steps = 200
03:37:14 [INFO] train episode 173: reward = -200.00, steps = 200
03:39:05 [INFO] train episode 174: reward = -200.00, steps = 200
03:40:57 [INFO] train episode 175: reward = -200.00, steps = 200
03:42:48 [INFO] train episode 176: reward = -200.00, steps = 200
03:44:38 [INFO] train episode 177: reward = -200.00, steps = 200
03:46:28 [INFO] train episode 178: reward = -200.00, steps = 200
03:48:18 [INFO] train episode 179: reward = -200.00, steps = 200
03:50:08 [INFO] train episode 180: reward = -200.00, steps = 200
03:51:59 [INFO] train episode 181: reward = -200.00, steps = 200
03:53:50 [INFO] train episode 182: reward = -200.00, steps = 200
03:55:41 [INFO] train episode 183: reward = -200.00, steps = 200
03:57:33 [INFO] train episode 184: reward = -200.00, steps = 200
03:59:23 [INFO] train episode 185: reward = -200.00, steps = 200
04:01:15 [INFO] train episode 186: reward = -200.00, steps = 200
04:03:04 [INFO] train episode 187: reward = -200.00, steps = 200
04:04:51 [INFO] train episode 188: reward = -200.00, steps = 200
04:06:39 [INFO] train episode 189: reward = -200.00, steps = 200
04:08:29 [INFO] train episode 190: reward = -200.00, steps = 200
04:10:29 [INFO] train episode 191: reward = -200.00, steps = 200
04:12:17 [INFO] train episode 192: reward = -200.00, steps = 200
04:14:07 [INFO] train episode 193: reward = -200.00, steps = 200
04:15:56 [INFO] train episode 194: reward = -200.00, steps = 200
04:17:44 [INFO] train episode 195: reward = -200.00, steps = 200
04:19:23 [INFO] train episode 196: reward = -200.00, steps = 200
04:20:48 [INFO] train episode 197: reward = -200.00, steps = 200
04:22:11 [INFO] train episode 198: reward = -200.00, steps = 200
04:23:34 [INFO] train episode 199: reward = -200.00, steps = 200
04:24:57 [INFO] train episode 200: reward = -200.00, steps = 200
04:26:19 [INFO] train episode 201: reward = -200.00, steps = 200
04:27:45 [INFO] train episode 202: reward = -200.00, steps = 200
04:29:08 [INFO] train episode 203: reward = -200.00, steps = 200
04:30:30 [INFO] train episode 204: reward = -200.00, steps = 200
04:31:49 [INFO] train episode 205: reward = -200.00, steps = 200
04:33:01 [INFO] train episode 206: reward = -200.00, steps = 200
04:34:13 [INFO] train episode 207: reward = -200.00, steps = 200
04:35:24 [INFO] train episode 208: reward = -200.00, steps = 200
04:36:34 [INFO] train episode 209: reward = -200.00, steps = 200
04:37:33 [INFO] train episode 210: reward = -200.00, steps = 200
04:38:32 [INFO] train episode 211: reward = -200.00, steps = 200
04:39:30 [INFO] train episode 212: reward = -200.00, steps = 200
04:40:29 [INFO] train episode 213: reward = -200.00, steps = 200
04:41:28 [INFO] train episode 214: reward = -200.00, steps = 200
04:42:27 [INFO] train episode 215: reward = -200.00, steps = 200
04:43:27 [INFO] train episode 216: reward = -200.00, steps = 200
04:44:26 [INFO] train episode 217: reward = -200.00, steps = 200
04:45:24 [INFO] train episode 218: reward = -200.00, steps = 200
04:46:23 [INFO] train episode 219: reward = -200.00, steps = 200
04:47:23 [INFO] train episode 220: reward = -200.00, steps = 200
04:48:22 [INFO] train episode 221: reward = -200.00, steps = 200
04:49:20 [INFO] train episode 222: reward = -200.00, steps = 200
04:50:14 [INFO] train episode 223: reward = -200.00, steps = 200
04:50:59 [INFO] train episode 224: reward = -200.00, steps = 200
04:51:44 [INFO] train episode 225: reward = -200.00, steps = 200
04:52:29 [INFO] train episode 226: reward = -200.00, steps = 200
04:53:14 [INFO] train episode 227: reward = -200.00, steps = 200
04:53:59 [INFO] train episode 228: reward = -200.00, steps = 200
04:54:42 [INFO] train episode 229: reward = -200.00, steps = 200
04:55:21 [INFO] train episode 230: reward = -200.00, steps = 200
04:55:58 [INFO] train episode 231: reward = -200.00, steps = 200
04:56:37 [INFO] train episode 232: reward = -200.00, steps = 200
04:57:15 [INFO] train episode 233: reward = -200.00, steps = 200
04:57:54 [INFO] train episode 234: reward = -200.00, steps = 200
04:58:33 [INFO] train episode 235: reward = -200.00, steps = 200
04:59:12 [INFO] train episode 236: reward = -200.00, steps = 200
04:59:41 [INFO] train episode 237: reward = -200.00, steps = 200
05:00:07 [INFO] train episode 238: reward = -200.00, steps = 200
05:00:29 [INFO] train episode 239: reward = -200.00, steps = 200
05:01:00 [INFO] train episode 240: reward = -200.00, steps = 200
05:01:39 [INFO] train episode 241: reward = -200.00, steps = 200
05:02:17 [INFO] train episode 242: reward = -200.00, steps = 200
05:02:56 [INFO] train episode 243: reward = -200.00, steps = 200
05:03:24 [INFO] train episode 244: reward = -200.00, steps = 200
05:04:02 [INFO] train episode 245: reward = -200.00, steps = 200
05:04:40 [INFO] train episode 246: reward = -200.00, steps = 200
05:05:18 [INFO] train episode 247: reward = -200.00, steps = 200
05:05:46 [INFO] train episode 248: reward = -200.00, steps = 200
05:06:26 [INFO] train episode 249: reward = -200.00, steps = 200
05:07:04 [INFO] train episode 250: reward = -200.00, steps = 200
05:07:33 [INFO] train episode 251: reward = -200.00, steps = 200
05:07:51 [INFO] train episode 252: reward = -200.00, steps = 200
05:08:16 [INFO] train episode 253: reward = -200.00, steps = 200
05:08:55 [INFO] train episode 254: reward = -200.00, steps = 200
05:09:20 [INFO] train episode 255: reward = -200.00, steps = 200
05:09:50 [INFO] train episode 256: reward = -200.00, steps = 200
05:10:09 [INFO] train episode 257: reward = -200.00, steps = 200
05:10:44 [INFO] train episode 258: reward = -200.00, steps = 200
05:11:23 [INFO] train episode 259: reward = -200.00, steps = 200
05:11:52 [INFO] train episode 260: reward = -200.00, steps = 200
05:12:30 [INFO] train episode 261: reward = -200.00, steps = 200
05:13:09 [INFO] train episode 262: reward = -200.00, steps = 200
05:13:47 [INFO] train episode 263: reward = -200.00, steps = 200
05:14:15 [INFO] train episode 264: reward = -200.00, steps = 200
05:14:45 [INFO] train episode 265: reward = -200.00, steps = 200
05:15:24 [INFO] train episode 266: reward = -200.00, steps = 200
05:15:49 [INFO] train episode 267: reward = -200.00, steps = 200
05:16:18 [INFO] train episode 268: reward = -200.00, steps = 200
05:16:56 [INFO] train episode 269: reward = -200.00, steps = 200
05:17:23 [INFO] train episode 270: reward = -200.00, steps = 200
05:17:54 [INFO] train episode 271: reward = -200.00, steps = 200
05:18:23 [INFO] train episode 272: reward = -200.00, steps = 200
05:19:01 [INFO] train episode 273: reward = -200.00, steps = 200
05:19:39 [INFO] train episode 274: reward = -200.00, steps = 200
05:20:18 [INFO] train episode 275: reward = -200.00, steps = 200
05:20:57 [INFO] train episode 276: reward = -200.00, steps = 200
05:21:36 [INFO] train episode 277: reward = -200.00, steps = 200
05:22:14 [INFO] train episode 278: reward = -200.00, steps = 200
05:22:53 [INFO] train episode 279: reward = -200.00, steps = 200
05:23:31 [INFO] train episode 280: reward = -200.00, steps = 200
05:24:09 [INFO] train episode 281: reward = -200.00, steps = 200
05:24:48 [INFO] train episode 282: reward = -200.00, steps = 200
05:25:14 [INFO] train episode 283: reward = -200.00, steps = 200
05:25:52 [INFO] train episode 284: reward = -200.00, steps = 200
05:26:31 [INFO] train episode 285: reward = -200.00, steps = 200
05:26:57 [INFO] train episode 286: reward = -200.00, steps = 200
05:27:34 [INFO] train episode 287: reward = -200.00, steps = 200
05:28:07 [INFO] train episode 288: reward = -200.00, steps = 200
05:28:42 [INFO] train episode 289: reward = -200.00, steps = 200
05:29:22 [INFO] train episode 290: reward = -200.00, steps = 200
05:29:56 [INFO] train episode 291: reward = -200.00, steps = 200
05:30:31 [INFO] train episode 292: reward = -200.00, steps = 200
05:31:04 [INFO] train episode 293: reward = -200.00, steps = 200
05:31:17 [INFO] train episode 294: reward = -200.00, steps = 200
05:31:29 [INFO] train episode 295: reward = -200.00, steps = 200
05:31:42 [INFO] train episode 296: reward = -200.00, steps = 200
05:31:54 [INFO] train episode 297: reward = -200.00, steps = 200
05:32:06 [INFO] train episode 298: reward = -200.00, steps = 200
05:32:19 [INFO] train episode 299: reward = -200.00, steps = 200
05:32:25 [INFO] train episode 300: reward = -92.00, steps = 92
05:32:37 [INFO] train episode 301: reward = -200.00, steps = 200
05:32:49 [INFO] train episode 302: reward = -200.00, steps = 200
05:33:08 [INFO] train episode 303: reward = -200.00, steps = 200
05:33:28 [INFO] train episode 304: reward = -200.00, steps = 200
05:33:49 [INFO] train episode 305: reward = -200.00, steps = 200
05:34:10 [INFO] train episode 306: reward = -200.00, steps = 200
05:34:28 [INFO] train episode 307: reward = -200.00, steps = 200
05:34:48 [INFO] train episode 308: reward = -200.00, steps = 200
05:35:08 [INFO] train episode 309: reward = -200.00, steps = 200
05:35:28 [INFO] train episode 310: reward = -200.00, steps = 200
05:35:46 [INFO] train episode 311: reward = -200.00, steps = 200
05:36:07 [INFO] train episode 312: reward = -200.00, steps = 200
05:36:27 [INFO] train episode 313: reward = -200.00, steps = 200
05:36:48 [INFO] train episode 314: reward = -200.00, steps = 200
05:37:09 [INFO] train episode 315: reward = -200.00, steps = 200
05:37:30 [INFO] train episode 316: reward = -200.00, steps = 200
05:37:50 [INFO] train episode 317: reward = -200.00, steps = 200
05:38:11 [INFO] train episode 318: reward = -200.00, steps = 200
05:38:31 [INFO] train episode 319: reward = -200.00, steps = 200
05:38:52 [INFO] train episode 320: reward = -200.00, steps = 200
05:39:14 [INFO] train episode 321: reward = -200.00, steps = 200
05:39:33 [INFO] train episode 322: reward = -200.00, steps = 200
05:39:54 [INFO] train episode 323: reward = -200.00, steps = 200
05:40:15 [INFO] train episode 324: reward = -200.00, steps = 200
05:40:35 [INFO] train episode 325: reward = -200.00, steps = 200
05:40:54 [INFO] train episode 326: reward = -200.00, steps = 200
05:41:14 [INFO] train episode 327: reward = -200.00, steps = 200
05:41:35 [INFO] train episode 328: reward = -200.00, steps = 200
05:41:55 [INFO] train episode 329: reward = -200.00, steps = 200
05:42:16 [INFO] train episode 330: reward = -200.00, steps = 200
05:42:36 [INFO] train episode 331: reward = -200.00, steps = 200
05:42:56 [INFO] train episode 332: reward = -200.00, steps = 200
05:43:16 [INFO] train episode 333: reward = -200.00, steps = 200
05:43:36 [INFO] train episode 334: reward = -200.00, steps = 200
05:43:56 [INFO] train episode 335: reward = -200.00, steps = 200
05:44:17 [INFO] train episode 336: reward = -200.00, steps = 200
05:44:38 [INFO] train episode 337: reward = -200.00, steps = 200
05:44:58 [INFO] train episode 338: reward = -200.00, steps = 200
05:45:18 [INFO] train episode 339: reward = -200.00, steps = 200
05:45:38 [INFO] train episode 340: reward = -200.00, steps = 200
05:45:59 [INFO] train episode 341: reward = -200.00, steps = 200
05:46:19 [INFO] train episode 342: reward = -200.00, steps = 200
05:46:40 [INFO] train episode 343: reward = -200.00, steps = 200
05:47:00 [INFO] train episode 344: reward = -200.00, steps = 200
05:47:20 [INFO] train episode 345: reward = -200.00, steps = 200
05:47:37 [INFO] train episode 346: reward = -200.00, steps = 200
05:47:47 [INFO] train episode 347: reward = -200.00, steps = 200
05:47:57 [INFO] train episode 348: reward = -200.00, steps = 200
05:48:07 [INFO] train episode 349: reward = -200.00, steps = 200
05:48:17 [INFO] train episode 350: reward = -200.00, steps = 200
05:48:26 [INFO] train episode 351: reward = -200.00, steps = 200
05:48:37 [INFO] train episode 352: reward = -200.00, steps = 200
05:48:46 [INFO] train episode 353: reward = -200.00, steps = 200
05:48:54 [INFO] train episode 354: reward = -200.00, steps = 200
05:49:02 [INFO] train episode 355: reward = -200.00, steps = 200
05:49:10 [INFO] train episode 356: reward = -200.00, steps = 200
05:49:17 [INFO] train episode 357: reward = -200.00, steps = 200
05:49:25 [INFO] train episode 358: reward = -200.00, steps = 200
05:49:30 [INFO] train episode 359: reward = -114.00, steps = 114
05:49:34 [INFO] train episode 360: reward = -113.00, steps = 113
05:49:38 [INFO] train episode 361: reward = -102.00, steps = 102
05:49:45 [INFO] train episode 362: reward = -178.00, steps = 178
05:49:49 [INFO] train episode 363: reward = -98.00, steps = 98
05:49:53 [INFO] train episode 364: reward = -94.00, steps = 94
05:50:01 [INFO] train episode 365: reward = -191.00, steps = 191
05:50:08 [INFO] train episode 366: reward = -174.00, steps = 174
05:50:16 [INFO] train episode 367: reward = -200.00, steps = 200
05:50:26 [INFO] train episode 368: reward = -200.00, steps = 200
05:50:34 [INFO] train episode 369: reward = -200.00, steps = 200
05:50:42 [INFO] train episode 370: reward = -200.00, steps = 200
05:50:50 [INFO] train episode 371: reward = -200.00, steps = 200
05:50:57 [INFO] train episode 372: reward = -165.00, steps = 165
05:51:04 [INFO] train episode 373: reward = -172.00, steps = 172
05:51:12 [INFO] train episode 374: reward = -181.00, steps = 181
05:51:20 [INFO] train episode 375: reward = -200.00, steps = 200
05:51:28 [INFO] train episode 376: reward = -200.00, steps = 200
05:51:37 [INFO] train episode 377: reward = -200.00, steps = 200
05:51:45 [INFO] train episode 378: reward = -200.00, steps = 200
05:51:53 [INFO] train episode 379: reward = -200.00, steps = 200
05:52:01 [INFO] train episode 380: reward = -183.00, steps = 183
05:52:09 [INFO] train episode 381: reward = -200.00, steps = 200
05:52:13 [INFO] train episode 382: reward = -93.00, steps = 93
05:52:17 [INFO] train episode 383: reward = -92.00, steps = 92
05:52:22 [INFO] train episode 384: reward = -108.00, steps = 108
05:52:25 [INFO] train episode 385: reward = -90.00, steps = 90
05:52:34 [INFO] train episode 386: reward = -200.00, steps = 200
05:52:38 [INFO] train episode 387: reward = -90.00, steps = 90
05:52:44 [INFO] train episode 388: reward = -156.00, steps = 156
05:52:52 [INFO] train episode 389: reward = -170.00, steps = 170
05:52:55 [INFO] train episode 390: reward = -91.00, steps = 91
05:53:02 [INFO] train episode 391: reward = -153.00, steps = 153
05:53:05 [INFO] train episode 392: reward = -88.00, steps = 88
05:53:11 [INFO] train episode 393: reward = -146.00, steps = 146
05:53:15 [INFO] train episode 394: reward = -85.00, steps = 85
05:53:19 [INFO] train episode 395: reward = -96.00, steps = 96
05:53:22 [INFO] train episode 396: reward = -86.00, steps = 86
05:53:28 [INFO] train episode 397: reward = -140.00, steps = 140
05:53:34 [INFO] train episode 398: reward = -146.00, steps = 146
05:53:41 [INFO] train episode 399: reward = -137.00, steps = 137
05:53:44 [INFO] train episode 400: reward = -85.00, steps = 85
05:53:50 [INFO] train episode 401: reward = -137.00, steps = 137
05:53:56 [INFO] train episode 402: reward = -145.00, steps = 145
05:53:59 [INFO] train episode 403: reward = -85.00, steps = 85
05:54:05 [INFO] train episode 404: reward = -142.00, steps = 142
05:54:11 [INFO] train episode 405: reward = -140.00, steps = 140
05:54:17 [INFO] train episode 406: reward = -137.00, steps = 137
05:54:22 [INFO] train episode 407: reward = -139.00, steps = 139
05:54:28 [INFO] train episode 408: reward = -140.00, steps = 140
05:54:32 [INFO] train episode 409: reward = -87.00, steps = 87
05:54:38 [INFO] train episode 410: reward = -149.00, steps = 149
05:54:44 [INFO] train episode 411: reward = -142.00, steps = 142
05:54:48 [INFO] train episode 412: reward = -103.00, steps = 103
05:54:55 [INFO] train episode 413: reward = -144.00, steps = 144
05:54:58 [INFO] train episode 414: reward = -92.00, steps = 92
05:55:03 [INFO] train episode 415: reward = -106.00, steps = 106
05:55:07 [INFO] train episode 416: reward = -92.00, steps = 92
05:55:11 [INFO] train episode 417: reward = -88.00, steps = 88
05:55:18 [INFO] train episode 418: reward = -149.00, steps = 149
05:55:22 [INFO] train episode 419: reward = -84.00, steps = 84
05:55:25 [INFO] train episode 420: reward = -95.00, steps = 95
05:55:25 [INFO] ==== test ====
05:55:26 [INFO] test episode 0: reward = -156.00, steps = 156
05:55:26 [INFO] test episode 1: reward = -94.00, steps = 94
05:55:26 [INFO] test episode 2: reward = -87.00, steps = 87
05:55:26 [INFO] test episode 3: reward = -147.00, steps = 147
05:55:26 [INFO] test episode 4: reward = -157.00, steps = 157
05:55:26 [INFO] test episode 5: reward = -146.00, steps = 146
05:55:26 [INFO] test episode 6: reward = -89.00, steps = 89
05:55:26 [INFO] test episode 7: reward = -85.00, steps = 85
05:55:26 [INFO] test episode 8: reward = -158.00, steps = 158
05:55:27 [INFO] test episode 9: reward = -146.00, steps = 146
05:55:27 [INFO] test episode 10: reward = -157.00, steps = 157
05:55:27 [INFO] test episode 11: reward = -146.00, steps = 146
05:55:27 [INFO] test episode 12: reward = -86.00, steps = 86
05:55:27 [INFO] test episode 13: reward = -149.00, steps = 149
05:55:27 [INFO] test episode 14: reward = -148.00, steps = 148
05:55:27 [INFO] test episode 15: reward = -86.00, steps = 86
05:55:27 [INFO] test episode 16: reward = -146.00, steps = 146
05:55:27 [INFO] test episode 17: reward = -87.00, steps = 87
05:55:27 [INFO] test episode 18: reward = -157.00, steps = 157
05:55:28 [INFO] test episode 19: reward = -148.00, steps = 148
05:55:28 [INFO] test episode 20: reward = -147.00, steps = 147
05:55:28 [INFO] test episode 21: reward = -98.00, steps = 98
05:55:28 [INFO] test episode 22: reward = -155.00, steps = 155
05:55:28 [INFO] test episode 23: reward = -146.00, steps = 146
05:55:28 [INFO] test episode 24: reward = -89.00, steps = 89
05:55:28 [INFO] test episode 25: reward = -157.00, steps = 157
05:55:28 [INFO] test episode 26: reward = -146.00, steps = 146
05:55:28 [INFO] test episode 27: reward = -88.00, steps = 88
05:55:29 [INFO] test episode 28: reward = -146.00, steps = 146
05:55:29 [INFO] test episode 29: reward = -146.00, steps = 146
05:55:29 [INFO] test episode 30: reward = -95.00, steps = 95
05:55:29 [INFO] test episode 31: reward = -86.00, steps = 86
05:55:29 [INFO] test episode 32: reward = -147.00, steps = 147
05:55:29 [INFO] test episode 33: reward = -91.00, steps = 91
05:55:29 [INFO] test episode 34: reward = -155.00, steps = 155
05:55:29 [INFO] test episode 35: reward = -86.00, steps = 86
05:55:29 [INFO] test episode 36: reward = -146.00, steps = 146
05:55:29 [INFO] test episode 37: reward = -146.00, steps = 146
05:55:30 [INFO] test episode 38: reward = -157.00, steps = 157
05:55:30 [INFO] test episode 39: reward = -90.00, steps = 90
05:55:30 [INFO] test episode 40: reward = -87.00, steps = 87
05:55:30 [INFO] test episode 41: reward = -89.00, steps = 89
05:55:30 [INFO] test episode 42: reward = -147.00, steps = 147
05:55:30 [INFO] test episode 43: reward = -147.00, steps = 147
05:55:30 [INFO] test episode 44: reward = -103.00, steps = 103
05:55:30 [INFO] test episode 45: reward = -146.00, steps = 146
05:55:30 [INFO] test episode 46: reward = -146.00, steps = 146
05:55:30 [INFO] test episode 47: reward = -155.00, steps = 155
05:55:31 [INFO] test episode 48: reward = -89.00, steps = 89
05:55:31 [INFO] test episode 49: reward = -159.00, steps = 159
05:55:31 [INFO] test episode 50: reward = -88.00, steps = 88
05:55:31 [INFO] test episode 51: reward = -98.00, steps = 98
05:55:31 [INFO] test episode 52: reward = -92.00, steps = 92
05:55:31 [INFO] test episode 53: reward = -145.00, steps = 145
05:55:31 [INFO] test episode 54: reward = -94.00, steps = 94
05:55:31 [INFO] test episode 55: reward = -156.00, steps = 156
05:55:31 [INFO] test episode 56: reward = -145.00, steps = 145
05:55:31 [INFO] test episode 57: reward = -90.00, steps = 90
05:55:32 [INFO] test episode 58: reward = -86.00, steps = 86
05:55:32 [INFO] test episode 59: reward = -98.00, steps = 98
05:55:32 [INFO] test episode 60: reward = -155.00, steps = 155
05:55:32 [INFO] test episode 61: reward = -158.00, steps = 158
05:55:32 [INFO] test episode 62: reward = -88.00, steps = 88
05:55:32 [INFO] test episode 63: reward = -146.00, steps = 146
05:55:32 [INFO] test episode 64: reward = -147.00, steps = 147
05:55:32 [INFO] test episode 65: reward = -156.00, steps = 156
05:55:32 [INFO] test episode 66: reward = -91.00, steps = 91
05:55:32 [INFO] test episode 67: reward = -85.00, steps = 85
05:55:33 [INFO] test episode 68: reward = -146.00, steps = 146
05:55:33 [INFO] test episode 69: reward = -146.00, steps = 146
05:55:33 [INFO] test episode 70: reward = -85.00, steps = 85
05:55:33 [INFO] test episode 71: reward = -86.00, steps = 86
05:55:33 [INFO] test episode 72: reward = -147.00, steps = 147
05:55:33 [INFO] test episode 73: reward = -86.00, steps = 86
05:55:33 [INFO] test episode 74: reward = -88.00, steps = 88
05:55:33 [INFO] test episode 75: reward = -158.00, steps = 158
05:55:33 [INFO] test episode 76: reward = -91.00, steps = 91
05:55:33 [INFO] test episode 77: reward = -156.00, steps = 156
05:55:34 [INFO] test episode 78: reward = -147.00, steps = 147
05:55:34 [INFO] test episode 79: reward = -147.00, steps = 147
05:55:34 [INFO] test episode 80: reward = -146.00, steps = 146
05:55:34 [INFO] test episode 81: reward = -89.00, steps = 89
05:55:34 [INFO] test episode 82: reward = -146.00, steps = 146
05:55:34 [INFO] test episode 83: reward = -146.00, steps = 146
05:55:34 [INFO] test episode 84: reward = -152.00, steps = 152
05:55:34 [INFO] test episode 85: reward = -87.00, steps = 87
05:55:34 [INFO] test episode 86: reward = -156.00, steps = 156
05:55:35 [INFO] test episode 87: reward = -158.00, steps = 158
05:55:35 [INFO] test episode 88: reward = -147.00, steps = 147
05:55:35 [INFO] test episode 89: reward = -156.00, steps = 156
05:55:35 [INFO] test episode 90: reward = -157.00, steps = 157
05:55:35 [INFO] test episode 91: reward = -153.00, steps = 153
05:55:35 [INFO] test episode 92: reward = -148.00, steps = 148
05:55:35 [INFO] test episode 93: reward = -147.00, steps = 147
05:55:35 [INFO] test episode 94: reward = -145.00, steps = 145
05:55:36 [INFO] test episode 95: reward = -147.00, steps = 147
05:55:36 [INFO] test episode 96: reward = -159.00, steps = 159
05:55:36 [INFO] test episode 97: reward = -89.00, steps = 89
05:55:36 [INFO] test episode 98: reward = -146.00, steps = 146
05:55:36 [INFO] test episode 99: reward = -147.00, steps = 147
05:55:36 [INFO] average episode reward = -127.82 ± 29.65
In [6]:
env.close()