Use Dueling DQN to Play MoutainCar-v0¶

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
from tensorflow import nn
from tensorflow import losses
from tensorflow import optimizers
from tensorflow import keras
from tensorflow.keras import layers

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('MountainCar-v0')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
22:22:06 [INFO] env: <MountainCarEnv<MountainCar-v0>>
22:22:06 [INFO] action_space: Discrete(3)
22:22:06 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
22:22:06 [INFO] reward_range: (-inf, inf)
22:22:06 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
22:22:06 [INFO] _max_episode_steps: 200
22:22:06 [INFO] _elapsed_steps: None
22:22:06 [INFO] id: MountainCar-v0
22:22:06 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
22:22:06 [INFO] reward_threshold: -110.0
22:22:06 [INFO] nondeterministic: False
22:22:06 [INFO] max_episode_steps: 200
22:22:06 [INFO] _kwargs: {}
22:22:06 [INFO] _env_name: MountainCar
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class DuelNet(keras.Model):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.common_net = keras.Sequential([
                layers.Dense(64, input_shape=(input_size,), activation=nn.relu)])
        self.advantage_net = keras.Sequential([
                layers.Dense(32, input_shape=(64,), activation=nn.relu),
                layers.Dense(output_size)])
        self.v_net = keras.Sequential([
                layers.Dense(32, input_shape=(64,), activation=nn.relu),
                layers.Dense(1)])

    def call(self, s):
        h = self.common_net(s)
        adv = self.advantage_net(h)
        adv = adv - tf.math.reduce_mean(adv, axis=1, keepdims=True)
        v = self.v_net(h)
        q = v + adv
        return q


class DuelDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.evaluate_net = self.build_net(
                input_size=env.observation_space.shape[0],
                output_size=self.action_n)
        self.target_net = self.build_net(
                input_size=env.observation_space.shape[0],
                output_size=self.action_n)

    def build_net(self, input_size, output_size):
        net = DuelNet(input_size=input_size, output_size=output_size)
        optimizer = optimizers.Adam(0.001)
        net.compile(loss=losses.mse, optimizer=optimizer)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net.set_weights(self.evaluate_net.get_weights())

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            qs = self.evaluate_net.predict(observation[np.newaxis], verbose=0)
            action = np.argmax(qs)
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(1024)

        # update value net
        next_eval_qs = self.evaluate_net.predict(next_states, verbose=0)
        next_actions = next_eval_qs.argmax(axis=-1)
        next_qs = self.target_net.predict(next_states, verbose=0)
        next_max_qs = next_qs[np.arange(next_qs.shape[0]), next_actions]
        us = rewards + self.gamma * next_max_qs * (1. - terminateds)
        targets = self.evaluate_net.predict(states, verbose=0)
        targets[np.arange(us.shape[0]), actions] = us
        self.evaluate_net.fit(states, targets, verbose=0)


agent = DuelDQNAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
22:22:07 [INFO] ==== train ====
22:22:18 [INFO] train episode 0: reward = -200.00, steps = 200
22:22:32 [INFO] train episode 1: reward = -200.00, steps = 200
22:22:44 [INFO] train episode 2: reward = -200.00, steps = 200
22:22:59 [INFO] train episode 3: reward = -200.00, steps = 200
22:23:13 [INFO] train episode 4: reward = -200.00, steps = 200
22:23:27 [INFO] train episode 5: reward = -200.00, steps = 200
22:23:46 [INFO] train episode 6: reward = -200.00, steps = 200
22:24:08 [INFO] train episode 7: reward = -200.00, steps = 200
22:24:25 [INFO] train episode 8: reward = -200.00, steps = 200
22:24:42 [INFO] train episode 9: reward = -200.00, steps = 200
22:25:09 [INFO] train episode 10: reward = -200.00, steps = 200
22:25:26 [INFO] train episode 11: reward = -200.00, steps = 200
22:25:40 [INFO] train episode 12: reward = -200.00, steps = 200
22:25:54 [INFO] train episode 13: reward = -200.00, steps = 200
22:26:07 [INFO] train episode 14: reward = -200.00, steps = 200
22:26:20 [INFO] train episode 15: reward = -200.00, steps = 200
22:26:34 [INFO] train episode 16: reward = -200.00, steps = 200
22:26:49 [INFO] train episode 17: reward = -200.00, steps = 200
22:27:03 [INFO] train episode 18: reward = -200.00, steps = 200
22:27:16 [INFO] train episode 19: reward = -200.00, steps = 200
22:27:31 [INFO] train episode 20: reward = -200.00, steps = 200
22:27:46 [INFO] train episode 21: reward = -200.00, steps = 200
22:28:00 [INFO] train episode 22: reward = -200.00, steps = 200
22:28:15 [INFO] train episode 23: reward = -200.00, steps = 200
22:28:30 [INFO] train episode 24: reward = -200.00, steps = 200
22:28:44 [INFO] train episode 25: reward = -200.00, steps = 200
22:28:59 [INFO] train episode 26: reward = -200.00, steps = 200
22:29:13 [INFO] train episode 27: reward = -200.00, steps = 200
22:29:27 [INFO] train episode 28: reward = -200.00, steps = 200
22:29:40 [INFO] train episode 29: reward = -200.00, steps = 200
22:29:53 [INFO] train episode 30: reward = -200.00, steps = 200
22:30:06 [INFO] train episode 31: reward = -200.00, steps = 200
22:30:19 [INFO] train episode 32: reward = -200.00, steps = 200
22:30:37 [INFO] train episode 33: reward = -200.00, steps = 200
22:30:58 [INFO] train episode 34: reward = -200.00, steps = 200
22:31:12 [INFO] train episode 35: reward = -200.00, steps = 200
22:31:26 [INFO] train episode 36: reward = -200.00, steps = 200
22:31:41 [INFO] train episode 37: reward = -200.00, steps = 200
22:31:55 [INFO] train episode 38: reward = -200.00, steps = 200
22:32:12 [INFO] train episode 39: reward = -200.00, steps = 200
22:32:26 [INFO] train episode 40: reward = -200.00, steps = 200
22:32:40 [INFO] train episode 41: reward = -200.00, steps = 200
22:32:55 [INFO] train episode 42: reward = -200.00, steps = 200
22:33:12 [INFO] train episode 43: reward = -200.00, steps = 200
22:33:26 [INFO] train episode 44: reward = -200.00, steps = 200
22:33:40 [INFO] train episode 45: reward = -200.00, steps = 200
22:33:53 [INFO] train episode 46: reward = -200.00, steps = 200
22:34:29 [INFO] train episode 47: reward = -200.00, steps = 200
22:36:05 [INFO] train episode 48: reward = -200.00, steps = 200
22:38:02 [INFO] train episode 49: reward = -200.00, steps = 200
22:40:03 [INFO] train episode 50: reward = -200.00, steps = 200
22:42:20 [INFO] train episode 51: reward = -200.00, steps = 200
22:44:42 [INFO] train episode 52: reward = -200.00, steps = 200
22:47:03 [INFO] train episode 53: reward = -200.00, steps = 200
22:49:07 [INFO] train episode 54: reward = -200.00, steps = 200
22:51:20 [INFO] train episode 55: reward = -200.00, steps = 200
22:53:29 [INFO] train episode 56: reward = -200.00, steps = 200
22:55:46 [INFO] train episode 57: reward = -200.00, steps = 200
22:56:43 [INFO] train episode 58: reward = -87.00, steps = 87
22:58:53 [INFO] train episode 59: reward = -200.00, steps = 200
23:01:17 [INFO] train episode 60: reward = -200.00, steps = 200
23:03:43 [INFO] train episode 61: reward = -200.00, steps = 200
23:06:02 [INFO] train episode 62: reward = -200.00, steps = 200
23:08:17 [INFO] train episode 63: reward = -200.00, steps = 200
23:10:37 [INFO] train episode 64: reward = -200.00, steps = 200
23:13:29 [INFO] train episode 65: reward = -200.00, steps = 200
23:15:04 [INFO] train episode 66: reward = -104.00, steps = 104
23:18:01 [INFO] train episode 67: reward = -200.00, steps = 200
23:19:31 [INFO] train episode 68: reward = -101.00, steps = 101
23:22:29 [INFO] train episode 69: reward = -200.00, steps = 200
23:25:31 [INFO] train episode 70: reward = -200.00, steps = 200
23:28:31 [INFO] train episode 71: reward = -200.00, steps = 200
23:31:33 [INFO] train episode 72: reward = -200.00, steps = 200
23:34:29 [INFO] train episode 73: reward = -200.00, steps = 200
23:37:26 [INFO] train episode 74: reward = -200.00, steps = 200
23:40:30 [INFO] train episode 75: reward = -200.00, steps = 200
23:43:46 [INFO] train episode 76: reward = -200.00, steps = 200
23:45:09 [INFO] train episode 77: reward = -89.00, steps = 89
23:48:31 [INFO] train episode 78: reward = -200.00, steps = 200
23:51:46 [INFO] train episode 79: reward = -200.00, steps = 200
23:54:56 [INFO] train episode 80: reward = -200.00, steps = 200
23:57:55 [INFO] train episode 81: reward = -200.00, steps = 200
00:00:52 [INFO] train episode 82: reward = -200.00, steps = 200
00:03:49 [INFO] train episode 83: reward = -200.00, steps = 200
00:06:43 [INFO] train episode 84: reward = -200.00, steps = 200
00:09:39 [INFO] train episode 85: reward = -200.00, steps = 200
00:12:35 [INFO] train episode 86: reward = -200.00, steps = 200
00:15:31 [INFO] train episode 87: reward = -200.00, steps = 200
00:18:26 [INFO] train episode 88: reward = -200.00, steps = 200
00:21:22 [INFO] train episode 89: reward = -200.00, steps = 200
00:24:18 [INFO] train episode 90: reward = -200.00, steps = 200
00:27:18 [INFO] train episode 91: reward = -200.00, steps = 200
00:30:13 [INFO] train episode 92: reward = -200.00, steps = 200
00:33:14 [INFO] train episode 93: reward = -200.00, steps = 200
00:36:09 [INFO] train episode 94: reward = -200.00, steps = 200
00:39:06 [INFO] train episode 95: reward = -200.00, steps = 200
00:42:05 [INFO] train episode 96: reward = -200.00, steps = 200
00:45:00 [INFO] train episode 97: reward = -200.00, steps = 200
00:47:54 [INFO] train episode 98: reward = -200.00, steps = 200
00:50:48 [INFO] train episode 99: reward = -200.00, steps = 200
00:53:43 [INFO] train episode 100: reward = -200.00, steps = 200
00:56:39 [INFO] train episode 101: reward = -200.00, steps = 200
00:59:34 [INFO] train episode 102: reward = -200.00, steps = 200
01:02:29 [INFO] train episode 103: reward = -200.00, steps = 200
01:05:23 [INFO] train episode 104: reward = -200.00, steps = 200
01:08:18 [INFO] train episode 105: reward = -200.00, steps = 200
01:11:13 [INFO] train episode 106: reward = -200.00, steps = 200
01:14:08 [INFO] train episode 107: reward = -200.00, steps = 200
01:17:03 [INFO] train episode 108: reward = -200.00, steps = 200
01:19:56 [INFO] train episode 109: reward = -200.00, steps = 200
01:22:51 [INFO] train episode 110: reward = -200.00, steps = 200
01:25:44 [INFO] train episode 111: reward = -200.00, steps = 200
01:28:42 [INFO] train episode 112: reward = -200.00, steps = 200
01:31:37 [INFO] train episode 113: reward = -200.00, steps = 200
01:34:33 [INFO] train episode 114: reward = -200.00, steps = 200
01:37:02 [INFO] train episode 115: reward = -171.00, steps = 171
01:38:50 [INFO] train episode 116: reward = -122.00, steps = 122
01:41:46 [INFO] train episode 117: reward = -200.00, steps = 200
01:44:34 [INFO] train episode 118: reward = -191.00, steps = 191
01:47:05 [INFO] train episode 119: reward = -173.00, steps = 173
01:49:59 [INFO] train episode 120: reward = -200.00, steps = 200
01:52:54 [INFO] train episode 121: reward = -200.00, steps = 200
01:55:48 [INFO] train episode 122: reward = -200.00, steps = 200
01:58:35 [INFO] train episode 123: reward = -190.00, steps = 190
02:01:23 [INFO] train episode 124: reward = -192.00, steps = 192
02:04:16 [INFO] train episode 125: reward = -200.00, steps = 200
02:07:10 [INFO] train episode 126: reward = -200.00, steps = 200
02:10:05 [INFO] train episode 127: reward = -200.00, steps = 200
02:13:00 [INFO] train episode 128: reward = -200.00, steps = 200
02:15:54 [INFO] train episode 129: reward = -200.00, steps = 200
02:18:47 [INFO] train episode 130: reward = -200.00, steps = 200
02:21:41 [INFO] train episode 131: reward = -200.00, steps = 200
02:24:34 [INFO] train episode 132: reward = -200.00, steps = 200
02:27:30 [INFO] train episode 133: reward = -200.00, steps = 200
02:29:13 [INFO] train episode 134: reward = -117.00, steps = 117
02:30:45 [INFO] train episode 135: reward = -105.00, steps = 105
02:33:38 [INFO] train episode 136: reward = -200.00, steps = 200
02:35:18 [INFO] train episode 137: reward = -114.00, steps = 114
02:38:13 [INFO] train episode 138: reward = -200.00, steps = 200
02:39:58 [INFO] train episode 139: reward = -118.00, steps = 118
02:42:56 [INFO] train episode 140: reward = -200.00, steps = 200
02:44:47 [INFO] train episode 141: reward = -127.00, steps = 127
02:47:04 [INFO] train episode 142: reward = -157.00, steps = 157
02:48:44 [INFO] train episode 143: reward = -114.00, steps = 114
02:51:27 [INFO] train episode 144: reward = -185.00, steps = 185
02:53:04 [INFO] train episode 145: reward = -110.00, steps = 110
02:54:37 [INFO] train episode 146: reward = -106.00, steps = 106
02:56:13 [INFO] train episode 147: reward = -108.00, steps = 108
02:57:28 [INFO] train episode 148: reward = -96.00, steps = 96
02:58:37 [INFO] train episode 149: reward = -87.00, steps = 87
03:00:33 [INFO] train episode 150: reward = -150.00, steps = 150
03:02:10 [INFO] train episode 151: reward = -122.00, steps = 122
03:04:41 [INFO] train episode 152: reward = -194.00, steps = 194
03:06:02 [INFO] train episode 153: reward = -104.00, steps = 104
03:07:34 [INFO] train episode 154: reward = -116.00, steps = 116
03:10:09 [INFO] train episode 155: reward = -200.00, steps = 200
03:11:22 [INFO] train episode 156: reward = -93.00, steps = 93
03:13:18 [INFO] train episode 157: reward = -149.00, steps = 149
03:15:12 [INFO] train episode 158: reward = -147.00, steps = 147
03:17:47 [INFO] train episode 159: reward = -200.00, steps = 200
03:20:21 [INFO] train episode 160: reward = -200.00, steps = 200
03:21:27 [INFO] train episode 161: reward = -86.00, steps = 86
03:23:28 [INFO] train episode 162: reward = -161.00, steps = 161
03:24:35 [INFO] train episode 163: reward = -87.00, steps = 87
03:27:08 [INFO] train episode 164: reward = -200.00, steps = 200
03:29:14 [INFO] train episode 165: reward = -166.00, steps = 166
03:31:46 [INFO] train episode 166: reward = -200.00, steps = 200
03:32:53 [INFO] train episode 167: reward = -88.00, steps = 88
03:34:09 [INFO] train episode 168: reward = -100.00, steps = 100
03:36:39 [INFO] train episode 169: reward = -200.00, steps = 200
03:39:11 [INFO] train episode 170: reward = -200.00, steps = 200
03:41:41 [INFO] train episode 171: reward = -200.00, steps = 200
03:42:44 [INFO] train episode 172: reward = -83.00, steps = 83
03:44:30 [INFO] train episode 173: reward = -140.00, steps = 140
03:45:38 [INFO] train episode 174: reward = -90.00, steps = 90
03:47:33 [INFO] train episode 175: reward = -151.00, steps = 151
03:49:37 [INFO] train episode 176: reward = -163.00, steps = 163
03:50:42 [INFO] train episode 177: reward = -85.00, steps = 85
03:52:21 [INFO] train episode 178: reward = -131.00, steps = 131
03:53:45 [INFO] train episode 179: reward = -111.00, steps = 111
03:55:13 [INFO] train episode 180: reward = -115.00, steps = 115
03:56:49 [INFO] train episode 181: reward = -127.00, steps = 127
03:58:33 [INFO] train episode 182: reward = -136.00, steps = 136
04:00:03 [INFO] train episode 183: reward = -120.00, steps = 120
04:01:36 [INFO] train episode 184: reward = -123.00, steps = 123
04:03:08 [INFO] train episode 185: reward = -121.00, steps = 121
04:04:33 [INFO] train episode 186: reward = -113.00, steps = 113
04:06:02 [INFO] train episode 187: reward = -119.00, steps = 119
04:07:31 [INFO] train episode 188: reward = -118.00, steps = 118
04:08:58 [INFO] train episode 189: reward = -115.00, steps = 115
04:10:42 [INFO] train episode 190: reward = -124.00, steps = 124
04:12:09 [INFO] train episode 191: reward = -115.00, steps = 115
04:13:37 [INFO] train episode 192: reward = -116.00, steps = 116
04:15:06 [INFO] train episode 193: reward = -118.00, steps = 118
04:16:33 [INFO] train episode 194: reward = -115.00, steps = 115
04:17:55 [INFO] train episode 195: reward = -109.00, steps = 109
04:19:17 [INFO] train episode 196: reward = -111.00, steps = 111
04:20:35 [INFO] train episode 197: reward = -116.00, steps = 116
04:21:32 [INFO] train episode 198: reward = -84.00, steps = 84
04:23:29 [INFO] train episode 199: reward = -174.00, steps = 174
04:24:48 [INFO] train episode 200: reward = -119.00, steps = 119
04:26:01 [INFO] train episode 201: reward = -108.00, steps = 108
04:27:04 [INFO] train episode 202: reward = -92.00, steps = 92
04:28:14 [INFO] train episode 203: reward = -105.00, steps = 105
04:29:27 [INFO] train episode 204: reward = -109.00, steps = 109
04:30:36 [INFO] train episode 205: reward = -102.00, steps = 102
04:31:33 [INFO] train episode 206: reward = -85.00, steps = 85
04:31:33 [INFO] ==== test ====
04:31:44 [INFO] test episode 0: reward = -106.00, steps = 106
04:31:56 [INFO] test episode 1: reward = -106.00, steps = 106
04:32:07 [INFO] test episode 2: reward = -106.00, steps = 106
04:32:18 [INFO] test episode 3: reward = -106.00, steps = 106
04:32:28 [INFO] test episode 4: reward = -88.00, steps = 88
04:32:40 [INFO] test episode 5: reward = -105.00, steps = 105
04:32:53 [INFO] test episode 6: reward = -128.00, steps = 128
04:33:03 [INFO] test episode 7: reward = -90.00, steps = 90
04:33:14 [INFO] test episode 8: reward = -106.00, steps = 106
04:33:26 [INFO] test episode 9: reward = -104.00, steps = 104
04:33:40 [INFO] test episode 10: reward = -138.00, steps = 138
04:33:50 [INFO] test episode 11: reward = -87.00, steps = 87
04:34:02 [INFO] test episode 12: reward = -106.00, steps = 106
04:34:13 [INFO] test episode 13: reward = -106.00, steps = 106
04:34:24 [INFO] test episode 14: reward = -106.00, steps = 106
04:34:36 [INFO] test episode 15: reward = -105.00, steps = 105
04:34:47 [INFO] test episode 16: reward = -104.00, steps = 104
04:34:56 [INFO] test episode 17: reward = -91.00, steps = 91
04:35:08 [INFO] test episode 18: reward = -103.00, steps = 103
04:35:19 [INFO] test episode 19: reward = -107.00, steps = 107
04:35:30 [INFO] test episode 20: reward = -106.00, steps = 106
04:35:42 [INFO] test episode 21: reward = -106.00, steps = 106
04:35:53 [INFO] test episode 22: reward = -106.00, steps = 106
04:36:04 [INFO] test episode 23: reward = -106.00, steps = 106
04:36:16 [INFO] test episode 24: reward = -106.00, steps = 106
04:36:26 [INFO] test episode 25: reward = -91.00, steps = 91
04:36:38 [INFO] test episode 26: reward = -106.00, steps = 106
04:36:49 [INFO] test episode 27: reward = -106.00, steps = 106
04:36:59 [INFO] test episode 28: reward = -105.00, steps = 105
04:37:13 [INFO] test episode 29: reward = -135.00, steps = 135
04:37:24 [INFO] test episode 30: reward = -107.00, steps = 107
04:37:35 [INFO] test episode 31: reward = -105.00, steps = 105
04:37:46 [INFO] test episode 32: reward = -106.00, steps = 106
04:37:59 [INFO] test episode 33: reward = -128.00, steps = 128
04:38:10 [INFO] test episode 34: reward = -105.00, steps = 105
04:38:21 [INFO] test episode 35: reward = -107.00, steps = 107
04:38:32 [INFO] test episode 36: reward = -107.00, steps = 107
04:38:43 [INFO] test episode 37: reward = -105.00, steps = 105
04:38:53 [INFO] test episode 38: reward = -104.00, steps = 104
04:39:02 [INFO] test episode 39: reward = -88.00, steps = 88
04:39:13 [INFO] test episode 40: reward = -106.00, steps = 106
04:39:24 [INFO] test episode 41: reward = -104.00, steps = 104
04:39:35 [INFO] test episode 42: reward = -106.00, steps = 106
04:39:46 [INFO] test episode 43: reward = -105.00, steps = 105
04:39:57 [INFO] test episode 44: reward = -105.00, steps = 105
04:40:08 [INFO] test episode 45: reward = -105.00, steps = 105
04:40:18 [INFO] test episode 46: reward = -104.00, steps = 104
04:40:29 [INFO] test episode 47: reward = -105.00, steps = 105
04:40:40 [INFO] test episode 48: reward = -106.00, steps = 106
04:40:51 [INFO] test episode 49: reward = -104.00, steps = 104
04:41:01 [INFO] test episode 50: reward = -105.00, steps = 105
04:41:12 [INFO] test episode 51: reward = -106.00, steps = 106
04:41:23 [INFO] test episode 52: reward = -106.00, steps = 106
04:41:34 [INFO] test episode 53: reward = -106.00, steps = 106
04:41:45 [INFO] test episode 54: reward = -106.00, steps = 106
04:41:58 [INFO] test episode 55: reward = -128.00, steps = 128
04:42:09 [INFO] test episode 56: reward = -105.00, steps = 105
04:42:20 [INFO] test episode 57: reward = -105.00, steps = 105
04:42:31 [INFO] test episode 58: reward = -104.00, steps = 104
04:42:42 [INFO] test episode 59: reward = -104.00, steps = 104
04:42:55 [INFO] test episode 60: reward = -128.00, steps = 128
04:43:04 [INFO] test episode 61: reward = -88.00, steps = 88
04:43:15 [INFO] test episode 62: reward = -106.00, steps = 106
04:43:25 [INFO] test episode 63: reward = -87.00, steps = 87
04:43:36 [INFO] test episode 64: reward = -107.00, steps = 107
04:43:46 [INFO] test episode 65: reward = -105.00, steps = 105
04:43:57 [INFO] test episode 66: reward = -106.00, steps = 106
04:44:06 [INFO] test episode 67: reward = -88.00, steps = 88
04:44:17 [INFO] test episode 68: reward = -104.00, steps = 104
04:44:28 [INFO] test episode 69: reward = -105.00, steps = 105
04:44:42 [INFO] test episode 70: reward = -133.00, steps = 133
04:44:51 [INFO] test episode 71: reward = -92.00, steps = 92
04:45:02 [INFO] test episode 72: reward = -107.00, steps = 107
04:45:13 [INFO] test episode 73: reward = -107.00, steps = 107
04:45:24 [INFO] test episode 74: reward = -105.00, steps = 105
04:45:35 [INFO] test episode 75: reward = -105.00, steps = 105
04:45:46 [INFO] test episode 76: reward = -107.00, steps = 107
04:45:55 [INFO] test episode 77: reward = -87.00, steps = 87
04:46:06 [INFO] test episode 78: reward = -105.00, steps = 105
04:46:17 [INFO] test episode 79: reward = -106.00, steps = 106
04:46:28 [INFO] test episode 80: reward = -106.00, steps = 106
04:46:39 [INFO] test episode 81: reward = -105.00, steps = 105
04:46:50 [INFO] test episode 82: reward = -106.00, steps = 106
04:47:01 [INFO] test episode 83: reward = -104.00, steps = 104
04:47:10 [INFO] test episode 84: reward = -90.00, steps = 90
04:47:21 [INFO] test episode 85: reward = -107.00, steps = 107
04:47:32 [INFO] test episode 86: reward = -107.00, steps = 107
04:47:43 [INFO] test episode 87: reward = -103.00, steps = 103
04:47:54 [INFO] test episode 88: reward = -106.00, steps = 106
04:48:05 [INFO] test episode 89: reward = -106.00, steps = 106
04:48:18 [INFO] test episode 90: reward = -127.00, steps = 127
04:48:27 [INFO] test episode 91: reward = -88.00, steps = 88
04:48:40 [INFO] test episode 92: reward = -127.00, steps = 127
04:48:50 [INFO] test episode 93: reward = -88.00, steps = 88
04:49:00 [INFO] test episode 94: reward = -106.00, steps = 106
04:49:11 [INFO] test episode 95: reward = -105.00, steps = 105
04:49:22 [INFO] test episode 96: reward = -104.00, steps = 104
04:49:33 [INFO] test episode 97: reward = -107.00, steps = 107
04:49:42 [INFO] test episode 98: reward = -88.00, steps = 88
04:49:53 [INFO] test episode 99: reward = -105.00, steps = 105
04:49:53 [INFO] average episode reward = -105.21 ± 9.98
In [6]:
env.close()