Use Implict Quantile Network to Play Pong¶

TensorFlow version

In [1]:
%matplotlib inline

import copy
import logging
import itertools
import sys

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
from gym.wrappers.atari_preprocessing import AtariPreprocessing
from gym.wrappers.frame_stack import FrameStack
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
from tensorflow.keras import models

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

Environment

In [2]:
env = gym.make('PongNoFrameskip-v4')
env = FrameStack(AtariPreprocessing(env), num_stack=4)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
00:00:52 [INFO] env: <AtariPreprocessing<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>
00:00:52 [INFO] action_space: Discrete(6)
00:00:52 [INFO] observation_space: : Box([[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]], [[[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]], (4, 84, 84), uint8)
00:00:52 [INFO] reward_range: (-inf, inf)
00:00:52 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:00:52 [INFO] num_stack: 4
00:00:52 [INFO] lz4_compress: False
00:00:52 [INFO] frames: deque([], maxlen=4)
00:00:52 [INFO] id: PongNoFrameskip-v4
00:00:52 [INFO] entry_point: gym.envs.atari:AtariEnv
00:00:52 [INFO] reward_threshold: None
00:00:52 [INFO] nondeterministic: False
00:00:52 [INFO] max_episode_steps: 400000
00:00:52 [INFO] _kwargs: {'game': 'pong', 'obs_type': 'image', 'frameskip': 1}
00:00:52 [INFO] _env_name: PongNoFrameskip

Agent

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class Net(keras.Model):
    def __init__(self, action_n, sample_count, cosine_count):
        super().__init__()
        self.cosine_count = cosine_count
        self.conv = keras.Sequential([
                layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),
                layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu),
                layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu),
                layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu),
                layers.Reshape((1, 3136))])
        self.emb = keras.Sequential([
                layers.Dense(3136, activation=nn.relu,
                input_shape=(sample_count, cosine_count))])
        self.fc = keras.Sequential([
                layers.Dense(512, activation=nn.relu),
                layers.Dense(action_n),
                layers.Permute((2, 1))])

    def call(self, input_tensor, cumprob_tensor):
        logit_tensor = self.conv(input_tensor)
        index_tensor = tf.range(1, self.cosine_count + 1, dtype=tf.float32)[
                np.newaxis, np.newaxis, :]
        cosine_tensor = tf.math.cos(index_tensor * np.pi * cumprob_tensor)
        emb_tensor = self.emb(cosine_tensor)
        prod_tensor = logit_tensor * emb_tensor
        output_tensor = self.fc(prod_tensor)
        return output_tensor
In [5]:
class IQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.epsilon = 1.

        self.replayer = DQNReplayer(capacity=100000)

        self.sample_count = 8
        self.evaluate_net = self.build_net(action_n=self.action_n,
                sample_count=self.sample_count)
        self.target_net = self.build_net(action_n=self.action_n,
                sample_count=self.sample_count)

    def build_net(self, action_n, sample_count, cosine_count=64):
        net = Net(action_n, sample_count, cosine_count)
        loss = losses.Huber(reduction="none")
        optimizer = optimizers.Adam(0.0001)
        net.compile(loss=loss, optimizer=optimizer)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        state_tensor = tf.convert_to_tensor(np.array(observation)[np.newaxis],
                dtype=tf.float32)
        prob_tensor = tf.random.uniform((1, self.sample_count, 1))
        q_component_tensor = self.evaluate_net(state_tensor, prob_tensor)
        q_tensor = tf.reduce_mean(q_component_tensor, axis=2)
        action_tensor = tf.math.argmax(q_tensor, axis=1)
        actions = action_tensor.numpy()
        action = actions[0]
        if self.mode == 'train':
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.action_n)
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 1024 and self.replayer.count % 10 == 0:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        average_weights = [(1. - learning_rate) * t + learning_rate * e for t, e
                in zip(target_net.get_weights(), evaluate_net.get_weights())]
        target_net.set_weights(average_weights)

    def learn(self):
        # replay
        batch_size = 32
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(batch_size)
        state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
        reward_tensor = tf.convert_to_tensor(rewards[:, np.newaxis],
                dtype=tf.float32)
        terminated_tensor = tf.convert_to_tensor(terminateds[:, np.newaxis],
                dtype=tf.float32)
        next_state_tensor = tf.convert_to_tensor(next_states, dtype=tf.float32)

        # calculate target
        next_cumprob_tensor = tf.random.uniform((batch_size, self.sample_count, 1))
        next_q_component_tensor = self.evaluate_net(next_state_tensor,
                next_cumprob_tensor)
        next_q_tensor = tf.reduce_mean(next_q_component_tensor, axis=2)
        next_action_tensor = tf.math.argmax(next_q_tensor, axis=1)
        next_actions = next_action_tensor.numpy()
        next_cumprob_tensor = tf.random.uniform((batch_size, self.sample_count, 1))
        all_next_q_quantile_tensor = self.target_net(next_state_tensor,
                next_cumprob_tensor)
        indices = [[idx, next_action] for idx, next_action in
                enumerate(next_actions)]
        next_q_quantile_tensor = tf.gather_nd(all_next_q_quantile_tensor,
                indices)
        target_quantile_tensor = reward_tensor + self.gamma \
                * next_q_quantile_tensor * (1. - terminated_tensor)

        with tf.GradientTape() as tape:
            cumprob_tensor = tf.random.uniform((batch_size,
                    self.sample_count, 1))
            all_q_quantile_tensor = self.evaluate_net(state_tensor,
                    cumprob_tensor)
            indices = [[idx, action] for idx, action in enumerate(actions)]
            q_quantile_tensor = tf.gather_nd(all_q_quantile_tensor, indices)
            target_quantile_tensor = target_quantile_tensor[:, np.newaxis, :]
            q_quantile_tensor = q_quantile_tensor[:, :, np.newaxis]
            td_error_tensor = target_quantile_tensor - q_quantile_tensor
            abs_td_error_tensor = tf.math.abs(td_error_tensor)
            hubor_delta = 1.
            hubor_loss_tensor = tf.where(abs_td_error_tensor < hubor_delta,
                    0.5 * tf.square(td_error_tensor),
                    hubor_delta * (abs_td_error_tensor - 0.5 * hubor_delta))
            comparison_tensor = tf.cast(td_error_tensor < 0, dtype=tf.float32)
            quantile_regression_tensor = tf.math.abs(cumprob_tensor -
                    comparison_tensor)
            quantile_huber_loss_tensor = tf.reduce_mean(tf.reduce_sum(
                    hubor_loss_tensor * quantile_regression_tensor, axis=-1),
                    axis=1)
            loss_tensor = tf.reduce_mean(quantile_huber_loss_tensor)
        grads = tape.gradient(loss_tensor, self.evaluate_net.variables)
        self.evaluate_net.optimizer.apply_gradients(
                zip(grads, self.evaluate_net.variables))

        self.update_net(self.target_net, self.evaluate_net)

        self.epsilon = max(self.epsilon - 1e-5, 0.05)


agent = IQNAgent(env)

Train & Test

In [6]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-5:]) > 16.:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:00:53 [INFO] ==== train ====
00:01:17 [INFO] train episode 0: reward = -18.00, steps = 1208
00:01:51 [INFO] train episode 1: reward = -19.00, steps = 981
00:02:24 [INFO] train episode 2: reward = -21.00, steps = 940
00:02:53 [INFO] train episode 3: reward = -21.00, steps = 819
00:03:30 [INFO] train episode 4: reward = -19.00, steps = 1037
00:04:02 [INFO] train episode 5: reward = -21.00, steps = 875
00:04:33 [INFO] train episode 6: reward = -21.00, steps = 851
00:05:07 [INFO] train episode 7: reward = -21.00, steps = 939
00:05:39 [INFO] train episode 8: reward = -20.00, steps = 875
00:06:10 [INFO] train episode 9: reward = -21.00, steps = 853
00:06:41 [INFO] train episode 10: reward = -21.00, steps = 852
00:07:15 [INFO] train episode 11: reward = -21.00, steps = 940
00:07:45 [INFO] train episode 12: reward = -21.00, steps = 848
00:08:14 [INFO] train episode 13: reward = -21.00, steps = 806
00:08:44 [INFO] train episode 14: reward = -20.00, steps = 836
00:09:17 [INFO] train episode 15: reward = -21.00, steps = 939
00:09:51 [INFO] train episode 16: reward = -19.00, steps = 931
00:10:20 [INFO] train episode 17: reward = -21.00, steps = 791
00:10:57 [INFO] train episode 18: reward = -20.00, steps = 1029
00:11:29 [INFO] train episode 19: reward = -20.00, steps = 899
00:12:06 [INFO] train episode 20: reward = -19.00, steps = 1022
00:12:37 [INFO] train episode 21: reward = -20.00, steps = 862
00:13:08 [INFO] train episode 22: reward = -21.00, steps = 838
00:13:40 [INFO] train episode 23: reward = -21.00, steps = 869
00:14:12 [INFO] train episode 24: reward = -20.00, steps = 891
00:14:43 [INFO] train episode 25: reward = -20.00, steps = 837
00:15:33 [INFO] train episode 26: reward = -18.00, steps = 1216
00:16:05 [INFO] train episode 27: reward = -20.00, steps = 878
00:16:44 [INFO] train episode 28: reward = -19.00, steps = 1036
00:17:21 [INFO] train episode 29: reward = -19.00, steps = 996
00:17:53 [INFO] train episode 30: reward = -21.00, steps = 851
00:18:31 [INFO] train episode 31: reward = -20.00, steps = 1007
00:19:05 [INFO] train episode 32: reward = -21.00, steps = 848
00:19:39 [INFO] train episode 33: reward = -21.00, steps = 926
00:20:12 [INFO] train episode 34: reward = -21.00, steps = 900
00:20:44 [INFO] train episode 35: reward = -21.00, steps = 868
00:21:30 [INFO] train episode 36: reward = -18.00, steps = 1177
00:22:07 [INFO] train episode 37: reward = -20.00, steps = 1007
00:22:46 [INFO] train episode 38: reward = -19.00, steps = 1002
00:23:19 [INFO] train episode 39: reward = -21.00, steps = 884
00:23:52 [INFO] train episode 40: reward = -21.00, steps = 880
00:24:25 [INFO] train episode 41: reward = -20.00, steps = 895
00:25:05 [INFO] train episode 42: reward = -19.00, steps = 1076
00:25:41 [INFO] train episode 43: reward = -21.00, steps = 968
00:26:10 [INFO] train episode 44: reward = -21.00, steps = 763
00:26:40 [INFO] train episode 45: reward = -21.00, steps = 822
00:27:16 [INFO] train episode 46: reward = -20.00, steps = 956
00:27:52 [INFO] train episode 47: reward = -20.00, steps = 957
00:28:25 [INFO] train episode 48: reward = -21.00, steps = 882
00:28:58 [INFO] train episode 49: reward = -21.00, steps = 881
00:29:35 [INFO] train episode 50: reward = -19.00, steps = 936
00:30:11 [INFO] train episode 51: reward = -20.00, steps = 963
00:30:45 [INFO] train episode 52: reward = -21.00, steps = 895
00:31:23 [INFO] train episode 53: reward = -20.00, steps = 976
00:31:58 [INFO] train episode 54: reward = -20.00, steps = 914
00:32:34 [INFO] train episode 55: reward = -20.00, steps = 929
00:33:09 [INFO] train episode 56: reward = -21.00, steps = 905
00:33:40 [INFO] train episode 57: reward = -21.00, steps = 785
00:34:18 [INFO] train episode 58: reward = -20.00, steps = 963
00:34:54 [INFO] train episode 59: reward = -21.00, steps = 911
00:35:39 [INFO] train episode 60: reward = -20.00, steps = 1123
00:36:12 [INFO] train episode 61: reward = -20.00, steps = 839
00:36:47 [INFO] train episode 62: reward = -21.00, steps = 843
00:37:20 [INFO] train episode 63: reward = -21.00, steps = 822
00:38:01 [INFO] train episode 64: reward = -19.00, steps = 982
00:38:35 [INFO] train episode 65: reward = -21.00, steps = 830
00:39:26 [INFO] train episode 66: reward = -19.00, steps = 1224
00:40:03 [INFO] train episode 67: reward = -21.00, steps = 884
00:40:36 [INFO] train episode 68: reward = -21.00, steps = 787
00:41:13 [INFO] train episode 69: reward = -21.00, steps = 878
00:41:55 [INFO] train episode 70: reward = -19.00, steps = 1002
00:42:32 [INFO] train episode 71: reward = -21.00, steps = 848
00:43:11 [INFO] train episode 72: reward = -20.00, steps = 915
00:43:52 [INFO] train episode 73: reward = -21.00, steps = 841
00:44:33 [INFO] train episode 74: reward = -21.00, steps = 824
00:45:19 [INFO] train episode 75: reward = -21.00, steps = 911
00:46:01 [INFO] train episode 76: reward = -21.00, steps = 817
00:46:54 [INFO] train episode 77: reward = -19.00, steps = 1057
00:47:47 [INFO] train episode 78: reward = -18.00, steps = 1042
00:48:32 [INFO] train episode 79: reward = -21.00, steps = 867
00:49:17 [INFO] train episode 80: reward = -21.00, steps = 868
00:50:02 [INFO] train episode 81: reward = -20.00, steps = 894
00:50:59 [INFO] train episode 82: reward = -21.00, steps = 1091
00:51:49 [INFO] train episode 83: reward = -21.00, steps = 955
00:52:32 [INFO] train episode 84: reward = -21.00, steps = 808
00:53:27 [INFO] train episode 85: reward = -20.00, steps = 1041
00:54:14 [INFO] train episode 86: reward = -21.00, steps = 881
00:55:02 [INFO] train episode 87: reward = -20.00, steps = 900
00:55:52 [INFO] train episode 88: reward = -20.00, steps = 923
00:56:34 [INFO] train episode 89: reward = -21.00, steps = 788
00:57:20 [INFO] train episode 90: reward = -21.00, steps = 851
00:58:04 [INFO] train episode 91: reward = -21.00, steps = 820
00:58:58 [INFO] train episode 92: reward = -20.00, steps = 985
00:59:48 [INFO] train episode 93: reward = -21.00, steps = 911
01:00:38 [INFO] train episode 94: reward = -21.00, steps = 925
01:01:30 [INFO] train episode 95: reward = -21.00, steps = 932
01:02:20 [INFO] train episode 96: reward = -20.00, steps = 897
01:03:23 [INFO] train episode 97: reward = -19.00, steps = 1122
01:04:11 [INFO] train episode 98: reward = -21.00, steps = 817
01:05:00 [INFO] train episode 99: reward = -21.00, steps = 825
01:06:01 [INFO] train episode 100: reward = -20.00, steps = 1022
01:06:55 [INFO] train episode 101: reward = -21.00, steps = 913
01:07:48 [INFO] train episode 102: reward = -20.00, steps = 884
01:08:46 [INFO] train episode 103: reward = -19.00, steps = 960
01:09:49 [INFO] train episode 104: reward = -20.00, steps = 1039
01:10:40 [INFO] train episode 105: reward = -21.00, steps = 852
01:11:33 [INFO] train episode 106: reward = -21.00, steps = 866
01:12:40 [INFO] train episode 107: reward = -21.00, steps = 1094
01:15:46 [INFO] train episode 108: reward = -21.00, steps = 791
01:22:24 [INFO] train episode 109: reward = -20.00, steps = 999
01:27:41 [INFO] train episode 110: reward = -21.00, steps = 789
01:35:42 [INFO] train episode 111: reward = -20.00, steps = 1164
01:42:33 [INFO] train episode 112: reward = -20.00, steps = 975
01:50:00 [INFO] train episode 113: reward = -19.00, steps = 1038
01:56:35 [INFO] train episode 114: reward = -21.00, steps = 910
02:02:34 [INFO] train episode 115: reward = -21.00, steps = 818
02:09:58 [INFO] train episode 116: reward = -21.00, steps = 1036
02:17:31 [INFO] train episode 117: reward = -20.00, steps = 1064
02:23:28 [INFO] train episode 118: reward = -21.00, steps = 841
02:30:00 [INFO] train episode 119: reward = -21.00, steps = 929
02:37:19 [INFO] train episode 120: reward = -20.00, steps = 1026
02:45:27 [INFO] train episode 121: reward = -19.00, steps = 1130
02:54:23 [INFO] train episode 122: reward = -19.00, steps = 1232
03:01:08 [INFO] train episode 123: reward = -21.00, steps = 917
03:09:07 [INFO] train episode 124: reward = -20.00, steps = 1082
03:16:12 [INFO] train episode 125: reward = -20.00, steps = 956
03:24:09 [INFO] train episode 126: reward = -19.00, steps = 1067
03:31:37 [INFO] train episode 127: reward = -21.00, steps = 991
03:40:52 [INFO] train episode 128: reward = -17.00, steps = 1217
03:49:54 [INFO] train episode 129: reward = -19.00, steps = 1182
04:00:15 [INFO] train episode 130: reward = -20.00, steps = 1351
04:10:54 [INFO] train episode 131: reward = -19.00, steps = 1376
04:18:58 [INFO] train episode 132: reward = -21.00, steps = 1033
04:29:32 [INFO] train episode 133: reward = -20.00, steps = 1346
04:40:01 [INFO] train episode 134: reward = -20.00, steps = 1326
04:50:47 [INFO] train episode 135: reward = -18.00, steps = 1346
05:01:32 [INFO] train episode 136: reward = -18.00, steps = 1319
05:10:00 [INFO] train episode 137: reward = -20.00, steps = 1050
05:20:00 [INFO] train episode 138: reward = -21.00, steps = 1234
05:33:39 [INFO] train episode 139: reward = -17.00, steps = 1664
05:46:50 [INFO] train episode 140: reward = -15.00, steps = 1591
05:59:29 [INFO] train episode 141: reward = -15.00, steps = 1513
06:09:02 [INFO] train episode 142: reward = -20.00, steps = 1131
06:21:46 [INFO] train episode 143: reward = -19.00, steps = 1494
06:31:41 [INFO] train episode 144: reward = -20.00, steps = 1170
06:45:19 [INFO] train episode 145: reward = -19.00, steps = 1593
06:58:50 [INFO] train episode 146: reward = -16.00, steps = 1558
07:15:10 [INFO] train episode 147: reward = -15.00, steps = 1864
07:27:11 [INFO] train episode 148: reward = -17.00, steps = 1353
07:43:16 [INFO] train episode 149: reward = -16.00, steps = 1791
08:00:18 [INFO] train episode 150: reward = -17.00, steps = 1872
08:13:27 [INFO] train episode 151: reward = -20.00, steps = 1430
08:28:15 [INFO] train episode 152: reward = -17.00, steps = 1591
08:41:53 [INFO] train episode 153: reward = -18.00, steps = 1453
08:56:28 [INFO] train episode 154: reward = -20.00, steps = 1533
09:11:17 [INFO] train episode 155: reward = -20.00, steps = 1546
09:25:55 [INFO] train episode 156: reward = -18.00, steps = 1514
09:42:44 [INFO] train episode 157: reward = -18.00, steps = 1619
10:00:54 [INFO] train episode 158: reward = -16.00, steps = 1828
10:20:00 [INFO] train episode 159: reward = -16.00, steps = 1890
10:39:15 [INFO] train episode 160: reward = -16.00, steps = 1831
10:59:03 [INFO] train episode 161: reward = -19.00, steps = 1765
11:24:36 [INFO] train episode 162: reward = -15.00, steps = 2275
11:44:55 [INFO] train episode 163: reward = -14.00, steps = 1774
12:09:16 [INFO] train episode 164: reward = -12.00, steps = 2113
12:32:19 [INFO] train episode 165: reward = -14.00, steps = 2024
12:56:01 [INFO] train episode 166: reward = -17.00, steps = 2057
13:20:53 [INFO] train episode 167: reward = -13.00, steps = 2128
13:48:35 [INFO] train episode 168: reward = -9.00, steps = 2337
14:05:50 [INFO] train episode 169: reward = -19.00, steps = 1436
14:32:56 [INFO] train episode 170: reward = -13.00, steps = 2220
15:02:40 [INFO] train episode 171: reward = -14.00, steps = 2309
15:38:23 [INFO] train episode 172: reward = -4.00, steps = 2789
16:08:33 [INFO] train episode 173: reward = -10.00, steps = 2316
16:42:58 [INFO] train episode 174: reward = -7.00, steps = 2641
17:22:27 [INFO] train episode 175: reward = -5.00, steps = 2895
17:57:45 [INFO] train episode 176: reward = -13.00, steps = 2493
18:41:00 [INFO] train episode 177: reward = -4.00, steps = 3102
19:26:31 [INFO] train episode 178: reward = -4.00, steps = 3180
19:53:44 [INFO] train episode 179: reward = -13.00, steps = 1860
20:09:31 [INFO] train episode 180: reward = -19.00, steps = 1074
20:47:31 [INFO] train episode 181: reward = -7.00, steps = 2388
21:29:32 [INFO] train episode 182: reward = -3.00, steps = 2657
22:12:37 [INFO] train episode 183: reward = -3.00, steps = 2618
23:05:50 [INFO] train episode 184: reward = -2.00, steps = 2873
23:46:56 [INFO] train episode 185: reward = -7.00, steps = 2349
00:22:10 [INFO] train episode 186: reward = -16.00, steps = 1689
00:42:53 [INFO] train episode 187: reward = -20.00, steps = 836
01:13:55 [INFO] train episode 188: reward = -12.00, steps = 1716
01:33:40 [INFO] train episode 189: reward = -19.00, steps = 1102
02:30:10 [INFO] train episode 190: reward = -2.00, steps = 3123
03:18:06 [INFO] train episode 191: reward = -4.00, steps = 2569
03:54:56 [INFO] train episode 192: reward = -9.00, steps = 1949
04:22:56 [INFO] train episode 193: reward = -16.00, steps = 1469
04:45:26 [INFO] train episode 194: reward = -18.00, steps = 1153
05:42:10 [INFO] train episode 195: reward = -2.00, steps = 2807
06:13:27 [INFO] train episode 196: reward = -15.00, steps = 1446
07:08:39 [INFO] train episode 197: reward = -7.00, steps = 2641
07:59:03 [INFO] train episode 198: reward = -8.00, steps = 2357
08:38:35 [INFO] train episode 199: reward = -13.00, steps = 1801
09:30:34 [INFO] train episode 200: reward = -7.00, steps = 2415
10:33:12 [INFO] train episode 201: reward = 3.00, steps = 2829
11:25:00 [INFO] train episode 202: reward = -7.00, steps = 2322
12:25:23 [INFO] train episode 203: reward = -2.00, steps = 2653
13:34:15 [INFO] train episode 204: reward = -1.00, steps = 3036
14:40:30 [INFO] train episode 205: reward = -2.00, steps = 2790
15:49:57 [INFO] train episode 206: reward = -3.00, steps = 2863
16:56:35 [INFO] train episode 207: reward = -5.00, steps = 2679
18:08:09 [INFO] train episode 208: reward = -2.00, steps = 2978
18:45:50 [INFO] train episode 209: reward = -17.00, steps = 1557
19:20:15 [INFO] train episode 210: reward = -16.00, steps = 1410
20:10:49 [INFO] train episode 211: reward = -13.00, steps = 1788
20:59:06 [INFO] train episode 212: reward = -11.00, steps = 1752
22:03:31 [INFO] train episode 213: reward = -7.00, steps = 2242
23:18:04 [INFO] train episode 214: reward = -8.00, steps = 2478
23:52:46 [INFO] train episode 215: reward = -17.00, steps = 1200
00:45:51 [INFO] train episode 216: reward = -12.00, steps = 1758
01:43:28 [INFO] train episode 217: reward = -9.00, steps = 2070
02:25:20 [INFO] train episode 218: reward = -15.00, steps = 1622
03:06:23 [INFO] train episode 219: reward = -14.00, steps = 1559
03:53:05 [INFO] train episode 220: reward = -13.00, steps = 1753
05:01:47 [INFO] train episode 221: reward = -3.00, steps = 2642
05:58:25 [INFO] train episode 222: reward = -11.00, steps = 2106
06:33:23 [INFO] train episode 223: reward = -17.00, steps = 1293
07:47:11 [INFO] train episode 224: reward = -5.00, steps = 2418
09:06:21 [INFO] train episode 225: reward = -2.00, steps = 2706
10:22:29 [INFO] train episode 226: reward = 5.00, steps = 2614
11:38:14 [INFO] train episode 227: reward = -3.00, steps = 2601
12:59:05 [INFO] train episode 228: reward = 3.00, steps = 2684
14:20:42 [INFO] train episode 229: reward = -3.00, steps = 2568
15:25:50 [INFO] train episode 230: reward = -10.00, steps = 2027
17:05:03 [INFO] train episode 231: reward = 3.00, steps = 3062
18:32:02 [INFO] train episode 232: reward = -2.00, steps = 2576
19:48:16 [INFO] train episode 233: reward = -6.00, steps = 2220
21:18:08 [INFO] train episode 234: reward = -1.00, steps = 2638
23:10:13 [INFO] train episode 235: reward = -2.00, steps = 2895
00:44:41 [INFO] train episode 236: reward = -8.00, steps = 2386
01:58:39 [INFO] train episode 237: reward = -10.00, steps = 1939
03:36:58 [INFO] train episode 238: reward = -2.00, steps = 2569
05:09:47 [INFO] train episode 239: reward = -5.00, steps = 2561
06:38:30 [INFO] train episode 240: reward = -4.00, steps = 2516
08:32:04 [INFO] train episode 241: reward = -2.00, steps = 3077
10:37:35 [INFO] train episode 242: reward = 4.00, steps = 2885
12:35:12 [INFO] train episode 243: reward = -4.00, steps = 2656
14:48:24 [INFO] train episode 244: reward = -2.00, steps = 2910
16:36:19 [INFO] train episode 245: reward = -7.00, steps = 2345
18:19:44 [INFO] train episode 246: reward = 13.00, steps = 2188
20:27:47 [INFO] train episode 247: reward = -2.00, steps = 2690
22:44:10 [INFO] train episode 248: reward = 4.00, steps = 2824
00:38:19 [INFO] train episode 249: reward = 15.00, steps = 2061
02:54:55 [INFO] train episode 250: reward = 1.00, steps = 2639
04:20:25 [INFO] train episode 251: reward = 20.00, steps = 1656
05:57:03 [INFO] train episode 252: reward = 16.00, steps = 1883
07:33:20 [INFO] train episode 253: reward = 17.00, steps = 1849
09:10:45 [INFO] train episode 254: reward = 17.00, steps = 1813
11:33:50 [INFO] train episode 255: reward = 8.00, steps = 2523
13:51:38 [INFO] train episode 256: reward = 1.00, steps = 2391
16:26:04 [INFO] train episode 257: reward = 1.00, steps = 2633
18:15:10 [INFO] train episode 258: reward = 18.00, steps = 1848
20:21:14 [INFO] train episode 259: reward = 13.00, steps = 2082
22:27:27 [INFO] train episode 260: reward = 16.00, steps = 1960
00:36:49 [INFO] train episode 261: reward = 16.00, steps = 2123
03:14:42 [INFO] train episode 262: reward = 3.00, steps = 2753
05:31:44 [INFO] train episode 263: reward = 6.00, steps = 2375
07:54:26 [INFO] train episode 264: reward = 8.00, steps = 2433
10:30:39 [INFO] train episode 265: reward = 2.00, steps = 2538
12:42:53 [INFO] train episode 266: reward = 14.00, steps = 2090
14:46:01 [INFO] train episode 267: reward = 16.00, steps = 1920
16:43:33 [INFO] train episode 268: reward = 17.00, steps = 1938
18:28:54 [INFO] train episode 269: reward = 17.00, steps = 1916
20:24:59 [INFO] train episode 270: reward = 15.00, steps = 2072
22:13:45 [INFO] train episode 271: reward = 15.00, steps = 1894
00:03:41 [INFO] train episode 272: reward = 17.00, steps = 1874
00:03:45 [INFO] ==== test ====
00:06:32 [INFO] test episode 0: reward = 20.00, steps = 1670
00:09:18 [INFO] test episode 1: reward = 20.00, steps = 1667
00:12:00 [INFO] test episode 2: reward = 20.00, steps = 1663
00:14:43 [INFO] test episode 3: reward = 20.00, steps = 1663
00:17:58 [INFO] test episode 4: reward = 19.00, steps = 1701
00:20:44 [INFO] test episode 5: reward = 20.00, steps = 1748
00:23:22 [INFO] test episode 6: reward = 20.00, steps = 1664
00:25:59 [INFO] test episode 7: reward = 20.00, steps = 1662
00:28:39 [INFO] test episode 8: reward = 19.00, steps = 1720
00:31:13 [INFO] test episode 9: reward = 20.00, steps = 1665
00:34:01 [INFO] test episode 10: reward = 18.00, steps = 1792
00:36:35 [INFO] test episode 11: reward = 20.00, steps = 1662
00:39:23 [INFO] test episode 12: reward = 18.00, steps = 1783
00:41:57 [INFO] test episode 13: reward = 20.00, steps = 1665
00:44:33 [INFO] test episode 14: reward = 20.00, steps = 1670
00:47:18 [INFO] test episode 15: reward = 19.00, steps = 1758
00:50:51 [INFO] test episode 16: reward = 14.00, steps = 2264
00:53:29 [INFO] test episode 17: reward = 20.00, steps = 1671
00:56:07 [INFO] test episode 18: reward = 20.00, steps = 1668
00:58:44 [INFO] test episode 19: reward = 20.00, steps = 1671
01:01:30 [INFO] test episode 20: reward = 19.00, steps = 1760
01:04:08 [INFO] test episode 21: reward = 20.00, steps = 1667
01:07:11 [INFO] test episode 22: reward = 14.00, steps = 1965
01:09:48 [INFO] test episode 23: reward = 20.00, steps = 1665
01:12:23 [INFO] test episode 24: reward = 20.00, steps = 1669
01:14:58 [INFO] test episode 25: reward = 20.00, steps = 1663
01:17:45 [INFO] test episode 26: reward = 18.00, steps = 1783
01:20:23 [INFO] test episode 27: reward = 20.00, steps = 1669
01:22:59 [INFO] test episode 28: reward = 20.00, steps = 1662
01:25:36 [INFO] test episode 29: reward = 20.00, steps = 1668
01:28:13 [INFO] test episode 30: reward = 20.00, steps = 1666
01:30:55 [INFO] test episode 31: reward = 19.00, steps = 1734
01:33:31 [INFO] test episode 32: reward = 20.00, steps = 1666
01:36:53 [INFO] test episode 33: reward = 8.00, steps = 2149
01:39:29 [INFO] test episode 34: reward = 20.00, steps = 1666
01:42:06 [INFO] test episode 35: reward = 20.00, steps = 1667
01:44:48 [INFO] test episode 36: reward = 19.00, steps = 1725
01:47:25 [INFO] test episode 37: reward = 20.00, steps = 1664
01:50:19 [INFO] test episode 38: reward = 16.00, steps = 1848
01:52:56 [INFO] test episode 39: reward = 20.00, steps = 1665
01:55:42 [INFO] test episode 40: reward = 19.00, steps = 1760
01:58:21 [INFO] test episode 41: reward = 20.00, steps = 1665
02:00:59 [INFO] test episode 42: reward = 20.00, steps = 1668
02:03:36 [INFO] test episode 43: reward = 20.00, steps = 1666
02:06:20 [INFO] test episode 44: reward = 18.00, steps = 1736
02:08:57 [INFO] test episode 45: reward = 20.00, steps = 1667
02:11:43 [INFO] test episode 46: reward = 19.00, steps = 1759
02:14:28 [INFO] test episode 47: reward = 19.00, steps = 1741
02:17:05 [INFO] test episode 48: reward = 20.00, steps = 1664
02:19:49 [INFO] test episode 49: reward = 19.00, steps = 1746
02:22:58 [INFO] test episode 50: reward = 14.00, steps = 1991
02:25:35 [INFO] test episode 51: reward = 20.00, steps = 1665
02:28:19 [INFO] test episode 52: reward = 19.00, steps = 1743
02:30:56 [INFO] test episode 53: reward = 20.00, steps = 1666
02:33:43 [INFO] test episode 54: reward = 19.00, steps = 1766
02:36:19 [INFO] test episode 55: reward = 20.00, steps = 1666
02:38:55 [INFO] test episode 56: reward = 20.00, steps = 1660
02:41:31 [INFO] test episode 57: reward = 20.00, steps = 1667
02:44:07 [INFO] test episode 58: reward = 20.00, steps = 1661
02:46:53 [INFO] test episode 59: reward = 19.00, steps = 1757
02:49:30 [INFO] test episode 60: reward = 20.00, steps = 1671
02:52:13 [INFO] test episode 61: reward = 19.00, steps = 1725
02:54:50 [INFO] test episode 62: reward = 20.00, steps = 1667
02:57:27 [INFO] test episode 63: reward = 20.00, steps = 1662
03:00:04 [INFO] test episode 64: reward = 20.00, steps = 1669
03:03:14 [INFO] test episode 65: reward = 18.00, steps = 2025
03:05:56 [INFO] test episode 66: reward = 19.00, steps = 1727
03:08:43 [INFO] test episode 67: reward = 18.00, steps = 1777
03:11:19 [INFO] test episode 68: reward = 20.00, steps = 1660
03:13:57 [INFO] test episode 69: reward = 20.00, steps = 1660
03:16:33 [INFO] test episode 70: reward = 20.00, steps = 1669
03:19:09 [INFO] test episode 71: reward = 20.00, steps = 1666
03:21:47 [INFO] test episode 72: reward = 20.00, steps = 1671
03:24:34 [INFO] test episode 73: reward = 19.00, steps = 1781
03:27:15 [INFO] test episode 74: reward = 18.00, steps = 1731
03:29:59 [INFO] test episode 75: reward = 19.00, steps = 1744
03:32:45 [INFO] test episode 76: reward = 19.00, steps = 1756
03:35:21 [INFO] test episode 77: reward = 20.00, steps = 1660
03:37:58 [INFO] test episode 78: reward = 20.00, steps = 1666
03:40:33 [INFO] test episode 79: reward = 20.00, steps = 1665
03:43:09 [INFO] test episode 80: reward = 20.00, steps = 1669
03:46:14 [INFO] test episode 81: reward = 18.00, steps = 1993
03:48:48 [INFO] test episode 82: reward = 20.00, steps = 1666
03:51:24 [INFO] test episode 83: reward = 20.00, steps = 1670
03:54:18 [INFO] test episode 84: reward = 18.00, steps = 1853
03:57:00 [INFO] test episode 85: reward = 20.00, steps = 1726
03:59:48 [INFO] test episode 86: reward = 18.00, steps = 1781
04:02:24 [INFO] test episode 87: reward = 20.00, steps = 1671
04:05:07 [INFO] test episode 88: reward = 20.00, steps = 1728
04:07:51 [INFO] test episode 89: reward = 19.00, steps = 1730
04:10:27 [INFO] test episode 90: reward = 20.00, steps = 1662
04:13:04 [INFO] test episode 91: reward = 20.00, steps = 1666
04:15:41 [INFO] test episode 92: reward = 20.00, steps = 1671
04:18:24 [INFO] test episode 93: reward = 19.00, steps = 1758
04:21:08 [INFO] test episode 94: reward = 19.00, steps = 1719
04:23:45 [INFO] test episode 95: reward = 20.00, steps = 1669
04:26:22 [INFO] test episode 96: reward = 20.00, steps = 1669
04:28:59 [INFO] test episode 97: reward = 20.00, steps = 1666
04:31:35 [INFO] test episode 98: reward = 20.00, steps = 1661
04:34:21 [INFO] test episode 99: reward = 19.00, steps = 1766
04:34:21 [INFO] average episode reward = 19.24 ± 1.65