Use Categorical DQN to Play Pong¶

TensorFlow version

In [1]:
%matplotlib inline

import copy
import logging
import itertools
import sys

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
from gym.wrappers.atari_preprocessing import AtariPreprocessing
from gym.wrappers.frame_stack import FrameStack
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
from tensorflow.keras import models

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

Environment

In [2]:
env = gym.make('PongNoFrameskip-v4')
env = FrameStack(AtariPreprocessing(env), num_stack=4)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
00:34:27 [INFO] env: <AtariPreprocessing<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>
00:34:27 [INFO] action_space: Discrete(6)
00:34:27 [INFO] observation_space: : Box([[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]], [[[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]], (4, 84, 84), uint8)
00:34:27 [INFO] reward_range: (-inf, inf)
00:34:27 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:34:27 [INFO] num_stack: 4
00:34:27 [INFO] lz4_compress: False
00:34:27 [INFO] frames: deque([], maxlen=4)
00:34:27 [INFO] id: PongNoFrameskip-v4
00:34:27 [INFO] entry_point: gym.envs.atari:AtariEnv
00:34:27 [INFO] reward_threshold: None
00:34:27 [INFO] nondeterministic: False
00:34:27 [INFO] max_episode_steps: 400000
00:34:27 [INFO] _kwargs: {'game': 'pong', 'obs_type': 'image', 'frameskip': 1}
00:34:27 [INFO] _env_name: PongNoFrameskip

Agent

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class CategoricalDQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.epsilon = 1. # exploration

        self.replayer = DQNReplayer(capacity=100000)

        atom_count = 51
        self.atom_min = -10.
        self.atom_max = 10.
        self.atom_difference = (self.atom_max - self.atom_min) / (atom_count - 1)
        self.atom_tensor = tf.linspace(self.atom_min, self.atom_max, atom_count)

        self.evaluate_net = self.build_net(self.action_n, atom_count)
        self.target_net = models.clone_model(self.evaluate_net)

    def build_net(self, action_n, atom_count):
        net = keras.Sequential([
                layers.Permute((2, 3, 1), input_shape=(4, 84, 84)),
                layers.Conv2D(32, kernel_size=8, strides=4, activation=nn.relu),
                layers.Conv2D(64, kernel_size=4, strides=2, activation=nn.relu),
                layers.Conv2D(64, kernel_size=3, strides=1, activation=nn.relu),
                layers.Flatten(),
                layers.Dense(512, activation=nn.relu),
                layers.Dense(action_n * atom_count),
                layers.Reshape((action_n, atom_count)), layers.Softmax()])
        optimizer = optimizers.Adam(0.0001)
        net.compile(loss=losses.mse, optimizer=optimizer)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        state_tensor = tf.convert_to_tensor(np.array(observation)[np.newaxis],
                dtype=tf.float32)
        prob_tensor = self.evaluate_net(state_tensor)
        q_component_tensor = prob_tensor * self.atom_tensor
        q_tensor = tf.reduce_mean(q_component_tensor, axis=2)
        action_tensor = tf.math.argmax(q_tensor, axis=1)
        actions = action_tensor.numpy()
        action = actions[0]
        if self.mode == 'train':
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.action_n)

            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 1024 and self.replayer.count % 10 == 0:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        average_weights = [(1. - learning_rate) * t + learning_rate * e for t, e
                in zip(target_net.get_weights(), evaluate_net.get_weights())]
        target_net.set_weights(average_weights)

    def learn(self):
        # replay
        batch_size = 32
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(batch_size)
        state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
        reward_tensor = tf.convert_to_tensor(rewards[:, np.newaxis],
                dtype=tf.float32)
        terminated_tensor = tf.convert_to_tensor(terminateds[:, np.newaxis],
                dtype=tf.float32)
        next_state_tensor = tf.convert_to_tensor(next_states, dtype=tf.float32)

        # compute target
        next_prob_tensor = self.target_net(next_state_tensor)
        next_q_tensor = tf.reduce_sum(next_prob_tensor * self.atom_tensor,
                axis=2)
        next_action_tensor = tf.math.argmax(next_q_tensor, axis=1)
        next_actions = next_action_tensor.numpy()
        indices = [[idx, next_action] for idx, next_action in
                enumerate(next_actions)]
        next_dist_tensor = tf.gather_nd(next_prob_tensor, indices)
        next_dist_tensor = tf.reshape(next_dist_tensor,
                shape=(batch_size, 1, -1))

        # project
        target_tensor = reward_tensor + self.gamma * tf.reshape(
                self.atom_tensor, (1, -1)) * (1. - terminated_tensor)  # broadcast
        clipped_target_tensor = tf.clip_by_value(target_tensor,
                self.atom_min, self.atom_max)
        projection_tensor = tf.clip_by_value(1. - tf.math.abs(
                clipped_target_tensor[:, np.newaxis, ...]
                - tf.reshape(self.atom_tensor, shape=(1, -1, 1)))
                / self.atom_difference, 0, 1)
        projected_tensor = tf.reduce_sum(projection_tensor * next_dist_tensor,
                axis=-1)

        with tf.GradientTape() as tape:
            all_q_prob_tensor = self.evaluate_net(state_tensor)
            indices = [[idx, action] for idx, action in enumerate(actions)]
            q_prob_tensor = tf.gather_nd(all_q_prob_tensor, indices)

            cross_entropy_tensor = -tf.reduce_sum(
                    tf.math.xlogy(projected_tensor, q_prob_tensor
                    + 1e-8))
            loss_tensor = tf.reduce_mean(cross_entropy_tensor)
        grads = tape.gradient(loss_tensor, self.evaluate_net.variables)
        self.evaluate_net.optimizer.apply_gradients(
                zip(grads, self.evaluate_net.variables))

        self.update_net(self.target_net, self.evaluate_net)

        self.epsilon = max(self.epsilon - 1e-5, 0.05)


agent = CategoricalDQNAgent(env)

Train & Test

In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-5:]) > 16.:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:34:30 [INFO] ==== train ====
00:34:44 [INFO] train episode 0: reward = -20.00, steps = 935
00:35:13 [INFO] train episode 1: reward = -19.00, steps = 1009
00:35:36 [INFO] train episode 2: reward = -21.00, steps = 757
00:36:04 [INFO] train episode 3: reward = -19.00, steps = 936
00:36:29 [INFO] train episode 4: reward = -21.00, steps = 851
00:36:56 [INFO] train episode 5: reward = -21.00, steps = 871
00:37:23 [INFO] train episode 6: reward = -20.00, steps = 896
00:37:55 [INFO] train episode 7: reward = -21.00, steps = 1064
00:38:21 [INFO] train episode 8: reward = -21.00, steps = 861
00:38:50 [INFO] train episode 9: reward = -19.00, steps = 961
00:39:16 [INFO] train episode 10: reward = -21.00, steps = 881
00:39:42 [INFO] train episode 11: reward = -21.00, steps = 853
00:40:12 [INFO] train episode 12: reward = -21.00, steps = 985
00:40:40 [INFO] train episode 13: reward = -19.00, steps = 933
00:41:06 [INFO] train episode 14: reward = -20.00, steps = 843
00:41:29 [INFO] train episode 15: reward = -21.00, steps = 762
00:41:57 [INFO] train episode 16: reward = -20.00, steps = 926
00:42:29 [INFO] train episode 17: reward = -20.00, steps = 1078
00:42:59 [INFO] train episode 18: reward = -21.00, steps = 971
00:43:37 [INFO] train episode 19: reward = -18.00, steps = 1252
00:44:02 [INFO] train episode 20: reward = -21.00, steps = 820
00:44:30 [INFO] train episode 21: reward = -21.00, steps = 926
00:44:58 [INFO] train episode 22: reward = -20.00, steps = 942
00:45:23 [INFO] train episode 23: reward = -21.00, steps = 824
00:45:54 [INFO] train episode 24: reward = -20.00, steps = 1013
00:46:20 [INFO] train episode 25: reward = -21.00, steps = 845
00:46:48 [INFO] train episode 26: reward = -20.00, steps = 947
00:47:16 [INFO] train episode 27: reward = -20.00, steps = 926
00:47:48 [INFO] train episode 28: reward = -20.00, steps = 1016
00:48:17 [INFO] train episode 29: reward = -21.00, steps = 941
00:48:50 [INFO] train episode 30: reward = -19.00, steps = 1108
00:49:16 [INFO] train episode 31: reward = -21.00, steps = 837
00:49:43 [INFO] train episode 32: reward = -20.00, steps = 883
00:50:07 [INFO] train episode 33: reward = -21.00, steps = 779
00:50:32 [INFO] train episode 34: reward = -20.00, steps = 836
00:51:01 [INFO] train episode 35: reward = -20.00, steps = 914
00:51:28 [INFO] train episode 36: reward = -21.00, steps = 908
00:51:58 [INFO] train episode 37: reward = -20.00, steps = 961
00:52:23 [INFO] train episode 38: reward = -20.00, steps = 837
00:52:54 [INFO] train episode 39: reward = -19.00, steps = 1008
00:53:21 [INFO] train episode 40: reward = -21.00, steps = 882
00:53:44 [INFO] train episode 41: reward = -21.00, steps = 760
00:54:09 [INFO] train episode 42: reward = -21.00, steps = 790
00:54:35 [INFO] train episode 43: reward = -21.00, steps = 859
00:54:58 [INFO] train episode 44: reward = -21.00, steps = 763
00:55:22 [INFO] train episode 45: reward = -21.00, steps = 776
00:55:48 [INFO] train episode 46: reward = -21.00, steps = 864
00:56:14 [INFO] train episode 47: reward = -21.00, steps = 851
00:56:47 [INFO] train episode 48: reward = -20.00, steps = 1077
00:57:17 [INFO] train episode 49: reward = -21.00, steps = 966
00:57:44 [INFO] train episode 50: reward = -20.00, steps = 895
00:58:13 [INFO] train episode 51: reward = -20.00, steps = 937
00:58:38 [INFO] train episode 52: reward = -21.00, steps = 821
00:59:08 [INFO] train episode 53: reward = -21.00, steps = 977
00:59:36 [INFO] train episode 54: reward = -21.00, steps = 909
01:00:07 [INFO] train episode 55: reward = -20.00, steps = 975
01:00:35 [INFO] train episode 56: reward = -20.00, steps = 899
01:01:01 [INFO] train episode 57: reward = -21.00, steps = 851
01:01:35 [INFO] train episode 58: reward = -19.00, steps = 1099
01:02:02 [INFO] train episode 59: reward = -21.00, steps = 866
01:02:32 [INFO] train episode 60: reward = -20.00, steps = 971
01:03:01 [INFO] train episode 61: reward = -20.00, steps = 928
01:03:35 [INFO] train episode 62: reward = -20.00, steps = 1085
01:04:04 [INFO] train episode 63: reward = -21.00, steps = 916
01:04:30 [INFO] train episode 64: reward = -21.00, steps = 861
01:05:01 [INFO] train episode 65: reward = -20.00, steps = 984
01:05:32 [INFO] train episode 66: reward = -21.00, steps = 972
01:06:04 [INFO] train episode 67: reward = -19.00, steps = 1034
01:06:33 [INFO] train episode 68: reward = -20.00, steps = 897
01:07:00 [INFO] train episode 69: reward = -20.00, steps = 866
01:07:29 [INFO] train episode 70: reward = -21.00, steps = 940
01:07:57 [INFO] train episode 71: reward = -20.00, steps = 863
01:08:27 [INFO] train episode 72: reward = -19.00, steps = 965
01:08:54 [INFO] train episode 73: reward = -20.00, steps = 855
01:09:24 [INFO] train episode 74: reward = -20.00, steps = 919
01:09:58 [INFO] train episode 75: reward = -19.00, steps = 1063
01:10:36 [INFO] train episode 76: reward = -18.00, steps = 1188
01:11:02 [INFO] train episode 77: reward = -21.00, steps = 824
01:11:33 [INFO] train episode 78: reward = -19.00, steps = 1002
01:12:04 [INFO] train episode 79: reward = -20.00, steps = 944
01:12:36 [INFO] train episode 80: reward = -19.00, steps = 1004
01:13:03 [INFO] train episode 81: reward = -21.00, steps = 854
01:13:36 [INFO] train episode 82: reward = -19.00, steps = 1012
01:14:03 [INFO] train episode 83: reward = -21.00, steps = 821
01:14:31 [INFO] train episode 84: reward = -20.00, steps = 893
01:15:04 [INFO] train episode 85: reward = -20.00, steps = 1023
01:15:32 [INFO] train episode 86: reward = -21.00, steps = 868
01:16:05 [INFO] train episode 87: reward = -21.00, steps = 1005
01:16:41 [INFO] train episode 88: reward = -19.00, steps = 1116
01:17:13 [INFO] train episode 89: reward = -19.00, steps = 998
01:17:48 [INFO] train episode 90: reward = -19.00, steps = 1078
01:18:14 [INFO] train episode 91: reward = -21.00, steps = 805
01:18:43 [INFO] train episode 92: reward = -20.00, steps = 863
01:19:07 [INFO] train episode 93: reward = -21.00, steps = 764
01:19:33 [INFO] train episode 94: reward = -21.00, steps = 809
01:19:58 [INFO] train episode 95: reward = -21.00, steps = 760
01:20:27 [INFO] train episode 96: reward = -21.00, steps = 909
01:20:55 [INFO] train episode 97: reward = -21.00, steps = 877
01:21:21 [INFO] train episode 98: reward = -21.00, steps = 789
01:21:56 [INFO] train episode 99: reward = -20.00, steps = 1086
01:22:21 [INFO] train episode 100: reward = -21.00, steps = 758
01:22:45 [INFO] train episode 101: reward = -21.00, steps = 763
01:23:17 [INFO] train episode 102: reward = -19.00, steps = 980
01:23:52 [INFO] train episode 103: reward = -19.00, steps = 1093
01:24:21 [INFO] train episode 104: reward = -21.00, steps = 879
01:24:53 [INFO] train episode 105: reward = -20.00, steps = 1002
01:25:22 [INFO] train episode 106: reward = -21.00, steps = 905
01:25:51 [INFO] train episode 107: reward = -21.00, steps = 885
01:26:44 [INFO] train episode 108: reward = -21.00, steps = 810
01:29:29 [INFO] train episode 109: reward = -21.00, steps = 970
01:31:38 [INFO] train episode 110: reward = -21.00, steps = 760
01:33:58 [INFO] train episode 111: reward = -21.00, steps = 818
01:36:46 [INFO] train episode 112: reward = -20.00, steps = 986
01:39:33 [INFO] train episode 113: reward = -20.00, steps = 985
01:42:11 [INFO] train episode 114: reward = -19.00, steps = 929
01:44:52 [INFO] train episode 115: reward = -20.00, steps = 945
01:47:35 [INFO] train episode 116: reward = -20.00, steps = 960
01:50:10 [INFO] train episode 117: reward = -21.00, steps = 910
01:52:47 [INFO] train episode 118: reward = -20.00, steps = 926
01:55:17 [INFO] train episode 119: reward = -21.00, steps = 879
01:58:20 [INFO] train episode 120: reward = -18.00, steps = 1082
02:00:42 [INFO] train episode 121: reward = -20.00, steps = 841
02:03:26 [INFO] train episode 122: reward = -20.00, steps = 963
02:05:53 [INFO] train episode 123: reward = -20.00, steps = 864
02:08:31 [INFO] train episode 124: reward = -19.00, steps = 928
02:10:43 [INFO] train episode 125: reward = -21.00, steps = 777
02:13:21 [INFO] train episode 126: reward = -21.00, steps = 922
02:15:48 [INFO] train episode 127: reward = -20.00, steps = 869
02:19:04 [INFO] train episode 128: reward = -18.00, steps = 1156
02:21:51 [INFO] train episode 129: reward = -20.00, steps = 977
02:24:13 [INFO] train episode 130: reward = -20.00, steps = 837
02:27:12 [INFO] train episode 131: reward = -20.00, steps = 1052
02:29:44 [INFO] train episode 132: reward = -20.00, steps = 894
02:32:46 [INFO] train episode 133: reward = -19.00, steps = 1066
02:35:16 [INFO] train episode 134: reward = -20.00, steps = 880
02:37:46 [INFO] train episode 135: reward = -21.00, steps = 879
02:40:15 [INFO] train episode 136: reward = -20.00, steps = 872
02:42:38 [INFO] train episode 137: reward = -21.00, steps = 842
02:45:30 [INFO] train episode 138: reward = -21.00, steps = 1016
02:48:28 [INFO] train episode 139: reward = -20.00, steps = 1047
02:51:42 [INFO] train episode 140: reward = -19.00, steps = 1145
02:55:21 [INFO] train episode 141: reward = -17.00, steps = 1288
02:58:09 [INFO] train episode 142: reward = -20.00, steps = 976
03:01:57 [INFO] train episode 143: reward = -18.00, steps = 1280
03:04:55 [INFO] train episode 144: reward = -21.00, steps = 1030
03:08:15 [INFO] train episode 145: reward = -18.00, steps = 1154
03:11:54 [INFO] train episode 146: reward = -20.00, steps = 1286
03:16:20 [INFO] train episode 147: reward = -18.00, steps = 1566
03:20:42 [INFO] train episode 148: reward = -18.00, steps = 1542
03:24:44 [INFO] train episode 149: reward = -18.00, steps = 1425
03:28:56 [INFO] train episode 150: reward = -18.00, steps = 1491
03:33:11 [INFO] train episode 151: reward = -20.00, steps = 1498
03:37:51 [INFO] train episode 152: reward = -14.00, steps = 1637
03:42:53 [INFO] train episode 153: reward = -15.00, steps = 1776
03:47:11 [INFO] train episode 154: reward = -16.00, steps = 1514
03:51:04 [INFO] train episode 155: reward = -19.00, steps = 1375
03:55:32 [INFO] train episode 156: reward = -15.00, steps = 1574
04:00:45 [INFO] train episode 157: reward = -15.00, steps = 1834
04:05:13 [INFO] train episode 158: reward = -17.00, steps = 1571
04:09:36 [INFO] train episode 159: reward = -19.00, steps = 1540
04:14:57 [INFO] train episode 160: reward = -16.00, steps = 1879
04:21:00 [INFO] train episode 161: reward = -10.00, steps = 2106
04:25:35 [INFO] train episode 162: reward = -15.00, steps = 1603
04:31:47 [INFO] train episode 163: reward = -12.00, steps = 2167
04:38:45 [INFO] train episode 164: reward = -11.00, steps = 2456
04:44:03 [INFO] train episode 165: reward = -13.00, steps = 1865
04:49:12 [INFO] train episode 166: reward = -16.00, steps = 1805
04:55:34 [INFO] train episode 167: reward = -13.00, steps = 2236
05:01:06 [INFO] train episode 168: reward = -12.00, steps = 1935
05:06:59 [INFO] train episode 169: reward = -9.00, steps = 2055
05:13:18 [INFO] train episode 170: reward = -12.00, steps = 2208
05:20:44 [INFO] train episode 171: reward = -5.00, steps = 2551
05:27:59 [INFO] train episode 172: reward = -2.00, steps = 2969
05:35:16 [INFO] train episode 173: reward = -3.00, steps = 3039
05:40:48 [INFO] train episode 174: reward = -7.00, steps = 2311
05:47:56 [INFO] train episode 175: reward = -4.00, steps = 2988
05:55:38 [INFO] train episode 176: reward = 5.00, steps = 3198
06:04:04 [INFO] train episode 177: reward = -1.00, steps = 3519
06:11:55 [INFO] train episode 178: reward = 3.00, steps = 3294
06:19:31 [INFO] train episode 179: reward = -6.00, steps = 3197
06:27:16 [INFO] train episode 180: reward = 1.00, steps = 3234
06:35:17 [INFO] train episode 181: reward = 4.00, steps = 3359
06:41:55 [INFO] train episode 182: reward = -6.00, steps = 2773
06:50:12 [INFO] train episode 183: reward = -2.00, steps = 3472
06:57:08 [INFO] train episode 184: reward = 4.00, steps = 2902
07:04:13 [INFO] train episode 185: reward = 3.00, steps = 2964
07:11:53 [INFO] train episode 186: reward = -4.00, steps = 3199
07:19:21 [INFO] train episode 187: reward = 2.00, steps = 3125
07:26:49 [INFO] train episode 188: reward = 4.00, steps = 3108
07:35:17 [INFO] train episode 189: reward = -1.00, steps = 3542
07:42:10 [INFO] train episode 190: reward = 5.00, steps = 2921
07:49:08 [INFO] train episode 191: reward = -4.00, steps = 3067
07:56:39 [INFO] train episode 192: reward = -2.00, steps = 3286
08:03:39 [INFO] train episode 193: reward = 3.00, steps = 3075
08:11:01 [INFO] train episode 194: reward = -4.00, steps = 3235
08:17:34 [INFO] train episode 195: reward = 4.00, steps = 2892
08:24:35 [INFO] train episode 196: reward = -4.00, steps = 3042
08:31:19 [INFO] train episode 197: reward = 6.00, steps = 2968
08:37:32 [INFO] train episode 198: reward = 9.00, steps = 2749
08:43:31 [INFO] train episode 199: reward = 3.00, steps = 2678
08:49:35 [INFO] train episode 200: reward = 1.00, steps = 3288
08:54:54 [INFO] train episode 201: reward = 6.00, steps = 2881
09:00:16 [INFO] train episode 202: reward = -3.00, steps = 2914
09:05:58 [INFO] train episode 203: reward = -1.00, steps = 3089
09:10:46 [INFO] train episode 204: reward = -3.00, steps = 2600
09:16:45 [INFO] train episode 205: reward = 1.00, steps = 3255
09:22:47 [INFO] train episode 206: reward = 3.00, steps = 3276
09:29:10 [INFO] train episode 207: reward = 1.00, steps = 3466
09:35:06 [INFO] train episode 208: reward = 2.00, steps = 3225
09:41:24 [INFO] train episode 209: reward = -1.00, steps = 3416
09:47:00 [INFO] train episode 210: reward = 2.00, steps = 3187
09:52:06 [INFO] train episode 211: reward = 2.00, steps = 2934
09:57:26 [INFO] train episode 212: reward = 7.00, steps = 3070
10:02:24 [INFO] train episode 213: reward = 5.00, steps = 2857
10:08:12 [INFO] train episode 214: reward = 3.00, steps = 3335
10:13:00 [INFO] train episode 215: reward = 7.00, steps = 2773
10:18:11 [INFO] train episode 216: reward = 6.00, steps = 2974
10:22:17 [INFO] train episode 217: reward = -9.00, steps = 2357
10:27:54 [INFO] train episode 218: reward = -2.00, steps = 3226
10:33:04 [INFO] train episode 219: reward = 5.00, steps = 2969
10:38:11 [INFO] train episode 220: reward = 3.00, steps = 2947
10:43:17 [INFO] train episode 221: reward = 2.00, steps = 2948
10:48:17 [INFO] train episode 222: reward = -2.00, steps = 2872
10:53:16 [INFO] train episode 223: reward = 3.00, steps = 2871
10:58:36 [INFO] train episode 224: reward = 4.00, steps = 3073
11:03:49 [INFO] train episode 225: reward = 1.00, steps = 2999
11:09:39 [INFO] train episode 226: reward = -1.00, steps = 3347
11:15:26 [INFO] train episode 227: reward = 6.00, steps = 3338
11:20:15 [INFO] train episode 228: reward = 7.00, steps = 2773
11:25:38 [INFO] train episode 229: reward = -4.00, steps = 3076
11:31:10 [INFO] train episode 230: reward = 5.00, steps = 3141
11:36:51 [INFO] train episode 231: reward = 6.00, steps = 3221
11:42:11 [INFO] train episode 232: reward = 2.00, steps = 3035
11:46:39 [INFO] train episode 233: reward = 10.00, steps = 2529
11:52:15 [INFO] train episode 234: reward = -1.00, steps = 3180
11:57:01 [INFO] train episode 235: reward = 6.00, steps = 2684
12:02:29 [INFO] train episode 236: reward = 6.00, steps = 3069
12:07:39 [INFO] train episode 237: reward = 3.00, steps = 2882
12:12:58 [INFO] train episode 238: reward = 3.00, steps = 2967
12:17:40 [INFO] train episode 239: reward = 8.00, steps = 2625
12:23:52 [INFO] train episode 240: reward = -1.00, steps = 3477
12:28:02 [INFO] train episode 241: reward = 13.00, steps = 2336
12:32:50 [INFO] train episode 242: reward = -3.00, steps = 2662
12:38:04 [INFO] train episode 243: reward = 3.00, steps = 2933
12:43:42 [INFO] train episode 244: reward = -2.00, steps = 3145
12:48:24 [INFO] train episode 245: reward = -3.00, steps = 2621
12:53:40 [INFO] train episode 246: reward = -2.00, steps = 2949
12:58:31 [INFO] train episode 247: reward = -3.00, steps = 2616
13:04:16 [INFO] train episode 248: reward = -2.00, steps = 3147
13:09:44 [INFO] train episode 249: reward = -1.00, steps = 2840
13:15:28 [INFO] train episode 250: reward = -1.00, steps = 3248
13:21:02 [INFO] train episode 251: reward = -2.00, steps = 3161
13:25:26 [INFO] train episode 252: reward = 9.00, steps = 2510
13:30:54 [INFO] train episode 253: reward = -2.00, steps = 3108
13:35:38 [INFO] train episode 254: reward = -3.00, steps = 2707
13:39:49 [INFO] train episode 255: reward = 12.00, steps = 2385
13:44:56 [INFO] train episode 256: reward = -3.00, steps = 2907
13:49:54 [INFO] train episode 257: reward = -3.00, steps = 2834
13:54:17 [INFO] train episode 258: reward = -4.00, steps = 2516
13:58:31 [INFO] train episode 259: reward = 10.00, steps = 2428
14:03:37 [INFO] train episode 260: reward = 1.00, steps = 2923
14:08:36 [INFO] train episode 261: reward = -1.00, steps = 2843
14:13:58 [INFO] train episode 262: reward = 1.00, steps = 3050
14:18:14 [INFO] train episode 263: reward = 12.00, steps = 2408
14:23:58 [INFO] train episode 264: reward = -1.00, steps = 3250
14:28:25 [INFO] train episode 265: reward = 13.00, steps = 2515
14:33:35 [INFO] train episode 266: reward = -3.00, steps = 2913
14:38:08 [INFO] train episode 267: reward = 7.00, steps = 2588
14:42:59 [INFO] train episode 268: reward = -2.00, steps = 2731
14:47:51 [INFO] train episode 269: reward = 8.00, steps = 2760
14:52:47 [INFO] train episode 270: reward = 9.00, steps = 2776
14:57:11 [INFO] train episode 271: reward = 10.00, steps = 2482
15:01:51 [INFO] train episode 272: reward = 11.00, steps = 2630
15:06:07 [INFO] train episode 273: reward = 13.00, steps = 2401
15:09:28 [INFO] train episode 274: reward = 20.00, steps = 1895
15:13:20 [INFO] train episode 275: reward = 15.00, steps = 2175
15:17:34 [INFO] train episode 276: reward = 13.00, steps = 2385
15:23:16 [INFO] train episode 277: reward = 2.00, steps = 3196
15:27:50 [INFO] train episode 278: reward = 9.00, steps = 2578
15:31:48 [INFO] train episode 279: reward = 16.00, steps = 2231
15:37:32 [INFO] train episode 280: reward = 3.00, steps = 3229
15:42:19 [INFO] train episode 281: reward = 7.00, steps = 2699
15:46:41 [INFO] train episode 282: reward = 13.00, steps = 2472
15:51:07 [INFO] train episode 283: reward = 10.00, steps = 2508
15:55:21 [INFO] train episode 284: reward = 13.00, steps = 2377
16:00:24 [INFO] train episode 285: reward = 2.00, steps = 2854
16:04:58 [INFO] train episode 286: reward = 10.00, steps = 2572
16:08:22 [INFO] train episode 287: reward = 18.00, steps = 1916
16:12:48 [INFO] train episode 288: reward = 15.00, steps = 2499
16:16:56 [INFO] train episode 289: reward = 16.00, steps = 2341
16:22:02 [INFO] train episode 290: reward = 5.00, steps = 2898
16:26:20 [INFO] train episode 291: reward = 13.00, steps = 2428
16:31:26 [INFO] train episode 292: reward = 7.00, steps = 2897
16:35:09 [INFO] train episode 293: reward = 15.00, steps = 2125
16:40:35 [INFO] train episode 294: reward = 3.00, steps = 3075
16:44:02 [INFO] train episode 295: reward = 18.00, steps = 1967
16:47:44 [INFO] train episode 296: reward = 16.00, steps = 2104
16:52:43 [INFO] train episode 297: reward = 8.00, steps = 2826
16:57:24 [INFO] train episode 298: reward = 11.00, steps = 2648
17:02:50 [INFO] train episode 299: reward = 5.00, steps = 3076
17:06:57 [INFO] train episode 300: reward = 12.00, steps = 2330
17:11:09 [INFO] train episode 301: reward = 11.00, steps = 2371
17:15:14 [INFO] train episode 302: reward = 13.00, steps = 2305
17:18:52 [INFO] train episode 303: reward = 16.00, steps = 2074
17:23:24 [INFO] train episode 304: reward = 10.00, steps = 2562
17:27:43 [INFO] train episode 305: reward = 12.00, steps = 2433
17:32:14 [INFO] train episode 306: reward = 11.00, steps = 2549
17:35:43 [INFO] train episode 307: reward = 16.00, steps = 1960
17:40:22 [INFO] train episode 308: reward = 9.00, steps = 2611
17:43:57 [INFO] train episode 309: reward = 17.00, steps = 2011
17:48:16 [INFO] train episode 310: reward = 11.00, steps = 2432
17:52:09 [INFO] train episode 311: reward = 15.00, steps = 2176
17:56:19 [INFO] train episode 312: reward = 14.00, steps = 2336
18:00:17 [INFO] train episode 313: reward = 15.00, steps = 2235
18:03:44 [INFO] train episode 314: reward = 18.00, steps = 1957
18:08:16 [INFO] train episode 315: reward = 13.00, steps = 2566
18:13:14 [INFO] train episode 316: reward = 8.00, steps = 2809
18:17:21 [INFO] train episode 317: reward = 13.00, steps = 2340
18:20:56 [INFO] train episode 318: reward = 18.00, steps = 1942
18:24:31 [INFO] train episode 319: reward = 16.00, steps = 1978
18:28:11 [INFO] train episode 320: reward = 16.00, steps = 2062
18:32:50 [INFO] train episode 321: reward = 8.00, steps = 2604
18:37:28 [INFO] train episode 322: reward = 10.00, steps = 2592
18:40:58 [INFO] train episode 323: reward = 18.00, steps = 1943
18:45:18 [INFO] train episode 324: reward = 12.00, steps = 2420
18:48:35 [INFO] train episode 325: reward = 19.00, steps = 1825
18:51:46 [INFO] train episode 326: reward = 19.00, steps = 1761
18:55:08 [INFO] train episode 327: reward = 16.00, steps = 1870
18:55:09 [INFO] ==== test ====
18:55:28 [INFO] test episode 0: reward = 10.00, steps = 2217
18:55:50 [INFO] test episode 1: reward = 10.00, steps = 2210
18:56:14 [INFO] test episode 2: reward = 10.00, steps = 2214
18:56:34 [INFO] test episode 3: reward = 18.00, steps = 1914
18:56:52 [INFO] test episode 4: reward = 19.00, steps = 1785
18:57:11 [INFO] test episode 5: reward = 19.00, steps = 1797
18:57:29 [INFO] test episode 6: reward = 19.00, steps = 1781
18:57:47 [INFO] test episode 7: reward = 19.00, steps = 1788
18:58:05 [INFO] test episode 8: reward = 19.00, steps = 1784
18:58:27 [INFO] test episode 9: reward = 10.00, steps = 2216
18:58:45 [INFO] test episode 10: reward = 19.00, steps = 1798
18:59:02 [INFO] test episode 11: reward = 18.00, steps = 1916
18:59:19 [INFO] test episode 12: reward = 18.00, steps = 1913
18:59:35 [INFO] test episode 13: reward = 19.00, steps = 1798
18:59:51 [INFO] test episode 14: reward = 19.00, steps = 1797
19:00:07 [INFO] test episode 15: reward = 19.00, steps = 1787
19:00:26 [INFO] test episode 16: reward = 10.00, steps = 2215
19:00:46 [INFO] test episode 17: reward = 10.00, steps = 2215
19:01:02 [INFO] test episode 18: reward = 19.00, steps = 1799
19:01:19 [INFO] test episode 19: reward = 18.00, steps = 1910
19:01:35 [INFO] test episode 20: reward = 19.00, steps = 1793
19:01:52 [INFO] test episode 21: reward = 18.00, steps = 1911
19:02:08 [INFO] test episode 22: reward = 19.00, steps = 1782
19:02:23 [INFO] test episode 23: reward = 19.00, steps = 1796
19:02:43 [INFO] test episode 24: reward = 10.00, steps = 2216
19:02:59 [INFO] test episode 25: reward = 19.00, steps = 1785
19:03:15 [INFO] test episode 26: reward = 19.00, steps = 1781
19:03:32 [INFO] test episode 27: reward = 18.00, steps = 1916
19:03:47 [INFO] test episode 28: reward = 19.00, steps = 1784
19:04:03 [INFO] test episode 29: reward = 19.00, steps = 1787
19:04:19 [INFO] test episode 30: reward = 19.00, steps = 1786
19:04:35 [INFO] test episode 31: reward = 19.00, steps = 1795
19:04:51 [INFO] test episode 32: reward = 19.00, steps = 1798
19:05:08 [INFO] test episode 33: reward = 18.00, steps = 1912
19:05:24 [INFO] test episode 34: reward = 19.00, steps = 1785
19:05:40 [INFO] test episode 35: reward = 19.00, steps = 1795
19:05:56 [INFO] test episode 36: reward = 19.00, steps = 1796
19:06:13 [INFO] test episode 37: reward = 18.00, steps = 1911
19:06:29 [INFO] test episode 38: reward = 19.00, steps = 1793
19:06:45 [INFO] test episode 39: reward = 19.00, steps = 1798
19:07:02 [INFO] test episode 40: reward = 18.00, steps = 1916
19:07:18 [INFO] test episode 41: reward = 19.00, steps = 1788
19:07:37 [INFO] test episode 42: reward = 10.00, steps = 2216
19:07:55 [INFO] test episode 43: reward = 18.00, steps = 1916
19:08:10 [INFO] test episode 44: reward = 19.00, steps = 1783
19:08:30 [INFO] test episode 45: reward = 10.00, steps = 2215
19:08:50 [INFO] test episode 46: reward = 10.00, steps = 2216
19:09:06 [INFO] test episode 47: reward = 19.00, steps = 1799
19:09:26 [INFO] test episode 48: reward = 10.00, steps = 2213
19:09:41 [INFO] test episode 49: reward = 19.00, steps = 1785
19:09:57 [INFO] test episode 50: reward = 19.00, steps = 1796
19:10:13 [INFO] test episode 51: reward = 19.00, steps = 1786
19:10:29 [INFO] test episode 52: reward = 19.00, steps = 1786
19:10:49 [INFO] test episode 53: reward = 10.00, steps = 2213
19:11:06 [INFO] test episode 54: reward = 18.00, steps = 1910
19:11:22 [INFO] test episode 55: reward = 19.00, steps = 1787
19:11:40 [INFO] test episode 56: reward = 18.00, steps = 1915
19:11:55 [INFO] test episode 57: reward = 19.00, steps = 1781
19:12:15 [INFO] test episode 58: reward = 10.00, steps = 2215
19:12:31 [INFO] test episode 59: reward = 19.00, steps = 1781
19:12:48 [INFO] test episode 60: reward = 18.00, steps = 1916
19:13:08 [INFO] test episode 61: reward = 10.00, steps = 2213
19:13:24 [INFO] test episode 62: reward = 19.00, steps = 1796
19:13:43 [INFO] test episode 63: reward = 10.00, steps = 2212
19:13:59 [INFO] test episode 64: reward = 19.00, steps = 1785
19:14:15 [INFO] test episode 65: reward = 19.00, steps = 1784
19:14:35 [INFO] test episode 66: reward = 10.00, steps = 2217
19:14:51 [INFO] test episode 67: reward = 19.00, steps = 1798
19:15:07 [INFO] test episode 68: reward = 19.00, steps = 1798
19:15:23 [INFO] test episode 69: reward = 19.00, steps = 1784
19:15:42 [INFO] test episode 70: reward = 10.00, steps = 2216
19:15:58 [INFO] test episode 71: reward = 19.00, steps = 1799
19:16:16 [INFO] test episode 72: reward = 18.00, steps = 1910
19:16:32 [INFO] test episode 73: reward = 19.00, steps = 1796
19:16:49 [INFO] test episode 74: reward = 18.00, steps = 1910
19:17:06 [INFO] test episode 75: reward = 18.00, steps = 1910
19:17:25 [INFO] test episode 76: reward = 10.00, steps = 2213
19:17:45 [INFO] test episode 77: reward = 10.00, steps = 2213
19:18:05 [INFO] test episode 78: reward = 10.00, steps = 2217
19:18:22 [INFO] test episode 79: reward = 18.00, steps = 1914
19:18:41 [INFO] test episode 80: reward = 10.00, steps = 2213
19:19:01 [INFO] test episode 81: reward = 10.00, steps = 2210
19:19:20 [INFO] test episode 82: reward = 10.00, steps = 2217
19:19:37 [INFO] test episode 83: reward = 18.00, steps = 1915
19:19:54 [INFO] test episode 84: reward = 18.00, steps = 1915
19:20:10 [INFO] test episode 85: reward = 19.00, steps = 1788
19:20:30 [INFO] test episode 86: reward = 10.00, steps = 2211
19:20:46 [INFO] test episode 87: reward = 19.00, steps = 1798
19:21:02 [INFO] test episode 88: reward = 19.00, steps = 1788
19:21:19 [INFO] test episode 89: reward = 18.00, steps = 1911
19:21:36 [INFO] test episode 90: reward = 18.00, steps = 1914
19:21:52 [INFO] test episode 91: reward = 19.00, steps = 1794
19:22:08 [INFO] test episode 92: reward = 19.00, steps = 1794
19:22:23 [INFO] test episode 93: reward = 19.00, steps = 1788
19:22:40 [INFO] test episode 94: reward = 19.00, steps = 1793
19:22:55 [INFO] test episode 95: reward = 19.00, steps = 1784
19:23:11 [INFO] test episode 96: reward = 19.00, steps = 1799
19:23:28 [INFO] test episode 97: reward = 18.00, steps = 1912
19:23:46 [INFO] test episode 98: reward = 18.00, steps = 1912
19:24:05 [INFO] test episode 99: reward = 10.00, steps = 2213
19:24:05 [INFO] average episode reward = 16.52 ± 3.79