Use Implict Quantile Network to Play Pong-v4¶

PyTorch version

In [1]:
%matplotlib inline

import copy
import logging
import itertools
import sys

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
from gym.wrappers.atari_preprocessing import AtariPreprocessing
from gym.wrappers.frame_stack import FrameStack
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
from torch import nn
from torch import optim

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

Environment

In [2]:
env = gym.make('PongNoFrameskip-v4')
env = FrameStack(AtariPreprocessing(env), num_stack=4)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
00:01:01 [INFO] env: <AtariPreprocessing<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>
00:01:01 [INFO] action_space: Discrete(6)
00:01:01 [INFO] observation_space: : Box([[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]], [[[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]

 [[255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  ...
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]
  [255 255 255 ... 255 255 255]]], (4, 84, 84), uint8)
00:01:01 [INFO] reward_range: (-inf, inf)
00:01:01 [INFO] metadata: {'render.modes': ['human', 'rgb_array']}
00:01:01 [INFO] num_stack: 4
00:01:01 [INFO] lz4_compress: False
00:01:01 [INFO] frames: deque([], maxlen=4)
00:01:01 [INFO] id: PongNoFrameskip-v4
00:01:01 [INFO] entry_point: gym.envs.atari:AtariEnv
00:01:01 [INFO] reward_threshold: None
00:01:01 [INFO] nondeterministic: False
00:01:01 [INFO] max_episode_steps: 400000
00:01:01 [INFO] _kwargs: {'game': 'pong', 'obs_type': 'image', 'frameskip': 1}
00:01:01 [INFO] _env_name: PongNoFrameskip

Agent

In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class Net(nn.Module):
    def __init__(self, action_n, sample_count, cosine_count=64):
        super().__init__()
        self.sample_count = sample_count
        self.cosine_count = cosine_count
        self.conv = nn.Sequential(
                nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(),
                nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(),
                nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(),
                nn.Flatten())
        self.emb = nn.Sequential(
                nn.Linear(in_features=64, out_features=3136), nn.ReLU())
        self.fc = nn.Sequential(
                nn.Linear(in_features=3136, out_features=512), nn.ReLU(),
                nn.Linear(in_features=512, out_features=action_n))

    def forward(self, input_tensor, cumprob_tensor):
        batch_size = input_tensor.size(0)
        logit_tensor = self.conv(input_tensor).unsqueeze(1)
        index_tensor = torch.arange(start=1, end=self.cosine_count + 1).view(1,
                1, self.cosine_count)
        cosine_tensor = torch.cos(index_tensor * np.pi * cumprob_tensor)
        emb_tensor = self.emb(cosine_tensor)
        prod_tensor = logit_tensor * emb_tensor
        output_tensor = self.fc(prod_tensor).transpose(1, 2)
        return output_tensor
In [5]:
class IQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.epsilon = 1.

        self.replayer = DQNReplayer(capacity=100000)

        self.sample_count = 8
        self.evaluate_net = Net(action_n=self.action_n,
                sample_count=self.sample_count)
        self.target_net = copy.deepcopy(self.evaluate_net)
        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.0001)
        self.loss = nn.SmoothL1Loss(reduction="none")

    def reset(self, mode=None):
        self.mode = mode
        if mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        state_tensor = torch.as_tensor(observation,
                dtype=torch.float).unsqueeze(0)
        cumprod_tensor = torch.rand(1, self.sample_count, 1)
        q_component_tensor = self.evaluate_net(state_tensor, cumprod_tensor)
        q_tensor = q_component_tensor.mean(2)
        action_tensor = q_tensor.argmax(dim=1)
        actions = action_tensor.detach().numpy()
        action = actions[0]
        if self.mode == 'train':
            if np.random.rand() < self.epsilon:
                action = np.random.randint(0, self.action_n)

            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 1024 and self.replayer.count % 10 == 0:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        for target_param, evaluate_param in zip(
                target_net.parameters(), evaluate_net.parameters()):
            target_param.data.copy_(learning_rate * evaluate_param.data
                    + (1 - learning_rate) * target_param.data)

    def learn(self):
        # replay
        batch_size = 32
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(batch_size)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)

        # calculate target
        next_cumprob_tensor = torch.rand(batch_size, self.sample_count, 1)
        next_q_component_tensor = self.evaluate_net(next_state_tensor,
                next_cumprob_tensor)
        next_q_tensor = next_q_component_tensor.mean(2)
        next_action_tensor = next_q_tensor.argmax(dim=1)
        next_actions = next_action_tensor.detach().numpy()
        next_cumprob_tensor = torch.rand(batch_size, self.sample_count, 1)
        all_next_q_quantile_tensor = self.target_net(next_state_tensor,
                next_cumprob_tensor)
        next_q_quantile_tensor = all_next_q_quantile_tensor[
                range(batch_size), next_actions, :]
        target_quantile_tensor = reward_tensor.reshape(batch_size, 1) \
                + self.gamma * next_q_quantile_tensor \
                * (1. - terminated_tensor).reshape(-1, 1)

        cumprob_tensor = torch.rand(batch_size, self.sample_count, 1)
        all_q_quantile_tensor = self.evaluate_net(state_tensor, cumprob_tensor)
        q_quantile_tensor = all_q_quantile_tensor[range(batch_size), actions, :]
        target_quantile_tensor = target_quantile_tensor.unsqueeze(1)
        q_quantile_tensor = q_quantile_tensor.unsqueeze(2)
        hubor_loss_tensor = self.loss(target_quantile_tensor, q_quantile_tensor)
        comparison_tensor = (target_quantile_tensor <
                q_quantile_tensor).detach().float()
        quantile_regression_tensor = (cumprob_tensor -
                comparison_tensor).abs()
        quantile_huber_loss_tensor = (hubor_loss_tensor *
                quantile_regression_tensor).sum(-1).mean(1)
        loss_tensor = quantile_huber_loss_tensor.mean()
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()

        self.update_net(self.target_net, self.evaluate_net)

        self.epsilon = max(self.epsilon - 1e-5, 0.05)


agent = IQNAgent(env)

Train & Test

In [6]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-5:]) > 16.:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
00:01:02 [INFO] ==== train ====
00:01:16 [INFO] train episode 0: reward = -19.00, steps = 1010
00:01:53 [INFO] train episode 1: reward = -21.00, steps = 998
00:02:30 [INFO] train episode 2: reward = -19.00, steps = 987
00:03:02 [INFO] train episode 3: reward = -21.00, steps = 848
00:03:36 [INFO] train episode 4: reward = -21.00, steps = 880
00:04:09 [INFO] train episode 5: reward = -20.00, steps = 881
00:04:45 [INFO] train episode 6: reward = -20.00, steps = 943
00:05:20 [INFO] train episode 7: reward = -21.00, steps = 924
00:05:58 [INFO] train episode 8: reward = -20.00, steps = 991
00:06:31 [INFO] train episode 9: reward = -21.00, steps = 819
00:07:12 [INFO] train episode 10: reward = -20.00, steps = 1033
00:07:45 [INFO] train episode 11: reward = -21.00, steps = 806
00:08:18 [INFO] train episode 12: reward = -21.00, steps = 825
00:08:56 [INFO] train episode 13: reward = -20.00, steps = 969
00:09:30 [INFO] train episode 14: reward = -21.00, steps = 851
00:10:06 [INFO] train episode 15: reward = -21.00, steps = 899
00:10:43 [INFO] train episode 16: reward = -20.00, steps = 917
00:11:18 [INFO] train episode 17: reward = -21.00, steps = 877
00:11:52 [INFO] train episode 18: reward = -21.00, steps = 879
00:12:38 [INFO] train episode 19: reward = -18.00, steps = 1160
00:13:09 [INFO] train episode 20: reward = -21.00, steps = 777
00:13:43 [INFO] train episode 21: reward = -20.00, steps = 864
00:14:18 [INFO] train episode 22: reward = -20.00, steps = 865
00:14:56 [INFO] train episode 23: reward = -20.00, steps = 959
00:15:29 [INFO] train episode 24: reward = -21.00, steps = 821
00:16:03 [INFO] train episode 25: reward = -21.00, steps = 852
00:16:42 [INFO] train episode 26: reward = -20.00, steps = 973
00:17:26 [INFO] train episode 27: reward = -18.00, steps = 1117
00:18:01 [INFO] train episode 28: reward = -20.00, steps = 878
00:18:31 [INFO] train episode 29: reward = -21.00, steps = 786
00:19:16 [INFO] train episode 30: reward = -20.00, steps = 1139
00:19:53 [INFO] train episode 31: reward = -21.00, steps = 942
00:20:33 [INFO] train episode 32: reward = -21.00, steps = 991
00:21:06 [INFO] train episode 33: reward = -21.00, steps = 820
00:21:41 [INFO] train episode 34: reward = -20.00, steps = 904
00:22:23 [INFO] train episode 35: reward = -19.00, steps = 1063
00:22:57 [INFO] train episode 36: reward = -21.00, steps = 884
00:23:39 [INFO] train episode 37: reward = -19.00, steps = 1066
00:24:19 [INFO] train episode 38: reward = -20.00, steps = 999
00:24:59 [INFO] train episode 39: reward = -19.00, steps = 1036
00:25:30 [INFO] train episode 40: reward = -21.00, steps = 790
00:26:03 [INFO] train episode 41: reward = -21.00, steps = 824
00:26:42 [INFO] train episode 42: reward = -20.00, steps = 987
00:27:19 [INFO] train episode 43: reward = -21.00, steps = 937
00:27:52 [INFO] train episode 44: reward = -20.00, steps = 835
00:28:24 [INFO] train episode 45: reward = -21.00, steps = 823
00:28:56 [INFO] train episode 46: reward = -21.00, steps = 823
00:29:35 [INFO] train episode 47: reward = -19.00, steps = 997
00:30:12 [INFO] train episode 48: reward = -20.00, steps = 938
00:30:46 [INFO] train episode 49: reward = -20.00, steps = 866
00:31:16 [INFO] train episode 50: reward = -21.00, steps = 760
00:31:50 [INFO] train episode 51: reward = -21.00, steps = 866
00:32:27 [INFO] train episode 52: reward = -20.00, steps = 929
00:33:01 [INFO] train episode 53: reward = -20.00, steps = 864
00:33:40 [INFO] train episode 54: reward = -21.00, steps = 984
00:34:14 [INFO] train episode 55: reward = -21.00, steps = 882
00:34:46 [INFO] train episode 56: reward = -21.00, steps = 809
00:35:20 [INFO] train episode 57: reward = -21.00, steps = 850
00:35:52 [INFO] train episode 58: reward = -21.00, steps = 815
00:36:26 [INFO] train episode 59: reward = -21.00, steps = 851
00:37:05 [INFO] train episode 60: reward = -21.00, steps = 995
00:37:42 [INFO] train episode 61: reward = -20.00, steps = 947
00:38:18 [INFO] train episode 62: reward = -21.00, steps = 896
00:38:53 [INFO] train episode 63: reward = -21.00, steps = 896
00:39:32 [INFO] train episode 64: reward = -20.00, steps = 977
00:40:11 [INFO] train episode 65: reward = -21.00, steps = 971
00:40:46 [INFO] train episode 66: reward = -21.00, steps = 883
00:41:22 [INFO] train episode 67: reward = -21.00, steps = 877
00:41:57 [INFO] train episode 68: reward = -21.00, steps = 885
00:42:30 [INFO] train episode 69: reward = -21.00, steps = 847
00:43:05 [INFO] train episode 70: reward = -21.00, steps = 865
00:43:38 [INFO] train episode 71: reward = -21.00, steps = 817
00:44:09 [INFO] train episode 72: reward = -21.00, steps = 786
00:44:39 [INFO] train episode 73: reward = -21.00, steps = 782
00:45:10 [INFO] train episode 74: reward = -21.00, steps = 780
00:45:47 [INFO] train episode 75: reward = -19.00, steps = 938
00:46:23 [INFO] train episode 76: reward = -21.00, steps = 908
00:47:00 [INFO] train episode 77: reward = -20.00, steps = 947
00:47:35 [INFO] train episode 78: reward = -21.00, steps = 862
00:48:13 [INFO] train episode 79: reward = -20.00, steps = 986
00:48:51 [INFO] train episode 80: reward = -19.00, steps = 961
00:49:24 [INFO] train episode 81: reward = -20.00, steps = 841
00:50:03 [INFO] train episode 82: reward = -19.00, steps = 977
00:50:34 [INFO] train episode 83: reward = -21.00, steps = 789
00:51:16 [INFO] train episode 84: reward = -18.00, steps = 1067
00:51:49 [INFO] train episode 85: reward = -21.00, steps = 839
00:52:32 [INFO] train episode 86: reward = -19.00, steps = 1095
00:53:05 [INFO] train episode 87: reward = -21.00, steps = 850
00:53:49 [INFO] train episode 88: reward = -21.00, steps = 1106
00:54:19 [INFO] train episode 89: reward = -21.00, steps = 776
00:54:56 [INFO] train episode 90: reward = -20.00, steps = 926
00:55:36 [INFO] train episode 91: reward = -18.00, steps = 1026
00:56:12 [INFO] train episode 92: reward = -21.00, steps = 927
00:56:55 [INFO] train episode 93: reward = -20.00, steps = 1097
00:57:27 [INFO] train episode 94: reward = -21.00, steps = 821
00:58:03 [INFO] train episode 95: reward = -21.00, steps = 925
00:58:38 [INFO] train episode 96: reward = -20.00, steps = 899
00:59:13 [INFO] train episode 97: reward = -21.00, steps = 907
00:59:54 [INFO] train episode 98: reward = -21.00, steps = 1057
01:00:29 [INFO] train episode 99: reward = -21.00, steps = 908
01:01:06 [INFO] train episode 100: reward = -19.00, steps = 942
01:01:43 [INFO] train episode 101: reward = -20.00, steps = 963
01:02:21 [INFO] train episode 102: reward = -21.00, steps = 970
01:02:59 [INFO] train episode 103: reward = -21.00, steps = 989
01:03:32 [INFO] train episode 104: reward = -20.00, steps = 839
01:04:03 [INFO] train episode 105: reward = -21.00, steps = 788
01:04:44 [INFO] train episode 106: reward = -19.00, steps = 1060
01:05:17 [INFO] train episode 107: reward = -21.00, steps = 851
01:05:49 [INFO] train episode 108: reward = -21.00, steps = 824
01:08:00 [INFO] train episode 109: reward = -21.00, steps = 992
01:12:43 [INFO] train episode 110: reward = -17.00, steps = 1230
01:17:14 [INFO] train episode 111: reward = -19.00, steps = 1174
01:21:43 [INFO] train episode 112: reward = -19.00, steps = 1150
01:25:13 [INFO] train episode 113: reward = -20.00, steps = 908
01:29:21 [INFO] train episode 114: reward = -19.00, steps = 1067
01:32:23 [INFO] train episode 115: reward = -21.00, steps = 781
01:36:29 [INFO] train episode 116: reward = -19.00, steps = 1058
01:39:40 [INFO] train episode 117: reward = -21.00, steps = 825
01:42:43 [INFO] train episode 118: reward = -21.00, steps = 787
01:47:57 [INFO] train episode 119: reward = -16.00, steps = 1356
01:51:49 [INFO] train episode 120: reward = -21.00, steps = 998
01:54:59 [INFO] train episode 121: reward = -21.00, steps = 818
01:58:06 [INFO] train episode 122: reward = -21.00, steps = 807
02:01:40 [INFO] train episode 123: reward = -20.00, steps = 924
02:05:13 [INFO] train episode 124: reward = -21.00, steps = 919
02:08:50 [INFO] train episode 125: reward = -19.00, steps = 938
02:13:40 [INFO] train episode 126: reward = -19.00, steps = 1255
02:18:06 [INFO] train episode 127: reward = -20.00, steps = 1147
02:23:05 [INFO] train episode 128: reward = -18.00, steps = 1283
02:28:26 [INFO] train episode 129: reward = -20.00, steps = 1391
02:33:49 [INFO] train episode 130: reward = -16.00, steps = 1393
02:37:48 [INFO] train episode 131: reward = -20.00, steps = 1038
02:43:50 [INFO] train episode 132: reward = -18.00, steps = 1577
02:50:02 [INFO] train episode 133: reward = -19.00, steps = 1617
02:55:08 [INFO] train episode 134: reward = -19.00, steps = 1332
02:59:23 [INFO] train episode 135: reward = -19.00, steps = 1114
03:04:15 [INFO] train episode 136: reward = -20.00, steps = 1279
03:08:20 [INFO] train episode 137: reward = -20.00, steps = 1071
03:14:08 [INFO] train episode 138: reward = -16.00, steps = 1528
03:19:22 [INFO] train episode 139: reward = -18.00, steps = 1379
03:25:01 [INFO] train episode 140: reward = -17.00, steps = 1480
03:29:53 [INFO] train episode 141: reward = -20.00, steps = 1284
03:35:27 [INFO] train episode 142: reward = -19.00, steps = 1472
03:40:10 [INFO] train episode 143: reward = -20.00, steps = 1253
03:45:41 [INFO] train episode 144: reward = -17.00, steps = 1469
03:49:55 [INFO] train episode 145: reward = -20.00, steps = 1125
03:54:47 [INFO] train episode 146: reward = -20.00, steps = 1295
03:59:57 [INFO] train episode 147: reward = -19.00, steps = 1381
04:05:16 [INFO] train episode 148: reward = -19.00, steps = 1368
04:10:10 [INFO] train episode 149: reward = -21.00, steps = 1309
04:16:40 [INFO] train episode 150: reward = -18.00, steps = 1740
04:23:01 [INFO] train episode 151: reward = -21.00, steps = 1696
04:28:02 [INFO] train episode 152: reward = -21.00, steps = 1348
04:33:22 [INFO] train episode 153: reward = -19.00, steps = 1427
04:40:08 [INFO] train episode 154: reward = -17.00, steps = 1832
04:48:20 [INFO] train episode 155: reward = -15.00, steps = 2212
04:54:04 [INFO] train episode 156: reward = -18.00, steps = 1550
05:00:19 [INFO] train episode 157: reward = -17.00, steps = 1684
05:06:23 [INFO] train episode 158: reward = -18.00, steps = 1624
05:11:37 [INFO] train episode 159: reward = -19.00, steps = 1384
05:18:39 [INFO] train episode 160: reward = -15.00, steps = 1857
05:25:22 [INFO] train episode 161: reward = -17.00, steps = 1756
05:31:13 [INFO] train episode 162: reward = -20.00, steps = 1531
05:38:21 [INFO] train episode 163: reward = -15.00, steps = 1874
05:46:18 [INFO] train episode 164: reward = -15.00, steps = 2089
05:53:03 [INFO] train episode 165: reward = -14.00, steps = 1784
06:02:27 [INFO] train episode 166: reward = -8.00, steps = 2475
06:10:22 [INFO] train episode 167: reward = -13.00, steps = 2094
06:17:06 [INFO] train episode 168: reward = -18.00, steps = 1777
06:27:33 [INFO] train episode 169: reward = -8.00, steps = 2764
06:33:12 [INFO] train episode 170: reward = -17.00, steps = 1489
06:39:08 [INFO] train episode 171: reward = -17.00, steps = 1565
06:45:55 [INFO] train episode 172: reward = -15.00, steps = 1793
06:55:21 [INFO] train episode 173: reward = -9.00, steps = 2487
07:02:53 [INFO] train episode 174: reward = -13.00, steps = 1994
07:09:25 [INFO] train episode 175: reward = -14.00, steps = 1722
07:17:58 [INFO] train episode 176: reward = -8.00, steps = 2260
07:27:16 [INFO] train episode 177: reward = -5.00, steps = 2458
07:32:49 [INFO] train episode 178: reward = -16.00, steps = 1462
07:38:38 [INFO] train episode 179: reward = -15.00, steps = 1536
07:45:17 [INFO] train episode 180: reward = -15.00, steps = 1750
07:53:46 [INFO] train episode 181: reward = -7.00, steps = 2240
08:03:30 [INFO] train episode 182: reward = -7.00, steps = 2569
08:16:16 [INFO] train episode 183: reward = -3.00, steps = 3346
08:26:03 [INFO] train episode 184: reward = -4.00, steps = 2609
08:33:30 [INFO] train episode 185: reward = -9.00, steps = 1992
08:40:22 [INFO] train episode 186: reward = -11.00, steps = 1843
08:46:30 [INFO] train episode 187: reward = -14.00, steps = 1647
08:52:25 [INFO] train episode 188: reward = -14.00, steps = 1587
08:56:32 [INFO] train episode 189: reward = -19.00, steps = 1106
09:06:26 [INFO] train episode 190: reward = -5.00, steps = 2545
09:13:08 [INFO] train episode 191: reward = -11.00, steps = 1818
09:22:18 [INFO] train episode 192: reward = -6.00, steps = 2472
09:29:00 [INFO] train episode 193: reward = -13.00, steps = 1810
09:37:50 [INFO] train episode 194: reward = -8.00, steps = 2387
09:44:21 [INFO] train episode 195: reward = -14.00, steps = 1754
09:53:44 [INFO] train episode 196: reward = -6.00, steps = 2533
10:03:03 [INFO] train episode 197: reward = -4.00, steps = 2511
10:11:45 [INFO] train episode 198: reward = -8.00, steps = 2351
10:18:14 [INFO] train episode 199: reward = -13.00, steps = 1755
10:27:32 [INFO] train episode 200: reward = -7.00, steps = 2485
10:34:10 [INFO] train episode 201: reward = -12.00, steps = 1789
10:44:14 [INFO] train episode 202: reward = -4.00, steps = 2719
10:50:20 [INFO] train episode 203: reward = -12.00, steps = 1652
10:57:59 [INFO] train episode 204: reward = -10.00, steps = 2065
11:02:51 [INFO] train episode 205: reward = -17.00, steps = 1313
11:11:48 [INFO] train episode 206: reward = -7.00, steps = 2423
11:21:12 [INFO] train episode 207: reward = -7.00, steps = 2535
11:28:05 [INFO] train episode 208: reward = -12.00, steps = 1865
11:33:23 [INFO] train episode 209: reward = -16.00, steps = 1432
11:38:41 [INFO] train episode 210: reward = -15.00, steps = 1441
11:48:28 [INFO] train episode 211: reward = -4.00, steps = 2664
11:54:04 [INFO] train episode 212: reward = -15.00, steps = 1527
12:03:45 [INFO] train episode 213: reward = -6.00, steps = 2640
12:09:56 [INFO] train episode 214: reward = -14.00, steps = 1688
12:15:53 [INFO] train episode 215: reward = -13.00, steps = 1627
12:24:53 [INFO] train episode 216: reward = -7.00, steps = 2441
12:31:15 [INFO] train episode 217: reward = -13.00, steps = 1731
12:38:12 [INFO] train episode 218: reward = -12.00, steps = 1895
12:45:46 [INFO] train episode 219: reward = -10.00, steps = 2062
12:53:48 [INFO] train episode 220: reward = -8.00, steps = 2192
13:02:22 [INFO] train episode 221: reward = -4.00, steps = 2350
13:08:46 [INFO] train episode 222: reward = -11.00, steps = 1751
13:15:57 [INFO] train episode 223: reward = -11.00, steps = 1969
13:22:44 [INFO] train episode 224: reward = -13.00, steps = 1848
13:30:07 [INFO] train episode 225: reward = -9.00, steps = 2022
13:39:23 [INFO] train episode 226: reward = -6.00, steps = 2545
13:47:45 [INFO] train episode 227: reward = -8.00, steps = 2300
13:55:39 [INFO] train episode 228: reward = -8.00, steps = 2178
14:01:37 [INFO] train episode 229: reward = -14.00, steps = 1636
14:09:04 [INFO] train episode 230: reward = -8.00, steps = 2048
14:16:44 [INFO] train episode 231: reward = -9.00, steps = 2111
14:26:00 [INFO] train episode 232: reward = -5.00, steps = 2545
14:33:34 [INFO] train episode 233: reward = -9.00, steps = 2081
14:38:49 [INFO] train episode 234: reward = -16.00, steps = 1437
14:47:27 [INFO] train episode 235: reward = -3.00, steps = 2377
14:55:44 [INFO] train episode 236: reward = -7.00, steps = 2286
15:01:07 [INFO] train episode 237: reward = -14.00, steps = 1486
15:09:14 [INFO] train episode 238: reward = -8.00, steps = 2237
15:16:58 [INFO] train episode 239: reward = -7.00, steps = 2140
15:23:32 [INFO] train episode 240: reward = -11.00, steps = 1806
15:32:16 [INFO] train episode 241: reward = -7.00, steps = 2414
15:40:12 [INFO] train episode 242: reward = -10.00, steps = 2193
15:50:41 [INFO] train episode 243: reward = -3.00, steps = 2900
16:01:38 [INFO] train episode 244: reward = 1.00, steps = 3027
16:12:12 [INFO] train episode 245: reward = 2.00, steps = 2924
16:22:00 [INFO] train episode 246: reward = 5.00, steps = 2698
16:30:19 [INFO] train episode 247: reward = -6.00, steps = 2307
16:39:20 [INFO] train episode 248: reward = -3.00, steps = 2501
16:46:44 [INFO] train episode 249: reward = -8.00, steps = 2050
16:56:31 [INFO] train episode 250: reward = 7.00, steps = 2715
17:04:06 [INFO] train episode 251: reward = 14.00, steps = 2106
17:12:43 [INFO] train episode 252: reward = -4.00, steps = 2396
17:20:36 [INFO] train episode 253: reward = 11.00, steps = 2190
17:28:15 [INFO] train episode 254: reward = 14.00, steps = 2112
17:37:49 [INFO] train episode 255: reward = 5.00, steps = 2654
17:45:13 [INFO] train episode 256: reward = 15.00, steps = 2059
17:52:22 [INFO] train episode 257: reward = 15.00, steps = 1996
18:00:37 [INFO] train episode 258: reward = 13.00, steps = 2299
18:09:32 [INFO] train episode 259: reward = 9.00, steps = 2479
18:17:42 [INFO] train episode 260: reward = 10.00, steps = 2273
18:26:23 [INFO] train episode 261: reward = -1.00, steps = 2420
18:33:13 [INFO] train episode 262: reward = 16.00, steps = 1898
18:41:09 [INFO] train episode 263: reward = 10.00, steps = 2214
18:47:55 [INFO] train episode 264: reward = 17.00, steps = 1892
18:55:56 [INFO] train episode 265: reward = 14.00, steps = 2238
19:03:47 [INFO] train episode 266: reward = 9.00, steps = 2189
19:10:55 [INFO] train episode 267: reward = 15.00, steps = 1988
19:19:26 [INFO] train episode 268: reward = 1.00, steps = 2371
19:28:20 [INFO] train episode 269: reward = 7.00, steps = 2471
19:36:47 [INFO] train episode 270: reward = -5.00, steps = 2360
19:43:53 [INFO] train episode 271: reward = 16.00, steps = 1978
19:53:39 [INFO] train episode 272: reward = 1.00, steps = 2621
20:01:39 [INFO] train episode 273: reward = 16.00, steps = 2120
20:11:26 [INFO] train episode 274: reward = -5.00, steps = 2511
20:18:54 [INFO] train episode 275: reward = 17.00, steps = 1895
20:26:55 [INFO] train episode 276: reward = 16.00, steps = 1982
20:35:59 [INFO] train episode 277: reward = 12.00, steps = 2214
20:48:07 [INFO] train episode 278: reward = 5.00, steps = 2548
20:56:10 [INFO] train episode 279: reward = 16.00, steps = 1904
21:04:37 [INFO] train episode 280: reward = 14.00, steps = 2015
21:14:03 [INFO] train episode 281: reward = 12.00, steps = 2202
21:24:20 [INFO] train episode 282: reward = 7.00, steps = 2563
21:31:39 [INFO] train episode 283: reward = 18.00, steps = 1840
21:39:22 [INFO] train episode 284: reward = 15.00, steps = 1983
21:47:20 [INFO] train episode 285: reward = 11.00, steps = 2054
21:54:58 [INFO] train episode 286: reward = 17.00, steps = 1934
22:03:09 [INFO] train episode 287: reward = 15.00, steps = 2069
22:10:30 [INFO] train episode 288: reward = 17.00, steps = 1851
22:19:17 [INFO] train episode 289: reward = 12.00, steps = 2190
22:27:54 [INFO] train episode 290: reward = 14.00, steps = 2149
22:37:09 [INFO] train episode 291: reward = 9.00, steps = 2289
22:45:05 [INFO] train episode 292: reward = 13.00, steps = 2046
22:52:17 [INFO] train episode 293: reward = 17.00, steps = 1820
23:01:21 [INFO] train episode 294: reward = 16.00, steps = 2217
23:08:47 [INFO] train episode 295: reward = 18.00, steps = 1843
23:16:19 [INFO] train episode 296: reward = 17.00, steps = 1865
23:16:19 [INFO] ==== test ====
23:16:43 [INFO] test episode 0: reward = 20.00, steps = 1664
23:17:09 [INFO] test episode 1: reward = 20.00, steps = 1732
23:17:33 [INFO] test episode 2: reward = 19.00, steps = 1700
23:17:59 [INFO] test episode 3: reward = 19.00, steps = 1787
23:18:25 [INFO] test episode 4: reward = 20.00, steps = 1664
23:18:59 [INFO] test episode 5: reward = 20.00, steps = 1665
23:19:37 [INFO] test episode 6: reward = 19.00, steps = 1765
23:20:10 [INFO] test episode 7: reward = 19.00, steps = 1696
23:20:39 [INFO] test episode 8: reward = 19.00, steps = 1697
23:21:07 [INFO] test episode 9: reward = 20.00, steps = 1725
23:21:32 [INFO] test episode 10: reward = 20.00, steps = 1662
23:21:57 [INFO] test episode 11: reward = 19.00, steps = 1740
23:22:21 [INFO] test episode 12: reward = 19.00, steps = 1699
23:22:46 [INFO] test episode 13: reward = 19.00, steps = 1700
23:23:12 [INFO] test episode 14: reward = 19.00, steps = 1763
23:23:37 [INFO] test episode 15: reward = 20.00, steps = 1728
23:24:03 [INFO] test episode 16: reward = 19.00, steps = 1745
23:24:31 [INFO] test episode 17: reward = 19.00, steps = 1726
23:24:57 [INFO] test episode 18: reward = 19.00, steps = 1702
23:25:22 [INFO] test episode 19: reward = 19.00, steps = 1736
23:25:46 [INFO] test episode 20: reward = 19.00, steps = 1699
23:26:11 [INFO] test episode 21: reward = 19.00, steps = 1706
23:26:36 [INFO] test episode 22: reward = 19.00, steps = 1699
23:27:00 [INFO] test episode 23: reward = 20.00, steps = 1670
23:27:25 [INFO] test episode 24: reward = 19.00, steps = 1702
23:27:51 [INFO] test episode 25: reward = 19.00, steps = 1699
23:28:16 [INFO] test episode 26: reward = 19.00, steps = 1766
23:28:42 [INFO] test episode 27: reward = 20.00, steps = 1716
23:29:08 [INFO] test episode 28: reward = 20.00, steps = 1661
23:29:33 [INFO] test episode 29: reward = 19.00, steps = 1762
23:30:01 [INFO] test episode 30: reward = 19.00, steps = 1741
23:30:29 [INFO] test episode 31: reward = 19.00, steps = 1846
23:30:53 [INFO] test episode 32: reward = 20.00, steps = 1662
23:31:20 [INFO] test episode 33: reward = 19.00, steps = 1821
23:31:46 [INFO] test episode 34: reward = 19.00, steps = 1706
23:32:11 [INFO] test episode 35: reward = 19.00, steps = 1702
23:32:36 [INFO] test episode 36: reward = 19.00, steps = 1761
23:33:00 [INFO] test episode 37: reward = 19.00, steps = 1702
23:33:24 [INFO] test episode 38: reward = 20.00, steps = 1666
23:33:49 [INFO] test episode 39: reward = 19.00, steps = 1759
23:34:13 [INFO] test episode 40: reward = 20.00, steps = 1665
23:34:39 [INFO] test episode 41: reward = 19.00, steps = 1759
23:35:04 [INFO] test episode 42: reward = 19.00, steps = 1746
23:35:28 [INFO] test episode 43: reward = 19.00, steps = 1700
23:35:54 [INFO] test episode 44: reward = 19.00, steps = 1768
23:36:21 [INFO] test episode 45: reward = 19.00, steps = 1878
23:36:46 [INFO] test episode 46: reward = 19.00, steps = 1762
23:37:11 [INFO] test episode 47: reward = 20.00, steps = 1670
23:37:35 [INFO] test episode 48: reward = 19.00, steps = 1698
23:37:59 [INFO] test episode 49: reward = 20.00, steps = 1666
23:38:24 [INFO] test episode 50: reward = 19.00, steps = 1763
23:38:49 [INFO] test episode 51: reward = 19.00, steps = 1729
23:39:13 [INFO] test episode 52: reward = 19.00, steps = 1701
23:39:38 [INFO] test episode 53: reward = 19.00, steps = 1723
23:40:02 [INFO] test episode 54: reward = 20.00, steps = 1661
23:40:26 [INFO] test episode 55: reward = 20.00, steps = 1662
23:40:50 [INFO] test episode 56: reward = 20.00, steps = 1660
23:41:14 [INFO] test episode 57: reward = 20.00, steps = 1661
23:41:40 [INFO] test episode 58: reward = 19.00, steps = 1825
23:42:04 [INFO] test episode 59: reward = 19.00, steps = 1701
23:42:29 [INFO] test episode 60: reward = 19.00, steps = 1759
23:42:54 [INFO] test episode 61: reward = 20.00, steps = 1660
23:43:21 [INFO] test episode 62: reward = 19.00, steps = 1766
23:43:47 [INFO] test episode 63: reward = 19.00, steps = 1702
23:44:11 [INFO] test episode 64: reward = 20.00, steps = 1662
23:44:35 [INFO] test episode 65: reward = 20.00, steps = 1728
23:44:59 [INFO] test episode 66: reward = 20.00, steps = 1670
23:45:23 [INFO] test episode 67: reward = 19.00, steps = 1703
23:45:48 [INFO] test episode 68: reward = 19.00, steps = 1800
23:46:13 [INFO] test episode 69: reward = 20.00, steps = 1724
23:46:37 [INFO] test episode 70: reward = 19.00, steps = 1700
23:47:01 [INFO] test episode 71: reward = 20.00, steps = 1666
23:47:25 [INFO] test episode 72: reward = 20.00, steps = 1668
23:47:48 [INFO] test episode 73: reward = 20.00, steps = 1670
23:48:12 [INFO] test episode 74: reward = 20.00, steps = 1666
23:48:36 [INFO] test episode 75: reward = 20.00, steps = 1670
23:49:00 [INFO] test episode 76: reward = 19.00, steps = 1702
23:49:24 [INFO] test episode 77: reward = 20.00, steps = 1664
23:49:48 [INFO] test episode 78: reward = 20.00, steps = 1660
23:50:12 [INFO] test episode 79: reward = 19.00, steps = 1704
23:50:37 [INFO] test episode 80: reward = 19.00, steps = 1701
23:51:02 [INFO] test episode 81: reward = 19.00, steps = 1702
23:51:27 [INFO] test episode 82: reward = 19.00, steps = 1759
23:51:53 [INFO] test episode 83: reward = 19.00, steps = 1780
23:52:17 [INFO] test episode 84: reward = 20.00, steps = 1664
23:52:42 [INFO] test episode 85: reward = 19.00, steps = 1763
23:53:06 [INFO] test episode 86: reward = 19.00, steps = 1704
23:53:31 [INFO] test episode 87: reward = 20.00, steps = 1728
23:53:55 [INFO] test episode 88: reward = 20.00, steps = 1664
23:54:21 [INFO] test episode 89: reward = 19.00, steps = 1759
23:54:45 [INFO] test episode 90: reward = 20.00, steps = 1665
23:55:10 [INFO] test episode 91: reward = 19.00, steps = 1700
23:55:35 [INFO] test episode 92: reward = 19.00, steps = 1759
23:56:00 [INFO] test episode 93: reward = 20.00, steps = 1729
23:56:27 [INFO] test episode 94: reward = 20.00, steps = 1804
23:56:51 [INFO] test episode 95: reward = 20.00, steps = 1660
23:57:15 [INFO] test episode 96: reward = 20.00, steps = 1665
23:57:40 [INFO] test episode 97: reward = 19.00, steps = 1758
23:58:04 [INFO] test episode 98: reward = 20.00, steps = 1661
23:58:30 [INFO] test episode 99: reward = 19.00, steps = 1829
23:58:30 [INFO] average episode reward = 19.40 ± 0.49