Use Soft Actor-Critic with Auto $\alpha$ Tuning to Play LunarLander-v2¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('LunarLander-v2')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
08:49:53 [INFO] env: <LunarLander<LunarLander-v2>>
08:49:53 [INFO] action_space: Discrete(4)
08:49:53 [INFO] observation_space: Box(-inf, inf, (8,), float32)
08:49:53 [INFO] reward_range: (-inf, inf)
08:49:53 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
08:49:53 [INFO] _max_episode_steps: 1000
08:49:53 [INFO] _elapsed_steps: None
08:49:53 [INFO] id: LunarLander-v2
08:49:53 [INFO] entry_point: gym.envs.box2d:LunarLander
08:49:53 [INFO] reward_threshold: 200
08:49:53 [INFO] nondeterministic: False
08:49:53 [INFO] max_episode_steps: 1000
08:49:53 [INFO] _kwargs: {}
08:49:53 [INFO] _env_name: LunarLander
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class SACAgent:
    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        # create alpha
        self.target_entropy = np.log(self.action_n) / 4.
        self.ln_alpha_tensor = torch.zeros(1, requires_grad=True)
        self.alpha_optimizer = optim.Adam([self.ln_alpha_tensor,], lr=3e-4)

        # create actor
        self.actor_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256],
                output_size=self.action_n, output_activator=nn.Softmax(-1))
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=3e-4)

        # create V critic
        self.v_evaluate_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256])
        self.v_target_net = copy.deepcopy(self.v_evaluate_net)
        self.v_optimizer = optim.Adam(self.v_evaluate_net.parameters(), lr=3e-4)
        self.v_loss = nn.MSELoss()

        # create Q critic
        self.q0_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=self.action_n)
        self.q1_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=self.action_n)
        self.q0_loss = nn.MSELoss()
        self.q1_loss = nn.MSELoss()
        self.q0_optimizer = optim.Adam(self.q0_net.parameters(), lr=3e-4)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=3e-4)

    def build_net(self, input_size, hidden_sizes, output_size=1,
            output_activator=None):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
        prob_tensor = self.actor_net(state_tensor)
        action_tensor = distributions.Categorical(prob_tensor).sample()
        action = action_tensor.numpy()[0]
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, action, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, action, reward, next_state, terminated)
            if self.replayer.count >= 500:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.0025):
        for target_param, evaluate_param in zip(
                target_net.parameters(), evaluate_net.parameters()):
            target_param.data.copy_(learning_rate * evaluate_param.data
                    + (1 - learning_rate) * target_param.data)

    def learn(self):
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(128)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update alpha
        prob_tensor = self.actor_net(state_tensor)
        ln_prob_tensor = torch.log(prob_tensor.clamp(1e-6, 1))
        neg_entropy_tensor = (prob_tensor * ln_prob_tensor).sum()
        # OR neg_entropy_tensor = torch.xlogy(prob_tensor, prob_tensor).sum()
        grad_tensor = neg_entropy_tensor + self.target_entropy
        alpha_loss_tensor = -self.ln_alpha_tensor * grad_tensor.detach()
        self.alpha_optimizer.zero_grad()
        alpha_loss_tensor.backward()
        self.alpha_optimizer.step()

        # update Q critic
        next_v_tensor = self.v_target_net(next_state_tensor)
        q_target_tensor = reward_tensor.unsqueeze(1) + self.gamma * \
                (1. - terminated_tensor.unsqueeze(1)) * next_v_tensor

        all_q0_pred_tensor = self.q0_net(state_tensor)
        q0_pred_tensor = torch.gather(all_q0_pred_tensor, 1,
                action_tensor.unsqueeze(1))
        q0_loss_tensor = self.q0_loss(q0_pred_tensor, q_target_tensor.detach())
        self.q0_optimizer.zero_grad()
        q0_loss_tensor.backward()
        self.q0_optimizer.step()

        all_q1_pred_tensor = self.q1_net(state_tensor)
        q1_pred_tensor = torch.gather(all_q1_pred_tensor, 1,
                action_tensor.unsqueeze(1))
        q1_loss_tensor = self.q1_loss(q1_pred_tensor, q_target_tensor.detach())
        self.q1_optimizer.zero_grad()
        q1_loss_tensor.backward()
        self.q1_optimizer.step()

        # update V critic
        q0_tensor = self.q0_net(state_tensor)
        q1_tensor = self.q1_net(state_tensor)
        q01_tensor = torch.min(q0_tensor, q1_tensor)
        prob_tensor = self.actor_net(state_tensor)
        ln_prob_tensor = torch.log(prob_tensor.clamp(1e-6, 1.))
        alpha = self.ln_alpha_tensor.exp().detach().item()
        entropic_q01_tensor = prob_tensor * (q01_tensor - alpha * ln_prob_tensor)
        # OR entropic_q01_tensor = prob_tensor * (q01_tensor -
        #         alpha * torch.xlogy(prob_tensor, prob_tensor)
        v_target_tensor = torch.sum(entropic_q01_tensor, dim=-1, keepdim=True)
        v_pred_tensor = self.v_evaluate_net(state_tensor)
        v_loss_tensor = self.v_loss(v_pred_tensor, v_target_tensor.detach())
        self.v_optimizer.zero_grad()
        v_loss_tensor.backward()
        self.v_optimizer.step()

        self.update_net(self.v_target_net, self.v_evaluate_net)

        # update actor
        prob_q_tensor = prob_tensor * (alpha * ln_prob_tensor - q0_tensor)
        actor_loss_tensor = prob_q_tensor.sum(axis=-1).mean()
        self.actor_optimizer.zero_grad()
        actor_loss_tensor.backward()
        self.actor_optimizer.step()


agent = SACAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 250:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
08:49:53 [INFO] ==== train ====
08:49:53 [INFO] train episode 0: reward = -119.74, steps = 100
08:49:53 [INFO] train episode 1: reward = -53.41, steps = 63
08:49:53 [INFO] train episode 2: reward = -136.45, steps = 87
08:49:53 [INFO] train episode 3: reward = -326.87, steps = 70
08:49:54 [INFO] train episode 4: reward = -117.95, steps = 113
08:49:55 [INFO] train episode 5: reward = -336.63, steps = 96
08:49:58 [INFO] train episode 6: reward = -181.90, steps = 99
08:50:02 [INFO] train episode 7: reward = -145.90, steps = 117
08:50:05 [INFO] train episode 8: reward = -310.74, steps = 100
08:50:07 [INFO] train episode 9: reward = -144.71, steps = 66
08:50:11 [INFO] train episode 10: reward = -2.77, steps = 150
08:50:14 [INFO] train episode 11: reward = -162.35, steps = 91
08:50:16 [INFO] train episode 12: reward = -315.83, steps = 72
08:50:18 [INFO] train episode 13: reward = -447.24, steps = 69
08:50:21 [INFO] train episode 14: reward = -351.69, steps = 106
08:50:23 [INFO] train episode 15: reward = -59.90, steps = 78
08:50:26 [INFO] train episode 16: reward = -429.55, steps = 89
08:50:30 [INFO] train episode 17: reward = -103.32, steps = 135
08:50:33 [INFO] train episode 18: reward = -181.24, steps = 84
08:50:36 [INFO] train episode 19: reward = -155.00, steps = 76
08:50:40 [INFO] train episode 20: reward = -376.96, steps = 149
08:50:43 [INFO] train episode 21: reward = -251.47, steps = 102
08:50:46 [INFO] train episode 22: reward = -122.12, steps = 75
08:50:49 [INFO] train episode 23: reward = -192.81, steps = 113
08:50:54 [INFO] train episode 24: reward = -298.56, steps = 144
08:50:58 [INFO] train episode 25: reward = -326.07, steps = 124
08:51:01 [INFO] train episode 26: reward = -322.35, steps = 88
08:51:05 [INFO] train episode 27: reward = -275.14, steps = 117
08:51:07 [INFO] train episode 28: reward = -60.64, steps = 88
08:51:11 [INFO] train episode 29: reward = -227.72, steps = 100
08:51:13 [INFO] train episode 30: reward = -67.02, steps = 92
08:51:16 [INFO] train episode 31: reward = -254.04, steps = 83
08:51:19 [INFO] train episode 32: reward = -349.88, steps = 107
08:51:22 [INFO] train episode 33: reward = -37.67, steps = 81
08:51:24 [INFO] train episode 34: reward = -77.48, steps = 72
08:51:27 [INFO] train episode 35: reward = -55.36, steps = 99
08:51:30 [INFO] train episode 36: reward = -90.90, steps = 78
08:51:33 [INFO] train episode 37: reward = -284.01, steps = 103
08:51:35 [INFO] train episode 38: reward = -188.93, steps = 62
08:51:38 [INFO] train episode 39: reward = -155.66, steps = 90
08:51:40 [INFO] train episode 40: reward = -153.28, steps = 81
08:51:42 [INFO] train episode 41: reward = -100.22, steps = 58
08:51:45 [INFO] train episode 42: reward = -77.63, steps = 94
08:51:48 [INFO] train episode 43: reward = -320.48, steps = 81
08:51:49 [INFO] train episode 44: reward = -107.22, steps = 54
08:51:52 [INFO] train episode 45: reward = -111.05, steps = 74
08:51:55 [INFO] train episode 46: reward = -266.53, steps = 118
08:51:59 [INFO] train episode 47: reward = -99.81, steps = 97
08:52:02 [INFO] train episode 48: reward = -280.52, steps = 114
08:52:04 [INFO] train episode 49: reward = -89.81, steps = 69
08:52:07 [INFO] train episode 50: reward = -229.07, steps = 99
08:52:10 [INFO] train episode 51: reward = -6.66, steps = 79
08:52:13 [INFO] train episode 52: reward = -117.21, steps = 88
08:52:15 [INFO] train episode 53: reward = -106.00, steps = 75
08:52:17 [INFO] train episode 54: reward = -175.23, steps = 76
08:52:20 [INFO] train episode 55: reward = -159.31, steps = 90
08:52:23 [INFO] train episode 56: reward = -37.40, steps = 95
08:52:26 [INFO] train episode 57: reward = -146.62, steps = 94
08:52:29 [INFO] train episode 58: reward = -273.68, steps = 87
08:52:32 [INFO] train episode 59: reward = -58.22, steps = 92
08:52:36 [INFO] train episode 60: reward = -182.84, steps = 124
08:52:39 [INFO] train episode 61: reward = -129.32, steps = 102
08:52:43 [INFO] train episode 62: reward = -53.85, steps = 119
08:52:46 [INFO] train episode 63: reward = -220.74, steps = 90
08:52:49 [INFO] train episode 64: reward = -165.93, steps = 110
08:52:52 [INFO] train episode 65: reward = -175.62, steps = 75
08:52:55 [INFO] train episode 66: reward = -36.51, steps = 90
08:52:59 [INFO] train episode 67: reward = -210.49, steps = 103
08:53:04 [INFO] train episode 68: reward = -73.05, steps = 154
08:53:07 [INFO] train episode 69: reward = -151.31, steps = 102
08:53:10 [INFO] train episode 70: reward = -82.39, steps = 87
08:53:15 [INFO] train episode 71: reward = -381.80, steps = 174
08:53:19 [INFO] train episode 72: reward = -319.34, steps = 114
08:53:23 [INFO] train episode 73: reward = -19.32, steps = 125
08:53:27 [INFO] train episode 74: reward = -195.74, steps = 125
08:53:31 [INFO] train episode 75: reward = 23.41, steps = 128
08:53:33 [INFO] train episode 76: reward = -120.23, steps = 83
08:53:36 [INFO] train episode 77: reward = -18.84, steps = 89
08:53:40 [INFO] train episode 78: reward = -101.72, steps = 111
08:53:43 [INFO] train episode 79: reward = -92.64, steps = 102
08:53:46 [INFO] train episode 80: reward = -140.65, steps = 97
08:53:53 [INFO] train episode 81: reward = -54.77, steps = 226
08:53:57 [INFO] train episode 82: reward = -236.75, steps = 123
08:54:01 [INFO] train episode 83: reward = -30.38, steps = 112
08:54:03 [INFO] train episode 84: reward = -88.70, steps = 76
08:54:08 [INFO] train episode 85: reward = -207.05, steps = 158
08:54:12 [INFO] train episode 86: reward = -96.84, steps = 125
08:54:16 [INFO] train episode 87: reward = -219.92, steps = 118
08:54:20 [INFO] train episode 88: reward = -79.93, steps = 116
08:54:27 [INFO] train episode 89: reward = -80.47, steps = 232
08:54:32 [INFO] train episode 90: reward = -179.80, steps = 138
08:54:36 [INFO] train episode 91: reward = -238.42, steps = 119
08:54:41 [INFO] train episode 92: reward = -198.85, steps = 164
08:54:48 [INFO] train episode 93: reward = -221.22, steps = 236
08:54:57 [INFO] train episode 94: reward = -74.74, steps = 274
08:55:01 [INFO] train episode 95: reward = -257.51, steps = 119
08:55:08 [INFO] train episode 96: reward = -177.59, steps = 236
08:55:14 [INFO] train episode 97: reward = -412.11, steps = 181
08:55:17 [INFO] train episode 98: reward = -207.53, steps = 118
08:55:28 [INFO] train episode 99: reward = -240.19, steps = 312
08:55:34 [INFO] train episode 100: reward = 9.24, steps = 182
08:55:58 [INFO] train episode 101: reward = -382.12, steps = 765
08:56:02 [INFO] train episode 102: reward = -108.51, steps = 107
08:56:08 [INFO] train episode 103: reward = -328.58, steps = 207
08:56:14 [INFO] train episode 104: reward = -258.20, steps = 180
08:56:18 [INFO] train episode 105: reward = -314.33, steps = 114
08:56:24 [INFO] train episode 106: reward = -322.31, steps = 190
08:56:37 [INFO] train episode 107: reward = -206.53, steps = 395
08:56:45 [INFO] train episode 108: reward = -98.26, steps = 256
08:57:13 [INFO] train episode 109: reward = -174.85, steps = 877
08:57:22 [INFO] train episode 110: reward = -26.02, steps = 255
08:57:27 [INFO] train episode 111: reward = -17.03, steps = 155
08:57:32 [INFO] train episode 112: reward = -75.25, steps = 169
08:57:40 [INFO] train episode 113: reward = 56.28, steps = 252
08:57:46 [INFO] train episode 114: reward = -56.24, steps = 187
08:57:55 [INFO] train episode 115: reward = 41.91, steps = 293
08:58:02 [INFO] train episode 116: reward = -94.69, steps = 211
08:58:09 [INFO] train episode 117: reward = -127.31, steps = 190
08:58:16 [INFO] train episode 118: reward = -142.80, steps = 240
08:58:23 [INFO] train episode 119: reward = -124.74, steps = 211
08:58:33 [INFO] train episode 120: reward = -60.55, steps = 320
08:58:41 [INFO] train episode 121: reward = -125.80, steps = 249
08:58:52 [INFO] train episode 122: reward = -86.13, steps = 355
08:58:59 [INFO] train episode 123: reward = -137.90, steps = 198
08:59:12 [INFO] train episode 124: reward = -46.64, steps = 428
08:59:22 [INFO] train episode 125: reward = -105.46, steps = 296
08:59:28 [INFO] train episode 126: reward = -137.83, steps = 184
08:59:37 [INFO] train episode 127: reward = -120.26, steps = 277
08:59:48 [INFO] train episode 128: reward = -44.16, steps = 365
09:00:04 [INFO] train episode 129: reward = -170.22, steps = 478
09:00:13 [INFO] train episode 130: reward = -112.95, steps = 290
09:00:17 [INFO] train episode 131: reward = -134.92, steps = 115
09:00:26 [INFO] train episode 132: reward = -148.71, steps = 281
09:00:32 [INFO] train episode 133: reward = -87.77, steps = 183
09:00:50 [INFO] train episode 134: reward = -179.98, steps = 548
09:00:54 [INFO] train episode 135: reward = -86.95, steps = 150
09:01:08 [INFO] train episode 136: reward = -185.02, steps = 431
09:01:21 [INFO] train episode 137: reward = -107.17, steps = 407
09:01:35 [INFO] train episode 138: reward = -217.35, steps = 427
09:01:40 [INFO] train episode 139: reward = -115.51, steps = 157
09:01:59 [INFO] train episode 140: reward = -239.48, steps = 602
09:02:09 [INFO] train episode 141: reward = -118.03, steps = 307
09:02:14 [INFO] train episode 142: reward = -94.33, steps = 162
09:02:27 [INFO] train episode 143: reward = -151.36, steps = 408
09:02:33 [INFO] train episode 144: reward = -113.35, steps = 194
09:03:06 [INFO] train episode 145: reward = -80.29, steps = 1000
09:03:15 [INFO] train episode 146: reward = -114.29, steps = 307
09:03:30 [INFO] train episode 147: reward = -123.43, steps = 447
09:03:37 [INFO] train episode 148: reward = -72.17, steps = 230
09:03:56 [INFO] train episode 149: reward = -84.20, steps = 594
09:04:17 [INFO] train episode 150: reward = -188.51, steps = 649
09:04:24 [INFO] train episode 151: reward = -90.44, steps = 227
09:04:58 [INFO] train episode 152: reward = 49.20, steps = 1000
09:05:24 [INFO] train episode 153: reward = -176.41, steps = 791
09:05:52 [INFO] train episode 154: reward = -214.44, steps = 873
09:06:25 [INFO] train episode 155: reward = -117.69, steps = 1000
09:06:59 [INFO] train episode 156: reward = -79.93, steps = 1000
09:07:34 [INFO] train episode 157: reward = -125.30, steps = 1000
09:08:08 [INFO] train episode 158: reward = -77.60, steps = 1000
09:08:41 [INFO] train episode 159: reward = -66.79, steps = 1000
09:09:15 [INFO] train episode 160: reward = -54.37, steps = 1000
09:09:31 [INFO] train episode 161: reward = -91.15, steps = 488
09:10:05 [INFO] train episode 162: reward = -70.56, steps = 1000
09:10:39 [INFO] train episode 163: reward = -18.33, steps = 1000
09:11:12 [INFO] train episode 164: reward = 1.63, steps = 1000
09:11:44 [INFO] train episode 165: reward = -39.78, steps = 1000
09:12:18 [INFO] train episode 166: reward = 2.87, steps = 1000
09:12:50 [INFO] train episode 167: reward = -26.18, steps = 1000
09:13:23 [INFO] train episode 168: reward = -71.67, steps = 1000
09:13:57 [INFO] train episode 169: reward = -36.45, steps = 1000
09:14:30 [INFO] train episode 170: reward = -41.07, steps = 1000
09:15:04 [INFO] train episode 171: reward = -31.90, steps = 1000
09:15:37 [INFO] train episode 172: reward = -73.11, steps = 1000
09:16:12 [INFO] train episode 173: reward = -44.80, steps = 1000
09:16:47 [INFO] train episode 174: reward = -12.23, steps = 1000
09:17:21 [INFO] train episode 175: reward = -6.38, steps = 1000
09:17:56 [INFO] train episode 176: reward = -4.75, steps = 1000
09:18:31 [INFO] train episode 177: reward = -17.49, steps = 1000
09:19:05 [INFO] train episode 178: reward = -31.29, steps = 1000
09:19:40 [INFO] train episode 179: reward = -2.97, steps = 1000
09:20:15 [INFO] train episode 180: reward = 12.63, steps = 1000
09:20:51 [INFO] train episode 181: reward = 4.95, steps = 1000
09:21:27 [INFO] train episode 182: reward = 23.84, steps = 1000
09:22:02 [INFO] train episode 183: reward = 5.81, steps = 1000
09:22:40 [INFO] train episode 184: reward = -25.18, steps = 1000
09:23:15 [INFO] train episode 185: reward = -12.47, steps = 1000
09:23:52 [INFO] train episode 186: reward = -75.48, steps = 1000
09:24:28 [INFO] train episode 187: reward = -11.71, steps = 1000
09:25:04 [INFO] train episode 188: reward = -32.70, steps = 1000
09:25:41 [INFO] train episode 189: reward = 14.71, steps = 1000
09:26:18 [INFO] train episode 190: reward = 4.37, steps = 1000
09:26:55 [INFO] train episode 191: reward = -25.78, steps = 1000
09:27:33 [INFO] train episode 192: reward = -21.27, steps = 1000
09:28:11 [INFO] train episode 193: reward = -49.02, steps = 1000
09:28:48 [INFO] train episode 194: reward = -14.77, steps = 1000
09:29:27 [INFO] train episode 195: reward = -36.21, steps = 1000
09:30:05 [INFO] train episode 196: reward = 0.20, steps = 1000
09:30:44 [INFO] train episode 197: reward = 19.84, steps = 1000
09:31:22 [INFO] train episode 198: reward = -30.18, steps = 1000
09:32:02 [INFO] train episode 199: reward = 6.53, steps = 1000
09:32:41 [INFO] train episode 200: reward = 7.37, steps = 1000
09:33:20 [INFO] train episode 201: reward = 13.28, steps = 1000
09:34:01 [INFO] train episode 202: reward = 7.84, steps = 1000
09:34:40 [INFO] train episode 203: reward = 11.01, steps = 1000
09:35:20 [INFO] train episode 204: reward = -1.73, steps = 1000
09:36:01 [INFO] train episode 205: reward = -55.33, steps = 1000
09:36:41 [INFO] train episode 206: reward = -37.24, steps = 1000
09:37:21 [INFO] train episode 207: reward = -67.50, steps = 1000
09:38:01 [INFO] train episode 208: reward = 3.71, steps = 1000
09:38:42 [INFO] train episode 209: reward = -39.68, steps = 1000
09:39:13 [INFO] train episode 210: reward = 168.03, steps = 794
09:39:52 [INFO] train episode 211: reward = -36.16, steps = 1000
09:40:33 [INFO] train episode 212: reward = -54.07, steps = 1000
09:41:11 [INFO] train episode 213: reward = 28.84, steps = 1000
09:41:50 [INFO] train episode 214: reward = 8.41, steps = 1000
09:41:53 [INFO] train episode 215: reward = -190.62, steps = 84
09:42:33 [INFO] train episode 216: reward = -45.12, steps = 1000
09:43:12 [INFO] train episode 217: reward = -6.79, steps = 1000
09:43:15 [INFO] train episode 218: reward = -52.57, steps = 72
09:43:54 [INFO] train episode 219: reward = 7.98, steps = 1000
09:44:33 [INFO] train episode 220: reward = -60.27, steps = 1000
09:45:14 [INFO] train episode 221: reward = -34.45, steps = 1000
09:45:23 [INFO] train episode 222: reward = -15.16, steps = 256
09:45:31 [INFO] train episode 223: reward = -40.23, steps = 209
09:46:10 [INFO] train episode 224: reward = -40.07, steps = 1000
09:46:49 [INFO] train episode 225: reward = -30.90, steps = 1000
09:47:28 [INFO] train episode 226: reward = -65.34, steps = 1000
09:48:08 [INFO] train episode 227: reward = 0.33, steps = 1000
09:48:47 [INFO] train episode 228: reward = -0.08, steps = 1000
09:49:25 [INFO] train episode 229: reward = -23.53, steps = 1000
09:50:04 [INFO] train episode 230: reward = -32.84, steps = 1000
09:50:42 [INFO] train episode 231: reward = 8.43, steps = 1000
09:51:21 [INFO] train episode 232: reward = 22.71, steps = 1000
09:52:00 [INFO] train episode 233: reward = 5.20, steps = 1000
09:52:41 [INFO] train episode 234: reward = -53.36, steps = 1000
09:53:22 [INFO] train episode 235: reward = 1.53, steps = 1000
09:54:01 [INFO] train episode 236: reward = 44.41, steps = 1000
09:54:41 [INFO] train episode 237: reward = -44.29, steps = 1000
09:55:17 [INFO] train episode 238: reward = -172.51, steps = 905
09:55:57 [INFO] train episode 239: reward = -29.64, steps = 1000
09:56:36 [INFO] train episode 240: reward = -64.17, steps = 1000
09:57:16 [INFO] train episode 241: reward = -24.15, steps = 1000
09:57:43 [INFO] train episode 242: reward = -125.58, steps = 694
09:58:22 [INFO] train episode 243: reward = -27.99, steps = 1000
09:59:01 [INFO] train episode 244: reward = -6.29, steps = 1000
09:59:40 [INFO] train episode 245: reward = -24.44, steps = 1000
10:00:18 [INFO] train episode 246: reward = -1.26, steps = 1000
10:00:57 [INFO] train episode 247: reward = 11.06, steps = 1000
10:01:36 [INFO] train episode 248: reward = -63.15, steps = 1000
10:02:15 [INFO] train episode 249: reward = -44.60, steps = 1000
10:02:54 [INFO] train episode 250: reward = 18.97, steps = 1000
10:03:33 [INFO] train episode 251: reward = -31.76, steps = 1000
10:04:13 [INFO] train episode 252: reward = -42.31, steps = 1000
10:04:16 [INFO] train episode 253: reward = -149.63, steps = 92
10:04:56 [INFO] train episode 254: reward = -48.89, steps = 1000
10:05:35 [INFO] train episode 255: reward = -5.55, steps = 1000
10:06:14 [INFO] train episode 256: reward = -30.94, steps = 1000
10:06:53 [INFO] train episode 257: reward = -9.88, steps = 1000
10:07:32 [INFO] train episode 258: reward = -41.72, steps = 1000
10:08:12 [INFO] train episode 259: reward = 86.70, steps = 987
10:08:52 [INFO] train episode 260: reward = -39.33, steps = 1000
10:09:32 [INFO] train episode 261: reward = 7.94, steps = 1000
10:10:12 [INFO] train episode 262: reward = -6.40, steps = 1000
10:10:51 [INFO] train episode 263: reward = -27.86, steps = 1000
10:11:30 [INFO] train episode 264: reward = -71.48, steps = 1000
10:12:10 [INFO] train episode 265: reward = -21.86, steps = 1000
10:12:49 [INFO] train episode 266: reward = -45.56, steps = 1000
10:13:29 [INFO] train episode 267: reward = -64.30, steps = 1000
10:14:09 [INFO] train episode 268: reward = 0.52, steps = 1000
10:14:49 [INFO] train episode 269: reward = -20.07, steps = 1000
10:15:30 [INFO] train episode 270: reward = -28.48, steps = 1000
10:16:10 [INFO] train episode 271: reward = -23.65, steps = 1000
10:16:50 [INFO] train episode 272: reward = -1.27, steps = 1000
10:17:31 [INFO] train episode 273: reward = 3.31, steps = 1000
10:18:12 [INFO] train episode 274: reward = -1.39, steps = 1000
10:18:52 [INFO] train episode 275: reward = -32.75, steps = 1000
10:19:34 [INFO] train episode 276: reward = -39.80, steps = 1000
10:20:14 [INFO] train episode 277: reward = -24.09, steps = 1000
10:20:55 [INFO] train episode 278: reward = -39.08, steps = 1000
10:21:36 [INFO] train episode 279: reward = -47.51, steps = 1000
10:22:16 [INFO] train episode 280: reward = -23.01, steps = 1000
10:23:00 [INFO] train episode 281: reward = -27.71, steps = 1000
10:23:40 [INFO] train episode 282: reward = 3.29, steps = 1000
10:24:21 [INFO] train episode 283: reward = -18.14, steps = 1000
10:25:02 [INFO] train episode 284: reward = -47.98, steps = 1000
10:25:43 [INFO] train episode 285: reward = -20.43, steps = 1000
10:26:24 [INFO] train episode 286: reward = -8.02, steps = 1000
10:27:05 [INFO] train episode 287: reward = -27.27, steps = 1000
10:27:47 [INFO] train episode 288: reward = -56.75, steps = 1000
10:28:27 [INFO] train episode 289: reward = 9.80, steps = 1000
10:29:08 [INFO] train episode 290: reward = -55.58, steps = 1000
10:29:50 [INFO] train episode 291: reward = -43.32, steps = 1000
10:30:32 [INFO] train episode 292: reward = -27.83, steps = 1000
10:30:38 [INFO] train episode 293: reward = -77.91, steps = 145
10:31:18 [INFO] train episode 294: reward = -59.44, steps = 1000
10:31:59 [INFO] train episode 295: reward = -4.06, steps = 1000
10:32:40 [INFO] train episode 296: reward = 11.90, steps = 1000
10:33:20 [INFO] train episode 297: reward = -49.49, steps = 1000
10:34:01 [INFO] train episode 298: reward = 7.90, steps = 1000
10:34:42 [INFO] train episode 299: reward = -63.10, steps = 1000
10:35:22 [INFO] train episode 300: reward = -16.20, steps = 1000
10:36:03 [INFO] train episode 301: reward = -1.74, steps = 1000
10:36:44 [INFO] train episode 302: reward = -35.07, steps = 1000
10:36:49 [INFO] train episode 303: reward = -6.43, steps = 144
10:37:30 [INFO] train episode 304: reward = 3.82, steps = 1000
10:38:10 [INFO] train episode 305: reward = -13.07, steps = 1000
10:38:51 [INFO] train episode 306: reward = -21.62, steps = 1000
10:39:32 [INFO] train episode 307: reward = -37.21, steps = 1000
10:40:14 [INFO] train episode 308: reward = -5.40, steps = 1000
10:40:19 [INFO] train episode 309: reward = -47.19, steps = 136
10:41:01 [INFO] train episode 310: reward = -34.44, steps = 1000
10:41:06 [INFO] train episode 311: reward = -53.53, steps = 126
10:41:47 [INFO] train episode 312: reward = 8.08, steps = 1000
10:42:28 [INFO] train episode 313: reward = -22.24, steps = 1000
10:43:09 [INFO] train episode 314: reward = -18.65, steps = 1000
10:43:50 [INFO] train episode 315: reward = -9.58, steps = 1000
10:44:21 [INFO] train episode 316: reward = 200.26, steps = 753
10:45:02 [INFO] train episode 317: reward = -30.10, steps = 1000
10:45:42 [INFO] train episode 318: reward = -60.28, steps = 1000
10:46:22 [INFO] train episode 319: reward = 17.45, steps = 1000
10:47:03 [INFO] train episode 320: reward = -33.42, steps = 1000
10:47:44 [INFO] train episode 321: reward = -19.99, steps = 1000
10:48:25 [INFO] train episode 322: reward = 13.88, steps = 1000
10:49:05 [INFO] train episode 323: reward = -35.00, steps = 1000
10:49:47 [INFO] train episode 324: reward = -0.80, steps = 1000
10:49:53 [INFO] train episode 325: reward = -31.59, steps = 174
10:49:58 [INFO] train episode 326: reward = -93.30, steps = 124
10:50:38 [INFO] train episode 327: reward = -4.55, steps = 1000
10:51:18 [INFO] train episode 328: reward = -22.92, steps = 1000
10:51:59 [INFO] train episode 329: reward = -66.67, steps = 1000
10:52:06 [INFO] train episode 330: reward = -91.46, steps = 199
10:52:48 [INFO] train episode 331: reward = 13.11, steps = 1000
10:53:28 [INFO] train episode 332: reward = 19.52, steps = 1000
10:54:09 [INFO] train episode 333: reward = -23.95, steps = 1000
10:54:50 [INFO] train episode 334: reward = -26.49, steps = 1000
10:55:31 [INFO] train episode 335: reward = 15.47, steps = 1000
10:56:13 [INFO] train episode 336: reward = -17.99, steps = 1000
10:56:53 [INFO] train episode 337: reward = -14.47, steps = 1000
10:57:34 [INFO] train episode 338: reward = -58.33, steps = 1000
10:58:14 [INFO] train episode 339: reward = -11.58, steps = 1000
10:58:45 [INFO] train episode 340: reward = -90.19, steps = 769
10:59:25 [INFO] train episode 341: reward = -11.28, steps = 1000
11:00:06 [INFO] train episode 342: reward = 8.60, steps = 1000
11:00:47 [INFO] train episode 343: reward = -26.04, steps = 1000
11:01:27 [INFO] train episode 344: reward = 20.18, steps = 1000
11:02:08 [INFO] train episode 345: reward = -22.66, steps = 1000
11:02:49 [INFO] train episode 346: reward = -31.68, steps = 1000
11:03:31 [INFO] train episode 347: reward = -26.41, steps = 1000
11:04:11 [INFO] train episode 348: reward = -6.08, steps = 1000
11:04:37 [INFO] train episode 349: reward = 146.24, steps = 651
11:05:09 [INFO] train episode 350: reward = 155.83, steps = 780
11:05:18 [INFO] train episode 351: reward = 7.82, steps = 245
11:05:51 [INFO] train episode 352: reward = 80.29, steps = 809
11:06:32 [INFO] train episode 353: reward = -36.24, steps = 1000
11:07:04 [INFO] train episode 354: reward = -61.48, steps = 801
11:07:46 [INFO] train episode 355: reward = -19.42, steps = 1000
11:08:27 [INFO] train episode 356: reward = -27.06, steps = 1000
11:09:07 [INFO] train episode 357: reward = -21.69, steps = 1000
11:09:50 [INFO] train episode 358: reward = -25.17, steps = 1000
11:10:29 [INFO] train episode 359: reward = 97.53, steps = 963
11:11:10 [INFO] train episode 360: reward = -8.54, steps = 1000
11:11:37 [INFO] train episode 361: reward = 185.19, steps = 682
11:12:18 [INFO] train episode 362: reward = 23.42, steps = 1000
11:12:59 [INFO] train episode 363: reward = 3.70, steps = 1000
11:13:40 [INFO] train episode 364: reward = 40.67, steps = 1000
11:14:20 [INFO] train episode 365: reward = -17.96, steps = 1000
11:15:02 [INFO] train episode 366: reward = -22.07, steps = 1000
11:15:26 [INFO] train episode 367: reward = 197.98, steps = 616
11:16:07 [INFO] train episode 368: reward = -29.95, steps = 1000
11:16:48 [INFO] train episode 369: reward = -14.75, steps = 1000
11:17:29 [INFO] train episode 370: reward = -29.98, steps = 1000
11:18:10 [INFO] train episode 371: reward = 17.82, steps = 1000
11:18:52 [INFO] train episode 372: reward = -39.24, steps = 1000
11:19:34 [INFO] train episode 373: reward = -39.12, steps = 1000
11:20:15 [INFO] train episode 374: reward = -10.31, steps = 1000
11:20:56 [INFO] train episode 375: reward = -15.00, steps = 1000
11:21:21 [INFO] train episode 376: reward = 187.56, steps = 615
11:21:33 [INFO] train episode 377: reward = 268.76, steps = 301
11:22:14 [INFO] train episode 378: reward = -53.70, steps = 1000
11:22:56 [INFO] train episode 379: reward = 39.52, steps = 1000
11:23:31 [INFO] train episode 380: reward = 104.65, steps = 850
11:24:12 [INFO] train episode 381: reward = -59.91, steps = 1000
11:24:52 [INFO] train episode 382: reward = 21.33, steps = 1000
11:25:33 [INFO] train episode 383: reward = -19.80, steps = 1000
11:25:39 [INFO] train episode 384: reward = 28.69, steps = 151
11:26:21 [INFO] train episode 385: reward = -57.24, steps = 1000
11:26:30 [INFO] train episode 386: reward = 249.79, steps = 220
11:26:36 [INFO] train episode 387: reward = 7.72, steps = 172
11:27:18 [INFO] train episode 388: reward = 35.42, steps = 1000
11:27:35 [INFO] train episode 389: reward = 239.86, steps = 453
11:28:16 [INFO] train episode 390: reward = 70.65, steps = 1000
11:28:35 [INFO] train episode 391: reward = 182.73, steps = 483
11:29:16 [INFO] train episode 392: reward = 115.86, steps = 1000
11:29:56 [INFO] train episode 393: reward = 106.90, steps = 1000
11:30:01 [INFO] train episode 394: reward = 30.09, steps = 129
11:30:08 [INFO] train episode 395: reward = 19.33, steps = 170
11:30:18 [INFO] train episode 396: reward = 275.51, steps = 251
11:30:24 [INFO] train episode 397: reward = 13.79, steps = 154
11:30:30 [INFO] train episode 398: reward = 26.55, steps = 149
11:30:33 [INFO] train episode 399: reward = -3.53, steps = 91
11:30:35 [INFO] train episode 400: reward = -53.04, steps = 56
11:30:41 [INFO] train episode 401: reward = -15.88, steps = 142
11:30:46 [INFO] train episode 402: reward = 31.50, steps = 136
11:31:26 [INFO] train episode 403: reward = 135.86, steps = 1000
11:31:31 [INFO] train episode 404: reward = -1.47, steps = 108
11:32:10 [INFO] train episode 405: reward = 85.22, steps = 1000
11:32:16 [INFO] train episode 406: reward = 12.88, steps = 141
11:32:22 [INFO] train episode 407: reward = 9.83, steps = 156
11:32:31 [INFO] train episode 408: reward = 245.56, steps = 239
11:32:38 [INFO] train episode 409: reward = 43.41, steps = 162
11:33:18 [INFO] train episode 410: reward = 122.64, steps = 1000
11:33:25 [INFO] train episode 411: reward = -3.57, steps = 182
11:34:05 [INFO] train episode 412: reward = 143.42, steps = 1000
11:34:09 [INFO] train episode 413: reward = -25.21, steps = 124
11:34:15 [INFO] train episode 414: reward = -18.99, steps = 140
11:34:54 [INFO] train episode 415: reward = 100.31, steps = 1000
11:35:33 [INFO] train episode 416: reward = 114.46, steps = 1000
11:35:41 [INFO] train episode 417: reward = 232.46, steps = 201
11:35:55 [INFO] train episode 418: reward = 266.54, steps = 366
11:36:34 [INFO] train episode 419: reward = 110.42, steps = 1000
11:37:13 [INFO] train episode 420: reward = 121.11, steps = 1000
11:37:19 [INFO] train episode 421: reward = -9.14, steps = 158
11:37:27 [INFO] train episode 422: reward = 239.80, steps = 207
11:37:45 [INFO] train episode 423: reward = 245.35, steps = 458
11:37:53 [INFO] train episode 424: reward = 257.88, steps = 224
11:38:32 [INFO] train episode 425: reward = 76.03, steps = 1000
11:38:38 [INFO] train episode 426: reward = 241.15, steps = 163
11:39:17 [INFO] train episode 427: reward = 121.12, steps = 1000
11:39:22 [INFO] train episode 428: reward = 60.61, steps = 138
11:39:31 [INFO] train episode 429: reward = 245.53, steps = 226
11:40:10 [INFO] train episode 430: reward = 70.72, steps = 1000
11:40:16 [INFO] train episode 431: reward = 69.00, steps = 149
11:40:21 [INFO] train episode 432: reward = -17.51, steps = 156
11:40:31 [INFO] train episode 433: reward = 232.31, steps = 251
11:40:48 [INFO] train episode 434: reward = 256.14, steps = 439
11:41:05 [INFO] train episode 435: reward = 270.95, steps = 450
11:41:25 [INFO] train episode 436: reward = 278.93, steps = 513
11:41:31 [INFO] train episode 437: reward = -26.74, steps = 164
11:41:35 [INFO] train episode 438: reward = -0.10, steps = 101
11:42:13 [INFO] train episode 439: reward = 129.38, steps = 1000
11:42:53 [INFO] train episode 440: reward = 105.27, steps = 1000
11:43:05 [INFO] train episode 441: reward = 210.13, steps = 303
11:43:13 [INFO] train episode 442: reward = 223.78, steps = 220
11:43:19 [INFO] train episode 443: reward = 65.32, steps = 174
11:43:32 [INFO] train episode 444: reward = 243.66, steps = 340
11:44:12 [INFO] train episode 445: reward = 51.24, steps = 1000
11:44:21 [INFO] train episode 446: reward = 245.63, steps = 232
11:45:00 [INFO] train episode 447: reward = 137.79, steps = 1000
11:45:39 [INFO] train episode 448: reward = 113.79, steps = 1000
11:45:56 [INFO] train episode 449: reward = 229.94, steps = 458
11:46:35 [INFO] train episode 450: reward = 137.03, steps = 1000
11:46:46 [INFO] train episode 451: reward = 267.52, steps = 298
11:47:23 [INFO] train episode 452: reward = 288.69, steps = 950
11:48:02 [INFO] train episode 453: reward = 145.29, steps = 1000
11:48:40 [INFO] train episode 454: reward = 160.21, steps = 1000
11:49:19 [INFO] train episode 455: reward = 130.20, steps = 1000
11:49:59 [INFO] train episode 456: reward = 120.77, steps = 1000
11:50:37 [INFO] train episode 457: reward = 126.23, steps = 1000
11:51:08 [INFO] train episode 458: reward = 215.63, steps = 802
11:51:46 [INFO] train episode 459: reward = 101.99, steps = 1000
11:52:25 [INFO] train episode 460: reward = 131.59, steps = 1000
11:52:33 [INFO] train episode 461: reward = 244.61, steps = 205
11:52:44 [INFO] train episode 462: reward = 250.38, steps = 310
11:53:23 [INFO] train episode 463: reward = 108.69, steps = 1000
11:53:31 [INFO] train episode 464: reward = 255.06, steps = 211
11:54:10 [INFO] train episode 465: reward = 139.20, steps = 1000
11:54:48 [INFO] train episode 466: reward = 120.25, steps = 1000
11:55:27 [INFO] train episode 467: reward = 124.46, steps = 1000
11:56:06 [INFO] train episode 468: reward = 147.71, steps = 1000
11:56:19 [INFO] train episode 469: reward = 207.01, steps = 339
11:56:58 [INFO] train episode 470: reward = 145.63, steps = 1000
11:57:37 [INFO] train episode 471: reward = 136.65, steps = 1000
11:57:50 [INFO] train episode 472: reward = 246.61, steps = 323
11:58:28 [INFO] train episode 473: reward = 151.00, steps = 1000
11:58:50 [INFO] train episode 474: reward = 238.25, steps = 562
11:58:52 [INFO] train episode 475: reward = -34.60, steps = 76
11:59:30 [INFO] train episode 476: reward = 136.21, steps = 1000
12:00:09 [INFO] train episode 477: reward = 142.19, steps = 1000
12:00:47 [INFO] train episode 478: reward = 125.15, steps = 1000
12:01:25 [INFO] train episode 479: reward = 133.39, steps = 1000
12:02:03 [INFO] train episode 480: reward = 142.66, steps = 1000
12:02:42 [INFO] train episode 481: reward = 161.41, steps = 1000
12:03:20 [INFO] train episode 482: reward = 138.70, steps = 1000
12:03:58 [INFO] train episode 483: reward = 149.07, steps = 1000
12:04:36 [INFO] train episode 484: reward = 117.19, steps = 1000
12:04:40 [INFO] train episode 485: reward = 6.27, steps = 104
12:05:18 [INFO] train episode 486: reward = 112.67, steps = 1000
12:05:23 [INFO] train episode 487: reward = 224.83, steps = 149
12:05:29 [INFO] train episode 488: reward = 67.36, steps = 139
12:06:01 [INFO] train episode 489: reward = 158.32, steps = 834
12:06:39 [INFO] train episode 490: reward = 178.29, steps = 1000
12:06:47 [INFO] train episode 491: reward = 224.99, steps = 224
12:07:25 [INFO] train episode 492: reward = 135.58, steps = 1000
12:08:04 [INFO] train episode 493: reward = 171.78, steps = 1000
12:08:42 [INFO] train episode 494: reward = 141.74, steps = 1000
12:09:21 [INFO] train episode 495: reward = 143.79, steps = 1000
12:09:27 [INFO] train episode 496: reward = 1.35, steps = 147
12:09:35 [INFO] train episode 497: reward = -30.79, steps = 211
12:10:15 [INFO] train episode 498: reward = 143.50, steps = 1000
12:10:53 [INFO] train episode 499: reward = 140.86, steps = 1000
12:11:32 [INFO] train episode 500: reward = 126.08, steps = 1000
12:12:11 [INFO] train episode 501: reward = 128.06, steps = 1000
12:12:21 [INFO] train episode 502: reward = 201.06, steps = 260
12:12:59 [INFO] train episode 503: reward = 144.96, steps = 1000
12:13:15 [INFO] train episode 504: reward = 214.63, steps = 416
12:13:22 [INFO] train episode 505: reward = 6.61, steps = 179
12:14:00 [INFO] train episode 506: reward = 89.71, steps = 1000
12:14:39 [INFO] train episode 507: reward = 202.74, steps = 986
12:14:44 [INFO] train episode 508: reward = -1.13, steps = 135
12:15:24 [INFO] train episode 509: reward = 118.31, steps = 1000
12:16:03 [INFO] train episode 510: reward = 108.18, steps = 1000
12:16:15 [INFO] train episode 511: reward = 209.12, steps = 315
12:16:54 [INFO] train episode 512: reward = 136.71, steps = 1000
12:17:33 [INFO] train episode 513: reward = 113.80, steps = 1000
12:18:12 [INFO] train episode 514: reward = 168.73, steps = 1000
12:18:34 [INFO] train episode 515: reward = 211.40, steps = 554
12:18:38 [INFO] train episode 516: reward = -33.62, steps = 105
12:19:17 [INFO] train episode 517: reward = 116.70, steps = 1000
12:19:57 [INFO] train episode 518: reward = 119.32, steps = 1000
12:20:36 [INFO] train episode 519: reward = 92.17, steps = 1000
12:20:45 [INFO] train episode 520: reward = 199.84, steps = 225
12:21:01 [INFO] train episode 521: reward = 186.73, steps = 430
12:21:40 [INFO] train episode 522: reward = 123.72, steps = 1000
12:22:20 [INFO] train episode 523: reward = 114.84, steps = 1000
12:23:00 [INFO] train episode 524: reward = 134.94, steps = 1000
12:23:22 [INFO] train episode 525: reward = 237.49, steps = 579
12:24:01 [INFO] train episode 526: reward = 113.79, steps = 1000
12:24:07 [INFO] train episode 527: reward = 3.46, steps = 170
12:24:11 [INFO] train episode 528: reward = 19.38, steps = 106
12:24:17 [INFO] train episode 529: reward = 50.39, steps = 151
12:24:22 [INFO] train episode 530: reward = -35.07, steps = 124
12:24:36 [INFO] train episode 531: reward = 203.90, steps = 360
12:24:45 [INFO] train episode 532: reward = -87.35, steps = 239
12:24:55 [INFO] train episode 533: reward = 230.81, steps = 262
12:25:06 [INFO] train episode 534: reward = 191.13, steps = 293
12:25:13 [INFO] train episode 535: reward = -53.85, steps = 182
12:25:23 [INFO] train episode 536: reward = -101.43, steps = 253
12:25:37 [INFO] train episode 537: reward = 175.83, steps = 353
12:25:48 [INFO] train episode 538: reward = 218.88, steps = 288
12:25:53 [INFO] train episode 539: reward = -22.08, steps = 132
12:26:00 [INFO] train episode 540: reward = -40.94, steps = 201
12:26:09 [INFO] train episode 541: reward = -17.87, steps = 214
12:26:48 [INFO] train episode 542: reward = 158.65, steps = 1000
12:27:28 [INFO] train episode 543: reward = 115.29, steps = 1000
12:27:45 [INFO] train episode 544: reward = 162.96, steps = 430
12:27:56 [INFO] train episode 545: reward = 224.67, steps = 281
12:28:28 [INFO] train episode 546: reward = 203.53, steps = 808
12:28:37 [INFO] train episode 547: reward = 4.79, steps = 238
12:29:17 [INFO] train episode 548: reward = 129.84, steps = 1000
12:29:26 [INFO] train episode 549: reward = 235.79, steps = 248
12:29:38 [INFO] train episode 550: reward = 231.07, steps = 288
12:29:47 [INFO] train episode 551: reward = 242.58, steps = 256
12:29:53 [INFO] train episode 552: reward = 26.45, steps = 137
12:30:04 [INFO] train episode 553: reward = 268.18, steps = 287
12:30:20 [INFO] train episode 554: reward = 210.02, steps = 416
12:30:28 [INFO] train episode 555: reward = 241.36, steps = 208
12:30:41 [INFO] train episode 556: reward = 239.09, steps = 314
12:31:06 [INFO] train episode 557: reward = 215.17, steps = 631
12:31:46 [INFO] train episode 558: reward = 133.01, steps = 1000
12:31:58 [INFO] train episode 559: reward = 205.48, steps = 309
12:32:04 [INFO] train episode 560: reward = -5.08, steps = 161
12:32:09 [INFO] train episode 561: reward = 5.65, steps = 133
12:32:49 [INFO] train episode 562: reward = 138.11, steps = 1000
12:33:28 [INFO] train episode 563: reward = 109.73, steps = 1000
12:34:06 [INFO] train episode 564: reward = 146.65, steps = 1000
12:34:45 [INFO] train episode 565: reward = 173.19, steps = 1000
12:34:49 [INFO] train episode 566: reward = 5.80, steps = 122
12:34:53 [INFO] train episode 567: reward = -52.19, steps = 103
12:35:12 [INFO] train episode 568: reward = 168.04, steps = 490
12:35:51 [INFO] train episode 569: reward = 144.85, steps = 1000
12:35:55 [INFO] train episode 570: reward = 5.84, steps = 125
12:36:33 [INFO] train episode 571: reward = 83.16, steps = 1000
12:36:39 [INFO] train episode 572: reward = 3.79, steps = 158
12:36:47 [INFO] train episode 573: reward = -29.89, steps = 210
12:37:25 [INFO] train episode 574: reward = 155.09, steps = 1000
12:37:32 [INFO] train episode 575: reward = 261.25, steps = 182
12:37:38 [INFO] train episode 576: reward = 33.45, steps = 163
12:37:49 [INFO] train episode 577: reward = 251.77, steps = 290
12:37:57 [INFO] train episode 578: reward = 202.44, steps = 210
12:38:36 [INFO] train episode 579: reward = 140.15, steps = 1000
12:39:14 [INFO] train episode 580: reward = 116.79, steps = 1000
12:39:22 [INFO] train episode 581: reward = -28.87, steps = 209
12:39:34 [INFO] train episode 582: reward = 230.60, steps = 319
12:39:42 [INFO] train episode 583: reward = 15.11, steps = 223
12:40:21 [INFO] train episode 584: reward = 109.42, steps = 1000
12:41:00 [INFO] train episode 585: reward = 119.07, steps = 1000
12:41:05 [INFO] train episode 586: reward = 39.63, steps = 139
12:41:12 [INFO] train episode 587: reward = -47.84, steps = 183
12:41:36 [INFO] train episode 588: reward = 233.27, steps = 611
12:41:46 [INFO] train episode 589: reward = 288.73, steps = 270
12:42:25 [INFO] train episode 590: reward = 132.62, steps = 1000
12:42:33 [INFO] train episode 591: reward = 251.68, steps = 228
12:43:12 [INFO] train episode 592: reward = 117.34, steps = 1000
12:43:17 [INFO] train episode 593: reward = 18.96, steps = 134
12:43:30 [INFO] train episode 594: reward = 255.76, steps = 357
12:44:08 [INFO] train episode 595: reward = 135.59, steps = 1000
12:44:46 [INFO] train episode 596: reward = 146.12, steps = 1000
12:45:24 [INFO] train episode 597: reward = 163.82, steps = 1000
12:46:02 [INFO] train episode 598: reward = 121.32, steps = 1000
12:46:40 [INFO] train episode 599: reward = 146.93, steps = 1000
12:47:18 [INFO] train episode 600: reward = 155.23, steps = 1000
12:47:56 [INFO] train episode 601: reward = 158.77, steps = 1000
12:48:34 [INFO] train episode 602: reward = 135.33, steps = 1000
12:48:42 [INFO] train episode 603: reward = 296.22, steps = 224
12:49:20 [INFO] train episode 604: reward = 118.72, steps = 1000
12:49:57 [INFO] train episode 605: reward = 240.88, steps = 986
12:50:05 [INFO] train episode 606: reward = 286.03, steps = 207
12:50:11 [INFO] train episode 607: reward = -76.28, steps = 161
12:50:20 [INFO] train episode 608: reward = 280.56, steps = 233
12:50:31 [INFO] train episode 609: reward = 259.10, steps = 311
12:50:40 [INFO] train episode 610: reward = 270.23, steps = 235
12:50:50 [INFO] train episode 611: reward = 276.68, steps = 251
12:50:56 [INFO] train episode 612: reward = 248.49, steps = 176
12:51:08 [INFO] train episode 613: reward = 292.11, steps = 305
12:51:32 [INFO] train episode 614: reward = 266.76, steps = 633
12:51:38 [INFO] train episode 615: reward = 249.82, steps = 171
12:51:45 [INFO] train episode 616: reward = 262.73, steps = 176
12:51:56 [INFO] train episode 617: reward = 247.55, steps = 308
12:51:57 [INFO] ==== test ====
12:51:57 [INFO] test episode 0: reward = 261.92, steps = 243
12:51:57 [INFO] test episode 1: reward = 254.51, steps = 231
12:51:59 [INFO] test episode 2: reward = 247.75, steps = 847
12:51:59 [INFO] test episode 3: reward = -4.16, steps = 326
12:52:00 [INFO] test episode 4: reward = 225.08, steps = 267
12:52:00 [INFO] test episode 5: reward = 282.01, steps = 256
12:52:00 [INFO] test episode 6: reward = 256.62, steps = 231
12:52:00 [INFO] test episode 7: reward = 269.03, steps = 205
12:52:00 [INFO] test episode 8: reward = 288.36, steps = 218
12:52:01 [INFO] test episode 9: reward = 24.78, steps = 265
12:52:01 [INFO] test episode 10: reward = 239.47, steps = 221
12:52:01 [INFO] test episode 11: reward = 1.40, steps = 238
12:52:01 [INFO] test episode 12: reward = 274.42, steps = 208
12:52:02 [INFO] test episode 13: reward = 261.07, steps = 299
12:52:02 [INFO] test episode 14: reward = 253.14, steps = 218
12:52:02 [INFO] test episode 15: reward = 288.40, steps = 234
12:52:03 [INFO] test episode 16: reward = -130.06, steps = 650
12:52:05 [INFO] test episode 17: reward = 166.18, steps = 1000
12:52:05 [INFO] test episode 18: reward = 265.44, steps = 195
12:52:05 [INFO] test episode 19: reward = 271.89, steps = 238
12:52:05 [INFO] test episode 20: reward = 278.74, steps = 245
12:52:05 [INFO] test episode 21: reward = 271.36, steps = 233
12:52:06 [INFO] test episode 22: reward = -12.77, steps = 178
12:52:06 [INFO] test episode 23: reward = 247.23, steps = 218
12:52:06 [INFO] test episode 24: reward = 201.95, steps = 434
12:52:07 [INFO] test episode 25: reward = 245.91, steps = 195
12:52:07 [INFO] test episode 26: reward = 243.44, steps = 195
12:52:07 [INFO] test episode 27: reward = 274.77, steps = 226
12:52:07 [INFO] test episode 28: reward = 2.28, steps = 227
12:52:09 [INFO] test episode 29: reward = 94.40, steps = 1000
12:52:09 [INFO] test episode 30: reward = 286.08, steps = 301
12:52:09 [INFO] test episode 31: reward = 259.12, steps = 249
12:52:10 [INFO] test episode 32: reward = 272.56, steps = 232
12:52:10 [INFO] test episode 33: reward = 256.48, steps = 219
12:52:10 [INFO] test episode 34: reward = 276.92, steps = 242
12:52:10 [INFO] test episode 35: reward = 250.72, steps = 250
12:52:11 [INFO] test episode 36: reward = 274.10, steps = 180
12:52:12 [INFO] test episode 37: reward = 135.72, steps = 1000
12:52:12 [INFO] test episode 38: reward = 276.21, steps = 228
12:52:13 [INFO] test episode 39: reward = 271.82, steps = 261
12:52:13 [INFO] test episode 40: reward = 272.34, steps = 231
12:52:13 [INFO] test episode 41: reward = 278.68, steps = 315
12:52:13 [INFO] test episode 42: reward = 289.59, steps = 229
12:52:14 [INFO] test episode 43: reward = 19.72, steps = 197
12:52:14 [INFO] test episode 44: reward = 269.59, steps = 216
12:52:15 [INFO] test episode 45: reward = 230.64, steps = 939
12:52:15 [INFO] test episode 46: reward = 265.88, steps = 262
12:52:16 [INFO] test episode 47: reward = 273.02, steps = 245
12:52:16 [INFO] test episode 48: reward = 279.87, steps = 193
12:52:16 [INFO] test episode 49: reward = 256.61, steps = 267
12:52:16 [INFO] test episode 50: reward = 8.28, steps = 224
12:52:17 [INFO] test episode 51: reward = 282.33, steps = 216
12:52:18 [INFO] test episode 52: reward = 134.88, steps = 1000
12:52:18 [INFO] test episode 53: reward = 286.73, steps = 241
12:52:19 [INFO] test episode 54: reward = 280.60, steps = 222
12:52:19 [INFO] test episode 55: reward = 250.07, steps = 233
12:52:19 [INFO] test episode 56: reward = 258.26, steps = 284
12:52:19 [INFO] test episode 57: reward = 258.39, steps = 225
12:52:20 [INFO] test episode 58: reward = 252.15, steps = 230
12:52:21 [INFO] test episode 59: reward = 177.74, steps = 754
12:52:21 [INFO] test episode 60: reward = 250.30, steps = 275
12:52:21 [INFO] test episode 61: reward = 252.47, steps = 178
12:52:21 [INFO] test episode 62: reward = 237.86, steps = 198
12:52:21 [INFO] test episode 63: reward = 276.02, steps = 226
12:52:21 [INFO] test episode 64: reward = 247.22, steps = 203
12:52:22 [INFO] test episode 65: reward = -30.39, steps = 412
12:52:22 [INFO] test episode 66: reward = 265.27, steps = 279
12:52:22 [INFO] test episode 67: reward = 280.11, steps = 200
12:52:23 [INFO] test episode 68: reward = 259.68, steps = 250
12:52:23 [INFO] test episode 69: reward = 219.61, steps = 288
12:52:23 [INFO] test episode 70: reward = 240.97, steps = 228
12:52:23 [INFO] test episode 71: reward = 263.67, steps = 254
12:52:24 [INFO] test episode 72: reward = 280.41, steps = 219
12:52:24 [INFO] test episode 73: reward = 270.40, steps = 163
12:52:24 [INFO] test episode 74: reward = 234.56, steps = 190
12:52:24 [INFO] test episode 75: reward = 278.69, steps = 245
12:52:24 [INFO] test episode 76: reward = 246.37, steps = 277
12:52:25 [INFO] test episode 77: reward = 36.31, steps = 283
12:52:25 [INFO] test episode 78: reward = 250.18, steps = 378
12:52:25 [INFO] test episode 79: reward = 236.87, steps = 241
12:52:26 [INFO] test episode 80: reward = 284.36, steps = 261
12:52:26 [INFO] test episode 81: reward = 264.29, steps = 250
12:52:26 [INFO] test episode 82: reward = 269.65, steps = 232
12:52:26 [INFO] test episode 83: reward = 275.08, steps = 248
12:52:27 [INFO] test episode 84: reward = 261.39, steps = 245
12:52:27 [INFO] test episode 85: reward = 249.30, steps = 182
12:52:27 [INFO] test episode 86: reward = 222.31, steps = 350
12:52:27 [INFO] test episode 87: reward = 259.59, steps = 171
12:52:28 [INFO] test episode 88: reward = 251.40, steps = 180
12:52:28 [INFO] test episode 89: reward = 244.87, steps = 193
12:52:29 [INFO] test episode 90: reward = 149.38, steps = 1000
12:52:29 [INFO] test episode 91: reward = 270.59, steps = 274
12:52:29 [INFO] test episode 92: reward = 6.33, steps = 182
12:52:30 [INFO] test episode 93: reward = -13.06, steps = 269
12:52:30 [INFO] test episode 94: reward = 256.70, steps = 204
12:52:30 [INFO] test episode 95: reward = 267.60, steps = 262
12:52:31 [INFO] test episode 96: reward = 261.67, steps = 850
12:52:31 [INFO] test episode 97: reward = 297.19, steps = 229
12:52:32 [INFO] test episode 98: reward = -91.23, steps = 204
12:52:32 [INFO] test episode 99: reward = 252.15, steps = 185
12:52:32 [INFO] average episode reward = 218.66 ± 97.16
In [6]:
env.close()