Use Soft Actor-Critic with Auto $\alpha$ Tuning to Play LunarLanderContinuous-v2¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('LunarLanderContinuous-v2')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
08:18:11 [INFO] env: <LunarLanderContinuous<LunarLanderContinuous-v2>>
08:18:11 [INFO] action_space: Box(-1.0, 1.0, (2,), float32)
08:18:11 [INFO] observation_space: Box(-inf, inf, (8,), float32)
08:18:11 [INFO] reward_range: (-inf, inf)
08:18:11 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
08:18:11 [INFO] _max_episode_steps: 1000
08:18:11 [INFO] _elapsed_steps: None
08:18:11 [INFO] id: LunarLanderContinuous-v2
08:18:11 [INFO] entry_point: gym.envs.box2d:LunarLanderContinuous
08:18:11 [INFO] reward_threshold: 200
08:18:11 [INFO] nondeterministic: False
08:18:11 [INFO] max_episode_steps: 1000
08:18:11 [INFO] _kwargs: {}
08:18:11 [INFO] _env_name: LunarLanderContinuous
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class SACAgent:
    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.gamma = 0.99

        self.replayer = DQNReplayer(100000)

        # create alpha
        self.target_entropy = -self.action_dim
        self.ln_alpha_tensor = torch.zeros(1, requires_grad=True)
        self.alpha_optimizer = optim.Adam([self.ln_alpha_tensor,], lr=0.0003)

        # create actor
        self.actor_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=self.action_dim*2,
                output_activator=nn.Tanh())
        self.actor_optimizier = optim.Adam(self.actor_net.parameters(), lr=0.0003)

        # create V critic
        self.v_evaluate_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256])
        self.v_target_net = copy.deepcopy(self.v_evaluate_net)
        self.v_loss = nn.MSELoss()
        self.v_optimizer = optim.Adam(self.v_evaluate_net.parameters(), lr=0.0003)

        # create Q critic
        self.q0_net = self.build_net(input_size=state_dim+self.action_dim,
                hidden_sizes=[256, 256])
        self.q1_net = self.build_net(input_size=state_dim+self.action_dim,
                hidden_sizes=[256, 256])
        self.q0_loss = nn.MSELoss()
        self.q1_loss = nn.MSELoss()
        self.q0_optimizer = optim.Adam(self.q0_net.parameters(), lr=0.0003)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=0.0003)

    def build_net(self, input_size, hidden_sizes, output_size=1,
            output_activator=None):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net

    def get_action_ln_prob_tensors(self, state_tensor):
        mean_ln_std_tensor = self.actor_net(state_tensor)
        mean_tensor, ln_std_tensor = torch.split(mean_ln_std_tensor,
                self.action_dim, dim=-1)
        if self.mode == 'train':
            std_tensor = torch.exp(ln_std_tensor)
            normal_dist = distributions.Normal(mean_tensor, std_tensor)
            rsample_tensor = normal_dist.rsample()
            action_tensor = torch.tanh(rsample_tensor)
            ln_prob_tensor = normal_dist.log_prob(rsample_tensor) - \
                    torch.log1p(1e-6 - action_tensor.pow(2))
            ln_prob_tensor = ln_prob_tensor.sum(-1, keepdim=True)
        else:
            action_tensor = torch.tanh(mean_tensor)
            ln_prob_tensor = torch.ones_like(action_tensor)
        return action_tensor, ln_prob_tensor

    def reset(self, mode):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and self.replayer.count < 5000:
            action = np.random.uniform(self.action_low, self.action_high)
        else:
            state_tensor = torch.as_tensor(observation, dtype=torch.float
                    ).unsqueeze(0)
            action_tensor, _ = self.get_action_ln_prob_tensors(state_tensor)
            action = action_tensor[0].detach().numpy()
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 128:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        for target_param, evaluate_param in zip(
                target_net.parameters(), evaluate_net.parameters()):
            target_param.data.copy_(learning_rate * evaluate_param.data
                    + (1 - learning_rate) * target_param.data)

    def learn(self):
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(128)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.float)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update alpha
        act_tensor, ln_prob_tensor = self.get_action_ln_prob_tensors(state_tensor)
        alpha_loss_tensor = (-self.ln_alpha_tensor * (ln_prob_tensor +
                self.target_entropy).detach()).mean()

        self.alpha_optimizer.zero_grad()
        alpha_loss_tensor.backward()
        self.alpha_optimizer.step()

        # update Q critic
        states_action_tensor = torch.cat((state_tensor, action_tensor), dim=-1)
        q0_tensor = self.q0_net(states_action_tensor)
        q1_tensor = self.q1_net(states_action_tensor)
        next_v_tensor = self.v_target_net(next_state_tensor)
        q_target = reward_tensor.unsqueeze(1) + \
                self.gamma * next_v_tensor * (1. - terminated_tensor.unsqueeze(1))
        q0_loss_tensor = self.q0_loss(q0_tensor, q_target.detach())
        q1_loss_tensor = self.q1_loss(q1_tensor, q_target.detach())

        self.q0_optimizer.zero_grad()
        q0_loss_tensor.backward()
        self.q0_optimizer.step()

        self.q1_optimizer.zero_grad()
        q1_loss_tensor.backward()
        self.q1_optimizer.step()

        # update V critic
        state_act_tensor = torch.cat((state_tensor, act_tensor), dim=-1)
        v_pred_tensor = self.v_evaluate_net(state_tensor)
        q0_pred_tensor = self.q0_net(state_act_tensor)
        q1_pred_tensor = self.q1_net(state_act_tensor)
        q_pred_tensor = torch.min(q0_pred_tensor, q1_pred_tensor)
        alpha_tensor = self.ln_alpha_tensor.exp()
        v_target_tensor = q_pred_tensor - alpha_tensor * ln_prob_tensor
        v_loss_tensor = self.v_loss(v_pred_tensor, v_target_tensor.detach())

        self.v_optimizer.zero_grad()
        v_loss_tensor.backward()
        self.v_optimizer.step()

        self.update_net(self.v_target_net, self.v_evaluate_net)

        # update actor
        actor_loss_tensor = (alpha_tensor * ln_prob_tensor
                - q0_pred_tensor).mean()

        self.actor_optimizier.zero_grad()
        actor_loss_tensor.backward()
        self.actor_optimizier.step()


agent = SACAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 250:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
08:18:11 [INFO] ==== train ====
08:18:12 [INFO] train episode 0: reward = -112.08, steps = 140
08:18:18 [INFO] train episode 1: reward = -281.50, steps = 159
08:18:23 [INFO] train episode 2: reward = -234.58, steps = 137
08:18:30 [INFO] train episode 3: reward = -158.37, steps = 187
08:18:37 [INFO] train episode 4: reward = -298.07, steps = 170
08:18:40 [INFO] train episode 5: reward = -335.86, steps = 75
08:18:44 [INFO] train episode 6: reward = -426.55, steps = 108
08:18:48 [INFO] train episode 7: reward = -149.63, steps = 111
08:18:51 [INFO] train episode 8: reward = -69.06, steps = 73
08:18:55 [INFO] train episode 9: reward = -271.07, steps = 96
08:18:59 [INFO] train episode 10: reward = -408.78, steps = 105
08:19:04 [INFO] train episode 11: reward = -102.08, steps = 137
08:19:10 [INFO] train episode 12: reward = -390.09, steps = 145
08:19:15 [INFO] train episode 13: reward = -408.79, steps = 107
08:19:17 [INFO] train episode 14: reward = -92.82, steps = 69
08:19:21 [INFO] train episode 15: reward = -352.78, steps = 87
08:19:24 [INFO] train episode 16: reward = -234.04, steps = 89
08:19:29 [INFO] train episode 17: reward = -250.68, steps = 127
08:19:34 [INFO] train episode 18: reward = -250.59, steps = 119
08:19:40 [INFO] train episode 19: reward = -320.67, steps = 152
08:19:43 [INFO] train episode 20: reward = -373.46, steps = 81
08:19:46 [INFO] train episode 21: reward = -88.18, steps = 83
08:19:49 [INFO] train episode 22: reward = -36.75, steps = 80
08:19:53 [INFO] train episode 23: reward = -139.52, steps = 97
08:19:57 [INFO] train episode 24: reward = -352.28, steps = 106
08:20:02 [INFO] train episode 25: reward = -111.02, steps = 121
08:20:06 [INFO] train episode 26: reward = -173.87, steps = 104
08:20:10 [INFO] train episode 27: reward = -357.54, steps = 101
08:20:15 [INFO] train episode 28: reward = -187.84, steps = 125
08:20:23 [INFO] train episode 29: reward = -259.05, steps = 213
08:20:27 [INFO] train episode 30: reward = -189.09, steps = 113
08:20:31 [INFO] train episode 31: reward = -233.59, steps = 106
08:20:35 [INFO] train episode 32: reward = -447.59, steps = 104
08:20:41 [INFO] train episode 33: reward = -298.22, steps = 157
08:20:46 [INFO] train episode 34: reward = -312.16, steps = 106
08:20:50 [INFO] train episode 35: reward = -99.37, steps = 114
08:20:54 [INFO] train episode 36: reward = -402.26, steps = 115
08:20:58 [INFO] train episode 37: reward = -280.93, steps = 104
08:21:02 [INFO] train episode 38: reward = -77.80, steps = 104
08:21:05 [INFO] train episode 39: reward = -194.40, steps = 82
08:21:10 [INFO] train episode 40: reward = -260.22, steps = 120
08:21:15 [INFO] train episode 41: reward = -267.42, steps = 117
08:21:20 [INFO] train episode 42: reward = -116.98, steps = 134
08:21:30 [INFO] train episode 43: reward = -338.80, steps = 225
08:21:40 [INFO] train episode 44: reward = -270.31, steps = 233
08:21:47 [INFO] train episode 45: reward = -133.80, steps = 155
08:21:54 [INFO] train episode 46: reward = -63.22, steps = 150
08:22:01 [INFO] train episode 47: reward = -290.41, steps = 176
08:22:12 [INFO] train episode 48: reward = -46.56, steps = 252
08:22:30 [INFO] train episode 49: reward = -115.06, steps = 431
08:22:46 [INFO] train episode 50: reward = -102.78, steps = 392
08:23:29 [INFO] train episode 51: reward = -150.02, steps = 1000
08:24:12 [INFO] train episode 52: reward = -58.18, steps = 1000
08:24:55 [INFO] train episode 53: reward = -137.97, steps = 1000
08:25:16 [INFO] train episode 54: reward = 189.33, steps = 514
08:25:58 [INFO] train episode 55: reward = -79.11, steps = 1000
08:26:41 [INFO] train episode 56: reward = -62.32, steps = 1000
08:27:09 [INFO] train episode 57: reward = -146.16, steps = 657
08:27:18 [INFO] train episode 58: reward = -310.16, steps = 233
08:27:26 [INFO] train episode 59: reward = -141.95, steps = 207
08:27:33 [INFO] train episode 60: reward = -24.10, steps = 171
08:27:39 [INFO] train episode 61: reward = 10.28, steps = 156
08:27:47 [INFO] train episode 62: reward = -72.96, steps = 177
08:28:09 [INFO] train episode 63: reward = 189.82, steps = 562
08:28:19 [INFO] train episode 64: reward = -45.08, steps = 249
08:28:36 [INFO] train episode 65: reward = 202.26, steps = 395
08:29:18 [INFO] train episode 66: reward = -1.59, steps = 1000
08:29:49 [INFO] train episode 67: reward = 85.66, steps = 765
08:30:31 [INFO] train episode 68: reward = 49.17, steps = 1000
08:31:14 [INFO] train episode 69: reward = -9.73, steps = 1000
08:31:59 [INFO] train episode 70: reward = -43.56, steps = 1000
08:32:32 [INFO] train episode 71: reward = 92.58, steps = 775
08:33:17 [INFO] train episode 72: reward = 104.25, steps = 1000
08:34:00 [INFO] train episode 73: reward = -2.90, steps = 1000
08:34:44 [INFO] train episode 74: reward = -16.13, steps = 1000
08:35:27 [INFO] train episode 75: reward = -81.60, steps = 1000
08:36:12 [INFO] train episode 76: reward = -51.90, steps = 1000
08:36:56 [INFO] train episode 77: reward = -34.58, steps = 1000
08:37:39 [INFO] train episode 78: reward = -8.94, steps = 1000
08:38:21 [INFO] train episode 79: reward = -138.96, steps = 992
08:39:03 [INFO] train episode 80: reward = -37.00, steps = 1000
08:39:45 [INFO] train episode 81: reward = -26.09, steps = 1000
08:40:29 [INFO] train episode 82: reward = -38.16, steps = 1000
08:41:11 [INFO] train episode 83: reward = -28.83, steps = 1000
08:41:54 [INFO] train episode 84: reward = -32.24, steps = 1000
08:42:31 [INFO] train episode 85: reward = 1.92, steps = 1000
08:43:09 [INFO] train episode 86: reward = -41.07, steps = 1000
08:43:49 [INFO] train episode 87: reward = -77.63, steps = 1000
08:44:24 [INFO] train episode 88: reward = -144.59, steps = 926
08:45:02 [INFO] train episode 89: reward = -51.94, steps = 1000
08:45:40 [INFO] train episode 90: reward = -64.39, steps = 1000
08:46:19 [INFO] train episode 91: reward = -30.99, steps = 1000
08:46:56 [INFO] train episode 92: reward = -35.70, steps = 1000
08:47:34 [INFO] train episode 93: reward = 40.63, steps = 1000
08:48:11 [INFO] train episode 94: reward = -48.49, steps = 1000
08:48:51 [INFO] train episode 95: reward = 14.42, steps = 1000
08:49:30 [INFO] train episode 96: reward = -56.95, steps = 1000
08:50:07 [INFO] train episode 97: reward = -1.87, steps = 1000
08:50:44 [INFO] train episode 98: reward = -29.84, steps = 1000
08:51:23 [INFO] train episode 99: reward = -29.44, steps = 1000
08:52:00 [INFO] train episode 100: reward = -3.35, steps = 1000
08:52:40 [INFO] train episode 101: reward = 4.47, steps = 1000
08:53:17 [INFO] train episode 102: reward = -28.23, steps = 1000
08:53:59 [INFO] train episode 103: reward = -36.16, steps = 1000
08:54:18 [INFO] train episode 104: reward = -80.85, steps = 499
08:54:55 [INFO] train episode 105: reward = -1.22, steps = 1000
08:55:24 [INFO] train episode 106: reward = -147.22, steps = 761
08:55:45 [INFO] train episode 107: reward = -113.34, steps = 574
08:56:21 [INFO] train episode 108: reward = -236.57, steps = 919
08:56:53 [INFO] train episode 109: reward = -208.71, steps = 845
08:57:31 [INFO] train episode 110: reward = -76.35, steps = 1000
08:58:10 [INFO] train episode 111: reward = -48.00, steps = 1000
08:58:51 [INFO] train episode 112: reward = -35.55, steps = 1000
08:59:32 [INFO] train episode 113: reward = -36.37, steps = 1000
09:00:12 [INFO] train episode 114: reward = -24.10, steps = 1000
09:00:53 [INFO] train episode 115: reward = -8.93, steps = 1000
09:01:34 [INFO] train episode 116: reward = -32.26, steps = 1000
09:02:16 [INFO] train episode 117: reward = -20.86, steps = 1000
09:02:55 [INFO] train episode 118: reward = -23.36, steps = 1000
09:03:35 [INFO] train episode 119: reward = -19.84, steps = 1000
09:04:16 [INFO] train episode 120: reward = 4.26, steps = 1000
09:04:57 [INFO] train episode 121: reward = 13.64, steps = 1000
09:05:37 [INFO] train episode 122: reward = -49.42, steps = 1000
09:06:20 [INFO] train episode 123: reward = 12.37, steps = 1000
09:07:04 [INFO] train episode 124: reward = -3.07, steps = 1000
09:07:39 [INFO] train episode 125: reward = 192.29, steps = 816
09:08:20 [INFO] train episode 126: reward = -24.69, steps = 1000
09:09:05 [INFO] train episode 127: reward = -73.62, steps = 1000
09:09:46 [INFO] train episode 128: reward = 69.85, steps = 1000
09:10:27 [INFO] train episode 129: reward = -47.97, steps = 1000
09:11:09 [INFO] train episode 130: reward = -33.53, steps = 1000
09:11:53 [INFO] train episode 131: reward = -11.22, steps = 1000
09:12:24 [INFO] train episode 132: reward = -98.74, steps = 753
09:13:07 [INFO] train episode 133: reward = -31.19, steps = 1000
09:13:51 [INFO] train episode 134: reward = -25.83, steps = 1000
09:14:34 [INFO] train episode 135: reward = -33.07, steps = 1000
09:15:15 [INFO] train episode 136: reward = -47.70, steps = 1000
09:15:59 [INFO] train episode 137: reward = 73.33, steps = 1000
09:16:44 [INFO] train episode 138: reward = -27.60, steps = 1000
09:17:29 [INFO] train episode 139: reward = -2.45, steps = 1000
09:18:11 [INFO] train episode 140: reward = 30.67, steps = 1000
09:18:53 [INFO] train episode 141: reward = 3.14, steps = 1000
09:19:36 [INFO] train episode 142: reward = -11.03, steps = 1000
09:20:21 [INFO] train episode 143: reward = -8.69, steps = 1000
09:21:05 [INFO] train episode 144: reward = -23.17, steps = 1000
09:21:53 [INFO] train episode 145: reward = -19.40, steps = 1000
09:22:37 [INFO] train episode 146: reward = 63.90, steps = 1000
09:23:20 [INFO] train episode 147: reward = -27.68, steps = 1000
09:24:02 [INFO] train episode 148: reward = -14.27, steps = 1000
09:24:45 [INFO] train episode 149: reward = -24.26, steps = 1000
09:25:28 [INFO] train episode 150: reward = -26.83, steps = 1000
09:26:13 [INFO] train episode 151: reward = -27.14, steps = 1000
09:26:56 [INFO] train episode 152: reward = -15.51, steps = 1000
09:27:39 [INFO] train episode 153: reward = -34.58, steps = 1000
09:28:21 [INFO] train episode 154: reward = 37.20, steps = 1000
09:29:03 [INFO] train episode 155: reward = 34.22, steps = 1000
09:29:48 [INFO] train episode 156: reward = -5.36, steps = 1000
09:30:31 [INFO] train episode 157: reward = -50.48, steps = 1000
09:31:13 [INFO] train episode 158: reward = 77.25, steps = 1000
09:31:28 [INFO] train episode 159: reward = -64.55, steps = 361
09:32:12 [INFO] train episode 160: reward = -50.94, steps = 1000
09:32:54 [INFO] train episode 161: reward = -24.62, steps = 1000
09:33:38 [INFO] train episode 162: reward = -39.87, steps = 1000
09:34:22 [INFO] train episode 163: reward = 113.43, steps = 1000
09:35:05 [INFO] train episode 164: reward = -18.25, steps = 1000
09:35:52 [INFO] train episode 165: reward = -32.68, steps = 1000
09:36:19 [INFO] train episode 166: reward = 251.94, steps = 639
09:37:03 [INFO] train episode 167: reward = 132.67, steps = 996
09:37:48 [INFO] train episode 168: reward = 26.10, steps = 1000
09:38:31 [INFO] train episode 169: reward = -23.84, steps = 1000
09:39:16 [INFO] train episode 170: reward = -34.61, steps = 1000
09:39:59 [INFO] train episode 171: reward = 16.13, steps = 1000
09:40:10 [INFO] train episode 172: reward = -28.65, steps = 267
09:40:55 [INFO] train episode 173: reward = -6.24, steps = 1000
09:41:41 [INFO] train episode 174: reward = -37.24, steps = 1000
09:42:17 [INFO] train episode 175: reward = 153.42, steps = 838
09:42:48 [INFO] train episode 176: reward = -106.54, steps = 704
09:43:32 [INFO] train episode 177: reward = -15.26, steps = 1000
09:44:15 [INFO] train episode 178: reward = -33.40, steps = 1000
09:45:01 [INFO] train episode 179: reward = -33.85, steps = 1000
09:45:47 [INFO] train episode 180: reward = -15.62, steps = 1000
09:46:34 [INFO] train episode 181: reward = -26.81, steps = 1000
09:47:19 [INFO] train episode 182: reward = 0.83, steps = 1000
09:48:04 [INFO] train episode 183: reward = 128.82, steps = 981
09:48:49 [INFO] train episode 184: reward = -46.66, steps = 1000
09:49:35 [INFO] train episode 185: reward = -45.17, steps = 1000
09:50:22 [INFO] train episode 186: reward = -39.43, steps = 1000
09:51:06 [INFO] train episode 187: reward = 79.79, steps = 1000
09:51:53 [INFO] train episode 188: reward = -23.49, steps = 1000
09:52:38 [INFO] train episode 189: reward = -37.61, steps = 1000
09:53:22 [INFO] train episode 190: reward = -12.40, steps = 1000
09:54:06 [INFO] train episode 191: reward = -10.87, steps = 1000
09:54:48 [INFO] train episode 192: reward = -18.93, steps = 1000
09:55:35 [INFO] train episode 193: reward = -30.56, steps = 1000
09:56:21 [INFO] train episode 194: reward = -40.22, steps = 1000
09:57:09 [INFO] train episode 195: reward = -10.30, steps = 1000
09:57:55 [INFO] train episode 196: reward = -1.69, steps = 1000
09:58:42 [INFO] train episode 197: reward = 10.05, steps = 1000
09:59:23 [INFO] train episode 198: reward = 167.86, steps = 914
10:00:10 [INFO] train episode 199: reward = 15.12, steps = 1000
10:00:54 [INFO] train episode 200: reward = -11.66, steps = 1000
10:01:40 [INFO] train episode 201: reward = -19.04, steps = 1000
10:02:06 [INFO] train episode 202: reward = 245.18, steps = 589
10:02:54 [INFO] train episode 203: reward = 32.02, steps = 1000
10:03:42 [INFO] train episode 204: reward = 56.35, steps = 1000
10:04:20 [INFO] train episode 205: reward = 186.69, steps = 836
10:05:08 [INFO] train episode 206: reward = -14.67, steps = 1000
10:05:41 [INFO] train episode 207: reward = 203.12, steps = 736
10:05:57 [INFO] train episode 208: reward = 262.30, steps = 366
10:06:29 [INFO] train episode 209: reward = 265.60, steps = 726
10:07:11 [INFO] train episode 210: reward = 180.46, steps = 914
10:07:36 [INFO] train episode 211: reward = 188.73, steps = 580
10:08:05 [INFO] train episode 212: reward = 170.45, steps = 661
10:08:24 [INFO] train episode 213: reward = 278.07, steps = 428
10:08:48 [INFO] train episode 214: reward = 276.87, steps = 559
10:09:33 [INFO] train episode 215: reward = -13.98, steps = 1000
10:09:57 [INFO] train episode 216: reward = 215.50, steps = 537
10:10:43 [INFO] train episode 217: reward = 1.84, steps = 1000
10:11:11 [INFO] train episode 218: reward = 217.29, steps = 636
10:11:34 [INFO] train episode 219: reward = 255.23, steps = 514
10:12:01 [INFO] train episode 220: reward = 186.56, steps = 597
10:12:48 [INFO] train episode 221: reward = 81.03, steps = 1000
10:13:25 [INFO] train episode 222: reward = 153.45, steps = 857
10:13:31 [INFO] train episode 223: reward = 18.37, steps = 123
10:14:03 [INFO] train episode 224: reward = 275.56, steps = 744
10:14:34 [INFO] train episode 225: reward = 240.97, steps = 700
10:15:01 [INFO] train episode 226: reward = 274.98, steps = 623
10:15:07 [INFO] train episode 227: reward = 20.80, steps = 144
10:15:14 [INFO] train episode 228: reward = -1.63, steps = 166
10:15:31 [INFO] train episode 229: reward = 233.82, steps = 387
10:15:47 [INFO] train episode 230: reward = 259.04, steps = 363
10:16:33 [INFO] train episode 231: reward = 107.03, steps = 1000
10:16:54 [INFO] train episode 232: reward = 224.22, steps = 488
10:17:20 [INFO] train episode 233: reward = 205.25, steps = 563
10:18:07 [INFO] train episode 234: reward = 4.50, steps = 1000
10:18:55 [INFO] train episode 235: reward = 119.17, steps = 993
10:19:41 [INFO] train episode 236: reward = -31.98, steps = 1000
10:20:28 [INFO] train episode 237: reward = -49.84, steps = 1000
10:21:13 [INFO] train episode 238: reward = -43.70, steps = 1000
10:21:54 [INFO] train episode 239: reward = -112.50, steps = 860
10:22:38 [INFO] train episode 240: reward = 158.07, steps = 1000
10:23:22 [INFO] train episode 241: reward = -66.71, steps = 1000
10:23:44 [INFO] train episode 242: reward = 222.56, steps = 490
10:24:32 [INFO] train episode 243: reward = -27.62, steps = 1000
10:24:44 [INFO] train episode 244: reward = 273.56, steps = 298
10:25:32 [INFO] train episode 245: reward = -53.45, steps = 1000
10:26:17 [INFO] train episode 246: reward = -3.71, steps = 1000
10:26:42 [INFO] train episode 247: reward = 204.70, steps = 562
10:27:12 [INFO] train episode 248: reward = 210.68, steps = 668
10:27:57 [INFO] train episode 249: reward = -24.15, steps = 1000
10:28:13 [INFO] train episode 250: reward = 236.60, steps = 371
10:28:26 [INFO] train episode 251: reward = 275.84, steps = 307
10:29:00 [INFO] train episode 252: reward = 214.70, steps = 764
10:29:29 [INFO] train episode 253: reward = 173.57, steps = 651
10:29:48 [INFO] train episode 254: reward = 265.24, steps = 453
10:30:32 [INFO] train episode 255: reward = 132.43, steps = 1000
10:31:19 [INFO] train episode 256: reward = -21.95, steps = 1000
10:32:04 [INFO] train episode 257: reward = -26.56, steps = 1000
10:32:30 [INFO] train episode 258: reward = 188.48, steps = 577
10:33:00 [INFO] train episode 259: reward = -106.38, steps = 660
10:33:46 [INFO] train episode 260: reward = -34.32, steps = 1000
10:34:18 [INFO] train episode 261: reward = 203.41, steps = 725
10:34:36 [INFO] train episode 262: reward = 255.83, steps = 408
10:35:22 [INFO] train episode 263: reward = -9.11, steps = 1000
10:36:10 [INFO] train episode 264: reward = 39.38, steps = 1000
10:36:46 [INFO] train episode 265: reward = 211.19, steps = 808
10:37:04 [INFO] train episode 266: reward = 261.64, steps = 413
10:37:48 [INFO] train episode 267: reward = 136.15, steps = 1000
10:38:33 [INFO] train episode 268: reward = -15.21, steps = 1000
10:39:05 [INFO] train episode 269: reward = 232.94, steps = 726
10:39:22 [INFO] train episode 270: reward = 236.03, steps = 375
10:39:50 [INFO] train episode 271: reward = 237.10, steps = 636
10:40:06 [INFO] train episode 272: reward = 248.89, steps = 359
10:40:22 [INFO] train episode 273: reward = 266.12, steps = 378
10:41:07 [INFO] train episode 274: reward = 1.84, steps = 1000
10:41:45 [INFO] train episode 275: reward = 157.79, steps = 825
10:42:17 [INFO] train episode 276: reward = 189.86, steps = 719
10:42:24 [INFO] train episode 277: reward = -35.46, steps = 155
10:42:33 [INFO] train episode 278: reward = 3.49, steps = 211
10:43:01 [INFO] train episode 279: reward = 238.85, steps = 641
10:43:33 [INFO] train episode 280: reward = 180.11, steps = 713
10:43:54 [INFO] train episode 281: reward = 232.95, steps = 470
10:44:11 [INFO] train episode 282: reward = 257.30, steps = 383
10:44:35 [INFO] train episode 283: reward = 192.55, steps = 533
10:45:00 [INFO] train episode 284: reward = 256.59, steps = 578
10:45:30 [INFO] train episode 285: reward = 253.76, steps = 650
10:45:57 [INFO] train episode 286: reward = 215.79, steps = 626
10:46:42 [INFO] train episode 287: reward = 127.63, steps = 1000
10:47:10 [INFO] train episode 288: reward = 237.74, steps = 639
10:47:24 [INFO] train episode 289: reward = 257.29, steps = 314
10:47:44 [INFO] train episode 290: reward = 219.75, steps = 454
10:48:27 [INFO] train episode 291: reward = -34.80, steps = 1000
10:48:45 [INFO] train episode 292: reward = 225.38, steps = 415
10:49:30 [INFO] train episode 293: reward = 132.93, steps = 1000
10:49:42 [INFO] train episode 294: reward = 239.37, steps = 267
10:50:08 [INFO] train episode 295: reward = 228.38, steps = 598
10:50:26 [INFO] train episode 296: reward = 288.36, steps = 422
10:51:11 [INFO] train episode 297: reward = 29.07, steps = 1000
10:51:27 [INFO] train episode 298: reward = 233.57, steps = 359
10:51:48 [INFO] train episode 299: reward = 205.36, steps = 478
10:52:06 [INFO] train episode 300: reward = 233.38, steps = 414
10:52:24 [INFO] train episode 301: reward = 214.90, steps = 402
10:52:42 [INFO] train episode 302: reward = 216.78, steps = 420
10:52:54 [INFO] train episode 303: reward = 272.85, steps = 274
10:53:27 [INFO] train episode 304: reward = 273.54, steps = 724
10:54:00 [INFO] train episode 305: reward = 201.24, steps = 734
10:54:44 [INFO] train episode 306: reward = 51.44, steps = 1000
10:54:56 [INFO] train episode 307: reward = 243.04, steps = 278
10:55:11 [INFO] train episode 308: reward = 259.15, steps = 352
10:55:25 [INFO] train episode 309: reward = 203.67, steps = 319
10:56:04 [INFO] train episode 310: reward = 229.39, steps = 901
10:56:25 [INFO] train episode 311: reward = 268.76, steps = 466
10:57:09 [INFO] train episode 312: reward = 144.95, steps = 1000
10:57:53 [INFO] train episode 313: reward = -7.47, steps = 1000
10:58:37 [INFO] train episode 314: reward = 148.34, steps = 1000
10:59:20 [INFO] train episode 315: reward = 143.54, steps = 1000
11:00:07 [INFO] train episode 316: reward = 86.12, steps = 1000
11:00:40 [INFO] train episode 317: reward = 205.33, steps = 744
11:01:24 [INFO] train episode 318: reward = 162.79, steps = 1000
11:01:47 [INFO] train episode 319: reward = 204.85, steps = 510
11:02:03 [INFO] train episode 320: reward = 242.95, steps = 361
11:02:15 [INFO] train episode 321: reward = 227.31, steps = 292
11:03:00 [INFO] train episode 322: reward = 67.16, steps = 1000
11:03:44 [INFO] train episode 323: reward = 164.35, steps = 1000
11:04:03 [INFO] train episode 324: reward = 226.79, steps = 440
11:04:17 [INFO] train episode 325: reward = 253.51, steps = 320
11:04:51 [INFO] train episode 326: reward = 223.09, steps = 730
11:05:07 [INFO] train episode 327: reward = 291.06, steps = 369
11:05:27 [INFO] train episode 328: reward = 222.47, steps = 434
11:05:52 [INFO] train episode 329: reward = 240.93, steps = 563
11:06:05 [INFO] train episode 330: reward = 296.07, steps = 308
11:06:20 [INFO] train episode 331: reward = 250.09, steps = 328
11:06:37 [INFO] train episode 332: reward = 241.10, steps = 395
11:06:56 [INFO] train episode 333: reward = 250.09, steps = 431
11:07:14 [INFO] train episode 334: reward = 247.34, steps = 405
11:07:15 [INFO] ==== test ====
11:07:15 [INFO] test episode 0: reward = 271.18, steps = 299
11:07:15 [INFO] test episode 1: reward = 261.38, steps = 254
11:07:16 [INFO] test episode 2: reward = 0.23, steps = 271
11:07:16 [INFO] test episode 3: reward = 248.13, steps = 267
11:07:17 [INFO] test episode 4: reward = 224.61, steps = 330
11:07:18 [INFO] test episode 5: reward = 216.57, steps = 342
11:07:18 [INFO] test episode 6: reward = 248.86, steps = 355
11:07:19 [INFO] test episode 7: reward = 229.78, steps = 296
11:07:20 [INFO] test episode 8: reward = 274.99, steps = 322
11:07:21 [INFO] test episode 9: reward = 181.28, steps = 491
11:07:21 [INFO] test episode 10: reward = 247.33, steps = 311
11:07:22 [INFO] test episode 11: reward = 235.35, steps = 499
11:07:23 [INFO] test episode 12: reward = 247.33, steps = 311
11:07:24 [INFO] test episode 13: reward = 278.01, steps = 264
11:07:24 [INFO] test episode 14: reward = 239.55, steps = 322
11:07:25 [INFO] test episode 15: reward = 254.86, steps = 298
11:07:25 [INFO] test episode 16: reward = 246.03, steps = 364
11:07:26 [INFO] test episode 17: reward = 250.75, steps = 338
11:07:27 [INFO] test episode 18: reward = 255.87, steps = 402
11:07:28 [INFO] test episode 19: reward = 248.91, steps = 298
11:07:28 [INFO] test episode 20: reward = 287.79, steps = 324
11:07:29 [INFO] test episode 21: reward = 275.28, steps = 355
11:07:29 [INFO] test episode 22: reward = 236.50, steps = 367
11:07:31 [INFO] test episode 23: reward = 192.14, steps = 454
11:07:33 [INFO] test episode 24: reward = 169.51, steps = 788
11:07:34 [INFO] test episode 25: reward = 261.63, steps = 444
11:07:35 [INFO] test episode 26: reward = 228.02, steps = 292
11:07:36 [INFO] test episode 27: reward = 239.05, steps = 375
11:07:36 [INFO] test episode 28: reward = 240.58, steps = 326
11:07:39 [INFO] test episode 29: reward = 161.61, steps = 1000
11:07:39 [INFO] test episode 30: reward = 252.69, steps = 290
11:07:40 [INFO] test episode 31: reward = 258.46, steps = 288
11:07:40 [INFO] test episode 32: reward = 273.27, steps = 326
11:07:41 [INFO] test episode 33: reward = 256.26, steps = 282
11:07:41 [INFO] test episode 34: reward = 267.48, steps = 331
11:07:42 [INFO] test episode 35: reward = 244.72, steps = 344
11:07:43 [INFO] test episode 36: reward = 247.76, steps = 336
11:07:44 [INFO] test episode 37: reward = 217.31, steps = 372
11:07:44 [INFO] test episode 38: reward = 247.06, steps = 342
11:07:45 [INFO] test episode 39: reward = 211.10, steps = 340
11:07:45 [INFO] test episode 40: reward = 264.69, steps = 312
11:07:46 [INFO] test episode 41: reward = 232.47, steps = 303
11:07:47 [INFO] test episode 42: reward = 228.02, steps = 382
11:07:48 [INFO] test episode 43: reward = 251.82, steps = 327
11:07:48 [INFO] test episode 44: reward = 252.08, steps = 304
11:07:49 [INFO] test episode 45: reward = 220.72, steps = 321
11:07:49 [INFO] test episode 46: reward = 238.09, steps = 409
11:07:50 [INFO] test episode 47: reward = 230.84, steps = 334
11:07:52 [INFO] test episode 48: reward = 215.48, steps = 612
11:07:52 [INFO] test episode 49: reward = 223.35, steps = 372
11:07:55 [INFO] test episode 50: reward = -17.11, steps = 1000
11:07:55 [INFO] test episode 51: reward = 228.41, steps = 294
11:07:56 [INFO] test episode 52: reward = 255.20, steps = 339
11:07:57 [INFO] test episode 53: reward = 220.52, steps = 498
11:07:58 [INFO] test episode 54: reward = 215.52, steps = 291
11:07:59 [INFO] test episode 55: reward = 243.97, steps = 491
11:08:00 [INFO] test episode 56: reward = 211.23, steps = 440
11:08:01 [INFO] test episode 57: reward = 213.63, steps = 306
11:08:01 [INFO] test episode 58: reward = 252.84, steps = 302
11:08:02 [INFO] test episode 59: reward = 284.78, steps = 265
11:08:02 [INFO] test episode 60: reward = 209.75, steps = 399
11:08:03 [INFO] test episode 61: reward = 254.63, steps = 268
11:08:07 [INFO] test episode 62: reward = -49.69, steps = 1000
11:08:07 [INFO] test episode 63: reward = 278.15, steps = 308
11:08:08 [INFO] test episode 64: reward = 243.76, steps = 468
11:08:09 [INFO] test episode 65: reward = 203.84, steps = 374
11:08:10 [INFO] test episode 66: reward = 267.18, steps = 285
11:08:10 [INFO] test episode 67: reward = 233.67, steps = 420
11:08:11 [INFO] test episode 68: reward = 251.62, steps = 337
11:08:12 [INFO] test episode 69: reward = 237.55, steps = 330
11:08:12 [INFO] test episode 70: reward = 282.45, steps = 364
11:08:13 [INFO] test episode 71: reward = 271.30, steps = 328
11:08:14 [INFO] test episode 72: reward = 266.89, steps = 313
11:08:14 [INFO] test episode 73: reward = 224.90, steps = 364
11:08:15 [INFO] test episode 74: reward = 231.54, steps = 401
11:08:16 [INFO] test episode 75: reward = 240.63, steps = 249
11:08:16 [INFO] test episode 76: reward = 238.24, steps = 325
11:08:17 [INFO] test episode 77: reward = 267.44, steps = 272
11:08:18 [INFO] test episode 78: reward = 262.40, steps = 370
11:08:18 [INFO] test episode 79: reward = 222.95, steps = 283
11:08:19 [INFO] test episode 80: reward = 231.19, steps = 319
11:08:20 [INFO] test episode 81: reward = 209.95, steps = 433
11:08:20 [INFO] test episode 82: reward = 224.54, steps = 323
11:08:21 [INFO] test episode 83: reward = 198.86, steps = 316
11:08:21 [INFO] test episode 84: reward = 233.75, steps = 260
11:08:22 [INFO] test episode 85: reward = 217.96, steps = 346
11:08:23 [INFO] test episode 86: reward = 259.67, steps = 307
11:08:23 [INFO] test episode 87: reward = 209.53, steps = 406
11:08:24 [INFO] test episode 88: reward = 255.52, steps = 316
11:08:24 [INFO] test episode 89: reward = 268.38, steps = 247
11:08:25 [INFO] test episode 90: reward = 249.62, steps = 336
11:08:26 [INFO] test episode 91: reward = 224.22, steps = 364
11:08:26 [INFO] test episode 92: reward = 284.50, steps = 305
11:08:27 [INFO] test episode 93: reward = 232.68, steps = 283
11:08:28 [INFO] test episode 94: reward = 227.78, steps = 326
11:08:28 [INFO] test episode 95: reward = 252.30, steps = 280
11:08:29 [INFO] test episode 96: reward = 214.03, steps = 347
11:08:31 [INFO] test episode 97: reward = 165.31, steps = 765
11:08:31 [INFO] test episode 98: reward = 254.61, steps = 356
11:08:32 [INFO] test episode 99: reward = 281.94, steps = 407
11:08:32 [INFO] average episode reward = 232.31 ± 51.68