Use Soft Actor-Critic with Auto $\alpha$ Tuning to Play LunarLanderContinuous-v2¶

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow_probability import distributions

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('LunarLanderContinuous-v2')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
08:13:48 [INFO] env: <LunarLanderContinuous<LunarLanderContinuous-v2>>
08:13:48 [INFO] action_space: Box(-1.0, 1.0, (2,), float32)
08:13:48 [INFO] observation_space: Box(-inf, inf, (8,), float32)
08:13:48 [INFO] reward_range: (-inf, inf)
08:13:48 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
08:13:48 [INFO] _max_episode_steps: 1000
08:13:48 [INFO] _elapsed_steps: None
08:13:48 [INFO] id: LunarLanderContinuous-v2
08:13:48 [INFO] entry_point: gym.envs.box2d:LunarLanderContinuous
08:13:48 [INFO] reward_threshold: 200
08:13:48 [INFO] nondeterministic: False
08:13:48 [INFO] max_episode_steps: 1000
08:13:48 [INFO] _kwargs: {}
08:13:48 [INFO] _env_name: LunarLanderContinuous
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class SACAgent:
    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high
        self.gamma = 0.99

        self.replayer = DQNReplayer(100000)

        # create alpha
        self.target_entropy = -action_dim
        self.ln_alpha_tensor = tf.Variable(0., dtype=tf.float32)
        self.alpha_optimizer = optimizers.Adam(3e-4)

        # create actor
        self.actor_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=action_dim*2,
                output_activation=tf.tanh)

        # create V critic
        self.v_evaluate_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256])
        self.v_target_net = models.clone_model(self.v_evaluate_net)

        # create Q critic
        self.q0_net = self.build_net(input_size=state_dim+action_dim,
                hidden_sizes=[256, 256])
        self.q1_net = self.build_net(input_size=state_dim+action_dim,
                hidden_sizes=[256, 256])

    def build_net(self, input_size, hidden_sizes, output_size=1,
                activation=nn.relu, output_activation=None,
                loss=losses.mse, learning_rate=3e-4):
        model = keras.Sequential()
        for layer, hidden_size in enumerate(hidden_sizes):
            kwargs = {'input_shape' : (input_size,)} if layer == 0 else {}
            model.add(layers.Dense(units=hidden_size,
                    activation=activation, **kwargs))
        model.add(layers.Dense(units=output_size,
                activation=output_activation))
        optimizer = optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model

    def get_action_ln_prob_tensors(self, state_tensor):
        mean_ln_std_tensor = self.actor_net(state_tensor)
        mean_tensor, ln_std_tensor = tf.split(mean_ln_std_tensor, 2, axis=-1)
        if self.mode == 'train':
            std_tensor = tf.math.exp(ln_std_tensor)
            normal_dist = distributions.Normal(mean_tensor, std_tensor)
            sample_tensor = normal_dist.sample()
            action_tensor = tf.tanh(sample_tensor)
            ln_prob_tensor = normal_dist.log_prob(sample_tensor) - \
                    tf.math.log1p(1e-6 - tf.pow(action_tensor, 2))
            ln_prob_tensor = tf.reduce_sum(ln_prob_tensor, axis=-1, keepdims=True)
        else:
            action_tensor = tf.tanh(mean_tensor)
            ln_prob_tensor = tf.ones_like(action_tensor)
        return action_tensor, ln_prob_tensor

    def reset(self, mode):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and self.replayer.count < 5000:
            action = np.random.uniform(self.action_low, self.action_high)
        else:
            state_tensor = tf.convert_to_tensor(observation[np.newaxis, :],
                    dtype=tf.float32)
            action_tensor, _ = self.get_action_ln_prob_tensors(state_tensor)
            action = action_tensor[0].numpy()
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 120:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.005):
        average_weights = [(1. - learning_rate) * t + learning_rate * e for t, e
                in zip(target_net.get_weights(), evaluate_net.get_weights())]
        target_net.set_weights(average_weights)

    def learn(self):
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(128)
        state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)

        # update alpha
        act_tensor, ln_prob_tensor = self.get_action_ln_prob_tensors(state_tensor)
        with tf.GradientTape() as tape:
            alpha_loss_tensor = -self.ln_alpha_tensor * (tf.reduce_mean(
                    ln_prob_tensor, axis=-1) + self.target_entropy)
        grads = tape.gradient(alpha_loss_tensor, [self.ln_alpha_tensor,])
        self.alpha_optimizer.apply_gradients(zip(grads, [self.ln_alpha_tensor,]))

        # update Q critic
        state_actions = np.concatenate((states, actions), axis=-1)
        next_vs = self.v_target_net.predict(next_states, verbose=0)
        q_targets = rewards[:, np.newaxis] + \
                self.gamma * (1. - terminateds[:, np.newaxis]) * next_vs
        self.q0_net.fit(state_actions, q_targets, verbose=False)
        self.q1_net.fit(state_actions, q_targets, verbose=False)

        # update V critic
        state_act_tensor = tf.concat((state_tensor, act_tensor), axis=-1)
        q0_pred_tensor = self.q0_net(state_act_tensor)
        q1_pred_tensor = self.q1_net(state_act_tensor)
        q_pred_tensor = tf.minimum(q0_pred_tensor, q1_pred_tensor)
        alpha_tensor = tf.exp(self.ln_alpha_tensor)
        v_target_tensor = q_pred_tensor - alpha_tensor * ln_prob_tensor
        v_targets = v_target_tensor.numpy()
        self.v_evaluate_net.fit(states, v_targets, verbose=False)
        self.update_net(self.v_target_net, self.v_evaluate_net)

        # update actor
        with tf.GradientTape() as tape:
            act_tensor, ln_prob_tensor = \
                    self.get_action_ln_prob_tensors(state_tensor)
            state_act_tensor = tf.concat((state_tensor, act_tensor), axis=-1)
            q0_pred_tensor = self.q0_net(state_act_tensor)
            alpha_tensor = tf.exp(self.ln_alpha_tensor)
            actor_loss_tensor = tf.reduce_mean(alpha_tensor * ln_prob_tensor -
                    q0_pred_tensor)
        grads = tape.gradient(actor_loss_tensor,
                self.actor_net.trainable_variables)
        self.actor_net.optimizer.apply_gradients(
                zip(grads, self.actor_net.trainable_variables))


agent = SACAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 250:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
08:13:51 [INFO] ==== train ====
08:14:00 [INFO] train episode 0: reward = -110.26, steps = 140
08:14:51 [INFO] train episode 1: reward = -249.29, steps = 154
08:15:35 [INFO] train episode 2: reward = -282.50, steps = 131
08:16:10 [INFO] train episode 3: reward = -299.41, steps = 104
08:16:39 [INFO] train episode 4: reward = -37.76, steps = 84
08:17:08 [INFO] train episode 5: reward = -42.23, steps = 76
08:17:47 [INFO] train episode 6: reward = -181.15, steps = 107
08:18:24 [INFO] train episode 7: reward = -161.18, steps = 98
08:18:59 [INFO] train episode 8: reward = -305.59, steps = 92
08:19:34 [INFO] train episode 9: reward = -143.95, steps = 90
08:20:26 [INFO] train episode 10: reward = -426.00, steps = 135
08:20:52 [INFO] train episode 11: reward = -98.31, steps = 66
08:21:26 [INFO] train episode 12: reward = -14.49, steps = 86
08:22:04 [INFO] train episode 13: reward = -78.21, steps = 100
08:22:45 [INFO] train episode 14: reward = -175.84, steps = 105
08:23:15 [INFO] train episode 15: reward = -60.30, steps = 78
08:23:53 [INFO] train episode 16: reward = -549.33, steps = 98
08:24:42 [INFO] train episode 17: reward = -214.00, steps = 128
08:25:38 [INFO] train episode 18: reward = -331.81, steps = 146
08:26:12 [INFO] train episode 19: reward = -338.44, steps = 89
08:27:00 [INFO] train episode 20: reward = -288.34, steps = 124
08:27:47 [INFO] train episode 21: reward = -126.20, steps = 122
08:28:30 [INFO] train episode 22: reward = -295.01, steps = 110
08:29:21 [INFO] train episode 23: reward = -541.01, steps = 134
08:29:48 [INFO] train episode 24: reward = -75.28, steps = 71
08:30:48 [INFO] train episode 25: reward = -116.77, steps = 154
08:31:45 [INFO] train episode 26: reward = -372.92, steps = 151
08:32:32 [INFO] train episode 27: reward = -253.98, steps = 121
08:33:14 [INFO] train episode 28: reward = -69.39, steps = 110
08:33:53 [INFO] train episode 29: reward = -288.25, steps = 101
08:34:32 [INFO] train episode 30: reward = -352.66, steps = 100
08:35:02 [INFO] train episode 31: reward = -87.67, steps = 77
08:35:35 [INFO] train episode 32: reward = -367.38, steps = 86
08:36:19 [INFO] train episode 33: reward = -72.54, steps = 114
08:36:55 [INFO] train episode 34: reward = -130.19, steps = 92
08:37:41 [INFO] train episode 35: reward = -206.35, steps = 121
08:38:19 [INFO] train episode 36: reward = -88.64, steps = 97
08:39:03 [INFO] train episode 37: reward = -198.04, steps = 115
08:39:30 [INFO] train episode 38: reward = -103.69, steps = 71
08:39:58 [INFO] train episode 39: reward = -135.01, steps = 70
08:40:44 [INFO] train episode 40: reward = -173.38, steps = 121
08:41:28 [INFO] train episode 41: reward = -417.42, steps = 113
08:42:22 [INFO] train episode 42: reward = -251.79, steps = 147
08:42:51 [INFO] train episode 43: reward = -422.75, steps = 82
08:43:31 [INFO] train episode 44: reward = -298.07, steps = 114
08:44:15 [INFO] train episode 45: reward = -509.73, steps = 129
08:45:12 [INFO] train episode 46: reward = -84.61, steps = 162
08:47:29 [INFO] train episode 47: reward = -37.50, steps = 389
08:48:28 [INFO] train episode 48: reward = 8.65, steps = 169
08:49:42 [INFO] train episode 49: reward = -34.87, steps = 211
08:51:11 [INFO] train episode 50: reward = -15.36, steps = 251
08:52:10 [INFO] train episode 51: reward = -40.78, steps = 169
08:53:46 [INFO] train episode 52: reward = -87.32, steps = 272
08:59:37 [INFO] train episode 53: reward = -104.92, steps = 1000
09:01:55 [INFO] train episode 54: reward = -68.44, steps = 394
09:07:47 [INFO] train episode 55: reward = -82.20, steps = 1000
09:13:37 [INFO] train episode 56: reward = -45.02, steps = 1000
09:14:56 [INFO] train episode 57: reward = -25.66, steps = 227
09:16:10 [INFO] train episode 58: reward = -29.24, steps = 211
09:19:27 [INFO] train episode 59: reward = -61.69, steps = 561
09:21:31 [INFO] train episode 60: reward = -104.31, steps = 348
09:22:34 [INFO] train episode 61: reward = -63.95, steps = 183
09:25:08 [INFO] train episode 62: reward = -238.04, steps = 438
09:27:25 [INFO] train episode 63: reward = 170.66, steps = 393
09:28:26 [INFO] train episode 64: reward = -144.34, steps = 175
09:31:35 [INFO] train episode 65: reward = -106.30, steps = 534
09:37:31 [INFO] train episode 66: reward = -30.74, steps = 1000
09:43:25 [INFO] train episode 67: reward = 11.18, steps = 1000
09:44:26 [INFO] train episode 68: reward = -96.97, steps = 172
09:50:18 [INFO] train episode 69: reward = -26.22, steps = 1000
09:56:11 [INFO] train episode 70: reward = -9.46, steps = 1000
10:02:05 [INFO] train episode 71: reward = 2.29, steps = 1000
10:08:05 [INFO] train episode 72: reward = -17.41, steps = 1000
10:09:03 [INFO] train episode 73: reward = -78.32, steps = 163
10:14:59 [INFO] train episode 74: reward = -56.47, steps = 1000
10:17:03 [INFO] train episode 75: reward = -105.24, steps = 350
10:23:01 [INFO] train episode 76: reward = -85.25, steps = 1000
10:28:58 [INFO] train episode 77: reward = -44.23, steps = 1000
10:34:56 [INFO] train episode 78: reward = -42.70, steps = 1000
10:40:53 [INFO] train episode 79: reward = -4.61, steps = 1000
10:46:54 [INFO] train episode 80: reward = -59.02, steps = 1000
10:52:52 [INFO] train episode 81: reward = -36.24, steps = 1000
10:58:52 [INFO] train episode 82: reward = -68.53, steps = 1000
11:04:51 [INFO] train episode 83: reward = -40.34, steps = 1000
11:10:05 [INFO] train episode 84: reward = -58.39, steps = 1000
11:14:10 [INFO] train episode 85: reward = -12.41, steps = 1000
11:18:13 [INFO] train episode 86: reward = -9.76, steps = 1000
11:22:15 [INFO] train episode 87: reward = -67.77, steps = 1000
11:26:19 [INFO] train episode 88: reward = -34.24, steps = 1000
11:30:22 [INFO] train episode 89: reward = -6.51, steps = 1000
11:34:27 [INFO] train episode 90: reward = -72.43, steps = 1000
11:38:32 [INFO] train episode 91: reward = -31.45, steps = 1000
11:42:36 [INFO] train episode 92: reward = -90.86, steps = 1000
11:46:40 [INFO] train episode 93: reward = -35.35, steps = 1000
11:50:43 [INFO] train episode 94: reward = -23.63, steps = 1000
11:54:45 [INFO] train episode 95: reward = -40.51, steps = 1000
11:58:49 [INFO] train episode 96: reward = -34.42, steps = 1000
12:02:55 [INFO] train episode 97: reward = -45.70, steps = 1000
12:06:59 [INFO] train episode 98: reward = -13.30, steps = 1000
12:11:04 [INFO] train episode 99: reward = -22.68, steps = 1000
12:12:34 [INFO] train episode 100: reward = -157.09, steps = 366
12:16:39 [INFO] train episode 101: reward = -50.90, steps = 1000
12:20:43 [INFO] train episode 102: reward = -33.76, steps = 1000
12:24:49 [INFO] train episode 103: reward = -54.74, steps = 1000
12:28:54 [INFO] train episode 104: reward = -18.99, steps = 1000
12:33:01 [INFO] train episode 105: reward = -50.84, steps = 1000
12:37:10 [INFO] train episode 106: reward = -59.08, steps = 1000
12:41:19 [INFO] train episode 107: reward = -42.44, steps = 1000
12:45:26 [INFO] train episode 108: reward = 3.06, steps = 1000
12:49:37 [INFO] train episode 109: reward = -70.19, steps = 1000
12:53:48 [INFO] train episode 110: reward = -18.02, steps = 1000
12:57:58 [INFO] train episode 111: reward = -40.37, steps = 1000
13:01:33 [INFO] train episode 112: reward = -142.23, steps = 849
13:05:12 [INFO] train episode 113: reward = -128.71, steps = 870
13:09:26 [INFO] train episode 114: reward = -17.06, steps = 1000
13:13:36 [INFO] train episode 115: reward = -97.91, steps = 1000
13:17:50 [INFO] train episode 116: reward = -99.62, steps = 1000
13:22:11 [INFO] train episode 117: reward = -25.60, steps = 1000
13:26:31 [INFO] train episode 118: reward = -42.49, steps = 1000
13:30:49 [INFO] train episode 119: reward = -37.43, steps = 1000
13:35:08 [INFO] train episode 120: reward = -74.25, steps = 1000
13:39:24 [INFO] train episode 121: reward = -86.12, steps = 1000
13:43:42 [INFO] train episode 122: reward = -84.45, steps = 1000
13:47:37 [INFO] train episode 123: reward = 126.28, steps = 908
13:51:56 [INFO] train episode 124: reward = -56.54, steps = 1000
13:56:15 [INFO] train episode 125: reward = 7.61, steps = 1000
14:00:34 [INFO] train episode 126: reward = -8.52, steps = 1000
14:03:10 [INFO] train episode 127: reward = -49.37, steps = 600
14:07:32 [INFO] train episode 128: reward = -71.91, steps = 1000
14:08:45 [INFO] train episode 129: reward = -63.49, steps = 280
14:13:09 [INFO] train episode 130: reward = -49.66, steps = 1000
14:17:29 [INFO] train episode 131: reward = -99.76, steps = 1000
14:21:50 [INFO] train episode 132: reward = -101.48, steps = 1000
14:24:40 [INFO] train episode 133: reward = -94.13, steps = 651
14:29:08 [INFO] train episode 134: reward = -44.32, steps = 1000
14:33:34 [INFO] train episode 135: reward = -68.16, steps = 1000
14:38:00 [INFO] train episode 136: reward = -14.00, steps = 1000
14:42:24 [INFO] train episode 137: reward = -61.74, steps = 1000
14:46:47 [INFO] train episode 138: reward = -98.26, steps = 1000
14:51:13 [INFO] train episode 139: reward = -45.95, steps = 1000
14:55:37 [INFO] train episode 140: reward = -11.04, steps = 1000
15:00:03 [INFO] train episode 141: reward = -6.89, steps = 1000
15:04:30 [INFO] train episode 142: reward = -34.49, steps = 1000
15:08:54 [INFO] train episode 143: reward = -29.75, steps = 1000
15:13:19 [INFO] train episode 144: reward = -7.63, steps = 1000
15:17:46 [INFO] train episode 145: reward = -35.95, steps = 1000
15:22:14 [INFO] train episode 146: reward = -58.01, steps = 1000
15:26:39 [INFO] train episode 147: reward = -60.17, steps = 1000
15:31:05 [INFO] train episode 148: reward = -12.72, steps = 1000
15:35:34 [INFO] train episode 149: reward = -51.32, steps = 1000
15:40:02 [INFO] train episode 150: reward = -5.12, steps = 1000
15:44:33 [INFO] train episode 151: reward = -61.23, steps = 1000
15:49:02 [INFO] train episode 152: reward = -37.32, steps = 1000
15:53:32 [INFO] train episode 153: reward = -35.78, steps = 1000
15:58:02 [INFO] train episode 154: reward = -28.23, steps = 1000
16:02:33 [INFO] train episode 155: reward = -60.52, steps = 1000
16:07:03 [INFO] train episode 156: reward = -28.69, steps = 1000
16:11:36 [INFO] train episode 157: reward = -56.97, steps = 1000
16:16:11 [INFO] train episode 158: reward = -36.27, steps = 1000
16:20:47 [INFO] train episode 159: reward = -7.67, steps = 1000
16:25:24 [INFO] train episode 160: reward = -29.63, steps = 1000
16:29:59 [INFO] train episode 161: reward = -18.34, steps = 1000
16:34:36 [INFO] train episode 162: reward = -38.78, steps = 1000
16:39:14 [INFO] train episode 163: reward = -48.51, steps = 1000
16:43:50 [INFO] train episode 164: reward = -42.26, steps = 1000
16:48:27 [INFO] train episode 165: reward = -35.21, steps = 1000
16:53:04 [INFO] train episode 166: reward = -13.84, steps = 1000
16:57:41 [INFO] train episode 167: reward = -51.10, steps = 1000
17:02:21 [INFO] train episode 168: reward = -24.52, steps = 1000
17:07:02 [INFO] train episode 169: reward = -56.40, steps = 1000
17:11:43 [INFO] train episode 170: reward = -52.76, steps = 1000
17:16:25 [INFO] train episode 171: reward = -56.77, steps = 1000
17:21:08 [INFO] train episode 172: reward = -28.57, steps = 1000
17:25:47 [INFO] train episode 173: reward = -18.01, steps = 1000
17:30:29 [INFO] train episode 174: reward = -86.60, steps = 1000
17:35:13 [INFO] train episode 175: reward = -42.24, steps = 1000
17:39:56 [INFO] train episode 176: reward = -64.39, steps = 1000
17:44:39 [INFO] train episode 177: reward = -58.60, steps = 1000
17:49:23 [INFO] train episode 178: reward = -47.75, steps = 1000
17:54:08 [INFO] train episode 179: reward = -13.38, steps = 1000
17:58:53 [INFO] train episode 180: reward = -20.14, steps = 1000
18:03:37 [INFO] train episode 181: reward = -37.82, steps = 1000
18:06:34 [INFO] train episode 182: reward = -144.37, steps = 621
18:11:23 [INFO] train episode 183: reward = -25.29, steps = 1000
18:16:08 [INFO] train episode 184: reward = -40.30, steps = 1000
18:21:00 [INFO] train episode 185: reward = -36.23, steps = 1000
18:24:03 [INFO] train episode 186: reward = -249.51, steps = 627
18:28:53 [INFO] train episode 187: reward = -42.21, steps = 1000
18:33:43 [INFO] train episode 188: reward = -29.54, steps = 1000
18:38:34 [INFO] train episode 189: reward = -66.94, steps = 1000
18:43:26 [INFO] train episode 190: reward = -36.37, steps = 1000
18:48:19 [INFO] train episode 191: reward = 7.51, steps = 1000
18:53:09 [INFO] train episode 192: reward = -41.29, steps = 1000
18:58:03 [INFO] train episode 193: reward = -53.87, steps = 1000
19:02:55 [INFO] train episode 194: reward = -25.27, steps = 1000
19:07:50 [INFO] train episode 195: reward = 5.41, steps = 1000
19:12:45 [INFO] train episode 196: reward = 10.69, steps = 1000
19:17:41 [INFO] train episode 197: reward = -42.86, steps = 1000
19:22:36 [INFO] train episode 198: reward = -19.69, steps = 1000
19:27:31 [INFO] train episode 199: reward = 43.78, steps = 1000
19:32:26 [INFO] train episode 200: reward = 17.24, steps = 1000
19:37:22 [INFO] train episode 201: reward = -31.67, steps = 1000
19:42:18 [INFO] train episode 202: reward = 1.99, steps = 1000
19:47:14 [INFO] train episode 203: reward = 8.05, steps = 1000
19:52:12 [INFO] train episode 204: reward = -30.82, steps = 1000
19:57:09 [INFO] train episode 205: reward = 4.38, steps = 1000
20:02:10 [INFO] train episode 206: reward = -15.40, steps = 1000
20:07:13 [INFO] train episode 207: reward = -61.17, steps = 1000
20:12:14 [INFO] train episode 208: reward = 23.43, steps = 1000
20:17:15 [INFO] train episode 209: reward = 23.01, steps = 1000
20:22:23 [INFO] train episode 210: reward = 6.76, steps = 1000
20:27:27 [INFO] train episode 211: reward = -25.18, steps = 1000
20:32:30 [INFO] train episode 212: reward = -15.10, steps = 1000
20:37:35 [INFO] train episode 213: reward = 5.44, steps = 1000
20:42:42 [INFO] train episode 214: reward = 28.07, steps = 1000
20:47:49 [INFO] train episode 215: reward = -11.50, steps = 1000
20:52:55 [INFO] train episode 216: reward = 5.54, steps = 1000
20:58:01 [INFO] train episode 217: reward = 28.85, steps = 1000
21:03:07 [INFO] train episode 218: reward = -24.30, steps = 1000
21:08:15 [INFO] train episode 219: reward = -32.33, steps = 1000
21:13:38 [INFO] train episode 220: reward = 13.72, steps = 1000
21:19:07 [INFO] train episode 221: reward = -23.18, steps = 1000
21:24:35 [INFO] train episode 222: reward = -73.13, steps = 1000
21:29:59 [INFO] train episode 223: reward = 8.28, steps = 1000
21:36:13 [INFO] train episode 224: reward = -34.46, steps = 1000
21:42:19 [INFO] train episode 225: reward = -40.52, steps = 1000
21:48:42 [INFO] train episode 226: reward = -27.43, steps = 1000
21:54:43 [INFO] train episode 227: reward = -4.24, steps = 1000
22:00:13 [INFO] train episode 228: reward = -46.48, steps = 1000
22:05:30 [INFO] train episode 229: reward = 18.70, steps = 1000
22:10:49 [INFO] train episode 230: reward = -37.42, steps = 1000
22:16:10 [INFO] train episode 231: reward = -20.73, steps = 1000
22:21:33 [INFO] train episode 232: reward = -23.47, steps = 1000
22:26:56 [INFO] train episode 233: reward = 5.32, steps = 1000
22:32:19 [INFO] train episode 234: reward = 20.45, steps = 1000
22:37:43 [INFO] train episode 235: reward = 13.88, steps = 1000
22:43:20 [INFO] train episode 236: reward = 29.19, steps = 1000
22:48:52 [INFO] train episode 237: reward = -33.77, steps = 1000
22:54:28 [INFO] train episode 238: reward = -13.70, steps = 1000
23:00:10 [INFO] train episode 239: reward = 18.16, steps = 1000
23:05:47 [INFO] train episode 240: reward = 46.52, steps = 1000
23:11:35 [INFO] train episode 241: reward = 88.73, steps = 1000
23:17:10 [INFO] train episode 242: reward = 36.74, steps = 1000
23:22:51 [INFO] train episode 243: reward = -35.77, steps = 1000
23:28:38 [INFO] train episode 244: reward = 45.93, steps = 1000
23:29:17 [INFO] train episode 245: reward = 44.37, steps = 116
23:29:48 [INFO] train episode 246: reward = -15.86, steps = 93
23:35:21 [INFO] train episode 247: reward = -15.06, steps = 1000
23:35:58 [INFO] train episode 248: reward = -67.91, steps = 109
23:36:29 [INFO] train episode 249: reward = -280.83, steps = 91
23:37:02 [INFO] train episode 250: reward = -218.63, steps = 98
23:42:34 [INFO] train episode 251: reward = -41.45, steps = 1000
23:48:07 [INFO] train episode 252: reward = -24.54, steps = 1000
23:54:01 [INFO] train episode 253: reward = -135.34, steps = 1000
23:59:45 [INFO] train episode 254: reward = -13.60, steps = 1000
00:03:28 [INFO] train episode 255: reward = -158.07, steps = 663
00:08:46 [INFO] train episode 256: reward = -222.26, steps = 939
00:14:31 [INFO] train episode 257: reward = -32.22, steps = 1000
00:17:00 [INFO] train episode 258: reward = -114.64, steps = 436
00:22:41 [INFO] train episode 259: reward = -108.70, steps = 1000
01:20:33 [INFO] train episode 260: reward = -103.78, steps = 1000
01:23:36 [INFO] train episode 261: reward = -101.21, steps = 500
01:26:43 [INFO] train episode 262: reward = 176.87, steps = 536
01:32:24 [INFO] train episode 263: reward = 70.56, steps = 1000
01:38:05 [INFO] train episode 264: reward = 53.58, steps = 1000
01:43:44 [INFO] train episode 265: reward = -50.69, steps = 1000
01:47:22 [INFO] train episode 266: reward = 203.64, steps = 638
01:53:08 [INFO] train episode 267: reward = 86.95, steps = 1000
01:58:53 [INFO] train episode 268: reward = 72.52, steps = 1000
02:04:33 [INFO] train episode 269: reward = 99.74, steps = 1000
02:09:27 [INFO] train episode 270: reward = 193.78, steps = 860
02:15:11 [INFO] train episode 271: reward = 195.16, steps = 997
02:18:53 [INFO] train episode 272: reward = 222.50, steps = 648
02:24:36 [INFO] train episode 273: reward = 96.52, steps = 1000
02:30:18 [INFO] train episode 274: reward = 79.26, steps = 1000
02:33:24 [INFO] train episode 275: reward = 179.62, steps = 541
02:37:32 [INFO] train episode 276: reward = 246.95, steps = 724
02:40:44 [INFO] train episode 277: reward = 243.33, steps = 559
02:43:09 [INFO] train episode 278: reward = 231.45, steps = 422
02:46:47 [INFO] train episode 279: reward = 193.85, steps = 629
02:50:15 [INFO] train episode 280: reward = 246.16, steps = 603
02:52:57 [INFO] train episode 281: reward = 261.41, steps = 470
02:58:49 [INFO] train episode 282: reward = 116.50, steps = 1000
03:04:39 [INFO] train episode 283: reward = 123.79, steps = 1000
03:07:48 [INFO] train episode 284: reward = 256.04, steps = 534
03:10:49 [INFO] train episode 285: reward = 228.98, steps = 511
03:16:45 [INFO] train episode 286: reward = -110.98, steps = 1000
03:20:19 [INFO] train episode 287: reward = 224.35, steps = 605
03:24:28 [INFO] train episode 288: reward = -176.85, steps = 704
03:28:04 [INFO] train episode 289: reward = 233.31, steps = 613
03:33:56 [INFO] train episode 290: reward = 130.94, steps = 1000
03:39:48 [INFO] train episode 291: reward = 231.11, steps = 996
03:43:30 [INFO] train episode 292: reward = 258.40, steps = 624
03:47:19 [INFO] train episode 293: reward = 207.03, steps = 640
03:50:15 [INFO] train episode 294: reward = 301.69, steps = 487
03:53:20 [INFO] train episode 295: reward = 243.74, steps = 522
03:55:55 [INFO] train episode 296: reward = 242.55, steps = 428
03:59:04 [INFO] train episode 297: reward = 233.71, steps = 522
04:02:41 [INFO] train episode 298: reward = 252.60, steps = 598
04:05:19 [INFO] train episode 299: reward = 266.74, steps = 438
04:07:39 [INFO] train episode 300: reward = 242.77, steps = 387
04:13:41 [INFO] train episode 301: reward = 103.89, steps = 1000
04:15:19 [INFO] train episode 302: reward = 225.41, steps = 268
04:18:22 [INFO] train episode 303: reward = 235.42, steps = 502
04:20:19 [INFO] train episode 304: reward = 262.21, steps = 320
04:23:44 [INFO] train episode 305: reward = 235.70, steps = 563
04:26:32 [INFO] train episode 306: reward = 249.91, steps = 463
04:30:55 [INFO] train episode 307: reward = 245.32, steps = 732
04:33:22 [INFO] train episode 308: reward = 210.31, steps = 410
04:36:40 [INFO] train episode 309: reward = 242.93, steps = 550
04:38:09 [INFO] train episode 310: reward = 298.26, steps = 249
04:39:35 [INFO] train episode 311: reward = 259.20, steps = 239
04:42:14 [INFO] train episode 312: reward = 242.33, steps = 441
04:44:57 [INFO] train episode 313: reward = 225.58, steps = 452
04:46:42 [INFO] train episode 314: reward = 258.81, steps = 291
04:51:45 [INFO] train episode 315: reward = 226.96, steps = 846
04:55:46 [INFO] train episode 316: reward = 272.94, steps = 671
04:59:41 [INFO] train episode 317: reward = 138.30, steps = 651
05:01:33 [INFO] train episode 318: reward = 250.34, steps = 310
05:03:13 [INFO] train episode 319: reward = 265.15, steps = 274
05:08:03 [INFO] train episode 320: reward = 223.02, steps = 799
05:13:24 [INFO] train episode 321: reward = 250.42, steps = 880
05:15:06 [INFO] train episode 322: reward = 206.46, steps = 277
05:17:36 [INFO] train episode 323: reward = 267.64, steps = 413
05:20:13 [INFO] train episode 324: reward = 237.02, steps = 427
05:21:44 [INFO] train episode 325: reward = 25.77, steps = 242
05:23:45 [INFO] train episode 326: reward = 287.57, steps = 331
05:28:12 [INFO] train episode 327: reward = 263.47, steps = 722
05:29:42 [INFO] train episode 328: reward = 255.88, steps = 243
05:31:26 [INFO] train episode 329: reward = 238.99, steps = 281
05:33:30 [INFO] train episode 330: reward = 252.40, steps = 332
05:39:42 [INFO] train episode 331: reward = 96.63, steps = 1000
05:41:12 [INFO] train episode 332: reward = 238.03, steps = 239
05:45:48 [INFO] train episode 333: reward = 250.10, steps = 737
05:48:58 [INFO] train episode 334: reward = 222.97, steps = 509
05:50:30 [INFO] train episode 335: reward = 35.42, steps = 246
05:54:17 [INFO] train episode 336: reward = 224.70, steps = 607
06:00:33 [INFO] train episode 337: reward = 126.39, steps = 1000
06:03:18 [INFO] train episode 338: reward = 194.06, steps = 438
06:05:04 [INFO] train episode 339: reward = 241.88, steps = 279
06:08:11 [INFO] train episode 340: reward = 211.99, steps = 494
06:10:02 [INFO] train episode 341: reward = 270.48, steps = 293
06:16:20 [INFO] train episode 342: reward = 133.99, steps = 1000
06:21:06 [INFO] train episode 343: reward = 224.46, steps = 767
06:22:28 [INFO] train episode 344: reward = 234.86, steps = 215
06:25:20 [INFO] train episode 345: reward = 153.00, steps = 455
06:28:05 [INFO] train episode 346: reward = 250.07, steps = 433
06:32:19 [INFO] train episode 347: reward = 192.83, steps = 647
06:34:33 [INFO] train episode 348: reward = 233.80, steps = 382
06:37:26 [INFO] train episode 349: reward = 226.76, steps = 517
06:39:48 [INFO] train episode 350: reward = 234.68, steps = 418
06:45:28 [INFO] train episode 351: reward = -345.04, steps = 1000
06:48:24 [INFO] train episode 352: reward = 268.46, steps = 516
06:52:14 [INFO] train episode 353: reward = 240.82, steps = 678
06:54:24 [INFO] train episode 354: reward = 242.19, steps = 381
07:00:01 [INFO] train episode 355: reward = 135.67, steps = 1000
07:01:23 [INFO] train episode 356: reward = 223.32, steps = 244
07:03:09 [INFO] train episode 357: reward = 289.18, steps = 321
07:08:41 [INFO] train episode 358: reward = 163.65, steps = 1000
07:14:16 [INFO] train episode 359: reward = 69.05, steps = 1000
07:18:43 [INFO] train episode 360: reward = 210.11, steps = 803
07:21:17 [INFO] train episode 361: reward = 224.41, steps = 460
07:23:05 [INFO] train episode 362: reward = 283.25, steps = 319
07:24:26 [INFO] train episode 363: reward = 270.16, steps = 238
07:30:02 [INFO] train episode 364: reward = 159.61, steps = 1000
07:32:27 [INFO] train episode 365: reward = 263.71, steps = 431
07:33:43 [INFO] train episode 366: reward = 309.44, steps = 226
07:36:01 [INFO] train episode 367: reward = 291.11, steps = 406
07:38:46 [INFO] train episode 368: reward = 256.37, steps = 483
07:41:53 [INFO] train episode 369: reward = 218.04, steps = 543
07:43:46 [INFO] train episode 370: reward = 282.23, steps = 337
07:43:47 [INFO] ==== test ====
07:43:47 [INFO] test episode 0: reward = 279.49, steps = 209
07:43:47 [INFO] test episode 1: reward = 10.49, steps = 228
07:43:48 [INFO] test episode 2: reward = 284.63, steps = 218
07:43:48 [INFO] test episode 3: reward = 241.75, steps = 253
07:43:49 [INFO] test episode 4: reward = 2.62, steps = 228
07:43:49 [INFO] test episode 5: reward = 286.19, steps = 208
07:43:49 [INFO] test episode 6: reward = 303.96, steps = 220
07:43:50 [INFO] test episode 7: reward = 256.15, steps = 244
07:43:50 [INFO] test episode 8: reward = 243.53, steps = 285
07:43:51 [INFO] test episode 9: reward = 252.03, steps = 273
07:43:51 [INFO] test episode 10: reward = 303.65, steps = 221
07:43:52 [INFO] test episode 11: reward = -209.82, steps = 417
07:43:52 [INFO] test episode 12: reward = 270.10, steps = 205
07:43:53 [INFO] test episode 13: reward = 220.75, steps = 261
07:43:53 [INFO] test episode 14: reward = 279.98, steps = 243
07:43:55 [INFO] test episode 15: reward = -145.12, steps = 990
07:43:55 [INFO] test episode 16: reward = -18.38, steps = 223
07:43:56 [INFO] test episode 17: reward = 259.68, steps = 226
07:43:57 [INFO] test episode 18: reward = 254.15, steps = 319
07:43:57 [INFO] test episode 19: reward = 239.07, steps = 246
07:43:57 [INFO] test episode 20: reward = 304.38, steps = 239
07:43:58 [INFO] test episode 21: reward = 239.00, steps = 229
07:43:58 [INFO] test episode 22: reward = 253.46, steps = 261
07:43:59 [INFO] test episode 23: reward = 252.11, steps = 218
07:43:59 [INFO] test episode 24: reward = 17.99, steps = 192
07:43:59 [INFO] test episode 25: reward = 2.82, steps = 203
07:44:00 [INFO] test episode 26: reward = 273.49, steps = 203
07:44:00 [INFO] test episode 27: reward = 298.55, steps = 246
07:44:00 [INFO] test episode 28: reward = 303.20, steps = 193
07:44:01 [INFO] test episode 29: reward = -71.84, steps = 525
07:44:02 [INFO] test episode 30: reward = 233.59, steps = 240
07:44:02 [INFO] test episode 31: reward = 222.26, steps = 299
07:44:03 [INFO] test episode 32: reward = 235.82, steps = 405
07:44:03 [INFO] test episode 33: reward = 262.32, steps = 217
07:44:04 [INFO] test episode 34: reward = 234.88, steps = 230
07:44:04 [INFO] test episode 35: reward = 277.83, steps = 228
07:44:05 [INFO] test episode 36: reward = 233.24, steps = 238
07:44:05 [INFO] test episode 37: reward = 250.93, steps = 219
07:44:05 [INFO] test episode 38: reward = -2.23, steps = 229
07:44:06 [INFO] test episode 39: reward = 258.36, steps = 396
07:44:07 [INFO] test episode 40: reward = 249.10, steps = 211
07:44:07 [INFO] test episode 41: reward = 252.49, steps = 286
07:44:07 [INFO] test episode 42: reward = 270.75, steps = 208
07:44:08 [INFO] test episode 43: reward = 277.22, steps = 265
07:44:08 [INFO] test episode 44: reward = -13.13, steps = 180
07:44:09 [INFO] test episode 45: reward = 237.96, steps = 271
07:44:09 [INFO] test episode 46: reward = 235.87, steps = 206
07:44:10 [INFO] test episode 47: reward = 240.49, steps = 296
07:44:10 [INFO] test episode 48: reward = 296.09, steps = 207
07:44:10 [INFO] test episode 49: reward = 259.75, steps = 216
07:44:11 [INFO] test episode 50: reward = 232.70, steps = 389
07:44:12 [INFO] test episode 51: reward = 241.77, steps = 284
07:44:12 [INFO] test episode 52: reward = 286.27, steps = 209
07:44:12 [INFO] test episode 53: reward = 279.07, steps = 253
07:44:13 [INFO] test episode 54: reward = 243.97, steps = 226
07:44:13 [INFO] test episode 55: reward = 228.24, steps = 233
07:44:14 [INFO] test episode 56: reward = 285.50, steps = 232
07:44:14 [INFO] test episode 57: reward = 254.46, steps = 316
07:44:15 [INFO] test episode 58: reward = 261.65, steps = 261
07:44:15 [INFO] test episode 59: reward = 266.15, steps = 229
07:44:15 [INFO] test episode 60: reward = 255.58, steps = 232
07:44:16 [INFO] test episode 61: reward = 262.40, steps = 230
07:44:16 [INFO] test episode 62: reward = 226.36, steps = 235
07:44:17 [INFO] test episode 63: reward = 278.25, steps = 205
07:44:17 [INFO] test episode 64: reward = 257.12, steps = 217
07:44:17 [INFO] test episode 65: reward = -20.50, steps = 227
07:44:18 [INFO] test episode 66: reward = 314.60, steps = 220
07:44:19 [INFO] test episode 67: reward = 191.99, steps = 404
07:44:19 [INFO] test episode 68: reward = 257.35, steps = 217
07:44:19 [INFO] test episode 69: reward = 268.04, steps = 210
07:44:20 [INFO] test episode 70: reward = 231.78, steps = 247
07:44:20 [INFO] test episode 71: reward = 236.76, steps = 213
07:44:21 [INFO] test episode 72: reward = 292.87, steps = 219
07:44:21 [INFO] test episode 73: reward = -27.42, steps = 264
07:44:22 [INFO] test episode 74: reward = 267.78, steps = 250
07:44:22 [INFO] test episode 75: reward = 251.07, steps = 230
07:44:22 [INFO] test episode 76: reward = 241.02, steps = 234
07:44:23 [INFO] test episode 77: reward = 274.73, steps = 241
07:44:23 [INFO] test episode 78: reward = 266.51, steps = 210
07:44:23 [INFO] test episode 79: reward = 264.22, steps = 232
07:44:24 [INFO] test episode 80: reward = 274.58, steps = 248
07:44:24 [INFO] test episode 81: reward = 16.07, steps = 201
07:44:25 [INFO] test episode 82: reward = 243.79, steps = 284
07:44:25 [INFO] test episode 83: reward = 293.47, steps = 250
07:44:25 [INFO] test episode 84: reward = -11.63, steps = 206
07:44:26 [INFO] test episode 85: reward = 301.89, steps = 207
07:44:26 [INFO] test episode 86: reward = 259.84, steps = 236
07:44:27 [INFO] test episode 87: reward = 261.76, steps = 234
07:44:27 [INFO] test episode 88: reward = -6.52, steps = 209
07:44:27 [INFO] test episode 89: reward = -11.69, steps = 244
07:44:28 [INFO] test episode 90: reward = 239.15, steps = 213
07:44:28 [INFO] test episode 91: reward = 309.71, steps = 195
07:44:28 [INFO] test episode 92: reward = 286.70, steps = 233
07:44:29 [INFO] test episode 93: reward = 244.88, steps = 226
07:44:29 [INFO] test episode 94: reward = 272.46, steps = 266
07:44:30 [INFO] test episode 95: reward = 252.75, steps = 304
07:44:30 [INFO] test episode 96: reward = 241.59, steps = 264
07:44:31 [INFO] test episode 97: reward = 227.30, steps = 297
07:44:32 [INFO] test episode 98: reward = 165.97, steps = 413
07:44:32 [INFO] test episode 99: reward = 293.29, steps = 237
07:44:32 [INFO] average episode reward = 213.27 ± 111.71