Use PPO to Play Acrobot-v1¶

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('Acrobot-v1')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
03:33:25 [INFO] env: <AcrobotEnv<Acrobot-v1>>
03:33:25 [INFO] action_space: Discrete(3)
03:33:25 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)
03:33:25 [INFO] reward_range: (-inf, inf)
03:33:25 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15}
03:33:25 [INFO] _max_episode_steps: 500
03:33:25 [INFO] _elapsed_steps: None
03:33:25 [INFO] id: Acrobot-v1
03:33:25 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv
03:33:25 [INFO] reward_threshold: -100.0
03:33:25 [INFO] nondeterministic: False
03:33:25 [INFO] max_episode_steps: 500
03:33:25 [INFO] _kwargs: {}
03:33:25 [INFO] _env_name: Acrobot
In [3]:
class PPOReplayer:
    def __init__(self):
        self.fields = ['state', 'action', 'prob', 'advantage', 'return']
        self.memory = pd.DataFrame(columns=self.fields)

    def store(self, df):
        if self.memory.empty:
            self.memory = df[self.fields]
        else:
            self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)

    def sample(self, size):
        indices = np.random.choice(self.memory.shape[0], size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.fields)
In [4]:
class PPOAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = PPOReplayer()

        self.actor_net = self.build_net(hidden_sizes=[100,],
                output_size=self.action_n, output_activation=nn.softmax,
                learning_rate=0.001)
        self.critic_net = self.build_net(hidden_sizes=[100,],
                learning_rate=0.002)

    def build_net(self, input_size=None, hidden_sizes=None, output_size=1,
                activation=nn.relu, output_activation=None,
                loss=losses.mse, learning_rate=0.001):
        model = keras.Sequential()
        for hidden_size in hidden_sizes:
            model.add(layers.Dense(units=hidden_size,
                    activation=activation))
        model.add(layers.Dense(units=output_size,
                activation=output_activation))
        optimizer = optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        probs = self.actor_net.predict(observation[np.newaxis], verbose=0)[0]
        action = np.random.choice(self.action_n, p=probs)
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
        return action

    def close(self):
        if self.mode == 'train':
            self.save_trajectory_to_replayer()
            if len(self.replayer.memory) >= 1000:
                for batch in range(5):  # learn multiple times
                    self.learn()
                self.replayer = PPOReplayer()
                        # reset replayer after the agent changes itself

    def save_trajectory_to_replayer(self):
        df = pd.DataFrame(
                np.array(self.trajectory, dtype=object).reshape(-1, 4),
                columns=['state', 'reward', 'terminated', 'action'], dtype=object)
        states = np.stack(df['state'])
        df['v'] = self.critic_net.predict(states, verbose=0)
        pis = self.actor_net.predict(states, verbose=0)
        df['prob'] = [pi[action] for pi, action in zip(pis, df['action'])]
        df['next_v'] = df['v'].shift(-1).fillna(0.)
        df['u'] = df['reward'] + self.gamma * df['next_v']
        df['delta'] = df['u'] - df['v']
        df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
                df['delta'][::-1])[::-1]
        df['return'] = signal.lfilter([1.,], [1., -self.gamma],
                df['reward'][::-1])[::-1]
        self.replayer.store(df)

    def learn(self):
        states, actions, old_pis, advantages, returns = \
                self.replayer.sample(size=64)
        state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
        action_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
        old_pi_tensor = tf.convert_to_tensor(old_pis, dtype=tf.float32)
        advantage_tensor = tf.convert_to_tensor(advantages, dtype=tf.float32)

        # update actor
        with tf.GradientTape() as tape:
            all_pi_tensor = self.actor_net(state_tensor)
            pi_tensor = tf.gather(all_pi_tensor, action_tensor, batch_dims=1)
            surrogate_advantage_tensor = (pi_tensor / old_pi_tensor) * \
                    advantage_tensor
            clip_times_advantage_tensor = 0.1 * surrogate_advantage_tensor
            max_surrogate_advantage_tensor = advantage_tensor + \
                    tf.where(advantage_tensor > 0.,
                    clip_times_advantage_tensor, -clip_times_advantage_tensor)
            clipped_surrogate_advantage_tensor = tf.minimum(
                    surrogate_advantage_tensor, max_surrogate_advantage_tensor)
            loss_tensor = -tf.reduce_mean(clipped_surrogate_advantage_tensor)
        actor_grads = tape.gradient(loss_tensor, self.actor_net.variables)
        self.actor_net.optimizer.apply_gradients(
                zip(actor_grads, self.actor_net.variables))

        # update critic
        self.critic_net.fit(states, returns, verbose=0)


agent = PPOAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -120:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
03:33:26 [INFO] ==== train ====
03:34:14 [INFO] train episode 0: reward = -500.00, steps = 500
03:34:14 [INFO] NumExpr defaulting to 8 threads.
03:35:03 [INFO] train episode 1: reward = -500.00, steps = 500
03:35:52 [INFO] train episode 2: reward = -500.00, steps = 500
03:36:40 [INFO] train episode 3: reward = -500.00, steps = 500
03:37:27 [INFO] train episode 4: reward = -485.00, steps = 486
03:38:14 [INFO] train episode 5: reward = -500.00, steps = 500
03:38:45 [INFO] train episode 6: reward = -339.00, steps = 340
03:39:27 [INFO] train episode 7: reward = -448.00, steps = 449
03:39:54 [INFO] train episode 8: reward = -292.00, steps = 293
03:40:29 [INFO] train episode 9: reward = -383.00, steps = 384
03:41:16 [INFO] train episode 10: reward = -500.00, steps = 500
03:41:42 [INFO] train episode 11: reward = -283.00, steps = 284
03:42:05 [INFO] train episode 12: reward = -254.00, steps = 255
03:42:30 [INFO] train episode 13: reward = -267.00, steps = 268
03:43:16 [INFO] train episode 14: reward = -500.00, steps = 500
03:44:01 [INFO] train episode 15: reward = -500.00, steps = 500
03:44:47 [INFO] train episode 16: reward = -500.00, steps = 500
03:45:31 [INFO] train episode 17: reward = -500.00, steps = 500
03:46:17 [INFO] train episode 18: reward = -500.00, steps = 500
03:47:03 [INFO] train episode 19: reward = -500.00, steps = 500
03:47:50 [INFO] train episode 20: reward = -500.00, steps = 500
03:48:36 [INFO] train episode 21: reward = -500.00, steps = 500
03:49:23 [INFO] train episode 22: reward = -500.00, steps = 500
03:50:09 [INFO] train episode 23: reward = -500.00, steps = 500
03:50:45 [INFO] train episode 24: reward = -390.00, steps = 391
03:51:31 [INFO] train episode 25: reward = -500.00, steps = 500
03:52:15 [INFO] train episode 26: reward = -487.00, steps = 488
03:53:02 [INFO] train episode 27: reward = -500.00, steps = 500
03:53:48 [INFO] train episode 28: reward = -500.00, steps = 500
03:54:33 [INFO] train episode 29: reward = -500.00, steps = 500
03:55:18 [INFO] train episode 30: reward = -500.00, steps = 500
03:56:04 [INFO] train episode 31: reward = -500.00, steps = 500
03:56:40 [INFO] train episode 32: reward = -398.00, steps = 399
03:57:16 [INFO] train episode 33: reward = -392.00, steps = 393
03:58:01 [INFO] train episode 34: reward = -500.00, steps = 500
03:58:46 [INFO] train episode 35: reward = -500.00, steps = 500
03:59:32 [INFO] train episode 36: reward = -492.00, steps = 493
04:00:17 [INFO] train episode 37: reward = -500.00, steps = 500
04:01:03 [INFO] train episode 38: reward = -500.00, steps = 500
04:01:49 [INFO] train episode 39: reward = -500.00, steps = 500
04:02:33 [INFO] train episode 40: reward = -500.00, steps = 500
04:03:01 [INFO] train episode 41: reward = -302.00, steps = 303
04:03:26 [INFO] train episode 42: reward = -266.00, steps = 267
04:03:59 [INFO] train episode 43: reward = -357.00, steps = 358
04:04:37 [INFO] train episode 44: reward = -416.00, steps = 417
04:05:17 [INFO] train episode 45: reward = -423.00, steps = 424
04:06:03 [INFO] train episode 46: reward = -500.00, steps = 500
04:06:48 [INFO] train episode 47: reward = -500.00, steps = 500
04:07:35 [INFO] train episode 48: reward = -500.00, steps = 500
04:08:20 [INFO] train episode 49: reward = -500.00, steps = 500
04:09:07 [INFO] train episode 50: reward = -500.00, steps = 500
04:09:53 [INFO] train episode 51: reward = -500.00, steps = 500
04:10:40 [INFO] train episode 52: reward = -500.00, steps = 500
04:11:26 [INFO] train episode 53: reward = -500.00, steps = 500
04:12:13 [INFO] train episode 54: reward = -500.00, steps = 500
04:13:00 [INFO] train episode 55: reward = -500.00, steps = 500
04:13:47 [INFO] train episode 56: reward = -500.00, steps = 500
04:14:33 [INFO] train episode 57: reward = -500.00, steps = 500
04:15:20 [INFO] train episode 58: reward = -500.00, steps = 500
04:16:05 [INFO] train episode 59: reward = -500.00, steps = 500
04:16:51 [INFO] train episode 60: reward = -500.00, steps = 500
04:17:37 [INFO] train episode 61: reward = -500.00, steps = 500
04:18:24 [INFO] train episode 62: reward = -500.00, steps = 500
04:19:10 [INFO] train episode 63: reward = -500.00, steps = 500
04:19:55 [INFO] train episode 64: reward = -500.00, steps = 500
04:20:41 [INFO] train episode 65: reward = -500.00, steps = 500
04:21:27 [INFO] train episode 66: reward = -500.00, steps = 500
04:22:13 [INFO] train episode 67: reward = -500.00, steps = 500
04:23:00 [INFO] train episode 68: reward = -500.00, steps = 500
04:23:45 [INFO] train episode 69: reward = -500.00, steps = 500
04:24:31 [INFO] train episode 70: reward = -500.00, steps = 500
04:25:15 [INFO] train episode 71: reward = -479.00, steps = 480
04:26:00 [INFO] train episode 72: reward = -500.00, steps = 500
04:26:46 [INFO] train episode 73: reward = -500.00, steps = 500
04:27:32 [INFO] train episode 74: reward = -500.00, steps = 500
04:28:18 [INFO] train episode 75: reward = -500.00, steps = 500
04:29:04 [INFO] train episode 76: reward = -500.00, steps = 500
04:29:51 [INFO] train episode 77: reward = -500.00, steps = 500
04:30:37 [INFO] train episode 78: reward = -500.00, steps = 500
04:31:22 [INFO] train episode 79: reward = -500.00, steps = 500
04:32:08 [INFO] train episode 80: reward = -500.00, steps = 500
04:32:55 [INFO] train episode 81: reward = -500.00, steps = 500
04:33:41 [INFO] train episode 82: reward = -500.00, steps = 500
04:34:27 [INFO] train episode 83: reward = -500.00, steps = 500
04:35:14 [INFO] train episode 84: reward = -500.00, steps = 500
04:36:00 [INFO] train episode 85: reward = -500.00, steps = 500
04:36:47 [INFO] train episode 86: reward = -500.00, steps = 500
04:37:33 [INFO] train episode 87: reward = -500.00, steps = 500
04:38:24 [INFO] train episode 88: reward = -500.00, steps = 500
04:39:09 [INFO] train episode 89: reward = -474.00, steps = 475
04:39:55 [INFO] train episode 90: reward = -500.00, steps = 500
04:40:39 [INFO] train episode 91: reward = -480.00, steps = 481
04:41:27 [INFO] train episode 92: reward = -500.00, steps = 500
04:42:13 [INFO] train episode 93: reward = -500.00, steps = 500
04:43:00 [INFO] train episode 94: reward = -500.00, steps = 500
04:43:46 [INFO] train episode 95: reward = -500.00, steps = 500
04:44:33 [INFO] train episode 96: reward = -500.00, steps = 500
04:45:19 [INFO] train episode 97: reward = -500.00, steps = 500
04:45:55 [INFO] train episode 98: reward = -382.00, steps = 383
04:46:41 [INFO] train episode 99: reward = -500.00, steps = 500
04:47:27 [INFO] train episode 100: reward = -500.00, steps = 500
04:48:14 [INFO] train episode 101: reward = -500.00, steps = 500
04:48:42 [INFO] train episode 102: reward = -298.00, steps = 299
04:49:25 [INFO] train episode 103: reward = -467.00, steps = 468
04:50:13 [INFO] train episode 104: reward = -500.00, steps = 500
04:51:03 [INFO] train episode 105: reward = -500.00, steps = 500
04:51:50 [INFO] train episode 106: reward = -500.00, steps = 500
04:52:36 [INFO] train episode 107: reward = -500.00, steps = 500
04:53:23 [INFO] train episode 108: reward = -500.00, steps = 500
04:54:09 [INFO] train episode 109: reward = -500.00, steps = 500
04:54:47 [INFO] train episode 110: reward = -393.00, steps = 394
04:55:33 [INFO] train episode 111: reward = -500.00, steps = 500
04:56:17 [INFO] train episode 112: reward = -500.00, steps = 500
04:57:03 [INFO] train episode 113: reward = -500.00, steps = 500
04:57:49 [INFO] train episode 114: reward = -500.00, steps = 500
04:58:36 [INFO] train episode 115: reward = -500.00, steps = 500
04:59:22 [INFO] train episode 116: reward = -500.00, steps = 500
05:00:07 [INFO] train episode 117: reward = -500.00, steps = 500
05:00:53 [INFO] train episode 118: reward = -500.00, steps = 500
05:01:38 [INFO] train episode 119: reward = -500.00, steps = 500
05:02:17 [INFO] train episode 120: reward = -417.00, steps = 418
05:03:02 [INFO] train episode 121: reward = -500.00, steps = 500
05:03:47 [INFO] train episode 122: reward = -500.00, steps = 500
05:04:31 [INFO] train episode 123: reward = -493.00, steps = 494
05:05:14 [INFO] train episode 124: reward = -480.00, steps = 481
05:05:38 [INFO] train episode 125: reward = -254.00, steps = 255
05:06:11 [INFO] train episode 126: reward = -369.00, steps = 370
05:06:44 [INFO] train episode 127: reward = -371.00, steps = 372
05:07:30 [INFO] train episode 128: reward = -496.00, steps = 497
05:07:58 [INFO] train episode 129: reward = -316.00, steps = 317
05:08:28 [INFO] train episode 130: reward = -324.00, steps = 325
05:09:03 [INFO] train episode 131: reward = -388.00, steps = 389
05:09:41 [INFO] train episode 132: reward = -418.00, steps = 419
05:10:14 [INFO] train episode 133: reward = -359.00, steps = 360
05:10:59 [INFO] train episode 134: reward = -500.00, steps = 500
05:11:47 [INFO] train episode 135: reward = -500.00, steps = 500
05:12:21 [INFO] train episode 136: reward = -365.00, steps = 366
05:12:43 [INFO] train episode 137: reward = -238.00, steps = 239
05:13:13 [INFO] train episode 138: reward = -332.00, steps = 333
05:13:58 [INFO] train episode 139: reward = -500.00, steps = 500
05:14:35 [INFO] train episode 140: reward = -401.00, steps = 402
05:15:02 [INFO] train episode 141: reward = -305.00, steps = 306
05:15:22 [INFO] train episode 142: reward = -225.00, steps = 226
05:15:41 [INFO] train episode 143: reward = -227.00, steps = 228
05:16:08 [INFO] train episode 144: reward = -300.00, steps = 301
05:16:35 [INFO] train episode 145: reward = -308.00, steps = 309
05:17:07 [INFO] train episode 146: reward = -365.00, steps = 366
05:17:31 [INFO] train episode 147: reward = -267.00, steps = 268
05:18:00 [INFO] train episode 148: reward = -319.00, steps = 320
05:18:27 [INFO] train episode 149: reward = -312.00, steps = 313
05:18:40 [INFO] train episode 150: reward = -150.00, steps = 151
05:19:22 [INFO] train episode 151: reward = -472.00, steps = 473
05:19:43 [INFO] train episode 152: reward = -227.00, steps = 228
05:20:09 [INFO] train episode 153: reward = -298.00, steps = 299
05:20:27 [INFO] train episode 154: reward = -211.00, steps = 212
05:20:49 [INFO] train episode 155: reward = -244.00, steps = 245
05:21:09 [INFO] train episode 156: reward = -238.00, steps = 239
05:21:32 [INFO] train episode 157: reward = -255.00, steps = 256
05:21:51 [INFO] train episode 158: reward = -220.00, steps = 221
05:22:21 [INFO] train episode 159: reward = -343.00, steps = 344
05:22:35 [INFO] train episode 160: reward = -161.00, steps = 162
05:22:54 [INFO] train episode 161: reward = -210.00, steps = 211
05:23:20 [INFO] train episode 162: reward = -296.00, steps = 297
05:23:38 [INFO] train episode 163: reward = -196.00, steps = 197
05:23:58 [INFO] train episode 164: reward = -235.00, steps = 236
05:24:24 [INFO] train episode 165: reward = -292.00, steps = 293
05:24:41 [INFO] train episode 166: reward = -191.00, steps = 192
05:25:18 [INFO] train episode 167: reward = -415.00, steps = 416
05:25:36 [INFO] train episode 168: reward = -209.00, steps = 210
05:26:05 [INFO] train episode 169: reward = -330.00, steps = 331
05:26:21 [INFO] train episode 170: reward = -173.00, steps = 174
05:26:37 [INFO] train episode 171: reward = -189.00, steps = 190
05:26:50 [INFO] train episode 172: reward = -142.00, steps = 143
05:27:10 [INFO] train episode 173: reward = -234.00, steps = 235
05:27:28 [INFO] train episode 174: reward = -197.00, steps = 198
05:27:43 [INFO] train episode 175: reward = -180.00, steps = 181
05:28:09 [INFO] train episode 176: reward = -284.00, steps = 285
05:28:36 [INFO] train episode 177: reward = -314.00, steps = 315
05:28:53 [INFO] train episode 178: reward = -190.00, steps = 191
05:29:11 [INFO] train episode 179: reward = -209.00, steps = 210
05:29:29 [INFO] train episode 180: reward = -202.00, steps = 203
05:29:41 [INFO] train episode 181: reward = -136.00, steps = 137
05:29:54 [INFO] train episode 182: reward = -142.00, steps = 143
05:30:16 [INFO] train episode 183: reward = -261.00, steps = 262
05:30:34 [INFO] train episode 184: reward = -195.00, steps = 196
05:30:49 [INFO] train episode 185: reward = -170.00, steps = 171
05:31:04 [INFO] train episode 186: reward = -161.00, steps = 162
05:31:21 [INFO] train episode 187: reward = -200.00, steps = 201
05:31:33 [INFO] train episode 188: reward = -130.00, steps = 131
05:31:45 [INFO] train episode 189: reward = -133.00, steps = 134
05:32:09 [INFO] train episode 190: reward = -275.00, steps = 276
05:32:25 [INFO] train episode 191: reward = -183.00, steps = 184
05:32:44 [INFO] train episode 192: reward = -204.00, steps = 205
05:33:02 [INFO] train episode 193: reward = -204.00, steps = 205
05:33:24 [INFO] train episode 194: reward = -255.00, steps = 256
05:33:38 [INFO] train episode 195: reward = -154.00, steps = 155
05:33:55 [INFO] train episode 196: reward = -189.00, steps = 190
05:34:09 [INFO] train episode 197: reward = -156.00, steps = 157
05:34:21 [INFO] train episode 198: reward = -135.00, steps = 136
05:34:32 [INFO] train episode 199: reward = -118.00, steps = 119
05:34:50 [INFO] train episode 200: reward = -196.00, steps = 197
05:35:06 [INFO] train episode 201: reward = -175.00, steps = 176
05:35:23 [INFO] train episode 202: reward = -202.00, steps = 203
05:35:36 [INFO] train episode 203: reward = -144.00, steps = 145
05:35:52 [INFO] train episode 204: reward = -172.00, steps = 173
05:36:05 [INFO] train episode 205: reward = -135.00, steps = 136
05:36:17 [INFO] train episode 206: reward = -145.00, steps = 146
05:36:30 [INFO] train episode 207: reward = -147.00, steps = 148
05:36:42 [INFO] train episode 208: reward = -122.00, steps = 123
05:36:58 [INFO] train episode 209: reward = -176.00, steps = 177
05:37:10 [INFO] train episode 210: reward = -136.00, steps = 137
05:37:23 [INFO] train episode 211: reward = -134.00, steps = 135
05:37:36 [INFO] train episode 212: reward = -140.00, steps = 141
05:37:50 [INFO] train episode 213: reward = -144.00, steps = 145
05:38:11 [INFO] train episode 214: reward = -238.00, steps = 239
05:38:31 [INFO] train episode 215: reward = -218.00, steps = 219
05:38:50 [INFO] train episode 216: reward = -211.00, steps = 212
05:39:05 [INFO] train episode 217: reward = -155.00, steps = 156
05:39:14 [INFO] train episode 218: reward = -112.00, steps = 113
05:39:27 [INFO] train episode 219: reward = -144.00, steps = 145
05:39:59 [INFO] train episode 220: reward = -364.00, steps = 365
05:40:13 [INFO] train episode 221: reward = -165.00, steps = 166
05:40:27 [INFO] train episode 222: reward = -153.00, steps = 154
05:40:38 [INFO] train episode 223: reward = -124.00, steps = 125
05:40:53 [INFO] train episode 224: reward = -169.00, steps = 170
05:41:03 [INFO] train episode 225: reward = -110.00, steps = 111
05:41:19 [INFO] train episode 226: reward = -179.00, steps = 180
05:41:34 [INFO] train episode 227: reward = -164.00, steps = 165
05:41:43 [INFO] train episode 228: reward = -103.00, steps = 104
05:41:53 [INFO] train episode 229: reward = -109.00, steps = 110
05:42:07 [INFO] train episode 230: reward = -160.00, steps = 161
05:42:25 [INFO] train episode 231: reward = -198.00, steps = 199
05:42:36 [INFO] train episode 232: reward = -114.00, steps = 115
05:42:53 [INFO] train episode 233: reward = -189.00, steps = 190
05:43:05 [INFO] train episode 234: reward = -131.00, steps = 132
05:43:18 [INFO] train episode 235: reward = -154.00, steps = 155
05:43:33 [INFO] train episode 236: reward = -161.00, steps = 162
05:43:46 [INFO] train episode 237: reward = -139.00, steps = 140
05:43:59 [INFO] train episode 238: reward = -138.00, steps = 139
05:44:13 [INFO] train episode 239: reward = -164.00, steps = 165
05:44:26 [INFO] train episode 240: reward = -141.00, steps = 142
05:44:42 [INFO] train episode 241: reward = -182.00, steps = 183
05:44:56 [INFO] train episode 242: reward = -161.00, steps = 162
05:45:07 [INFO] train episode 243: reward = -120.00, steps = 121
05:45:17 [INFO] train episode 244: reward = -113.00, steps = 114
05:45:29 [INFO] train episode 245: reward = -129.00, steps = 130
05:45:41 [INFO] train episode 246: reward = -142.00, steps = 143
05:45:53 [INFO] train episode 247: reward = -135.00, steps = 136
05:46:04 [INFO] train episode 248: reward = -120.00, steps = 121
05:46:16 [INFO] train episode 249: reward = -138.00, steps = 139
05:46:31 [INFO] train episode 250: reward = -171.00, steps = 172
05:46:46 [INFO] train episode 251: reward = -162.00, steps = 163
05:47:03 [INFO] train episode 252: reward = -188.00, steps = 189
05:47:14 [INFO] train episode 253: reward = -124.00, steps = 125
05:47:33 [INFO] train episode 254: reward = -210.00, steps = 211
05:48:17 [INFO] train episode 255: reward = -500.00, steps = 500
05:48:32 [INFO] train episode 256: reward = -166.00, steps = 167
05:48:42 [INFO] train episode 257: reward = -111.00, steps = 112
05:48:55 [INFO] train episode 258: reward = -145.00, steps = 146
05:49:07 [INFO] train episode 259: reward = -141.00, steps = 142
05:49:19 [INFO] train episode 260: reward = -133.00, steps = 134
05:49:36 [INFO] train episode 261: reward = -187.00, steps = 188
05:49:48 [INFO] train episode 262: reward = -140.00, steps = 141
05:49:59 [INFO] train episode 263: reward = -118.00, steps = 119
05:50:12 [INFO] train episode 264: reward = -141.00, steps = 142
05:50:23 [INFO] train episode 265: reward = -119.00, steps = 120
05:50:39 [INFO] train episode 266: reward = -184.00, steps = 185
05:50:50 [INFO] train episode 267: reward = -122.00, steps = 123
05:51:18 [INFO] train episode 268: reward = -324.00, steps = 325
05:51:31 [INFO] train episode 269: reward = -146.00, steps = 147
05:51:43 [INFO] train episode 270: reward = -133.00, steps = 134
05:51:55 [INFO] train episode 271: reward = -139.00, steps = 140
05:52:09 [INFO] train episode 272: reward = -155.00, steps = 156
05:52:21 [INFO] train episode 273: reward = -130.00, steps = 131
05:52:33 [INFO] train episode 274: reward = -136.00, steps = 137
05:52:47 [INFO] train episode 275: reward = -158.00, steps = 159
05:52:57 [INFO] train episode 276: reward = -107.00, steps = 108
05:53:08 [INFO] train episode 277: reward = -134.00, steps = 135
05:53:21 [INFO] train episode 278: reward = -139.00, steps = 140
05:53:37 [INFO] train episode 279: reward = -187.00, steps = 188
05:53:50 [INFO] train episode 280: reward = -143.00, steps = 144
05:54:02 [INFO] train episode 281: reward = -141.00, steps = 142
05:54:13 [INFO] train episode 282: reward = -120.00, steps = 121
05:54:25 [INFO] train episode 283: reward = -132.00, steps = 133
05:54:38 [INFO] train episode 284: reward = -135.00, steps = 136
05:54:52 [INFO] train episode 285: reward = -160.00, steps = 161
05:55:05 [INFO] train episode 286: reward = -150.00, steps = 151
05:55:18 [INFO] train episode 287: reward = -148.00, steps = 149
05:55:32 [INFO] train episode 288: reward = -151.00, steps = 152
05:55:48 [INFO] train episode 289: reward = -180.00, steps = 181
05:56:01 [INFO] train episode 290: reward = -139.00, steps = 140
05:56:15 [INFO] train episode 291: reward = -147.00, steps = 148
05:56:24 [INFO] train episode 292: reward = -111.00, steps = 112
05:56:34 [INFO] train episode 293: reward = -109.00, steps = 110
05:56:48 [INFO] train episode 294: reward = -151.00, steps = 152
05:57:01 [INFO] train episode 295: reward = -154.00, steps = 155
05:57:22 [INFO] train episode 296: reward = -228.00, steps = 229
05:57:37 [INFO] train episode 297: reward = -165.00, steps = 166
05:57:53 [INFO] train episode 298: reward = -179.00, steps = 180
05:58:08 [INFO] train episode 299: reward = -174.00, steps = 175
05:58:17 [INFO] train episode 300: reward = -96.00, steps = 97
05:58:36 [INFO] train episode 301: reward = -211.00, steps = 212
05:58:49 [INFO] train episode 302: reward = -154.00, steps = 155
05:59:02 [INFO] train episode 303: reward = -145.00, steps = 146
05:59:14 [INFO] train episode 304: reward = -129.00, steps = 130
05:59:26 [INFO] train episode 305: reward = -129.00, steps = 130
05:59:41 [INFO] train episode 306: reward = -168.00, steps = 169
05:59:52 [INFO] train episode 307: reward = -123.00, steps = 124
06:00:05 [INFO] train episode 308: reward = -135.00, steps = 136
06:00:43 [INFO] train episode 309: reward = -429.00, steps = 430
06:00:52 [INFO] train episode 310: reward = -98.00, steps = 99
06:01:05 [INFO] train episode 311: reward = -136.00, steps = 137
06:01:17 [INFO] train episode 312: reward = -131.00, steps = 132
06:01:32 [INFO] train episode 313: reward = -170.00, steps = 171
06:01:46 [INFO] train episode 314: reward = -151.00, steps = 152
06:02:00 [INFO] train episode 315: reward = -167.00, steps = 168
06:02:15 [INFO] train episode 316: reward = -165.00, steps = 166
06:02:29 [INFO] train episode 317: reward = -150.00, steps = 151
06:02:40 [INFO] train episode 318: reward = -124.00, steps = 125
06:02:56 [INFO] train episode 319: reward = -175.00, steps = 176
06:03:07 [INFO] train episode 320: reward = -128.00, steps = 129
06:03:21 [INFO] train episode 321: reward = -160.00, steps = 161
06:03:31 [INFO] train episode 322: reward = -111.00, steps = 112
06:03:41 [INFO] train episode 323: reward = -109.00, steps = 110
06:03:56 [INFO] train episode 324: reward = -165.00, steps = 166
06:04:10 [INFO] train episode 325: reward = -150.00, steps = 151
06:04:20 [INFO] train episode 326: reward = -105.00, steps = 106
06:04:32 [INFO] train episode 327: reward = -136.00, steps = 137
06:04:42 [INFO] train episode 328: reward = -118.00, steps = 119
06:04:57 [INFO] train episode 329: reward = -172.00, steps = 173
06:05:09 [INFO] train episode 330: reward = -127.00, steps = 128
06:05:27 [INFO] train episode 331: reward = -210.00, steps = 211
06:05:38 [INFO] train episode 332: reward = -116.00, steps = 117
06:05:48 [INFO] train episode 333: reward = -105.00, steps = 106
06:06:04 [INFO] train episode 334: reward = -175.00, steps = 176
06:06:15 [INFO] train episode 335: reward = -124.00, steps = 125
06:06:30 [INFO] train episode 336: reward = -161.00, steps = 162
06:06:44 [INFO] train episode 337: reward = -166.00, steps = 167
06:06:53 [INFO] train episode 338: reward = -90.00, steps = 91
06:07:02 [INFO] train episode 339: reward = -112.00, steps = 113
06:07:19 [INFO] train episode 340: reward = -180.00, steps = 181
06:07:33 [INFO] train episode 341: reward = -146.00, steps = 147
06:07:42 [INFO] train episode 342: reward = -109.00, steps = 110
06:07:52 [INFO] train episode 343: reward = -101.00, steps = 102
06:08:02 [INFO] train episode 344: reward = -118.00, steps = 119
06:08:12 [INFO] train episode 345: reward = -104.00, steps = 105
06:08:25 [INFO] train episode 346: reward = -145.00, steps = 146
06:08:34 [INFO] train episode 347: reward = -101.00, steps = 102
06:08:46 [INFO] train episode 348: reward = -126.00, steps = 127
06:08:57 [INFO] train episode 349: reward = -125.00, steps = 126
06:09:06 [INFO] train episode 350: reward = -86.00, steps = 87
06:09:06 [INFO] ==== test ====
06:09:17 [INFO] test episode 0: reward = -132.00, steps = 133
06:09:28 [INFO] test episode 1: reward = -118.00, steps = 119
06:09:39 [INFO] test episode 2: reward = -139.00, steps = 140
06:09:47 [INFO] test episode 3: reward = -87.00, steps = 88
06:09:57 [INFO] test episode 4: reward = -118.00, steps = 119
06:10:07 [INFO] test episode 5: reward = -117.00, steps = 118
06:10:17 [INFO] test episode 6: reward = -111.00, steps = 112
06:10:27 [INFO] test episode 7: reward = -116.00, steps = 117
06:10:36 [INFO] test episode 8: reward = -106.00, steps = 107
06:10:45 [INFO] test episode 9: reward = -106.00, steps = 107
06:10:55 [INFO] test episode 10: reward = -114.00, steps = 115
06:11:05 [INFO] test episode 11: reward = -122.00, steps = 123
06:11:17 [INFO] test episode 12: reward = -137.00, steps = 138
06:11:26 [INFO] test episode 13: reward = -105.00, steps = 106
06:11:37 [INFO] test episode 14: reward = -133.00, steps = 134
06:11:48 [INFO] test episode 15: reward = -125.00, steps = 126
06:12:02 [INFO] test episode 16: reward = -158.00, steps = 159
06:12:12 [INFO] test episode 17: reward = -115.00, steps = 116
06:12:21 [INFO] test episode 18: reward = -118.00, steps = 119
06:12:32 [INFO] test episode 19: reward = -127.00, steps = 128
06:12:46 [INFO] test episode 20: reward = -165.00, steps = 166
06:12:56 [INFO] test episode 21: reward = -118.00, steps = 119
06:13:09 [INFO] test episode 22: reward = -142.00, steps = 143
06:13:17 [INFO] test episode 23: reward = -103.00, steps = 104
06:13:29 [INFO] test episode 24: reward = -134.00, steps = 135
06:13:50 [INFO] test episode 25: reward = -248.00, steps = 249
06:13:59 [INFO] test episode 26: reward = -111.00, steps = 112
06:14:11 [INFO] test episode 27: reward = -133.00, steps = 134
06:14:25 [INFO] test episode 28: reward = -173.00, steps = 174
06:14:37 [INFO] test episode 29: reward = -134.00, steps = 135
06:14:48 [INFO] test episode 30: reward = -132.00, steps = 133
06:14:58 [INFO] test episode 31: reward = -117.00, steps = 118
06:15:08 [INFO] test episode 32: reward = -115.00, steps = 116
06:15:19 [INFO] test episode 33: reward = -131.00, steps = 132
06:15:28 [INFO] test episode 34: reward = -98.00, steps = 99
06:15:40 [INFO] test episode 35: reward = -141.00, steps = 142
06:15:50 [INFO] test episode 36: reward = -119.00, steps = 120
06:15:59 [INFO] test episode 37: reward = -107.00, steps = 108
06:16:12 [INFO] test episode 38: reward = -149.00, steps = 150
06:16:23 [INFO] test episode 39: reward = -120.00, steps = 121
06:16:45 [INFO] test episode 40: reward = -263.00, steps = 264
06:16:59 [INFO] test episode 41: reward = -171.00, steps = 172
06:17:10 [INFO] test episode 42: reward = -122.00, steps = 123
06:17:19 [INFO] test episode 43: reward = -111.00, steps = 112
06:17:30 [INFO] test episode 44: reward = -120.00, steps = 121
06:17:41 [INFO] test episode 45: reward = -126.00, steps = 127
06:17:58 [INFO] test episode 46: reward = -202.00, steps = 203
06:18:08 [INFO] test episode 47: reward = -124.00, steps = 125
06:18:16 [INFO] test episode 48: reward = -87.00, steps = 88
06:18:24 [INFO] test episode 49: reward = -91.00, steps = 92
06:18:36 [INFO] test episode 50: reward = -146.00, steps = 147
06:18:50 [INFO] test episode 51: reward = -169.00, steps = 170
06:19:01 [INFO] test episode 52: reward = -129.00, steps = 130
06:19:13 [INFO] test episode 53: reward = -139.00, steps = 140
06:19:23 [INFO] test episode 54: reward = -115.00, steps = 116
06:19:37 [INFO] test episode 55: reward = -158.00, steps = 159
06:19:46 [INFO] test episode 56: reward = -109.00, steps = 110
06:19:55 [INFO] test episode 57: reward = -107.00, steps = 108
06:20:07 [INFO] test episode 58: reward = -139.00, steps = 140
06:20:17 [INFO] test episode 59: reward = -118.00, steps = 119
06:20:30 [INFO] test episode 60: reward = -150.00, steps = 151
06:20:46 [INFO] test episode 61: reward = -185.00, steps = 186
06:21:00 [INFO] test episode 62: reward = -165.00, steps = 166
06:21:09 [INFO] test episode 63: reward = -101.00, steps = 102
06:21:20 [INFO] test episode 64: reward = -123.00, steps = 124
06:21:30 [INFO] test episode 65: reward = -126.00, steps = 127
06:21:39 [INFO] test episode 66: reward = -102.00, steps = 103
06:21:48 [INFO] test episode 67: reward = -109.00, steps = 110
06:21:58 [INFO] test episode 68: reward = -107.00, steps = 108
06:22:07 [INFO] test episode 69: reward = -109.00, steps = 110
06:22:15 [INFO] test episode 70: reward = -86.00, steps = 87
06:22:28 [INFO] test episode 71: reward = -154.00, steps = 155
06:22:37 [INFO] test episode 72: reward = -106.00, steps = 107
06:22:47 [INFO] test episode 73: reward = -114.00, steps = 115
06:22:59 [INFO] test episode 74: reward = -138.00, steps = 139
06:23:11 [INFO] test episode 75: reward = -145.00, steps = 146
06:23:23 [INFO] test episode 76: reward = -141.00, steps = 142
06:23:34 [INFO] test episode 77: reward = -122.00, steps = 123
06:23:43 [INFO] test episode 78: reward = -113.00, steps = 114
06:24:00 [INFO] test episode 79: reward = -196.00, steps = 197
06:24:09 [INFO] test episode 80: reward = -100.00, steps = 101
06:24:21 [INFO] test episode 81: reward = -145.00, steps = 146
06:24:36 [INFO] test episode 82: reward = -175.00, steps = 176
06:24:44 [INFO] test episode 83: reward = -99.00, steps = 100
06:24:58 [INFO] test episode 84: reward = -160.00, steps = 161
06:25:09 [INFO] test episode 85: reward = -124.00, steps = 125
06:25:20 [INFO] test episode 86: reward = -132.00, steps = 133
06:25:32 [INFO] test episode 87: reward = -147.00, steps = 148
06:25:44 [INFO] test episode 88: reward = -132.00, steps = 133
06:25:52 [INFO] test episode 89: reward = -93.00, steps = 94
06:26:03 [INFO] test episode 90: reward = -137.00, steps = 138
06:26:17 [INFO] test episode 91: reward = -160.00, steps = 161
06:26:26 [INFO] test episode 92: reward = -105.00, steps = 106
06:26:37 [INFO] test episode 93: reward = -137.00, steps = 138
06:26:48 [INFO] test episode 94: reward = -136.00, steps = 137
06:26:58 [INFO] test episode 95: reward = -114.00, steps = 115
06:27:11 [INFO] test episode 96: reward = -148.00, steps = 149
06:27:26 [INFO] test episode 97: reward = -178.00, steps = 179
06:27:35 [INFO] test episode 98: reward = -112.00, steps = 113
06:27:47 [INFO] test episode 99: reward = -135.00, steps = 136
06:27:47 [INFO] average episode reward = -131.01 ± 29.60
In [6]:
env.close()