Use Soft Q Learning to Play LunarLander-v2¶

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import scipy.special
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import nn
from tensorflow import losses
from tensorflow import optimizers
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('LunarLander-v2')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
12:34:09 [INFO] env: <LunarLander<LunarLander-v2>>
12:34:09 [INFO] action_space: Discrete(4)
12:34:09 [INFO] observation_space: Box(-inf, inf, (8,), float32)
12:34:09 [INFO] reward_range: (-inf, inf)
12:34:09 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
12:34:09 [INFO] _max_episode_steps: 1000
12:34:09 [INFO] _elapsed_steps: None
12:34:09 [INFO] id: LunarLander-v2
12:34:09 [INFO] entry_point: gym.envs.box2d:LunarLander
12:34:09 [INFO] reward_threshold: 200
12:34:09 [INFO] nondeterministic: False
12:34:09 [INFO] max_episode_steps: 1000
12:34:09 [INFO] _kwargs: {}
12:34:09 [INFO] _env_name: LunarLander
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class SQLAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.replayer = DQNReplayer(10000)

        self.alpha = 0.02

        self.evaluate_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[64, 64], output_size=self.action_n)
        self.target_net = models.clone_model(self.evaluate_net)

    def build_net(self, input_size, hidden_sizes, output_size):
        model = keras.Sequential()
        for layer, hidden_size in enumerate(hidden_sizes):
            kwargs = dict(input_shape=(input_size,)) if not layer else {}
            model.add(layers.Dense(units=hidden_size,
                    activation=nn.relu, **kwargs))
        model.add(layers.Dense(units=output_size))
        optimizer = optimizers.Adam(0.001)
        model.compile(loss=losses.mse, optimizer=optimizer)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.target_net.set_weights(self.evaluate_net.get_weights())

    def step(self, observation, reward, terminated):
        qs = self.evaluate_net.predict(observation[np.newaxis], verbose=0)
        q_div_alpha = qs[0] / self.alpha
        v_div_alpha = scipy.special.logsumexp(q_div_alpha)
        prob = np.exp(q_div_alpha - v_div_alpha)
        prob /= prob.sum()  # work around for np.random.choice
        action = np.random.choice(self.action_n, p=prob)
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, terminated)
            if self.replayer.count >= 500:
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(128)

        # update value net
        next_qs = self.target_net.predict(next_states, verbose=0)
        next_vs = self.alpha * scipy.special.logsumexp(next_qs / self.alpha,
                axis=-1)
        us = rewards + self.gamma * (1. - terminateds) * next_vs
        targets = self.evaluate_net.predict(states, verbose=0)
        targets[np.arange(us.shape[0]), actions] = us
        self.evaluate_net.fit(states, targets, verbose=0)


agent = SQLAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 250:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
12:34:10 [INFO] ==== train ====
12:34:17 [INFO] train episode 0: reward = -445.54, steps = 136
12:34:20 [INFO] train episode 1: reward = -153.23, steps = 72
12:34:26 [INFO] train episode 2: reward = -117.16, steps = 99
12:34:31 [INFO] train episode 3: reward = -518.03, steps = 95
12:34:36 [INFO] train episode 4: reward = -671.37, steps = 88
12:34:52 [INFO] train episode 5: reward = -89.08, steps = 81
12:35:17 [INFO] train episode 6: reward = -213.88, steps = 124
12:35:54 [INFO] train episode 7: reward = -343.83, steps = 182
12:36:52 [INFO] train episode 8: reward = -117.37, steps = 301
12:37:20 [INFO] train episode 9: reward = -232.42, steps = 145
12:38:08 [INFO] train episode 10: reward = -56.57, steps = 251
12:38:59 [INFO] train episode 11: reward = -19.96, steps = 267
12:39:50 [INFO] train episode 12: reward = -25.62, steps = 274
12:40:24 [INFO] train episode 13: reward = -35.56, steps = 174
12:41:27 [INFO] train episode 14: reward = -100.93, steps = 331
12:42:15 [INFO] train episode 15: reward = -180.48, steps = 251
12:42:39 [INFO] train episode 16: reward = -219.53, steps = 125
12:43:08 [INFO] train episode 17: reward = -118.45, steps = 153
12:43:48 [INFO] train episode 18: reward = -80.96, steps = 213
12:47:00 [INFO] train episode 19: reward = -68.70, steps = 1000
12:48:06 [INFO] train episode 20: reward = -108.64, steps = 349
12:48:40 [INFO] train episode 21: reward = -122.94, steps = 175
12:49:17 [INFO] train episode 22: reward = 46.26, steps = 198
12:49:48 [INFO] train episode 23: reward = -108.06, steps = 161
12:52:59 [INFO] train episode 24: reward = -114.93, steps = 1000
12:55:27 [INFO] train episode 25: reward = -241.04, steps = 769
12:55:52 [INFO] train episode 26: reward = -85.91, steps = 129
12:57:32 [INFO] train episode 27: reward = -209.55, steps = 528
13:00:43 [INFO] train episode 28: reward = -52.04, steps = 1000
13:01:14 [INFO] train episode 29: reward = -89.28, steps = 166
13:02:49 [INFO] train episode 30: reward = -308.12, steps = 503
13:06:00 [INFO] train episode 31: reward = -192.10, steps = 1000
13:08:39 [INFO] train episode 32: reward = -225.21, steps = 843
13:09:16 [INFO] train episode 33: reward = -97.18, steps = 194
13:09:48 [INFO] train episode 34: reward = -85.01, steps = 167
13:11:37 [INFO] train episode 35: reward = -185.60, steps = 570
13:14:48 [INFO] train episode 36: reward = -131.19, steps = 1000
13:17:58 [INFO] train episode 37: reward = -144.72, steps = 1000
13:18:29 [INFO] train episode 38: reward = -103.72, steps = 159
13:19:35 [INFO] train episode 39: reward = -145.05, steps = 350
13:22:47 [INFO] train episode 40: reward = -26.52, steps = 1000
13:25:57 [INFO] train episode 41: reward = -148.75, steps = 1000
13:29:07 [INFO] train episode 42: reward = -79.30, steps = 1000
13:31:13 [INFO] train episode 43: reward = -365.93, steps = 660
13:34:23 [INFO] train episode 44: reward = -31.31, steps = 1000
13:37:34 [INFO] train episode 45: reward = -63.72, steps = 1000
13:40:44 [INFO] train episode 46: reward = -136.30, steps = 1000
13:43:55 [INFO] train episode 47: reward = -148.76, steps = 1000
13:47:06 [INFO] train episode 48: reward = -60.38, steps = 1000
13:50:16 [INFO] train episode 49: reward = -57.75, steps = 1000
13:53:26 [INFO] train episode 50: reward = -13.95, steps = 1000
13:54:00 [INFO] train episode 51: reward = -58.56, steps = 177
13:57:14 [INFO] train episode 52: reward = -33.15, steps = 1000
13:58:01 [INFO] train episode 53: reward = -130.91, steps = 249
14:01:17 [INFO] train episode 54: reward = -58.64, steps = 1000
14:04:29 [INFO] train episode 55: reward = -31.15, steps = 1000
14:05:10 [INFO] train episode 56: reward = -76.13, steps = 216
14:08:21 [INFO] train episode 57: reward = -58.40, steps = 1000
14:11:32 [INFO] train episode 58: reward = -24.58, steps = 1000
14:14:43 [INFO] train episode 59: reward = -35.16, steps = 1000
14:17:55 [INFO] train episode 60: reward = -4.12, steps = 1000
14:21:06 [INFO] train episode 61: reward = -18.72, steps = 1000
14:22:02 [INFO] train episode 62: reward = -245.97, steps = 295
14:22:26 [INFO] train episode 63: reward = -244.15, steps = 119
14:25:38 [INFO] train episode 64: reward = -14.62, steps = 1000
14:28:50 [INFO] train episode 65: reward = -10.11, steps = 1000
14:32:02 [INFO] train episode 66: reward = -27.16, steps = 1000
14:32:22 [INFO] train episode 67: reward = -215.93, steps = 104
14:32:54 [INFO] train episode 68: reward = -448.00, steps = 168
14:36:05 [INFO] train episode 69: reward = 17.63, steps = 1000
14:39:17 [INFO] train episode 70: reward = -3.72, steps = 1000
14:42:28 [INFO] train episode 71: reward = -0.00, steps = 1000
14:45:40 [INFO] train episode 72: reward = -43.46, steps = 1000
14:48:51 [INFO] train episode 73: reward = -40.71, steps = 1000
14:52:02 [INFO] train episode 74: reward = -4.29, steps = 1000
14:55:13 [INFO] train episode 75: reward = -19.24, steps = 1000
14:56:15 [INFO] train episode 76: reward = -385.84, steps = 324
14:59:26 [INFO] train episode 77: reward = 2.01, steps = 1000
15:02:37 [INFO] train episode 78: reward = -4.19, steps = 1000
15:05:48 [INFO] train episode 79: reward = 6.13, steps = 1000
15:08:57 [INFO] train episode 80: reward = -15.60, steps = 1000
15:12:08 [INFO] train episode 81: reward = 9.69, steps = 1000
15:12:25 [INFO] train episode 82: reward = -192.80, steps = 87
15:13:01 [INFO] train episode 83: reward = -92.14, steps = 188
15:16:11 [INFO] train episode 84: reward = -5.96, steps = 1000
15:19:22 [INFO] train episode 85: reward = -47.92, steps = 1000
15:19:47 [INFO] train episode 86: reward = -110.50, steps = 131
15:22:59 [INFO] train episode 87: reward = -28.78, steps = 1000
15:26:11 [INFO] train episode 88: reward = -45.21, steps = 1000
15:29:21 [INFO] train episode 89: reward = -49.71, steps = 1000
15:29:34 [INFO] train episode 90: reward = -566.19, steps = 67
15:32:45 [INFO] train episode 91: reward = -67.21, steps = 1000
15:33:12 [INFO] train episode 92: reward = -112.67, steps = 143
15:36:24 [INFO] train episode 93: reward = -54.77, steps = 1000
15:39:35 [INFO] train episode 94: reward = -74.25, steps = 1000
15:42:46 [INFO] train episode 95: reward = -50.85, steps = 1000
15:46:02 [INFO] train episode 96: reward = 25.18, steps = 1000
15:49:13 [INFO] train episode 97: reward = -0.68, steps = 1000
15:52:24 [INFO] train episode 98: reward = -56.84, steps = 1000
15:55:35 [INFO] train episode 99: reward = 10.48, steps = 1000
15:58:49 [INFO] train episode 100: reward = -21.99, steps = 1000
16:02:08 [INFO] train episode 101: reward = -42.10, steps = 1000
16:05:24 [INFO] train episode 102: reward = -5.42, steps = 1000
16:08:41 [INFO] train episode 103: reward = 8.07, steps = 1000
16:12:02 [INFO] train episode 104: reward = -21.89, steps = 1000
16:15:14 [INFO] train episode 105: reward = 8.46, steps = 1000
16:18:32 [INFO] train episode 106: reward = 6.09, steps = 1000
16:19:16 [INFO] train episode 107: reward = -132.98, steps = 223
16:22:37 [INFO] train episode 108: reward = -14.66, steps = 1000
16:25:55 [INFO] train episode 109: reward = -27.07, steps = 1000
16:29:09 [INFO] train episode 110: reward = -0.74, steps = 1000
16:32:29 [INFO] train episode 111: reward = -22.02, steps = 1000
16:35:55 [INFO] train episode 112: reward = -18.27, steps = 1000
16:39:14 [INFO] train episode 113: reward = -48.27, steps = 1000
16:42:34 [INFO] train episode 114: reward = -6.09, steps = 1000
16:45:58 [INFO] train episode 115: reward = 3.66, steps = 1000
16:49:13 [INFO] train episode 116: reward = -32.62, steps = 1000
16:52:26 [INFO] train episode 117: reward = 7.44, steps = 1000
16:55:38 [INFO] train episode 118: reward = -15.38, steps = 1000
16:58:49 [INFO] train episode 119: reward = -33.93, steps = 1000
17:02:00 [INFO] train episode 120: reward = -17.84, steps = 1000
17:05:10 [INFO] train episode 121: reward = -60.53, steps = 1000
17:08:27 [INFO] train episode 122: reward = -29.67, steps = 1000
17:11:45 [INFO] train episode 123: reward = -53.64, steps = 1000
17:15:01 [INFO] train episode 124: reward = 6.91, steps = 1000
17:18:24 [INFO] train episode 125: reward = -51.62, steps = 1000
17:21:41 [INFO] train episode 126: reward = -59.06, steps = 1000
17:25:03 [INFO] train episode 127: reward = -9.68, steps = 1000
17:25:20 [INFO] train episode 128: reward = -98.93, steps = 87
17:26:24 [INFO] train episode 129: reward = 176.44, steps = 335
17:29:48 [INFO] train episode 130: reward = -21.54, steps = 1000
17:32:59 [INFO] train episode 131: reward = -55.94, steps = 1000
17:36:14 [INFO] train episode 132: reward = -10.50, steps = 1000
17:39:37 [INFO] train episode 133: reward = -15.11, steps = 1000
17:43:01 [INFO] train episode 134: reward = -14.00, steps = 1000
17:46:12 [INFO] train episode 135: reward = -20.43, steps = 1000
17:49:24 [INFO] train episode 136: reward = -27.60, steps = 1000
17:52:50 [INFO] train episode 137: reward = 13.22, steps = 1000
17:53:10 [INFO] train episode 138: reward = -31.07, steps = 99
17:53:33 [INFO] train episode 139: reward = -68.69, steps = 112
17:53:55 [INFO] train episode 140: reward = -75.15, steps = 105
17:57:05 [INFO] train episode 141: reward = -58.11, steps = 1000
18:28:13 [INFO] train episode 142: reward = -8.72, steps = 1000
18:28:59 [INFO] train episode 143: reward = -92.55, steps = 245
18:32:24 [INFO] train episode 144: reward = -27.97, steps = 1000
18:35:34 [INFO] train episode 145: reward = 19.74, steps = 1000
18:38:46 [INFO] train episode 146: reward = -6.78, steps = 1000
18:41:56 [INFO] train episode 147: reward = -24.94, steps = 1000
18:45:07 [INFO] train episode 148: reward = -36.86, steps = 1000
18:48:19 [INFO] train episode 149: reward = -28.49, steps = 1000
18:51:31 [INFO] train episode 150: reward = 18.76, steps = 1000
18:54:42 [INFO] train episode 151: reward = -32.24, steps = 1000
18:58:01 [INFO] train episode 152: reward = -53.15, steps = 1000
19:01:28 [INFO] train episode 153: reward = -51.07, steps = 1000
19:04:48 [INFO] train episode 154: reward = -18.64, steps = 1000
19:08:06 [INFO] train episode 155: reward = -59.46, steps = 1000
19:08:44 [INFO] train episode 156: reward = -6.30, steps = 190
19:12:04 [INFO] train episode 157: reward = -69.82, steps = 1000
19:15:20 [INFO] train episode 158: reward = 15.60, steps = 1000
19:18:38 [INFO] train episode 159: reward = -39.00, steps = 1000
19:21:49 [INFO] train episode 160: reward = -7.11, steps = 1000
19:25:06 [INFO] train episode 161: reward = -14.60, steps = 1000
19:28:21 [INFO] train episode 162: reward = -15.43, steps = 1000
19:31:32 [INFO] train episode 163: reward = -22.80, steps = 1000
19:34:44 [INFO] train episode 164: reward = 4.65, steps = 1000
19:37:56 [INFO] train episode 165: reward = -18.00, steps = 1000
19:39:48 [INFO] train episode 166: reward = 216.01, steps = 589
19:42:58 [INFO] train episode 167: reward = -156.64, steps = 1000
19:43:59 [INFO] train episode 168: reward = -484.52, steps = 320
19:47:10 [INFO] train episode 169: reward = -17.42, steps = 1000
19:50:22 [INFO] train episode 170: reward = -34.25, steps = 1000
19:53:33 [INFO] train episode 171: reward = 108.22, steps = 1000
19:56:44 [INFO] train episode 172: reward = -13.96, steps = 1000
19:59:56 [INFO] train episode 173: reward = -12.42, steps = 1000
20:03:07 [INFO] train episode 174: reward = -38.94, steps = 1000
20:06:18 [INFO] train episode 175: reward = -44.30, steps = 1000
20:09:30 [INFO] train episode 176: reward = 10.14, steps = 1000
20:12:42 [INFO] train episode 177: reward = 159.61, steps = 1000
20:15:56 [INFO] train episode 178: reward = 10.36, steps = 1000
20:19:11 [INFO] train episode 179: reward = -30.74, steps = 1000
20:22:37 [INFO] train episode 180: reward = -52.14, steps = 1000
20:26:05 [INFO] train episode 181: reward = -27.19, steps = 1000
20:29:26 [INFO] train episode 182: reward = 4.21, steps = 1000
20:32:52 [INFO] train episode 183: reward = -34.05, steps = 1000
20:33:06 [INFO] train episode 184: reward = -108.16, steps = 70
20:36:24 [INFO] train episode 185: reward = -26.27, steps = 1000
20:39:44 [INFO] train episode 186: reward = -32.15, steps = 1000
20:40:00 [INFO] train episode 187: reward = 15.89, steps = 79
20:43:16 [INFO] train episode 188: reward = 45.82, steps = 1000
20:46:35 [INFO] train episode 189: reward = -29.37, steps = 1000
20:49:51 [INFO] train episode 190: reward = 167.43, steps = 1000
20:53:13 [INFO] train episode 191: reward = 130.48, steps = 1000
20:56:29 [INFO] train episode 192: reward = 143.59, steps = 1000
20:58:47 [INFO] train episode 193: reward = 219.19, steps = 712
20:59:28 [INFO] train episode 194: reward = 264.75, steps = 213
21:00:49 [INFO] train episode 195: reward = 274.17, steps = 419
21:01:45 [INFO] train episode 196: reward = 226.69, steps = 290
21:02:41 [INFO] train episode 197: reward = 257.07, steps = 280
21:03:24 [INFO] train episode 198: reward = 272.97, steps = 218
21:04:06 [INFO] train episode 199: reward = 249.87, steps = 210
21:04:52 [INFO] train episode 200: reward = -349.75, steps = 233
21:05:31 [INFO] train episode 201: reward = -167.46, steps = 191
21:06:11 [INFO] train episode 202: reward = 257.01, steps = 194
21:06:53 [INFO] train episode 203: reward = 272.13, steps = 199
21:09:03 [INFO] train episode 204: reward = 175.77, steps = 643
21:10:12 [INFO] train episode 205: reward = 187.14, steps = 359
21:11:27 [INFO] train episode 206: reward = 265.49, steps = 373
21:12:17 [INFO] train episode 207: reward = 275.21, steps = 250
21:12:35 [INFO] train episode 208: reward = 31.06, steps = 85
21:13:45 [INFO] train episode 209: reward = -276.59, steps = 355
21:14:42 [INFO] train episode 210: reward = 283.97, steps = 298
21:15:31 [INFO] train episode 211: reward = 239.15, steps = 258
21:15:56 [INFO] train episode 212: reward = -48.32, steps = 126
21:16:38 [INFO] train episode 213: reward = 272.72, steps = 216
21:17:11 [INFO] train episode 214: reward = 282.84, steps = 172
21:18:17 [INFO] train episode 215: reward = 273.05, steps = 341
21:18:58 [INFO] train episode 216: reward = 241.15, steps = 209
21:19:44 [INFO] train episode 217: reward = 263.21, steps = 226
21:20:32 [INFO] train episode 218: reward = 21.42, steps = 237
21:21:12 [INFO] train episode 219: reward = 18.91, steps = 199
21:22:08 [INFO] train episode 220: reward = -63.51, steps = 258
21:23:10 [INFO] train episode 221: reward = 253.62, steps = 296
21:24:08 [INFO] train episode 222: reward = 277.63, steps = 296
21:24:47 [INFO] train episode 223: reward = 258.39, steps = 196
21:25:11 [INFO] train episode 224: reward = 13.94, steps = 125
21:25:49 [INFO] train episode 225: reward = 243.13, steps = 192
21:26:33 [INFO] train episode 226: reward = 269.37, steps = 209
21:27:23 [INFO] train episode 227: reward = 251.06, steps = 255
21:28:12 [INFO] train episode 228: reward = 282.18, steps = 240
21:28:39 [INFO] train episode 229: reward = 55.31, steps = 131
21:29:25 [INFO] train episode 230: reward = 313.56, steps = 221
21:29:52 [INFO] train episode 231: reward = 17.49, steps = 129
21:30:25 [INFO] train episode 232: reward = 51.12, steps = 161
21:31:03 [INFO] train episode 233: reward = 259.60, steps = 189
21:32:23 [INFO] train episode 234: reward = -7.68, steps = 414
21:33:41 [INFO] train episode 235: reward = 265.75, steps = 379
21:34:35 [INFO] train episode 236: reward = 242.28, steps = 266
21:35:18 [INFO] train episode 237: reward = 269.10, steps = 226
21:36:56 [INFO] train episode 238: reward = 262.48, steps = 458
21:37:59 [INFO] train episode 239: reward = 231.01, steps = 303
21:38:52 [INFO] train episode 240: reward = 273.28, steps = 274
21:39:47 [INFO] train episode 241: reward = 271.58, steps = 287
21:40:22 [INFO] train episode 242: reward = 261.21, steps = 178
21:41:35 [INFO] train episode 243: reward = -32.50, steps = 346
21:42:17 [INFO] train episode 244: reward = 263.57, steps = 203
21:43:42 [INFO] train episode 245: reward = 304.49, steps = 423
21:44:16 [INFO] train episode 246: reward = 241.71, steps = 178
21:44:31 [INFO] train episode 247: reward = 11.91, steps = 73
21:45:14 [INFO] train episode 248: reward = 249.07, steps = 223
21:45:57 [INFO] train episode 249: reward = 274.94, steps = 214
21:47:11 [INFO] train episode 250: reward = 242.63, steps = 377
21:47:54 [INFO] train episode 251: reward = 233.60, steps = 227
21:48:12 [INFO] train episode 252: reward = -2.84, steps = 91
21:49:16 [INFO] train episode 253: reward = -239.17, steps = 335
21:50:12 [INFO] train episode 254: reward = -136.52, steps = 289
21:50:57 [INFO] train episode 255: reward = 250.63, steps = 225
21:51:42 [INFO] train episode 256: reward = 234.01, steps = 217
21:52:52 [INFO] train episode 257: reward = 224.57, steps = 336
21:53:38 [INFO] train episode 258: reward = 252.21, steps = 242
21:53:55 [INFO] train episode 259: reward = -282.69, steps = 87
21:54:44 [INFO] train episode 260: reward = 263.44, steps = 254
21:58:05 [INFO] train episode 261: reward = 102.63, steps = 1000
21:59:12 [INFO] train episode 262: reward = 279.98, steps = 314
22:00:19 [INFO] train episode 263: reward = 261.80, steps = 324
22:01:07 [INFO] train episode 264: reward = 271.63, steps = 242
22:02:02 [INFO] train episode 265: reward = 267.08, steps = 285
22:02:55 [INFO] train episode 266: reward = 291.73, steps = 274
22:03:46 [INFO] train episode 267: reward = 270.96, steps = 264
22:04:23 [INFO] train episode 268: reward = 266.43, steps = 192
22:04:55 [INFO] train episode 269: reward = 253.13, steps = 172
22:04:56 [INFO] ==== test ====
22:05:12 [INFO] test episode 0: reward = 195.51, steps = 329
22:05:17 [INFO] test episode 1: reward = -58.19, steps = 102
22:05:30 [INFO] test episode 2: reward = 241.64, steps = 263
22:05:40 [INFO] test episode 3: reward = 283.24, steps = 197
22:05:48 [INFO] test episode 4: reward = 281.88, steps = 165
22:05:53 [INFO] test episode 5: reward = -63.84, steps = 110
22:05:58 [INFO] test episode 6: reward = 23.95, steps = 102
22:06:12 [INFO] test episode 7: reward = 241.56, steps = 293
22:06:30 [INFO] test episode 8: reward = 275.48, steps = 351
22:07:11 [INFO] test episode 9: reward = 235.27, steps = 836
22:07:24 [INFO] test episode 10: reward = 252.13, steps = 249
22:07:35 [INFO] test episode 11: reward = 265.20, steps = 229
22:07:40 [INFO] test episode 12: reward = 31.41, steps = 107
22:08:30 [INFO] test episode 13: reward = 142.52, steps = 1000
22:08:43 [INFO] test episode 14: reward = 275.90, steps = 265
22:09:08 [INFO] test episode 15: reward = 262.03, steps = 497
22:09:21 [INFO] test episode 16: reward = 303.05, steps = 271
22:09:33 [INFO] test episode 17: reward = 250.39, steps = 231
22:09:48 [INFO] test episode 18: reward = 209.79, steps = 294
22:09:56 [INFO] test episode 19: reward = 285.27, steps = 178
22:10:45 [INFO] test episode 20: reward = 134.08, steps = 1000
22:10:53 [INFO] test episode 21: reward = 293.50, steps = 170
22:11:12 [INFO] test episode 22: reward = 284.40, steps = 383
22:11:23 [INFO] test episode 23: reward = 245.72, steps = 220
22:12:13 [INFO] test episode 24: reward = 140.64, steps = 1000
22:12:31 [INFO] test episode 25: reward = 214.02, steps = 381
22:12:41 [INFO] test episode 26: reward = 240.13, steps = 185
22:12:50 [INFO] test episode 27: reward = 277.05, steps = 186
22:13:00 [INFO] test episode 28: reward = 248.50, steps = 209
22:13:14 [INFO] test episode 29: reward = 239.76, steps = 290
22:13:40 [INFO] test episode 30: reward = 262.17, steps = 516
22:13:49 [INFO] test episode 31: reward = -68.64, steps = 179
22:13:59 [INFO] test episode 32: reward = 21.94, steps = 203
22:14:07 [INFO] test episode 33: reward = 277.74, steps = 163
22:14:20 [INFO] test episode 34: reward = 268.05, steps = 268
22:14:27 [INFO] test episode 35: reward = 288.36, steps = 158
22:14:41 [INFO] test episode 36: reward = -8.02, steps = 283
22:14:49 [INFO] test episode 37: reward = -24.24, steps = 165
22:14:55 [INFO] test episode 38: reward = 31.30, steps = 129
22:15:05 [INFO] test episode 39: reward = 282.84, steps = 191
22:15:10 [INFO] test episode 40: reward = -51.32, steps = 100
22:15:26 [INFO] test episode 41: reward = -5.79, steps = 332
22:15:34 [INFO] test episode 42: reward = 274.60, steps = 165
22:15:44 [INFO] test episode 43: reward = -68.40, steps = 203
22:15:54 [INFO] test episode 44: reward = 252.45, steps = 190
22:16:02 [INFO] test episode 45: reward = 244.62, steps = 182
22:16:11 [INFO] test episode 46: reward = 255.61, steps = 179
22:17:00 [INFO] test episode 47: reward = 142.33, steps = 1000
22:17:09 [INFO] test episode 48: reward = 286.27, steps = 173
22:17:14 [INFO] test episode 49: reward = -28.24, steps = 106
22:17:23 [INFO] test episode 50: reward = 245.22, steps = 181
22:17:36 [INFO] test episode 51: reward = 271.80, steps = 273
22:17:48 [INFO] test episode 52: reward = 267.12, steps = 243
22:17:58 [INFO] test episode 53: reward = 264.49, steps = 205
22:18:11 [INFO] test episode 54: reward = 266.46, steps = 267
22:18:54 [INFO] test episode 55: reward = 219.71, steps = 867
22:19:07 [INFO] test episode 56: reward = 239.48, steps = 263
22:19:20 [INFO] test episode 57: reward = 270.80, steps = 271
22:19:29 [INFO] test episode 58: reward = 234.86, steps = 180
22:19:41 [INFO] test episode 59: reward = 244.22, steps = 240
22:19:50 [INFO] test episode 60: reward = 250.19, steps = 191
22:20:42 [INFO] test episode 61: reward = 145.28, steps = 1000
22:20:53 [INFO] test episode 62: reward = 257.66, steps = 232
22:21:05 [INFO] test episode 63: reward = 275.18, steps = 251
22:21:54 [INFO] test episode 64: reward = 129.41, steps = 1000
22:22:05 [INFO] test episode 65: reward = 251.45, steps = 224
22:22:22 [INFO] test episode 66: reward = 274.88, steps = 344
22:22:34 [INFO] test episode 67: reward = 290.47, steps = 230
22:22:40 [INFO] test episode 68: reward = 36.77, steps = 120
22:22:53 [INFO] test episode 69: reward = 255.38, steps = 289
22:23:07 [INFO] test episode 70: reward = 268.93, steps = 288
22:23:17 [INFO] test episode 71: reward = 49.80, steps = 202
22:24:01 [INFO] test episode 72: reward = 216.81, steps = 872
22:24:14 [INFO] test episode 73: reward = 251.55, steps = 262
22:24:25 [INFO] test episode 74: reward = 249.90, steps = 216
22:24:34 [INFO] test episode 75: reward = 257.68, steps = 172
22:24:45 [INFO] test episode 76: reward = 261.66, steps = 226
22:24:56 [INFO] test episode 77: reward = 257.12, steps = 233
22:25:01 [INFO] test episode 78: reward = -33.48, steps = 99
22:25:06 [INFO] test episode 79: reward = -70.85, steps = 100
22:25:18 [INFO] test episode 80: reward = 268.50, steps = 248
22:25:34 [INFO] test episode 81: reward = 248.09, steps = 318
22:25:39 [INFO] test episode 82: reward = -44.75, steps = 100
22:26:29 [INFO] test episode 83: reward = 57.68, steps = 1000
22:27:19 [INFO] test episode 84: reward = 144.49, steps = 1000
22:27:29 [INFO] test episode 85: reward = 253.31, steps = 215
22:27:42 [INFO] test episode 86: reward = 272.77, steps = 260
22:27:50 [INFO] test episode 87: reward = 266.35, steps = 162
22:28:00 [INFO] test episode 88: reward = 27.98, steps = 201
22:28:24 [INFO] test episode 89: reward = 240.75, steps = 481
22:28:38 [INFO] test episode 90: reward = 263.58, steps = 279
22:28:46 [INFO] test episode 91: reward = 248.10, steps = 173
22:28:55 [INFO] test episode 92: reward = 283.05, steps = 180
22:29:14 [INFO] test episode 93: reward = 275.28, steps = 381
22:29:24 [INFO] test episode 94: reward = 258.63, steps = 210
22:29:37 [INFO] test episode 95: reward = 288.42, steps = 267
22:30:27 [INFO] test episode 96: reward = 165.82, steps = 1000
22:30:36 [INFO] test episode 97: reward = 279.71, steps = 185
22:30:48 [INFO] test episode 98: reward = 274.17, steps = 235
22:30:57 [INFO] test episode 99: reward = 252.05, steps = 180
22:30:57 [INFO] average episode reward = 196.10 ± 111.89
In [6]:
env.close()