Use Soft Actor-Critic to Play LunarLander-v2¶

PyTorch version

In [1]:
%matplotlib inline

import sys
import logging
import itertools
import copy

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('LunarLander-v2')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
06:44:32 [INFO] env: <LunarLander<LunarLander-v2>>
06:44:32 [INFO] action_space: Discrete(4)
06:44:32 [INFO] observation_space: Box(-inf, inf, (8,), float32)
06:44:32 [INFO] reward_range: (-inf, inf)
06:44:32 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
06:44:32 [INFO] _max_episode_steps: 1000
06:44:32 [INFO] _elapsed_steps: None
06:44:32 [INFO] id: LunarLander-v2
06:44:32 [INFO] entry_point: gym.envs.box2d:LunarLander
06:44:32 [INFO] reward_threshold: 200
06:44:32 [INFO] nondeterministic: False
06:44:32 [INFO] max_episode_steps: 1000
06:44:32 [INFO] _kwargs: {}
06:44:32 [INFO] _env_name: LunarLander
In [3]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'terminated'])
        self.i = 0
        self.count = 0
        self.capacity = capacity

    def store(self, *args):
        self.memory.loc[self.i] = np.asarray(args, dtype=object)
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)

    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)
In [4]:
class SACAgent:
    def __init__(self, env):
        state_dim = env.observation_space.shape[0]
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.replayer = DQNReplayer(10000)

        self.alpha = 0.02

        # create actor
        self.actor_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256],
                output_size=self.action_n, output_activator=nn.Softmax(-1))
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=3e-4)

        # create V critic
        self.v_evaluate_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256])
        self.v_target_net = copy.deepcopy(self.v_evaluate_net)
        self.v_optimizer = optim.Adam(self.v_evaluate_net.parameters(), lr=3e-4)
        self.v_loss = nn.MSELoss()

        # create Q critic
        self.q0_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=self.action_n)
        self.q1_net = self.build_net(input_size=state_dim,
                hidden_sizes=[256, 256], output_size=self.action_n)
        self.q0_loss = nn.MSELoss()
        self.q1_loss = nn.MSELoss()
        self.q0_optimizer = optim.Adam(self.q0_net.parameters(), lr=3e-4)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=3e-4)

    def build_net(self, input_size, hidden_sizes, output_size=1,
            output_activator=None):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        net = nn.Sequential(*layers)
        return net

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, terminated):
        state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
        prob_tensor = self.actor_net(state_tensor)
        action_tensor = distributions.Categorical(prob_tensor).sample()
        action = action_tensor.numpy()[0]
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                state, _, _, action, next_state, reward, terminated, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, action, reward, next_state, terminated)
            if self.replayer.count >= 500:
                self.learn()
        return action

    def close(self):
        pass

    def update_net(self, target_net, evaluate_net, learning_rate=0.0025):
        for target_param, evaluate_param in zip(
                target_net.parameters(), evaluate_net.parameters()):
            target_param.data.copy_(learning_rate * evaluate_param.data
                    + (1 - learning_rate) * target_param.data)

    def learn(self):
        states, actions, rewards, next_states, terminateds = \
                self.replayer.sample(128)
        state_tensor = torch.as_tensor(states, dtype=torch.float)
        action_tensor = torch.as_tensor(actions, dtype=torch.long)
        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
        terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)

        # update Q critic
        next_v_tensor = self.v_target_net(next_state_tensor)
        q_target_tensor = reward_tensor.unsqueeze(1) + self.gamma * \
                (1. - terminated_tensor.unsqueeze(1)) * next_v_tensor

        all_q0_pred_tensor = self.q0_net(state_tensor)
        q0_pred_tensor = torch.gather(all_q0_pred_tensor, 1,
                action_tensor.unsqueeze(1))
        q0_loss_tensor = self.q0_loss(q0_pred_tensor, q_target_tensor.detach())
        self.q0_optimizer.zero_grad()
        q0_loss_tensor.backward()
        self.q0_optimizer.step()

        all_q1_pred_tensor = self.q1_net(state_tensor)
        q1_pred_tensor = torch.gather(all_q1_pred_tensor, 1,
                action_tensor.unsqueeze(1))
        q1_loss_tensor = self.q1_loss(q1_pred_tensor, q_target_tensor.detach())
        self.q1_optimizer.zero_grad()
        q1_loss_tensor.backward()
        self.q1_optimizer.step()

        # update V critic
        q0_tensor = self.q0_net(state_tensor)
        q1_tensor = self.q1_net(state_tensor)
        q01_tensor = torch.min(q0_tensor, q1_tensor)
        prob_tensor = self.actor_net(state_tensor)
        ln_prob_tensor = torch.log(prob_tensor.clamp(1e-6, 1.))
        entropic_q01_tensor = prob_tensor * (q01_tensor -
                self.alpha * ln_prob_tensor)
        # OR entropic_q01_tensor = prob_tensor * (q01_tensor - \
        #         self.alpha * torch.xlogy(prob_tensor, prob_tensor)
        v_target_tensor = torch.sum(entropic_q01_tensor, dim=-1, keepdim=True)
        v_pred_tensor = self.v_evaluate_net(state_tensor)
        v_loss_tensor = self.v_loss(v_pred_tensor, v_target_tensor.detach())
        self.v_optimizer.zero_grad()
        v_loss_tensor.backward()
        self.v_optimizer.step()

        self.update_net(self.v_target_net, self.v_evaluate_net)

        # update actor
        prob_q_tensor = prob_tensor * (self.alpha * ln_prob_tensor - q0_tensor)
        actor_loss_tensor = prob_q_tensor.sum(axis=-1).mean()
        self.actor_optimizer.zero_grad()
        actor_loss_tensor.backward()
        self.actor_optimizer.step()


agent = SACAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > 200:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
06:44:33 [INFO] ==== train ====
06:44:33 [INFO] train episode 0: reward = -119.74, steps = 100
06:44:33 [INFO] train episode 1: reward = -53.41, steps = 63
06:44:33 [INFO] train episode 2: reward = -136.45, steps = 87
06:44:33 [INFO] train episode 3: reward = -326.87, steps = 70
06:44:33 [INFO] train episode 4: reward = -117.95, steps = 113
06:44:34 [INFO] train episode 5: reward = -431.55, steps = 99
06:44:37 [INFO] train episode 6: reward = -344.16, steps = 79
06:44:40 [INFO] train episode 7: reward = -124.00, steps = 73
06:44:43 [INFO] train episode 8: reward = -504.88, steps = 85
06:44:46 [INFO] train episode 9: reward = -453.93, steps = 79
06:44:48 [INFO] train episode 10: reward = -347.43, steps = 71
06:44:51 [INFO] train episode 11: reward = -244.51, steps = 73
06:44:54 [INFO] train episode 12: reward = -291.91, steps = 73
06:44:56 [INFO] train episode 13: reward = -398.81, steps = 73
06:44:59 [INFO] train episode 14: reward = -281.87, steps = 68
06:45:01 [INFO] train episode 15: reward = -262.03, steps = 53
06:45:03 [INFO] train episode 16: reward = -213.43, steps = 61
06:45:05 [INFO] train episode 17: reward = -308.40, steps = 56
06:45:08 [INFO] train episode 18: reward = -242.01, steps = 76
06:45:13 [INFO] train episode 19: reward = -353.46, steps = 113
06:45:15 [INFO] train episode 20: reward = -209.88, steps = 75
06:45:19 [INFO] train episode 21: reward = -308.92, steps = 98
06:45:22 [INFO] train episode 22: reward = 9.74, steps = 87
06:45:37 [INFO] train episode 23: reward = -520.48, steps = 412
06:45:46 [INFO] train episode 24: reward = -341.28, steps = 229
06:45:51 [INFO] train episode 25: reward = -59.83, steps = 153
06:45:55 [INFO] train episode 26: reward = -122.98, steps = 110
06:46:20 [INFO] train episode 27: reward = -646.65, steps = 668
06:46:55 [INFO] train episode 28: reward = -480.28, steps = 920
06:47:08 [INFO] train episode 29: reward = -272.43, steps = 357
06:47:15 [INFO] train episode 30: reward = -305.03, steps = 173
06:47:28 [INFO] train episode 31: reward = -199.98, steps = 368
06:47:40 [INFO] train episode 32: reward = -124.04, steps = 320
06:47:52 [INFO] train episode 33: reward = -118.74, steps = 317
06:48:04 [INFO] train episode 34: reward = -215.16, steps = 338
06:48:17 [INFO] train episode 35: reward = -232.50, steps = 356
06:48:43 [INFO] train episode 36: reward = -297.74, steps = 693
06:49:02 [INFO] train episode 37: reward = -262.43, steps = 496
06:49:15 [INFO] train episode 38: reward = -269.07, steps = 345
06:49:22 [INFO] train episode 39: reward = -190.48, steps = 191
06:49:31 [INFO] train episode 40: reward = -206.82, steps = 237
06:49:42 [INFO] train episode 41: reward = -196.88, steps = 315
06:49:50 [INFO] train episode 42: reward = -147.69, steps = 200
06:49:58 [INFO] train episode 43: reward = -210.92, steps = 239
06:50:06 [INFO] train episode 44: reward = -169.53, steps = 197
06:50:37 [INFO] train episode 45: reward = -320.28, steps = 840
06:50:48 [INFO] train episode 46: reward = -248.61, steps = 299
06:50:58 [INFO] train episode 47: reward = -239.37, steps = 265
06:51:04 [INFO] train episode 48: reward = -176.54, steps = 167
06:51:36 [INFO] train episode 49: reward = -311.47, steps = 820
06:51:45 [INFO] train episode 50: reward = -147.75, steps = 261
06:52:03 [INFO] train episode 51: reward = -241.43, steps = 471
06:52:10 [INFO] train episode 52: reward = -152.05, steps = 211
06:52:27 [INFO] train episode 53: reward = -226.51, steps = 439
06:52:34 [INFO] train episode 54: reward = -150.29, steps = 188
06:52:44 [INFO] train episode 55: reward = -191.25, steps = 277
06:52:54 [INFO] train episode 56: reward = -156.33, steps = 275
06:53:08 [INFO] train episode 57: reward = -204.69, steps = 385
06:53:19 [INFO] train episode 58: reward = -180.31, steps = 297
06:53:40 [INFO] train episode 59: reward = -190.51, steps = 571
06:53:54 [INFO] train episode 60: reward = -179.89, steps = 364
06:54:16 [INFO] train episode 61: reward = -224.54, steps = 595
06:54:41 [INFO] train episode 62: reward = -255.25, steps = 645
06:55:18 [INFO] train episode 63: reward = -282.85, steps = 984
06:55:36 [INFO] train episode 64: reward = -171.43, steps = 464
06:55:51 [INFO] train episode 65: reward = -202.57, steps = 399
06:56:26 [INFO] train episode 66: reward = -262.52, steps = 910
06:56:36 [INFO] train episode 67: reward = -103.61, steps = 277
06:56:45 [INFO] train episode 68: reward = -117.21, steps = 246
06:57:23 [INFO] train episode 69: reward = -157.12, steps = 1000
06:57:53 [INFO] train episode 70: reward = -251.57, steps = 792
06:58:30 [INFO] train episode 71: reward = -176.52, steps = 1000
06:58:43 [INFO] train episode 72: reward = -154.51, steps = 316
06:59:21 [INFO] train episode 73: reward = -125.18, steps = 1000
06:59:59 [INFO] train episode 74: reward = -91.01, steps = 1000
07:00:39 [INFO] train episode 75: reward = -152.42, steps = 1000
07:01:17 [INFO] train episode 76: reward = -136.96, steps = 1000
07:01:57 [INFO] train episode 77: reward = -155.50, steps = 1000
07:02:35 [INFO] train episode 78: reward = -92.97, steps = 1000
07:03:13 [INFO] train episode 79: reward = -69.89, steps = 1000
07:03:49 [INFO] train episode 80: reward = -103.41, steps = 1000
07:04:26 [INFO] train episode 81: reward = -79.75, steps = 1000
07:05:03 [INFO] train episode 82: reward = -105.82, steps = 1000
07:05:42 [INFO] train episode 83: reward = -136.08, steps = 1000
07:06:23 [INFO] train episode 84: reward = -119.86, steps = 1000
07:07:04 [INFO] train episode 85: reward = -76.90, steps = 1000
07:07:45 [INFO] train episode 86: reward = -97.73, steps = 1000
07:08:25 [INFO] train episode 87: reward = -146.23, steps = 1000
07:09:04 [INFO] train episode 88: reward = -120.31, steps = 1000
07:09:45 [INFO] train episode 89: reward = -113.89, steps = 1000
07:10:26 [INFO] train episode 90: reward = -106.02, steps = 1000
07:11:07 [INFO] train episode 91: reward = -19.21, steps = 1000
07:11:49 [INFO] train episode 92: reward = -116.32, steps = 1000
07:12:29 [INFO] train episode 93: reward = -57.89, steps = 1000
07:13:10 [INFO] train episode 94: reward = -92.17, steps = 1000
07:13:52 [INFO] train episode 95: reward = -122.81, steps = 1000
07:14:33 [INFO] train episode 96: reward = -144.50, steps = 1000
07:15:14 [INFO] train episode 97: reward = -87.08, steps = 1000
07:15:56 [INFO] train episode 98: reward = -125.38, steps = 1000
07:16:37 [INFO] train episode 99: reward = -64.51, steps = 1000
07:17:18 [INFO] train episode 100: reward = -111.63, steps = 1000
07:17:59 [INFO] train episode 101: reward = -52.33, steps = 1000
07:18:43 [INFO] train episode 102: reward = -126.03, steps = 1000
07:19:24 [INFO] train episode 103: reward = -125.11, steps = 1000
07:19:27 [INFO] train episode 104: reward = -205.00, steps = 76
07:20:09 [INFO] train episode 105: reward = -146.56, steps = 1000
07:20:52 [INFO] train episode 106: reward = -139.96, steps = 1000
07:21:32 [INFO] train episode 107: reward = -124.79, steps = 1000
07:22:12 [INFO] train episode 108: reward = -114.53, steps = 1000
07:22:52 [INFO] train episode 109: reward = -95.93, steps = 1000
07:23:35 [INFO] train episode 110: reward = -123.12, steps = 1000
07:24:18 [INFO] train episode 111: reward = -80.41, steps = 1000
07:25:00 [INFO] train episode 112: reward = -76.70, steps = 1000
07:25:44 [INFO] train episode 113: reward = -105.85, steps = 1000
07:26:29 [INFO] train episode 114: reward = -107.08, steps = 1000
07:27:12 [INFO] train episode 115: reward = -129.47, steps = 1000
07:27:57 [INFO] train episode 116: reward = -48.79, steps = 1000
07:28:43 [INFO] train episode 117: reward = -50.67, steps = 1000
07:29:29 [INFO] train episode 118: reward = -99.13, steps = 1000
07:30:15 [INFO] train episode 119: reward = -95.30, steps = 1000
07:31:00 [INFO] train episode 120: reward = -131.54, steps = 1000
07:31:22 [INFO] train episode 121: reward = 234.60, steps = 489
07:32:08 [INFO] train episode 122: reward = -134.29, steps = 1000
07:32:53 [INFO] train episode 123: reward = -63.59, steps = 1000
07:33:26 [INFO] train episode 124: reward = -80.91, steps = 784
07:34:11 [INFO] train episode 125: reward = -115.63, steps = 1000
07:34:54 [INFO] train episode 126: reward = -82.44, steps = 1000
07:35:40 [INFO] train episode 127: reward = -128.58, steps = 1000
07:36:24 [INFO] train episode 128: reward = -67.69, steps = 1000
07:37:09 [INFO] train episode 129: reward = -136.20, steps = 1000
07:37:54 [INFO] train episode 130: reward = -103.51, steps = 1000
07:38:40 [INFO] train episode 131: reward = -146.59, steps = 1000
07:39:25 [INFO] train episode 132: reward = -69.08, steps = 1000
07:40:10 [INFO] train episode 133: reward = -66.15, steps = 1000
07:40:56 [INFO] train episode 134: reward = -125.93, steps = 1000
07:41:43 [INFO] train episode 135: reward = -100.82, steps = 1000
07:42:31 [INFO] train episode 136: reward = -136.78, steps = 1000
07:42:40 [INFO] train episode 137: reward = 1.99, steps = 202
07:42:44 [INFO] train episode 138: reward = -411.09, steps = 92
07:43:29 [INFO] train episode 139: reward = -84.54, steps = 1000
07:44:13 [INFO] train episode 140: reward = -95.32, steps = 1000
07:45:00 [INFO] train episode 141: reward = -64.36, steps = 1000
07:45:46 [INFO] train episode 142: reward = -46.74, steps = 1000
07:46:32 [INFO] train episode 143: reward = -111.39, steps = 1000
07:47:18 [INFO] train episode 144: reward = -133.64, steps = 1000
07:48:05 [INFO] train episode 145: reward = -173.13, steps = 1000
07:48:59 [INFO] train episode 146: reward = -139.49, steps = 1000
07:49:54 [INFO] train episode 147: reward = -110.06, steps = 1000
07:50:42 [INFO] train episode 148: reward = -120.51, steps = 1000
07:51:03 [INFO] train episode 149: reward = -194.49, steps = 449
07:51:50 [INFO] train episode 150: reward = -91.71, steps = 1000
07:52:38 [INFO] train episode 151: reward = -109.07, steps = 1000
07:53:25 [INFO] train episode 152: reward = -120.29, steps = 1000
07:54:12 [INFO] train episode 153: reward = -77.45, steps = 1000
07:55:00 [INFO] train episode 154: reward = -106.42, steps = 1000
07:55:49 [INFO] train episode 155: reward = -135.87, steps = 1000
07:56:37 [INFO] train episode 156: reward = -87.26, steps = 1000
07:57:26 [INFO] train episode 157: reward = -37.32, steps = 1000
07:58:12 [INFO] train episode 158: reward = -108.85, steps = 1000
07:59:00 [INFO] train episode 159: reward = -92.93, steps = 1000
07:59:48 [INFO] train episode 160: reward = -51.78, steps = 1000
08:00:36 [INFO] train episode 161: reward = -94.08, steps = 1000
08:01:29 [INFO] train episode 162: reward = -65.62, steps = 1000
08:02:24 [INFO] train episode 163: reward = -96.11, steps = 1000
08:03:19 [INFO] train episode 164: reward = -72.80, steps = 1000
08:04:07 [INFO] train episode 165: reward = -90.31, steps = 1000
08:04:58 [INFO] train episode 166: reward = -103.97, steps = 1000
08:05:46 [INFO] train episode 167: reward = -110.74, steps = 1000
08:06:34 [INFO] train episode 168: reward = -87.47, steps = 1000
08:07:22 [INFO] train episode 169: reward = -110.01, steps = 1000
08:08:10 [INFO] train episode 170: reward = -72.90, steps = 1000
08:08:57 [INFO] train episode 171: reward = -102.21, steps = 1000
08:09:43 [INFO] train episode 172: reward = -78.22, steps = 1000
08:10:30 [INFO] train episode 173: reward = -134.52, steps = 1000
08:11:17 [INFO] train episode 174: reward = -112.33, steps = 1000
08:12:06 [INFO] train episode 175: reward = -124.90, steps = 1000
08:12:54 [INFO] train episode 176: reward = -104.53, steps = 1000
08:13:41 [INFO] train episode 177: reward = -104.41, steps = 1000
08:13:46 [INFO] train episode 178: reward = -120.70, steps = 117
08:14:34 [INFO] train episode 179: reward = -116.06, steps = 1000
08:15:22 [INFO] train episode 180: reward = -109.05, steps = 1000
08:16:13 [INFO] train episode 181: reward = -138.38, steps = 1000
08:16:57 [INFO] train episode 182: reward = -180.98, steps = 872
08:17:43 [INFO] train episode 183: reward = -121.58, steps = 1000
08:18:31 [INFO] train episode 184: reward = -92.83, steps = 1000
08:19:18 [INFO] train episode 185: reward = -114.65, steps = 1000
09:45:10 [INFO] train episode 186: reward = -91.63, steps = 1000
09:46:00 [INFO] train episode 187: reward = -51.81, steps = 1000
09:46:51 [INFO] train episode 188: reward = -111.85, steps = 1000
09:47:41 [INFO] train episode 189: reward = -86.58, steps = 1000
09:48:33 [INFO] train episode 190: reward = -103.56, steps = 1000
09:49:23 [INFO] train episode 191: reward = -59.68, steps = 1000
09:50:17 [INFO] train episode 192: reward = -103.80, steps = 1000
09:51:12 [INFO] train episode 193: reward = -87.74, steps = 1000
09:52:01 [INFO] train episode 194: reward = -102.89, steps = 1000
09:52:50 [INFO] train episode 195: reward = -71.90, steps = 1000
09:53:40 [INFO] train episode 196: reward = -100.19, steps = 1000
09:54:33 [INFO] train episode 197: reward = -111.32, steps = 1000
09:55:27 [INFO] train episode 198: reward = -130.02, steps = 1000
09:56:19 [INFO] train episode 199: reward = -102.46, steps = 1000
09:57:10 [INFO] train episode 200: reward = -94.77, steps = 1000
09:58:00 [INFO] train episode 201: reward = -95.71, steps = 1000
09:58:50 [INFO] train episode 202: reward = -130.00, steps = 1000
09:59:42 [INFO] train episode 203: reward = -90.24, steps = 1000
10:00:35 [INFO] train episode 204: reward = -56.69, steps = 1000
10:01:24 [INFO] train episode 205: reward = -127.45, steps = 1000
10:02:13 [INFO] train episode 206: reward = -159.15, steps = 1000
10:03:05 [INFO] train episode 207: reward = -89.27, steps = 1000
10:03:57 [INFO] train episode 208: reward = -143.35, steps = 1000
10:04:02 [INFO] train episode 209: reward = -152.49, steps = 107
10:04:15 [INFO] train episode 210: reward = -218.52, steps = 281
10:04:23 [INFO] train episode 211: reward = -132.44, steps = 148
10:05:13 [INFO] train episode 212: reward = -112.33, steps = 1000
10:05:21 [INFO] train episode 213: reward = -190.35, steps = 173
10:05:34 [INFO] train episode 214: reward = -192.30, steps = 250
10:05:43 [INFO] train episode 215: reward = -372.53, steps = 185
10:05:51 [INFO] train episode 216: reward = -448.11, steps = 173
10:05:59 [INFO] train episode 217: reward = -514.03, steps = 155
10:06:04 [INFO] train episode 218: reward = -331.05, steps = 102
10:06:13 [INFO] train episode 219: reward = -355.23, steps = 180
10:06:21 [INFO] train episode 220: reward = -315.29, steps = 164
10:06:27 [INFO] train episode 221: reward = -334.99, steps = 110
10:06:32 [INFO] train episode 222: reward = -338.17, steps = 95
10:06:39 [INFO] train episode 223: reward = -239.06, steps = 134
10:07:33 [INFO] train episode 224: reward = -235.71, steps = 1000
10:07:47 [INFO] train episode 225: reward = -221.32, steps = 285
10:07:55 [INFO] train episode 226: reward = -21.88, steps = 152
10:08:01 [INFO] train episode 227: reward = -191.53, steps = 137
10:08:08 [INFO] train episode 228: reward = -287.15, steps = 146
10:08:12 [INFO] train episode 229: reward = -140.63, steps = 77
10:08:16 [INFO] train episode 230: reward = -123.00, steps = 69
10:08:22 [INFO] train episode 231: reward = -153.94, steps = 135
10:08:35 [INFO] train episode 232: reward = -123.91, steps = 257
10:08:38 [INFO] train episode 233: reward = -85.80, steps = 65
10:08:43 [INFO] train episode 234: reward = -52.43, steps = 95
10:08:51 [INFO] train episode 235: reward = -237.58, steps = 111
10:09:23 [INFO] train episode 236: reward = -77.69, steps = 620
10:09:26 [INFO] train episode 237: reward = -143.91, steps = 59
10:09:52 [INFO] train episode 238: reward = -178.96, steps = 569
10:09:54 [INFO] train episode 239: reward = -57.07, steps = 65
10:10:21 [INFO] train episode 240: reward = 226.20, steps = 582
10:10:26 [INFO] train episode 241: reward = -263.10, steps = 109
10:10:44 [INFO] train episode 242: reward = -118.00, steps = 407
10:10:49 [INFO] train episode 243: reward = -145.33, steps = 111
10:11:19 [INFO] train episode 244: reward = -276.73, steps = 660
10:11:22 [INFO] train episode 245: reward = -78.30, steps = 75
10:11:27 [INFO] train episode 246: reward = -175.75, steps = 113
10:11:46 [INFO] train episode 247: reward = -202.39, steps = 429
10:11:56 [INFO] train episode 248: reward = -87.05, steps = 212
10:11:59 [INFO] train episode 249: reward = -105.88, steps = 76
10:12:03 [INFO] train episode 250: reward = 24.85, steps = 99
10:12:06 [INFO] train episode 251: reward = -31.90, steps = 69
10:12:10 [INFO] train episode 252: reward = -98.10, steps = 88
10:12:13 [INFO] train episode 253: reward = -65.26, steps = 68
10:12:23 [INFO] train episode 254: reward = -47.99, steps = 214
10:12:26 [INFO] train episode 255: reward = -66.61, steps = 78
10:12:30 [INFO] train episode 256: reward = -55.35, steps = 96
10:12:43 [INFO] train episode 257: reward = -108.98, steps = 287
10:12:53 [INFO] train episode 258: reward = -243.54, steps = 228
10:12:58 [INFO] train episode 259: reward = -61.50, steps = 113
10:13:04 [INFO] train episode 260: reward = 6.30, steps = 146
10:13:07 [INFO] train episode 261: reward = -63.49, steps = 69
10:13:12 [INFO] train episode 262: reward = -27.91, steps = 110
10:13:16 [INFO] train episode 263: reward = -28.44, steps = 86
10:13:21 [INFO] train episode 264: reward = -73.21, steps = 125
10:13:33 [INFO] train episode 265: reward = -90.64, steps = 267
10:13:37 [INFO] train episode 266: reward = -71.93, steps = 106
10:13:41 [INFO] train episode 267: reward = -80.66, steps = 83
10:13:46 [INFO] train episode 268: reward = -254.02, steps = 119
10:13:51 [INFO] train episode 269: reward = -175.93, steps = 106
10:13:55 [INFO] train episode 270: reward = -31.64, steps = 101
10:14:00 [INFO] train episode 271: reward = 39.31, steps = 122
10:14:04 [INFO] train episode 272: reward = -96.64, steps = 87
10:14:08 [INFO] train episode 273: reward = -126.36, steps = 98
10:14:11 [INFO] train episode 274: reward = -93.53, steps = 58
10:14:16 [INFO] train episode 275: reward = -29.92, steps = 123
10:14:21 [INFO] train episode 276: reward = -195.91, steps = 105
10:14:25 [INFO] train episode 277: reward = -59.00, steps = 94
10:14:28 [INFO] train episode 278: reward = -69.29, steps = 60
10:14:33 [INFO] train episode 279: reward = -79.33, steps = 115
10:14:39 [INFO] train episode 280: reward = -46.21, steps = 131
10:14:42 [INFO] train episode 281: reward = -41.76, steps = 78
10:14:49 [INFO] train episode 282: reward = -81.62, steps = 167
10:14:56 [INFO] train episode 283: reward = -61.45, steps = 136
10:15:00 [INFO] train episode 284: reward = -83.56, steps = 114
10:15:08 [INFO] train episode 285: reward = -45.95, steps = 182
10:15:13 [INFO] train episode 286: reward = -231.50, steps = 100
10:15:19 [INFO] train episode 287: reward = 2.56, steps = 142
10:15:23 [INFO] train episode 288: reward = -164.72, steps = 91
10:15:28 [INFO] train episode 289: reward = -269.13, steps = 110
10:15:33 [INFO] train episode 290: reward = -187.57, steps = 92
10:15:37 [INFO] train episode 291: reward = -90.53, steps = 90
10:15:44 [INFO] train episode 292: reward = -29.03, steps = 156
10:15:50 [INFO] train episode 293: reward = -79.58, steps = 148
10:15:56 [INFO] train episode 294: reward = -113.31, steps = 134
10:16:01 [INFO] train episode 295: reward = -282.00, steps = 113
10:16:10 [INFO] train episode 296: reward = -286.56, steps = 206
10:16:14 [INFO] train episode 297: reward = -117.19, steps = 98
10:16:20 [INFO] train episode 298: reward = -188.22, steps = 129
10:16:24 [INFO] train episode 299: reward = -178.26, steps = 106
10:16:32 [INFO] train episode 300: reward = -116.22, steps = 167
10:16:36 [INFO] train episode 301: reward = -125.04, steps = 103
10:16:41 [INFO] train episode 302: reward = -100.87, steps = 107
10:16:46 [INFO] train episode 303: reward = -219.16, steps = 108
10:16:53 [INFO] train episode 304: reward = -172.78, steps = 164
10:16:57 [INFO] train episode 305: reward = -224.83, steps = 105
10:17:03 [INFO] train episode 306: reward = 37.84, steps = 120
10:17:12 [INFO] train episode 307: reward = -197.55, steps = 225
10:17:20 [INFO] train episode 308: reward = -244.53, steps = 167
10:17:24 [INFO] train episode 309: reward = 1.84, steps = 98
10:17:28 [INFO] train episode 310: reward = -38.25, steps = 92
10:18:02 [INFO] train episode 311: reward = 99.52, steps = 764
10:18:09 [INFO] train episode 312: reward = -317.85, steps = 157
10:18:15 [INFO] train episode 313: reward = -43.25, steps = 151
10:18:20 [INFO] train episode 314: reward = -31.94, steps = 105
10:18:27 [INFO] train episode 315: reward = -318.80, steps = 161
10:18:35 [INFO] train episode 316: reward = -293.41, steps = 178
10:18:40 [INFO] train episode 317: reward = 19.58, steps = 114
10:18:46 [INFO] train episode 318: reward = -267.28, steps = 152
10:18:53 [INFO] train episode 319: reward = -305.29, steps = 161
10:18:59 [INFO] train episode 320: reward = -165.88, steps = 137
10:19:08 [INFO] train episode 321: reward = -142.77, steps = 203
10:19:24 [INFO] train episode 322: reward = -253.69, steps = 359
10:19:35 [INFO] train episode 323: reward = -215.88, steps = 256
10:19:41 [INFO] train episode 324: reward = -53.77, steps = 139
10:20:26 [INFO] train episode 325: reward = 43.81, steps = 1000
10:20:31 [INFO] train episode 326: reward = 18.88, steps = 126
10:20:47 [INFO] train episode 327: reward = -233.37, steps = 362
10:21:03 [INFO] train episode 328: reward = -96.78, steps = 340
10:21:15 [INFO] train episode 329: reward = -173.47, steps = 268
10:21:34 [INFO] train episode 330: reward = -55.85, steps = 442
10:21:46 [INFO] train episode 331: reward = -61.28, steps = 272
10:22:05 [INFO] train episode 332: reward = -154.27, steps = 408
10:22:22 [INFO] train episode 333: reward = -54.09, steps = 370
10:22:31 [INFO] train episode 334: reward = -12.82, steps = 178
10:22:38 [INFO] train episode 335: reward = -37.68, steps = 162
10:23:24 [INFO] train episode 336: reward = 27.19, steps = 1000
10:23:36 [INFO] train episode 337: reward = -49.52, steps = 266
10:24:19 [INFO] train episode 338: reward = 91.25, steps = 937
10:25:05 [INFO] train episode 339: reward = -29.73, steps = 1000
10:25:50 [INFO] train episode 340: reward = -5.60, steps = 1000
10:26:37 [INFO] train episode 341: reward = -36.03, steps = 1000
10:27:29 [INFO] train episode 342: reward = -87.34, steps = 1000
10:28:18 [INFO] train episode 343: reward = -40.44, steps = 1000
10:28:26 [INFO] train episode 344: reward = -178.64, steps = 180
10:29:11 [INFO] train episode 345: reward = 1.81, steps = 1000
10:29:56 [INFO] train episode 346: reward = 8.41, steps = 1000
10:30:42 [INFO] train episode 347: reward = -37.85, steps = 1000
10:31:30 [INFO] train episode 348: reward = -59.51, steps = 1000
10:32:17 [INFO] train episode 349: reward = -48.58, steps = 1000
10:33:07 [INFO] train episode 350: reward = 19.01, steps = 1000
10:34:03 [INFO] train episode 351: reward = 20.37, steps = 1000
10:35:00 [INFO] train episode 352: reward = -36.50, steps = 1000
10:35:54 [INFO] train episode 353: reward = 6.67, steps = 1000
10:36:49 [INFO] train episode 354: reward = 15.42, steps = 1000
10:37:46 [INFO] train episode 355: reward = -3.12, steps = 1000
10:38:42 [INFO] train episode 356: reward = 12.33, steps = 1000
10:39:41 [INFO] train episode 357: reward = -276.81, steps = 999
10:40:36 [INFO] train episode 358: reward = -29.50, steps = 1000
10:41:34 [INFO] train episode 359: reward = -6.15, steps = 1000
10:42:29 [INFO] train episode 360: reward = 11.12, steps = 1000
10:43:26 [INFO] train episode 361: reward = -46.64, steps = 1000
10:44:21 [INFO] train episode 362: reward = 20.21, steps = 1000
10:45:17 [INFO] train episode 363: reward = -21.80, steps = 1000
10:46:07 [INFO] train episode 364: reward = 4.08, steps = 1000
10:46:58 [INFO] train episode 365: reward = 25.01, steps = 1000
10:47:51 [INFO] train episode 366: reward = -11.14, steps = 1000
10:48:43 [INFO] train episode 367: reward = -5.90, steps = 1000
10:49:34 [INFO] train episode 368: reward = 152.86, steps = 942
10:50:27 [INFO] train episode 369: reward = 98.29, steps = 982
10:51:17 [INFO] train episode 370: reward = 6.46, steps = 1000
10:52:10 [INFO] train episode 371: reward = -4.99, steps = 1000
10:53:02 [INFO] train episode 372: reward = 59.76, steps = 1000
10:53:55 [INFO] train episode 373: reward = 26.03, steps = 1000
10:54:49 [INFO] train episode 374: reward = -128.53, steps = 1000
10:55:42 [INFO] train episode 375: reward = 38.01, steps = 1000
10:56:35 [INFO] train episode 376: reward = 13.31, steps = 1000
10:57:25 [INFO] train episode 377: reward = 41.84, steps = 1000
10:58:05 [INFO] train episode 378: reward = 162.93, steps = 769
10:58:50 [INFO] train episode 379: reward = 115.39, steps = 884
10:59:29 [INFO] train episode 380: reward = 170.95, steps = 732
11:00:13 [INFO] train episode 381: reward = 127.19, steps = 843
11:00:31 [INFO] train episode 382: reward = 293.36, steps = 344
11:01:09 [INFO] train episode 383: reward = 176.53, steps = 734
11:01:47 [INFO] train episode 384: reward = 172.70, steps = 766
11:02:17 [INFO] train episode 385: reward = 202.70, steps = 606
11:02:45 [INFO] train episode 386: reward = 216.61, steps = 529
11:03:13 [INFO] train episode 387: reward = 198.19, steps = 549
11:04:02 [INFO] train episode 388: reward = 145.91, steps = 928
11:04:34 [INFO] train episode 389: reward = 181.54, steps = 575
11:05:01 [INFO] train episode 390: reward = 241.98, steps = 487
11:05:25 [INFO] train episode 391: reward = 248.55, steps = 414
11:05:25 [INFO] ==== test ====
11:05:26 [INFO] test episode 0: reward = 277.28, steps = 418
11:05:27 [INFO] test episode 1: reward = 233.27, steps = 401
11:05:28 [INFO] test episode 2: reward = 228.74, steps = 514
11:05:29 [INFO] test episode 3: reward = 225.76, steps = 503
11:05:30 [INFO] test episode 4: reward = 249.42, steps = 523
11:05:31 [INFO] test episode 5: reward = 221.33, steps = 464
11:05:32 [INFO] test episode 6: reward = 213.58, steps = 434
11:05:33 [INFO] test episode 7: reward = 259.34, steps = 451
11:05:34 [INFO] test episode 8: reward = 215.36, steps = 584
11:05:35 [INFO] test episode 9: reward = 234.53, steps = 401
11:05:36 [INFO] test episode 10: reward = 218.23, steps = 669
11:05:37 [INFO] test episode 11: reward = 263.28, steps = 464
11:05:38 [INFO] test episode 12: reward = 206.59, steps = 458
11:05:39 [INFO] test episode 13: reward = 263.12, steps = 371
11:05:40 [INFO] test episode 14: reward = 234.53, steps = 449
11:05:40 [INFO] test episode 15: reward = 254.30, steps = 375
11:05:41 [INFO] test episode 16: reward = 224.86, steps = 513
11:05:42 [INFO] test episode 17: reward = 214.34, steps = 489
11:05:43 [INFO] test episode 18: reward = 234.87, steps = 418
11:05:44 [INFO] test episode 19: reward = 208.02, steps = 429
11:05:45 [INFO] test episode 20: reward = 273.05, steps = 384
11:05:46 [INFO] test episode 21: reward = 209.81, steps = 502
11:05:47 [INFO] test episode 22: reward = 221.46, steps = 690
11:05:48 [INFO] test episode 23: reward = 211.70, steps = 624
11:05:49 [INFO] test episode 24: reward = 221.82, steps = 379
11:05:50 [INFO] test episode 25: reward = 211.89, steps = 598
11:05:52 [INFO] test episode 26: reward = 214.95, steps = 489
11:05:52 [INFO] test episode 27: reward = 255.48, steps = 374
11:05:54 [INFO] test episode 28: reward = 222.00, steps = 506
11:05:54 [INFO] test episode 29: reward = 234.99, steps = 420
11:05:55 [INFO] test episode 30: reward = 270.02, steps = 358
11:05:56 [INFO] test episode 31: reward = 274.00, steps = 369
11:05:56 [INFO] test episode 32: reward = 197.17, steps = 421
11:05:57 [INFO] test episode 33: reward = 265.64, steps = 493
11:05:58 [INFO] test episode 34: reward = 242.44, steps = 423
11:06:00 [INFO] test episode 35: reward = 218.41, steps = 621
11:06:01 [INFO] test episode 36: reward = 193.90, steps = 437
11:06:01 [INFO] test episode 37: reward = 242.15, steps = 433
11:06:02 [INFO] test episode 38: reward = 253.12, steps = 418
11:06:04 [INFO] test episode 39: reward = 219.34, steps = 561
11:06:05 [INFO] test episode 40: reward = 248.94, steps = 462
11:06:05 [INFO] test episode 41: reward = 254.59, steps = 400
11:06:06 [INFO] test episode 42: reward = 204.38, steps = 464
11:06:07 [INFO] test episode 43: reward = 227.20, steps = 411
11:06:08 [INFO] test episode 44: reward = 246.14, steps = 426
11:06:09 [INFO] test episode 45: reward = 240.90, steps = 406
11:06:10 [INFO] test episode 46: reward = 221.85, steps = 638
11:06:11 [INFO] test episode 47: reward = 223.06, steps = 426
11:06:12 [INFO] test episode 48: reward = 266.92, steps = 477
11:06:13 [INFO] test episode 49: reward = 241.36, steps = 537
11:06:14 [INFO] test episode 50: reward = 213.25, steps = 392
11:06:15 [INFO] test episode 51: reward = 238.72, steps = 432
11:06:15 [INFO] test episode 52: reward = 270.38, steps = 400
11:06:16 [INFO] test episode 53: reward = 231.75, steps = 385
11:06:17 [INFO] test episode 54: reward = 247.63, steps = 424
11:06:18 [INFO] test episode 55: reward = 231.26, steps = 412
11:06:19 [INFO] test episode 56: reward = 202.18, steps = 403
11:06:19 [INFO] test episode 57: reward = 212.22, steps = 447
11:06:20 [INFO] test episode 58: reward = 261.40, steps = 449
11:06:21 [INFO] test episode 59: reward = 224.69, steps = 450
11:06:22 [INFO] test episode 60: reward = 264.63, steps = 427
11:06:23 [INFO] test episode 61: reward = 231.54, steps = 480
11:06:24 [INFO] test episode 62: reward = 237.32, steps = 425
11:06:25 [INFO] test episode 63: reward = 229.52, steps = 435
11:06:25 [INFO] test episode 64: reward = 257.12, steps = 421
11:06:26 [INFO] test episode 65: reward = 228.47, steps = 444
11:06:27 [INFO] test episode 66: reward = 194.83, steps = 432
11:06:28 [INFO] test episode 67: reward = 210.60, steps = 504
11:06:30 [INFO] test episode 68: reward = 216.76, steps = 672
11:06:31 [INFO] test episode 69: reward = 258.72, steps = 449
11:06:32 [INFO] test episode 70: reward = 232.74, steps = 385
11:06:33 [INFO] test episode 71: reward = 258.31, steps = 448
11:06:34 [INFO] test episode 72: reward = 250.08, steps = 492
11:06:35 [INFO] test episode 73: reward = 207.09, steps = 453
11:06:36 [INFO] test episode 74: reward = 239.58, steps = 494
11:06:36 [INFO] test episode 75: reward = 283.37, steps = 398
11:06:37 [INFO] test episode 76: reward = 273.23, steps = 406
11:06:38 [INFO] test episode 77: reward = 199.14, steps = 437
11:06:39 [INFO] test episode 78: reward = 206.96, steps = 453
11:06:40 [INFO] test episode 79: reward = 232.77, steps = 527
11:06:41 [INFO] test episode 80: reward = 262.82, steps = 414
11:06:42 [INFO] test episode 81: reward = 238.25, steps = 417
11:06:42 [INFO] test episode 82: reward = 213.53, steps = 486
11:06:43 [INFO] test episode 83: reward = 276.50, steps = 429
11:06:44 [INFO] test episode 84: reward = 221.97, steps = 602
11:06:46 [INFO] test episode 85: reward = 210.29, steps = 722
11:06:46 [INFO] test episode 86: reward = 248.47, steps = 399
11:06:48 [INFO] test episode 87: reward = 218.88, steps = 714
11:06:49 [INFO] test episode 88: reward = 197.18, steps = 432
11:06:50 [INFO] test episode 89: reward = 229.81, steps = 405
11:06:51 [INFO] test episode 90: reward = 228.87, steps = 470
11:06:52 [INFO] test episode 91: reward = 243.35, steps = 437
11:06:53 [INFO] test episode 92: reward = 227.23, steps = 441
11:06:54 [INFO] test episode 93: reward = 252.32, steps = 409
11:06:55 [INFO] test episode 94: reward = 251.28, steps = 409
11:06:56 [INFO] test episode 95: reward = 210.83, steps = 484
11:06:57 [INFO] test episode 96: reward = 222.87, steps = 490
11:06:58 [INFO] test episode 97: reward = 209.81, steps = 603
11:06:59 [INFO] test episode 98: reward = 277.47, steps = 399
11:07:00 [INFO] test episode 99: reward = 217.30, steps = 448
11:07:00 [INFO] average episode reward = 234.15 ± 22.26
In [6]:
env.close()