PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import copy
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('LunarLander-v2')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
06:44:32 [INFO] env: <LunarLander<LunarLander-v2>> 06:44:32 [INFO] action_space: Discrete(4) 06:44:32 [INFO] observation_space: Box(-inf, inf, (8,), float32) 06:44:32 [INFO] reward_range: (-inf, inf) 06:44:32 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50} 06:44:32 [INFO] _max_episode_steps: 1000 06:44:32 [INFO] _elapsed_steps: None 06:44:32 [INFO] id: LunarLander-v2 06:44:32 [INFO] entry_point: gym.envs.box2d:LunarLander 06:44:32 [INFO] reward_threshold: 200 06:44:32 [INFO] nondeterministic: False 06:44:32 [INFO] max_episode_steps: 1000 06:44:32 [INFO] _kwargs: {} 06:44:32 [INFO] _env_name: LunarLander
class DQNReplayer:
def __init__(self, capacity):
self.memory = pd.DataFrame(index=range(capacity),
columns=['state', 'action', 'reward', 'next_state', 'terminated'])
self.i = 0
self.count = 0
self.capacity = capacity
def store(self, *args):
self.memory.loc[self.i] = np.asarray(args, dtype=object)
self.i = (self.i + 1) % self.capacity
self.count = min(self.count + 1, self.capacity)
def sample(self, size):
indices = np.random.choice(self.count, size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.memory.columns)
class SACAgent:
def __init__(self, env):
state_dim = env.observation_space.shape[0]
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = DQNReplayer(10000)
self.alpha = 0.02
# create actor
self.actor_net = self.build_net(input_size=state_dim,
hidden_sizes=[256, 256],
output_size=self.action_n, output_activator=nn.Softmax(-1))
self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=3e-4)
# create V critic
self.v_evaluate_net = self.build_net(input_size=state_dim,
hidden_sizes=[256, 256])
self.v_target_net = copy.deepcopy(self.v_evaluate_net)
self.v_optimizer = optim.Adam(self.v_evaluate_net.parameters(), lr=3e-4)
self.v_loss = nn.MSELoss()
# create Q critic
self.q0_net = self.build_net(input_size=state_dim,
hidden_sizes=[256, 256], output_size=self.action_n)
self.q1_net = self.build_net(input_size=state_dim,
hidden_sizes=[256, 256], output_size=self.action_n)
self.q0_loss = nn.MSELoss()
self.q1_loss = nn.MSELoss()
self.q0_optimizer = optim.Adam(self.q0_net.parameters(), lr=3e-4)
self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=3e-4)
def build_net(self, input_size, hidden_sizes, output_size=1,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
def step(self, observation, reward, terminated):
state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
prob_tensor = self.actor_net(state_tensor)
action_tensor = distributions.Categorical(prob_tensor).sample()
action = action_tensor.numpy()[0]
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
state, _, _, action, next_state, reward, terminated, _ = \
self.trajectory[-8:]
self.replayer.store(state, action, reward, next_state, terminated)
if self.replayer.count >= 500:
self.learn()
return action
def close(self):
pass
def update_net(self, target_net, evaluate_net, learning_rate=0.0025):
for target_param, evaluate_param in zip(
target_net.parameters(), evaluate_net.parameters()):
target_param.data.copy_(learning_rate * evaluate_param.data
+ (1 - learning_rate) * target_param.data)
def learn(self):
states, actions, rewards, next_states, terminateds = \
self.replayer.sample(128)
state_tensor = torch.as_tensor(states, dtype=torch.float)
action_tensor = torch.as_tensor(actions, dtype=torch.long)
reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)
# update Q critic
next_v_tensor = self.v_target_net(next_state_tensor)
q_target_tensor = reward_tensor.unsqueeze(1) + self.gamma * \
(1. - terminated_tensor.unsqueeze(1)) * next_v_tensor
all_q0_pred_tensor = self.q0_net(state_tensor)
q0_pred_tensor = torch.gather(all_q0_pred_tensor, 1,
action_tensor.unsqueeze(1))
q0_loss_tensor = self.q0_loss(q0_pred_tensor, q_target_tensor.detach())
self.q0_optimizer.zero_grad()
q0_loss_tensor.backward()
self.q0_optimizer.step()
all_q1_pred_tensor = self.q1_net(state_tensor)
q1_pred_tensor = torch.gather(all_q1_pred_tensor, 1,
action_tensor.unsqueeze(1))
q1_loss_tensor = self.q1_loss(q1_pred_tensor, q_target_tensor.detach())
self.q1_optimizer.zero_grad()
q1_loss_tensor.backward()
self.q1_optimizer.step()
# update V critic
q0_tensor = self.q0_net(state_tensor)
q1_tensor = self.q1_net(state_tensor)
q01_tensor = torch.min(q0_tensor, q1_tensor)
prob_tensor = self.actor_net(state_tensor)
ln_prob_tensor = torch.log(prob_tensor.clamp(1e-6, 1.))
entropic_q01_tensor = prob_tensor * (q01_tensor -
self.alpha * ln_prob_tensor)
# OR entropic_q01_tensor = prob_tensor * (q01_tensor - \
# self.alpha * torch.xlogy(prob_tensor, prob_tensor)
v_target_tensor = torch.sum(entropic_q01_tensor, dim=-1, keepdim=True)
v_pred_tensor = self.v_evaluate_net(state_tensor)
v_loss_tensor = self.v_loss(v_pred_tensor, v_target_tensor.detach())
self.v_optimizer.zero_grad()
v_loss_tensor.backward()
self.v_optimizer.step()
self.update_net(self.v_target_net, self.v_evaluate_net)
# update actor
prob_q_tensor = prob_tensor * (self.alpha * ln_prob_tensor - q0_tensor)
actor_loss_tensor = prob_q_tensor.sum(axis=-1).mean()
self.actor_optimizer.zero_grad()
actor_loss_tensor.backward()
self.actor_optimizer.step()
agent = SACAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > 200:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
06:44:33 [INFO] ==== train ==== 06:44:33 [INFO] train episode 0: reward = -119.74, steps = 100 06:44:33 [INFO] train episode 1: reward = -53.41, steps = 63 06:44:33 [INFO] train episode 2: reward = -136.45, steps = 87 06:44:33 [INFO] train episode 3: reward = -326.87, steps = 70 06:44:33 [INFO] train episode 4: reward = -117.95, steps = 113 06:44:34 [INFO] train episode 5: reward = -431.55, steps = 99 06:44:37 [INFO] train episode 6: reward = -344.16, steps = 79 06:44:40 [INFO] train episode 7: reward = -124.00, steps = 73 06:44:43 [INFO] train episode 8: reward = -504.88, steps = 85 06:44:46 [INFO] train episode 9: reward = -453.93, steps = 79 06:44:48 [INFO] train episode 10: reward = -347.43, steps = 71 06:44:51 [INFO] train episode 11: reward = -244.51, steps = 73 06:44:54 [INFO] train episode 12: reward = -291.91, steps = 73 06:44:56 [INFO] train episode 13: reward = -398.81, steps = 73 06:44:59 [INFO] train episode 14: reward = -281.87, steps = 68 06:45:01 [INFO] train episode 15: reward = -262.03, steps = 53 06:45:03 [INFO] train episode 16: reward = -213.43, steps = 61 06:45:05 [INFO] train episode 17: reward = -308.40, steps = 56 06:45:08 [INFO] train episode 18: reward = -242.01, steps = 76 06:45:13 [INFO] train episode 19: reward = -353.46, steps = 113 06:45:15 [INFO] train episode 20: reward = -209.88, steps = 75 06:45:19 [INFO] train episode 21: reward = -308.92, steps = 98 06:45:22 [INFO] train episode 22: reward = 9.74, steps = 87 06:45:37 [INFO] train episode 23: reward = -520.48, steps = 412 06:45:46 [INFO] train episode 24: reward = -341.28, steps = 229 06:45:51 [INFO] train episode 25: reward = -59.83, steps = 153 06:45:55 [INFO] train episode 26: reward = -122.98, steps = 110 06:46:20 [INFO] train episode 27: reward = -646.65, steps = 668 06:46:55 [INFO] train episode 28: reward = -480.28, steps = 920 06:47:08 [INFO] train episode 29: reward = -272.43, steps = 357 06:47:15 [INFO] train episode 30: reward = -305.03, steps = 173 06:47:28 [INFO] train episode 31: reward = -199.98, steps = 368 06:47:40 [INFO] train episode 32: reward = -124.04, steps = 320 06:47:52 [INFO] train episode 33: reward = -118.74, steps = 317 06:48:04 [INFO] train episode 34: reward = -215.16, steps = 338 06:48:17 [INFO] train episode 35: reward = -232.50, steps = 356 06:48:43 [INFO] train episode 36: reward = -297.74, steps = 693 06:49:02 [INFO] train episode 37: reward = -262.43, steps = 496 06:49:15 [INFO] train episode 38: reward = -269.07, steps = 345 06:49:22 [INFO] train episode 39: reward = -190.48, steps = 191 06:49:31 [INFO] train episode 40: reward = -206.82, steps = 237 06:49:42 [INFO] train episode 41: reward = -196.88, steps = 315 06:49:50 [INFO] train episode 42: reward = -147.69, steps = 200 06:49:58 [INFO] train episode 43: reward = -210.92, steps = 239 06:50:06 [INFO] train episode 44: reward = -169.53, steps = 197 06:50:37 [INFO] train episode 45: reward = -320.28, steps = 840 06:50:48 [INFO] train episode 46: reward = -248.61, steps = 299 06:50:58 [INFO] train episode 47: reward = -239.37, steps = 265 06:51:04 [INFO] train episode 48: reward = -176.54, steps = 167 06:51:36 [INFO] train episode 49: reward = -311.47, steps = 820 06:51:45 [INFO] train episode 50: reward = -147.75, steps = 261 06:52:03 [INFO] train episode 51: reward = -241.43, steps = 471 06:52:10 [INFO] train episode 52: reward = -152.05, steps = 211 06:52:27 [INFO] train episode 53: reward = -226.51, steps = 439 06:52:34 [INFO] train episode 54: reward = -150.29, steps = 188 06:52:44 [INFO] train episode 55: reward = -191.25, steps = 277 06:52:54 [INFO] train episode 56: reward = -156.33, steps = 275 06:53:08 [INFO] train episode 57: reward = -204.69, steps = 385 06:53:19 [INFO] train episode 58: reward = -180.31, steps = 297 06:53:40 [INFO] train episode 59: reward = -190.51, steps = 571 06:53:54 [INFO] train episode 60: reward = -179.89, steps = 364 06:54:16 [INFO] train episode 61: reward = -224.54, steps = 595 06:54:41 [INFO] train episode 62: reward = -255.25, steps = 645 06:55:18 [INFO] train episode 63: reward = -282.85, steps = 984 06:55:36 [INFO] train episode 64: reward = -171.43, steps = 464 06:55:51 [INFO] train episode 65: reward = -202.57, steps = 399 06:56:26 [INFO] train episode 66: reward = -262.52, steps = 910 06:56:36 [INFO] train episode 67: reward = -103.61, steps = 277 06:56:45 [INFO] train episode 68: reward = -117.21, steps = 246 06:57:23 [INFO] train episode 69: reward = -157.12, steps = 1000 06:57:53 [INFO] train episode 70: reward = -251.57, steps = 792 06:58:30 [INFO] train episode 71: reward = -176.52, steps = 1000 06:58:43 [INFO] train episode 72: reward = -154.51, steps = 316 06:59:21 [INFO] train episode 73: reward = -125.18, steps = 1000 06:59:59 [INFO] train episode 74: reward = -91.01, steps = 1000 07:00:39 [INFO] train episode 75: reward = -152.42, steps = 1000 07:01:17 [INFO] train episode 76: reward = -136.96, steps = 1000 07:01:57 [INFO] train episode 77: reward = -155.50, steps = 1000 07:02:35 [INFO] train episode 78: reward = -92.97, steps = 1000 07:03:13 [INFO] train episode 79: reward = -69.89, steps = 1000 07:03:49 [INFO] train episode 80: reward = -103.41, steps = 1000 07:04:26 [INFO] train episode 81: reward = -79.75, steps = 1000 07:05:03 [INFO] train episode 82: reward = -105.82, steps = 1000 07:05:42 [INFO] train episode 83: reward = -136.08, steps = 1000 07:06:23 [INFO] train episode 84: reward = -119.86, steps = 1000 07:07:04 [INFO] train episode 85: reward = -76.90, steps = 1000 07:07:45 [INFO] train episode 86: reward = -97.73, steps = 1000 07:08:25 [INFO] train episode 87: reward = -146.23, steps = 1000 07:09:04 [INFO] train episode 88: reward = -120.31, steps = 1000 07:09:45 [INFO] train episode 89: reward = -113.89, steps = 1000 07:10:26 [INFO] train episode 90: reward = -106.02, steps = 1000 07:11:07 [INFO] train episode 91: reward = -19.21, steps = 1000 07:11:49 [INFO] train episode 92: reward = -116.32, steps = 1000 07:12:29 [INFO] train episode 93: reward = -57.89, steps = 1000 07:13:10 [INFO] train episode 94: reward = -92.17, steps = 1000 07:13:52 [INFO] train episode 95: reward = -122.81, steps = 1000 07:14:33 [INFO] train episode 96: reward = -144.50, steps = 1000 07:15:14 [INFO] train episode 97: reward = -87.08, steps = 1000 07:15:56 [INFO] train episode 98: reward = -125.38, steps = 1000 07:16:37 [INFO] train episode 99: reward = -64.51, steps = 1000 07:17:18 [INFO] train episode 100: reward = -111.63, steps = 1000 07:17:59 [INFO] train episode 101: reward = -52.33, steps = 1000 07:18:43 [INFO] train episode 102: reward = -126.03, steps = 1000 07:19:24 [INFO] train episode 103: reward = -125.11, steps = 1000 07:19:27 [INFO] train episode 104: reward = -205.00, steps = 76 07:20:09 [INFO] train episode 105: reward = -146.56, steps = 1000 07:20:52 [INFO] train episode 106: reward = -139.96, steps = 1000 07:21:32 [INFO] train episode 107: reward = -124.79, steps = 1000 07:22:12 [INFO] train episode 108: reward = -114.53, steps = 1000 07:22:52 [INFO] train episode 109: reward = -95.93, steps = 1000 07:23:35 [INFO] train episode 110: reward = -123.12, steps = 1000 07:24:18 [INFO] train episode 111: reward = -80.41, steps = 1000 07:25:00 [INFO] train episode 112: reward = -76.70, steps = 1000 07:25:44 [INFO] train episode 113: reward = -105.85, steps = 1000 07:26:29 [INFO] train episode 114: reward = -107.08, steps = 1000 07:27:12 [INFO] train episode 115: reward = -129.47, steps = 1000 07:27:57 [INFO] train episode 116: reward = -48.79, steps = 1000 07:28:43 [INFO] train episode 117: reward = -50.67, steps = 1000 07:29:29 [INFO] train episode 118: reward = -99.13, steps = 1000 07:30:15 [INFO] train episode 119: reward = -95.30, steps = 1000 07:31:00 [INFO] train episode 120: reward = -131.54, steps = 1000 07:31:22 [INFO] train episode 121: reward = 234.60, steps = 489 07:32:08 [INFO] train episode 122: reward = -134.29, steps = 1000 07:32:53 [INFO] train episode 123: reward = -63.59, steps = 1000 07:33:26 [INFO] train episode 124: reward = -80.91, steps = 784 07:34:11 [INFO] train episode 125: reward = -115.63, steps = 1000 07:34:54 [INFO] train episode 126: reward = -82.44, steps = 1000 07:35:40 [INFO] train episode 127: reward = -128.58, steps = 1000 07:36:24 [INFO] train episode 128: reward = -67.69, steps = 1000 07:37:09 [INFO] train episode 129: reward = -136.20, steps = 1000 07:37:54 [INFO] train episode 130: reward = -103.51, steps = 1000 07:38:40 [INFO] train episode 131: reward = -146.59, steps = 1000 07:39:25 [INFO] train episode 132: reward = -69.08, steps = 1000 07:40:10 [INFO] train episode 133: reward = -66.15, steps = 1000 07:40:56 [INFO] train episode 134: reward = -125.93, steps = 1000 07:41:43 [INFO] train episode 135: reward = -100.82, steps = 1000 07:42:31 [INFO] train episode 136: reward = -136.78, steps = 1000 07:42:40 [INFO] train episode 137: reward = 1.99, steps = 202 07:42:44 [INFO] train episode 138: reward = -411.09, steps = 92 07:43:29 [INFO] train episode 139: reward = -84.54, steps = 1000 07:44:13 [INFO] train episode 140: reward = -95.32, steps = 1000 07:45:00 [INFO] train episode 141: reward = -64.36, steps = 1000 07:45:46 [INFO] train episode 142: reward = -46.74, steps = 1000 07:46:32 [INFO] train episode 143: reward = -111.39, steps = 1000 07:47:18 [INFO] train episode 144: reward = -133.64, steps = 1000 07:48:05 [INFO] train episode 145: reward = -173.13, steps = 1000 07:48:59 [INFO] train episode 146: reward = -139.49, steps = 1000 07:49:54 [INFO] train episode 147: reward = -110.06, steps = 1000 07:50:42 [INFO] train episode 148: reward = -120.51, steps = 1000 07:51:03 [INFO] train episode 149: reward = -194.49, steps = 449 07:51:50 [INFO] train episode 150: reward = -91.71, steps = 1000 07:52:38 [INFO] train episode 151: reward = -109.07, steps = 1000 07:53:25 [INFO] train episode 152: reward = -120.29, steps = 1000 07:54:12 [INFO] train episode 153: reward = -77.45, steps = 1000 07:55:00 [INFO] train episode 154: reward = -106.42, steps = 1000 07:55:49 [INFO] train episode 155: reward = -135.87, steps = 1000 07:56:37 [INFO] train episode 156: reward = -87.26, steps = 1000 07:57:26 [INFO] train episode 157: reward = -37.32, steps = 1000 07:58:12 [INFO] train episode 158: reward = -108.85, steps = 1000 07:59:00 [INFO] train episode 159: reward = -92.93, steps = 1000 07:59:48 [INFO] train episode 160: reward = -51.78, steps = 1000 08:00:36 [INFO] train episode 161: reward = -94.08, steps = 1000 08:01:29 [INFO] train episode 162: reward = -65.62, steps = 1000 08:02:24 [INFO] train episode 163: reward = -96.11, steps = 1000 08:03:19 [INFO] train episode 164: reward = -72.80, steps = 1000 08:04:07 [INFO] train episode 165: reward = -90.31, steps = 1000 08:04:58 [INFO] train episode 166: reward = -103.97, steps = 1000 08:05:46 [INFO] train episode 167: reward = -110.74, steps = 1000 08:06:34 [INFO] train episode 168: reward = -87.47, steps = 1000 08:07:22 [INFO] train episode 169: reward = -110.01, steps = 1000 08:08:10 [INFO] train episode 170: reward = -72.90, steps = 1000 08:08:57 [INFO] train episode 171: reward = -102.21, steps = 1000 08:09:43 [INFO] train episode 172: reward = -78.22, steps = 1000 08:10:30 [INFO] train episode 173: reward = -134.52, steps = 1000 08:11:17 [INFO] train episode 174: reward = -112.33, steps = 1000 08:12:06 [INFO] train episode 175: reward = -124.90, steps = 1000 08:12:54 [INFO] train episode 176: reward = -104.53, steps = 1000 08:13:41 [INFO] train episode 177: reward = -104.41, steps = 1000 08:13:46 [INFO] train episode 178: reward = -120.70, steps = 117 08:14:34 [INFO] train episode 179: reward = -116.06, steps = 1000 08:15:22 [INFO] train episode 180: reward = -109.05, steps = 1000 08:16:13 [INFO] train episode 181: reward = -138.38, steps = 1000 08:16:57 [INFO] train episode 182: reward = -180.98, steps = 872 08:17:43 [INFO] train episode 183: reward = -121.58, steps = 1000 08:18:31 [INFO] train episode 184: reward = -92.83, steps = 1000 08:19:18 [INFO] train episode 185: reward = -114.65, steps = 1000 09:45:10 [INFO] train episode 186: reward = -91.63, steps = 1000 09:46:00 [INFO] train episode 187: reward = -51.81, steps = 1000 09:46:51 [INFO] train episode 188: reward = -111.85, steps = 1000 09:47:41 [INFO] train episode 189: reward = -86.58, steps = 1000 09:48:33 [INFO] train episode 190: reward = -103.56, steps = 1000 09:49:23 [INFO] train episode 191: reward = -59.68, steps = 1000 09:50:17 [INFO] train episode 192: reward = -103.80, steps = 1000 09:51:12 [INFO] train episode 193: reward = -87.74, steps = 1000 09:52:01 [INFO] train episode 194: reward = -102.89, steps = 1000 09:52:50 [INFO] train episode 195: reward = -71.90, steps = 1000 09:53:40 [INFO] train episode 196: reward = -100.19, steps = 1000 09:54:33 [INFO] train episode 197: reward = -111.32, steps = 1000 09:55:27 [INFO] train episode 198: reward = -130.02, steps = 1000 09:56:19 [INFO] train episode 199: reward = -102.46, steps = 1000 09:57:10 [INFO] train episode 200: reward = -94.77, steps = 1000 09:58:00 [INFO] train episode 201: reward = -95.71, steps = 1000 09:58:50 [INFO] train episode 202: reward = -130.00, steps = 1000 09:59:42 [INFO] train episode 203: reward = -90.24, steps = 1000 10:00:35 [INFO] train episode 204: reward = -56.69, steps = 1000 10:01:24 [INFO] train episode 205: reward = -127.45, steps = 1000 10:02:13 [INFO] train episode 206: reward = -159.15, steps = 1000 10:03:05 [INFO] train episode 207: reward = -89.27, steps = 1000 10:03:57 [INFO] train episode 208: reward = -143.35, steps = 1000 10:04:02 [INFO] train episode 209: reward = -152.49, steps = 107 10:04:15 [INFO] train episode 210: reward = -218.52, steps = 281 10:04:23 [INFO] train episode 211: reward = -132.44, steps = 148 10:05:13 [INFO] train episode 212: reward = -112.33, steps = 1000 10:05:21 [INFO] train episode 213: reward = -190.35, steps = 173 10:05:34 [INFO] train episode 214: reward = -192.30, steps = 250 10:05:43 [INFO] train episode 215: reward = -372.53, steps = 185 10:05:51 [INFO] train episode 216: reward = -448.11, steps = 173 10:05:59 [INFO] train episode 217: reward = -514.03, steps = 155 10:06:04 [INFO] train episode 218: reward = -331.05, steps = 102 10:06:13 [INFO] train episode 219: reward = -355.23, steps = 180 10:06:21 [INFO] train episode 220: reward = -315.29, steps = 164 10:06:27 [INFO] train episode 221: reward = -334.99, steps = 110 10:06:32 [INFO] train episode 222: reward = -338.17, steps = 95 10:06:39 [INFO] train episode 223: reward = -239.06, steps = 134 10:07:33 [INFO] train episode 224: reward = -235.71, steps = 1000 10:07:47 [INFO] train episode 225: reward = -221.32, steps = 285 10:07:55 [INFO] train episode 226: reward = -21.88, steps = 152 10:08:01 [INFO] train episode 227: reward = -191.53, steps = 137 10:08:08 [INFO] train episode 228: reward = -287.15, steps = 146 10:08:12 [INFO] train episode 229: reward = -140.63, steps = 77 10:08:16 [INFO] train episode 230: reward = -123.00, steps = 69 10:08:22 [INFO] train episode 231: reward = -153.94, steps = 135 10:08:35 [INFO] train episode 232: reward = -123.91, steps = 257 10:08:38 [INFO] train episode 233: reward = -85.80, steps = 65 10:08:43 [INFO] train episode 234: reward = -52.43, steps = 95 10:08:51 [INFO] train episode 235: reward = -237.58, steps = 111 10:09:23 [INFO] train episode 236: reward = -77.69, steps = 620 10:09:26 [INFO] train episode 237: reward = -143.91, steps = 59 10:09:52 [INFO] train episode 238: reward = -178.96, steps = 569 10:09:54 [INFO] train episode 239: reward = -57.07, steps = 65 10:10:21 [INFO] train episode 240: reward = 226.20, steps = 582 10:10:26 [INFO] train episode 241: reward = -263.10, steps = 109 10:10:44 [INFO] train episode 242: reward = -118.00, steps = 407 10:10:49 [INFO] train episode 243: reward = -145.33, steps = 111 10:11:19 [INFO] train episode 244: reward = -276.73, steps = 660 10:11:22 [INFO] train episode 245: reward = -78.30, steps = 75 10:11:27 [INFO] train episode 246: reward = -175.75, steps = 113 10:11:46 [INFO] train episode 247: reward = -202.39, steps = 429 10:11:56 [INFO] train episode 248: reward = -87.05, steps = 212 10:11:59 [INFO] train episode 249: reward = -105.88, steps = 76 10:12:03 [INFO] train episode 250: reward = 24.85, steps = 99 10:12:06 [INFO] train episode 251: reward = -31.90, steps = 69 10:12:10 [INFO] train episode 252: reward = -98.10, steps = 88 10:12:13 [INFO] train episode 253: reward = -65.26, steps = 68 10:12:23 [INFO] train episode 254: reward = -47.99, steps = 214 10:12:26 [INFO] train episode 255: reward = -66.61, steps = 78 10:12:30 [INFO] train episode 256: reward = -55.35, steps = 96 10:12:43 [INFO] train episode 257: reward = -108.98, steps = 287 10:12:53 [INFO] train episode 258: reward = -243.54, steps = 228 10:12:58 [INFO] train episode 259: reward = -61.50, steps = 113 10:13:04 [INFO] train episode 260: reward = 6.30, steps = 146 10:13:07 [INFO] train episode 261: reward = -63.49, steps = 69 10:13:12 [INFO] train episode 262: reward = -27.91, steps = 110 10:13:16 [INFO] train episode 263: reward = -28.44, steps = 86 10:13:21 [INFO] train episode 264: reward = -73.21, steps = 125 10:13:33 [INFO] train episode 265: reward = -90.64, steps = 267 10:13:37 [INFO] train episode 266: reward = -71.93, steps = 106 10:13:41 [INFO] train episode 267: reward = -80.66, steps = 83 10:13:46 [INFO] train episode 268: reward = -254.02, steps = 119 10:13:51 [INFO] train episode 269: reward = -175.93, steps = 106 10:13:55 [INFO] train episode 270: reward = -31.64, steps = 101 10:14:00 [INFO] train episode 271: reward = 39.31, steps = 122 10:14:04 [INFO] train episode 272: reward = -96.64, steps = 87 10:14:08 [INFO] train episode 273: reward = -126.36, steps = 98 10:14:11 [INFO] train episode 274: reward = -93.53, steps = 58 10:14:16 [INFO] train episode 275: reward = -29.92, steps = 123 10:14:21 [INFO] train episode 276: reward = -195.91, steps = 105 10:14:25 [INFO] train episode 277: reward = -59.00, steps = 94 10:14:28 [INFO] train episode 278: reward = -69.29, steps = 60 10:14:33 [INFO] train episode 279: reward = -79.33, steps = 115 10:14:39 [INFO] train episode 280: reward = -46.21, steps = 131 10:14:42 [INFO] train episode 281: reward = -41.76, steps = 78 10:14:49 [INFO] train episode 282: reward = -81.62, steps = 167 10:14:56 [INFO] train episode 283: reward = -61.45, steps = 136 10:15:00 [INFO] train episode 284: reward = -83.56, steps = 114 10:15:08 [INFO] train episode 285: reward = -45.95, steps = 182 10:15:13 [INFO] train episode 286: reward = -231.50, steps = 100 10:15:19 [INFO] train episode 287: reward = 2.56, steps = 142 10:15:23 [INFO] train episode 288: reward = -164.72, steps = 91 10:15:28 [INFO] train episode 289: reward = -269.13, steps = 110 10:15:33 [INFO] train episode 290: reward = -187.57, steps = 92 10:15:37 [INFO] train episode 291: reward = -90.53, steps = 90 10:15:44 [INFO] train episode 292: reward = -29.03, steps = 156 10:15:50 [INFO] train episode 293: reward = -79.58, steps = 148 10:15:56 [INFO] train episode 294: reward = -113.31, steps = 134 10:16:01 [INFO] train episode 295: reward = -282.00, steps = 113 10:16:10 [INFO] train episode 296: reward = -286.56, steps = 206 10:16:14 [INFO] train episode 297: reward = -117.19, steps = 98 10:16:20 [INFO] train episode 298: reward = -188.22, steps = 129 10:16:24 [INFO] train episode 299: reward = -178.26, steps = 106 10:16:32 [INFO] train episode 300: reward = -116.22, steps = 167 10:16:36 [INFO] train episode 301: reward = -125.04, steps = 103 10:16:41 [INFO] train episode 302: reward = -100.87, steps = 107 10:16:46 [INFO] train episode 303: reward = -219.16, steps = 108 10:16:53 [INFO] train episode 304: reward = -172.78, steps = 164 10:16:57 [INFO] train episode 305: reward = -224.83, steps = 105 10:17:03 [INFO] train episode 306: reward = 37.84, steps = 120 10:17:12 [INFO] train episode 307: reward = -197.55, steps = 225 10:17:20 [INFO] train episode 308: reward = -244.53, steps = 167 10:17:24 [INFO] train episode 309: reward = 1.84, steps = 98 10:17:28 [INFO] train episode 310: reward = -38.25, steps = 92 10:18:02 [INFO] train episode 311: reward = 99.52, steps = 764 10:18:09 [INFO] train episode 312: reward = -317.85, steps = 157 10:18:15 [INFO] train episode 313: reward = -43.25, steps = 151 10:18:20 [INFO] train episode 314: reward = -31.94, steps = 105 10:18:27 [INFO] train episode 315: reward = -318.80, steps = 161 10:18:35 [INFO] train episode 316: reward = -293.41, steps = 178 10:18:40 [INFO] train episode 317: reward = 19.58, steps = 114 10:18:46 [INFO] train episode 318: reward = -267.28, steps = 152 10:18:53 [INFO] train episode 319: reward = -305.29, steps = 161 10:18:59 [INFO] train episode 320: reward = -165.88, steps = 137 10:19:08 [INFO] train episode 321: reward = -142.77, steps = 203 10:19:24 [INFO] train episode 322: reward = -253.69, steps = 359 10:19:35 [INFO] train episode 323: reward = -215.88, steps = 256 10:19:41 [INFO] train episode 324: reward = -53.77, steps = 139 10:20:26 [INFO] train episode 325: reward = 43.81, steps = 1000 10:20:31 [INFO] train episode 326: reward = 18.88, steps = 126 10:20:47 [INFO] train episode 327: reward = -233.37, steps = 362 10:21:03 [INFO] train episode 328: reward = -96.78, steps = 340 10:21:15 [INFO] train episode 329: reward = -173.47, steps = 268 10:21:34 [INFO] train episode 330: reward = -55.85, steps = 442 10:21:46 [INFO] train episode 331: reward = -61.28, steps = 272 10:22:05 [INFO] train episode 332: reward = -154.27, steps = 408 10:22:22 [INFO] train episode 333: reward = -54.09, steps = 370 10:22:31 [INFO] train episode 334: reward = -12.82, steps = 178 10:22:38 [INFO] train episode 335: reward = -37.68, steps = 162 10:23:24 [INFO] train episode 336: reward = 27.19, steps = 1000 10:23:36 [INFO] train episode 337: reward = -49.52, steps = 266 10:24:19 [INFO] train episode 338: reward = 91.25, steps = 937 10:25:05 [INFO] train episode 339: reward = -29.73, steps = 1000 10:25:50 [INFO] train episode 340: reward = -5.60, steps = 1000 10:26:37 [INFO] train episode 341: reward = -36.03, steps = 1000 10:27:29 [INFO] train episode 342: reward = -87.34, steps = 1000 10:28:18 [INFO] train episode 343: reward = -40.44, steps = 1000 10:28:26 [INFO] train episode 344: reward = -178.64, steps = 180 10:29:11 [INFO] train episode 345: reward = 1.81, steps = 1000 10:29:56 [INFO] train episode 346: reward = 8.41, steps = 1000 10:30:42 [INFO] train episode 347: reward = -37.85, steps = 1000 10:31:30 [INFO] train episode 348: reward = -59.51, steps = 1000 10:32:17 [INFO] train episode 349: reward = -48.58, steps = 1000 10:33:07 [INFO] train episode 350: reward = 19.01, steps = 1000 10:34:03 [INFO] train episode 351: reward = 20.37, steps = 1000 10:35:00 [INFO] train episode 352: reward = -36.50, steps = 1000 10:35:54 [INFO] train episode 353: reward = 6.67, steps = 1000 10:36:49 [INFO] train episode 354: reward = 15.42, steps = 1000 10:37:46 [INFO] train episode 355: reward = -3.12, steps = 1000 10:38:42 [INFO] train episode 356: reward = 12.33, steps = 1000 10:39:41 [INFO] train episode 357: reward = -276.81, steps = 999 10:40:36 [INFO] train episode 358: reward = -29.50, steps = 1000 10:41:34 [INFO] train episode 359: reward = -6.15, steps = 1000 10:42:29 [INFO] train episode 360: reward = 11.12, steps = 1000 10:43:26 [INFO] train episode 361: reward = -46.64, steps = 1000 10:44:21 [INFO] train episode 362: reward = 20.21, steps = 1000 10:45:17 [INFO] train episode 363: reward = -21.80, steps = 1000 10:46:07 [INFO] train episode 364: reward = 4.08, steps = 1000 10:46:58 [INFO] train episode 365: reward = 25.01, steps = 1000 10:47:51 [INFO] train episode 366: reward = -11.14, steps = 1000 10:48:43 [INFO] train episode 367: reward = -5.90, steps = 1000 10:49:34 [INFO] train episode 368: reward = 152.86, steps = 942 10:50:27 [INFO] train episode 369: reward = 98.29, steps = 982 10:51:17 [INFO] train episode 370: reward = 6.46, steps = 1000 10:52:10 [INFO] train episode 371: reward = -4.99, steps = 1000 10:53:02 [INFO] train episode 372: reward = 59.76, steps = 1000 10:53:55 [INFO] train episode 373: reward = 26.03, steps = 1000 10:54:49 [INFO] train episode 374: reward = -128.53, steps = 1000 10:55:42 [INFO] train episode 375: reward = 38.01, steps = 1000 10:56:35 [INFO] train episode 376: reward = 13.31, steps = 1000 10:57:25 [INFO] train episode 377: reward = 41.84, steps = 1000 10:58:05 [INFO] train episode 378: reward = 162.93, steps = 769 10:58:50 [INFO] train episode 379: reward = 115.39, steps = 884 10:59:29 [INFO] train episode 380: reward = 170.95, steps = 732 11:00:13 [INFO] train episode 381: reward = 127.19, steps = 843 11:00:31 [INFO] train episode 382: reward = 293.36, steps = 344 11:01:09 [INFO] train episode 383: reward = 176.53, steps = 734 11:01:47 [INFO] train episode 384: reward = 172.70, steps = 766 11:02:17 [INFO] train episode 385: reward = 202.70, steps = 606 11:02:45 [INFO] train episode 386: reward = 216.61, steps = 529 11:03:13 [INFO] train episode 387: reward = 198.19, steps = 549 11:04:02 [INFO] train episode 388: reward = 145.91, steps = 928 11:04:34 [INFO] train episode 389: reward = 181.54, steps = 575 11:05:01 [INFO] train episode 390: reward = 241.98, steps = 487 11:05:25 [INFO] train episode 391: reward = 248.55, steps = 414 11:05:25 [INFO] ==== test ==== 11:05:26 [INFO] test episode 0: reward = 277.28, steps = 418 11:05:27 [INFO] test episode 1: reward = 233.27, steps = 401 11:05:28 [INFO] test episode 2: reward = 228.74, steps = 514 11:05:29 [INFO] test episode 3: reward = 225.76, steps = 503 11:05:30 [INFO] test episode 4: reward = 249.42, steps = 523 11:05:31 [INFO] test episode 5: reward = 221.33, steps = 464 11:05:32 [INFO] test episode 6: reward = 213.58, steps = 434 11:05:33 [INFO] test episode 7: reward = 259.34, steps = 451 11:05:34 [INFO] test episode 8: reward = 215.36, steps = 584 11:05:35 [INFO] test episode 9: reward = 234.53, steps = 401 11:05:36 [INFO] test episode 10: reward = 218.23, steps = 669 11:05:37 [INFO] test episode 11: reward = 263.28, steps = 464 11:05:38 [INFO] test episode 12: reward = 206.59, steps = 458 11:05:39 [INFO] test episode 13: reward = 263.12, steps = 371 11:05:40 [INFO] test episode 14: reward = 234.53, steps = 449 11:05:40 [INFO] test episode 15: reward = 254.30, steps = 375 11:05:41 [INFO] test episode 16: reward = 224.86, steps = 513 11:05:42 [INFO] test episode 17: reward = 214.34, steps = 489 11:05:43 [INFO] test episode 18: reward = 234.87, steps = 418 11:05:44 [INFO] test episode 19: reward = 208.02, steps = 429 11:05:45 [INFO] test episode 20: reward = 273.05, steps = 384 11:05:46 [INFO] test episode 21: reward = 209.81, steps = 502 11:05:47 [INFO] test episode 22: reward = 221.46, steps = 690 11:05:48 [INFO] test episode 23: reward = 211.70, steps = 624 11:05:49 [INFO] test episode 24: reward = 221.82, steps = 379 11:05:50 [INFO] test episode 25: reward = 211.89, steps = 598 11:05:52 [INFO] test episode 26: reward = 214.95, steps = 489 11:05:52 [INFO] test episode 27: reward = 255.48, steps = 374 11:05:54 [INFO] test episode 28: reward = 222.00, steps = 506 11:05:54 [INFO] test episode 29: reward = 234.99, steps = 420 11:05:55 [INFO] test episode 30: reward = 270.02, steps = 358 11:05:56 [INFO] test episode 31: reward = 274.00, steps = 369 11:05:56 [INFO] test episode 32: reward = 197.17, steps = 421 11:05:57 [INFO] test episode 33: reward = 265.64, steps = 493 11:05:58 [INFO] test episode 34: reward = 242.44, steps = 423 11:06:00 [INFO] test episode 35: reward = 218.41, steps = 621 11:06:01 [INFO] test episode 36: reward = 193.90, steps = 437 11:06:01 [INFO] test episode 37: reward = 242.15, steps = 433 11:06:02 [INFO] test episode 38: reward = 253.12, steps = 418 11:06:04 [INFO] test episode 39: reward = 219.34, steps = 561 11:06:05 [INFO] test episode 40: reward = 248.94, steps = 462 11:06:05 [INFO] test episode 41: reward = 254.59, steps = 400 11:06:06 [INFO] test episode 42: reward = 204.38, steps = 464 11:06:07 [INFO] test episode 43: reward = 227.20, steps = 411 11:06:08 [INFO] test episode 44: reward = 246.14, steps = 426 11:06:09 [INFO] test episode 45: reward = 240.90, steps = 406 11:06:10 [INFO] test episode 46: reward = 221.85, steps = 638 11:06:11 [INFO] test episode 47: reward = 223.06, steps = 426 11:06:12 [INFO] test episode 48: reward = 266.92, steps = 477 11:06:13 [INFO] test episode 49: reward = 241.36, steps = 537 11:06:14 [INFO] test episode 50: reward = 213.25, steps = 392 11:06:15 [INFO] test episode 51: reward = 238.72, steps = 432 11:06:15 [INFO] test episode 52: reward = 270.38, steps = 400 11:06:16 [INFO] test episode 53: reward = 231.75, steps = 385 11:06:17 [INFO] test episode 54: reward = 247.63, steps = 424 11:06:18 [INFO] test episode 55: reward = 231.26, steps = 412 11:06:19 [INFO] test episode 56: reward = 202.18, steps = 403 11:06:19 [INFO] test episode 57: reward = 212.22, steps = 447 11:06:20 [INFO] test episode 58: reward = 261.40, steps = 449 11:06:21 [INFO] test episode 59: reward = 224.69, steps = 450 11:06:22 [INFO] test episode 60: reward = 264.63, steps = 427 11:06:23 [INFO] test episode 61: reward = 231.54, steps = 480 11:06:24 [INFO] test episode 62: reward = 237.32, steps = 425 11:06:25 [INFO] test episode 63: reward = 229.52, steps = 435 11:06:25 [INFO] test episode 64: reward = 257.12, steps = 421 11:06:26 [INFO] test episode 65: reward = 228.47, steps = 444 11:06:27 [INFO] test episode 66: reward = 194.83, steps = 432 11:06:28 [INFO] test episode 67: reward = 210.60, steps = 504 11:06:30 [INFO] test episode 68: reward = 216.76, steps = 672 11:06:31 [INFO] test episode 69: reward = 258.72, steps = 449 11:06:32 [INFO] test episode 70: reward = 232.74, steps = 385 11:06:33 [INFO] test episode 71: reward = 258.31, steps = 448 11:06:34 [INFO] test episode 72: reward = 250.08, steps = 492 11:06:35 [INFO] test episode 73: reward = 207.09, steps = 453 11:06:36 [INFO] test episode 74: reward = 239.58, steps = 494 11:06:36 [INFO] test episode 75: reward = 283.37, steps = 398 11:06:37 [INFO] test episode 76: reward = 273.23, steps = 406 11:06:38 [INFO] test episode 77: reward = 199.14, steps = 437 11:06:39 [INFO] test episode 78: reward = 206.96, steps = 453 11:06:40 [INFO] test episode 79: reward = 232.77, steps = 527 11:06:41 [INFO] test episode 80: reward = 262.82, steps = 414 11:06:42 [INFO] test episode 81: reward = 238.25, steps = 417 11:06:42 [INFO] test episode 82: reward = 213.53, steps = 486 11:06:43 [INFO] test episode 83: reward = 276.50, steps = 429 11:06:44 [INFO] test episode 84: reward = 221.97, steps = 602 11:06:46 [INFO] test episode 85: reward = 210.29, steps = 722 11:06:46 [INFO] test episode 86: reward = 248.47, steps = 399 11:06:48 [INFO] test episode 87: reward = 218.88, steps = 714 11:06:49 [INFO] test episode 88: reward = 197.18, steps = 432 11:06:50 [INFO] test episode 89: reward = 229.81, steps = 405 11:06:51 [INFO] test episode 90: reward = 228.87, steps = 470 11:06:52 [INFO] test episode 91: reward = 243.35, steps = 437 11:06:53 [INFO] test episode 92: reward = 227.23, steps = 441 11:06:54 [INFO] test episode 93: reward = 252.32, steps = 409 11:06:55 [INFO] test episode 94: reward = 251.28, steps = 409 11:06:56 [INFO] test episode 95: reward = 210.83, steps = 484 11:06:57 [INFO] test episode 96: reward = 222.87, steps = 490 11:06:58 [INFO] test episode 97: reward = 209.81, steps = 603 11:06:59 [INFO] test episode 98: reward = 277.47, steps = 399 11:07:00 [INFO] test episode 99: reward = 217.30, steps = 448 11:07:00 [INFO] average episode reward = 234.15 ± 22.26
env.close()