PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.distributions as distributions
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
11:46:32 [INFO] env: <AcrobotEnv<Acrobot-v1>> 11:46:32 [INFO] action_space: Discrete(3) 11:46:32 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 11:46:32 [INFO] reward_range: (-inf, inf) 11:46:32 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 11:46:32 [INFO] _max_episode_steps: 500 11:46:32 [INFO] _elapsed_steps: None 11:46:32 [INFO] id: Acrobot-v1 11:46:32 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 11:46:32 [INFO] reward_threshold: -100.0 11:46:32 [INFO] nondeterministic: False 11:46:32 [INFO] max_episode_steps: 500 11:46:32 [INFO] _kwargs: {} 11:46:32 [INFO] _env_name: Acrobot
class PPOReplayer:
def __init__(self):
self.fields = ['state', 'action', 'prob', 'advantage', 'return']
self.memory = pd.DataFrame(columns=self.fields)
def store(self, df):
self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)
def sample(self, size):
indices = np.random.choice(self.memory.shape[0], size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.fields)
def conjugate_gradient(f, b, iter_count=10, epsilon=1e-12, tol=1e-6):
x = b * 0.
r = b.clone()
p = b.clone()
rho = torch.dot(r, r)
for i in range(iter_count):
z = f(p)
alpha = rho / (torch.dot(p, z) + epsilon)
x += alpha * p
r -= alpha * z
rho_new = torch.dot(r, r)
p = r + (rho_new / rho) * p
rho = rho_new
if rho < tol:
break
return x, f(x)
class NPGAgent:
def __init__(self, env):
self.gamma = 0.99
self.replayer = PPOReplayer()
self.trajectory = []
self.actor_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,],
output_size=env.action_space.n, output_activator=nn.Softmax(1))
self.max_kl = 0.001
self.critic_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,])
self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.002)
self.critic_loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size=1,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
def step(self, observation, reward, terminated):
state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
prob_tensor = self.actor_net(state_tensor)
action_tensor = distributions.Categorical(prob_tensor).sample()
action = action_tensor.numpy()[0]
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
return action
def close(self):
if self.mode == 'train':
self.save_trajectory_to_replayer()
if len(self.replayer.memory) >= 1000:
for batch in range(5): # learn multiple times
self.learn()
self.replayer = PPOReplayer()
# reset replayer after the agent changes itself
def save_trajectory_to_replayer(self):
df = pd.DataFrame(
np.array(self.trajectory, dtype=object).reshape(-1, 4),
columns=['state', 'reward', 'terminated', 'action'])
state_tensor = torch.as_tensor(np.stack(df['state']), dtype=torch.float)
action_tensor = torch.as_tensor(df['action'], dtype=torch.long)
v_tensor = self.critic_net(state_tensor)
df['v'] = v_tensor.detach().numpy()
prob_tensor = self.actor_net(state_tensor)
pi_tensor = prob_tensor.gather(-1, action_tensor.unsqueeze(1)).squeeze(1)
df['prob'] = pi_tensor.detach().numpy()
df['next_v'] = df['v'].shift(-1).fillna(0.)
df['u'] = df['reward'] + self.gamma * df['next_v']
df['delta'] = df['u'] - df['v']
df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
df['delta'][::-1])[::-1]
df['return'] = signal.lfilter([1.,], [1., -self.gamma],
df['reward'][::-1])[::-1]
self.replayer.store(df)
def learn(self):
states, actions, old_pis, advantages, returns = \
self.replayer.sample(size=64)
state_tensor = torch.as_tensor(states, dtype=torch.float)
action_tensor = torch.as_tensor(actions, dtype=torch.long)
old_pi_tensor = torch.as_tensor(old_pis, dtype=torch.float)
advantage_tensor = torch.as_tensor(advantages, dtype=torch.float)
return_tensor = torch.as_tensor(returns, dtype=torch.float).unsqueeze(1)
# update actor
# ... calculate first order gradient: g
all_pi_tensor = self.actor_net(state_tensor)
pi_tensor = all_pi_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
surrogate_tensor = (pi_tensor / old_pi_tensor) * advantage_tensor
loss_tensor = surrogate_tensor.mean()
loss_grads = autograd.grad(loss_tensor, self.actor_net.parameters())
loss_grad = torch.cat([grad.view(-1) for grad in loss_grads]).detach()
# flatten for calculating conjugate gradient
# ... calculate conjugate gradient: Fx = g
def f(x): # calculate Fx
prob_tensor = self.actor_net(state_tensor)
prob_old_tensor = prob_tensor.detach()
kld_tensor = (prob_old_tensor * (torch.log((prob_old_tensor /
prob_tensor).clamp(1e-6, 1e6)))).sum(axis=1)
kld_loss_tensor = kld_tensor.mean()
grads = autograd.grad(kld_loss_tensor, self.actor_net.parameters(),
create_graph=True)
flatten_grad_tensor = torch.cat([grad.view(-1) for grad in grads])
grad_matmul_x = torch.dot(flatten_grad_tensor, x)
grad_grads = autograd.grad(grad_matmul_x, self.actor_net.parameters())
flatten_grad_grad = torch.cat([grad.contiguous().view(-1) for grad
in grad_grads]).detach()
fx = flatten_grad_grad + x * 1e-2
return fx
x, fx = conjugate_gradient(f, loss_grad)
# ... calculate natural gradient: sqrt(...) g
natural_gradient = torch.sqrt(2 * self.max_kl / torch.dot(fx, x)) * x
# ... update actor net
begin = 0
for param in self.actor_net.parameters():
end = begin + param.numel()
param.data.copy_(natural_gradient[begin:end].view(param.size()) +
param.data)
begin = end
# update critic
pred_tensor = self.critic_net(state_tensor)
critic_loss_tensor = self.critic_loss(pred_tensor, return_tensor)
self.critic_optimizer.zero_grad()
critic_loss_tensor.backward()
self.critic_optimizer.step()
agent = NPGAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
11:46:33 [INFO] ==== train ==== 11:46:33 [INFO] NumExpr defaulting to 8 threads. 11:46:33 [INFO] train episode 0: reward = -500.00, steps = 500 11:46:34 [INFO] train episode 1: reward = -500.00, steps = 500 11:46:34 [INFO] train episode 2: reward = -500.00, steps = 500 11:46:35 [INFO] train episode 3: reward = -500.00, steps = 500 11:46:35 [INFO] train episode 4: reward = -500.00, steps = 500 11:46:36 [INFO] train episode 5: reward = -500.00, steps = 500 11:46:36 [INFO] train episode 6: reward = -500.00, steps = 500 11:46:37 [INFO] train episode 7: reward = -500.00, steps = 500 11:46:37 [INFO] train episode 8: reward = -301.00, steps = 302 11:46:37 [INFO] train episode 9: reward = -500.00, steps = 500 11:46:38 [INFO] train episode 10: reward = -303.00, steps = 304 11:46:38 [INFO] train episode 11: reward = -500.00, steps = 500 11:46:39 [INFO] train episode 12: reward = -500.00, steps = 500 11:46:39 [INFO] train episode 13: reward = -366.00, steps = 367 11:46:39 [INFO] train episode 14: reward = -233.00, steps = 234 11:46:40 [INFO] train episode 15: reward = -288.00, steps = 289 11:46:40 [INFO] train episode 16: reward = -460.00, steps = 461 11:46:40 [INFO] train episode 17: reward = -303.00, steps = 304 11:46:41 [INFO] train episode 18: reward = -411.00, steps = 412 11:46:41 [INFO] train episode 19: reward = -500.00, steps = 500 11:46:41 [INFO] train episode 20: reward = -312.00, steps = 313 11:46:42 [INFO] train episode 21: reward = -500.00, steps = 500 11:46:42 [INFO] train episode 22: reward = -440.00, steps = 441 11:46:43 [INFO] train episode 23: reward = -500.00, steps = 500 11:46:43 [INFO] train episode 24: reward = -491.00, steps = 492 11:46:44 [INFO] train episode 25: reward = -245.00, steps = 246 11:46:44 [INFO] train episode 26: reward = -450.00, steps = 451 11:46:44 [INFO] train episode 27: reward = -229.00, steps = 230 11:46:45 [INFO] train episode 28: reward = -458.00, steps = 459 11:46:45 [INFO] train episode 29: reward = -500.00, steps = 500 11:46:45 [INFO] train episode 30: reward = -223.00, steps = 224 11:46:45 [INFO] train episode 31: reward = -152.00, steps = 153 11:46:46 [INFO] train episode 32: reward = -167.00, steps = 168 11:46:46 [INFO] train episode 33: reward = -220.00, steps = 221 11:46:46 [INFO] train episode 34: reward = -188.00, steps = 189 11:46:46 [INFO] train episode 35: reward = -500.00, steps = 500 11:46:47 [INFO] train episode 36: reward = -450.00, steps = 451 11:46:47 [INFO] train episode 37: reward = -157.00, steps = 158 11:46:47 [INFO] train episode 38: reward = -272.00, steps = 273 11:46:48 [INFO] train episode 39: reward = -257.00, steps = 258 11:46:48 [INFO] train episode 40: reward = -259.00, steps = 260 11:46:48 [INFO] train episode 41: reward = -234.00, steps = 235 11:46:48 [INFO] train episode 42: reward = -270.00, steps = 271 11:46:49 [INFO] train episode 43: reward = -279.00, steps = 280 11:46:49 [INFO] train episode 44: reward = -232.00, steps = 233 11:46:49 [INFO] train episode 45: reward = -341.00, steps = 342 11:46:49 [INFO] train episode 46: reward = -263.00, steps = 264 11:46:50 [INFO] train episode 47: reward = -160.00, steps = 161 11:46:50 [INFO] train episode 48: reward = -276.00, steps = 277 11:46:50 [INFO] train episode 49: reward = -152.00, steps = 153 11:46:50 [INFO] train episode 50: reward = -238.00, steps = 239 11:46:50 [INFO] train episode 51: reward = -170.00, steps = 171 11:46:51 [INFO] train episode 52: reward = -191.00, steps = 192 11:46:51 [INFO] train episode 53: reward = -211.00, steps = 212 11:46:51 [INFO] train episode 54: reward = -299.00, steps = 300 11:46:51 [INFO] train episode 55: reward = -149.00, steps = 150 11:46:51 [INFO] train episode 56: reward = -149.00, steps = 150 11:46:52 [INFO] train episode 57: reward = -217.00, steps = 218 11:46:52 [INFO] train episode 58: reward = -266.00, steps = 267 11:46:52 [INFO] train episode 59: reward = -144.00, steps = 145 11:46:52 [INFO] train episode 60: reward = -222.00, steps = 223 11:46:53 [INFO] train episode 61: reward = -161.00, steps = 162 11:46:53 [INFO] train episode 62: reward = -188.00, steps = 189 11:46:53 [INFO] train episode 63: reward = -147.00, steps = 148 11:46:53 [INFO] train episode 64: reward = -168.00, steps = 169 11:46:53 [INFO] train episode 65: reward = -201.00, steps = 202 11:46:53 [INFO] train episode 66: reward = -161.00, steps = 162 11:46:54 [INFO] train episode 67: reward = -191.00, steps = 192 11:46:54 [INFO] train episode 68: reward = -166.00, steps = 167 11:46:54 [INFO] train episode 69: reward = -189.00, steps = 190 11:46:54 [INFO] train episode 70: reward = -117.00, steps = 118 11:46:54 [INFO] train episode 71: reward = -171.00, steps = 172 11:46:55 [INFO] train episode 72: reward = -160.00, steps = 161 11:46:55 [INFO] train episode 73: reward = -285.00, steps = 286 11:46:55 [INFO] train episode 74: reward = -216.00, steps = 217 11:46:55 [INFO] train episode 75: reward = -157.00, steps = 158 11:46:55 [INFO] train episode 76: reward = -230.00, steps = 231 11:46:56 [INFO] train episode 77: reward = -448.00, steps = 449 11:46:56 [INFO] train episode 78: reward = -138.00, steps = 139 11:46:56 [INFO] train episode 79: reward = -174.00, steps = 175 11:46:56 [INFO] train episode 80: reward = -223.00, steps = 224 11:46:56 [INFO] train episode 81: reward = -171.00, steps = 172 11:46:56 [INFO] train episode 82: reward = -96.00, steps = 97 11:46:57 [INFO] train episode 83: reward = -238.00, steps = 239 11:46:57 [INFO] train episode 84: reward = -158.00, steps = 159 11:46:57 [INFO] train episode 85: reward = -183.00, steps = 184 11:46:57 [INFO] train episode 86: reward = -154.00, steps = 155 11:46:57 [INFO] train episode 87: reward = -115.00, steps = 116 11:46:58 [INFO] train episode 88: reward = -174.00, steps = 175 11:46:58 [INFO] train episode 89: reward = -150.00, steps = 151 11:46:58 [INFO] train episode 90: reward = -191.00, steps = 192 11:46:58 [INFO] train episode 91: reward = -178.00, steps = 179 11:46:58 [INFO] train episode 92: reward = -142.00, steps = 143 11:46:58 [INFO] train episode 93: reward = -169.00, steps = 170 11:46:59 [INFO] train episode 94: reward = -194.00, steps = 195 11:46:59 [INFO] train episode 95: reward = -137.00, steps = 138 11:46:59 [INFO] train episode 96: reward = -156.00, steps = 157 11:46:59 [INFO] train episode 97: reward = -142.00, steps = 143 11:46:59 [INFO] train episode 98: reward = -223.00, steps = 224 11:47:00 [INFO] train episode 99: reward = -176.00, steps = 177 11:47:00 [INFO] train episode 100: reward = -127.00, steps = 128 11:47:00 [INFO] train episode 101: reward = -163.00, steps = 164 11:47:00 [INFO] train episode 102: reward = -280.00, steps = 281 11:47:00 [INFO] train episode 103: reward = -136.00, steps = 137 11:47:01 [INFO] train episode 104: reward = -161.00, steps = 162 11:47:01 [INFO] train episode 105: reward = -186.00, steps = 187 11:47:01 [INFO] train episode 106: reward = -218.00, steps = 219 11:47:01 [INFO] train episode 107: reward = -162.00, steps = 163 11:47:01 [INFO] train episode 108: reward = -142.00, steps = 143 11:47:01 [INFO] train episode 109: reward = -124.00, steps = 125 11:47:02 [INFO] train episode 110: reward = -128.00, steps = 129 11:47:02 [INFO] train episode 111: reward = -160.00, steps = 161 11:47:02 [INFO] train episode 112: reward = -213.00, steps = 214 11:47:02 [INFO] train episode 113: reward = -172.00, steps = 173 11:47:02 [INFO] train episode 114: reward = -140.00, steps = 141 11:47:02 [INFO] train episode 115: reward = -169.00, steps = 170 11:47:03 [INFO] train episode 116: reward = -225.00, steps = 226 11:47:03 [INFO] train episode 117: reward = -114.00, steps = 115 11:47:03 [INFO] train episode 118: reward = -161.00, steps = 162 11:47:03 [INFO] train episode 119: reward = -200.00, steps = 201 11:47:03 [INFO] train episode 120: reward = -120.00, steps = 121 11:47:04 [INFO] train episode 121: reward = -182.00, steps = 183 11:47:04 [INFO] train episode 122: reward = -162.00, steps = 163 11:47:04 [INFO] train episode 123: reward = -135.00, steps = 136 11:47:04 [INFO] train episode 124: reward = -189.00, steps = 190 11:47:04 [INFO] train episode 125: reward = -178.00, steps = 179 11:47:05 [INFO] train episode 126: reward = -145.00, steps = 146 11:47:05 [INFO] train episode 127: reward = -128.00, steps = 129 11:47:05 [INFO] train episode 128: reward = -191.00, steps = 192 11:47:05 [INFO] train episode 129: reward = -189.00, steps = 190 11:47:05 [INFO] train episode 130: reward = -186.00, steps = 187 11:47:06 [INFO] train episode 131: reward = -131.00, steps = 132 11:47:06 [INFO] train episode 132: reward = -193.00, steps = 194 11:47:06 [INFO] train episode 133: reward = -154.00, steps = 155 11:47:06 [INFO] train episode 134: reward = -146.00, steps = 147 11:47:07 [INFO] train episode 135: reward = -267.00, steps = 268 11:47:07 [INFO] train episode 136: reward = -116.00, steps = 117 11:47:07 [INFO] train episode 137: reward = -167.00, steps = 168 11:47:07 [INFO] train episode 138: reward = -149.00, steps = 150 11:47:07 [INFO] train episode 139: reward = -111.00, steps = 112 11:47:07 [INFO] train episode 140: reward = -139.00, steps = 140 11:47:07 [INFO] train episode 141: reward = -243.00, steps = 244 11:47:08 [INFO] train episode 142: reward = -149.00, steps = 150 11:47:08 [INFO] train episode 143: reward = -170.00, steps = 171 11:47:08 [INFO] train episode 144: reward = -171.00, steps = 172 11:47:08 [INFO] train episode 145: reward = -151.00, steps = 152 11:47:08 [INFO] train episode 146: reward = -121.00, steps = 122 11:47:09 [INFO] train episode 147: reward = -158.00, steps = 159 11:47:09 [INFO] train episode 148: reward = -119.00, steps = 120 11:47:09 [INFO] train episode 149: reward = -153.00, steps = 154 11:47:09 [INFO] train episode 150: reward = -146.00, steps = 147 11:47:09 [INFO] train episode 151: reward = -133.00, steps = 134 11:47:09 [INFO] train episode 152: reward = -152.00, steps = 153 11:47:10 [INFO] train episode 153: reward = -153.00, steps = 154 11:47:10 [INFO] train episode 154: reward = -128.00, steps = 129 11:47:10 [INFO] train episode 155: reward = -117.00, steps = 118 11:47:10 [INFO] train episode 156: reward = -185.00, steps = 186 11:47:10 [INFO] train episode 157: reward = -170.00, steps = 171 11:47:11 [INFO] train episode 158: reward = -147.00, steps = 148 11:47:11 [INFO] train episode 159: reward = -218.00, steps = 219 11:47:11 [INFO] train episode 160: reward = -128.00, steps = 129 11:47:11 [INFO] train episode 161: reward = -180.00, steps = 181 11:47:11 [INFO] train episode 162: reward = -149.00, steps = 150 11:47:12 [INFO] train episode 163: reward = -281.00, steps = 282 11:47:12 [INFO] train episode 164: reward = -137.00, steps = 138 11:47:12 [INFO] train episode 165: reward = -146.00, steps = 147 11:47:12 [INFO] train episode 166: reward = -155.00, steps = 156 11:47:12 [INFO] train episode 167: reward = -155.00, steps = 156 11:47:13 [INFO] train episode 168: reward = -218.00, steps = 219 11:47:13 [INFO] train episode 169: reward = -131.00, steps = 132 11:47:13 [INFO] train episode 170: reward = -149.00, steps = 150 11:47:13 [INFO] train episode 171: reward = -172.00, steps = 173 11:47:13 [INFO] train episode 172: reward = -143.00, steps = 144 11:47:13 [INFO] train episode 173: reward = -95.00, steps = 96 11:47:13 [INFO] train episode 174: reward = -168.00, steps = 169 11:47:14 [INFO] train episode 175: reward = -115.00, steps = 116 11:47:14 [INFO] train episode 176: reward = -106.00, steps = 107 11:47:14 [INFO] train episode 177: reward = -288.00, steps = 289 11:47:14 [INFO] train episode 178: reward = -142.00, steps = 143 11:47:14 [INFO] train episode 179: reward = -205.00, steps = 206 11:47:15 [INFO] train episode 180: reward = -119.00, steps = 120 11:47:15 [INFO] train episode 181: reward = -197.00, steps = 198 11:47:15 [INFO] train episode 182: reward = -226.00, steps = 227 11:47:15 [INFO] train episode 183: reward = -169.00, steps = 170 11:47:16 [INFO] train episode 184: reward = -214.00, steps = 215 11:47:16 [INFO] train episode 185: reward = -83.00, steps = 84 11:47:16 [INFO] train episode 186: reward = -144.00, steps = 145 11:47:16 [INFO] train episode 187: reward = -155.00, steps = 156 11:47:16 [INFO] train episode 188: reward = -143.00, steps = 144 11:47:16 [INFO] train episode 189: reward = -152.00, steps = 153 11:47:17 [INFO] train episode 190: reward = -145.00, steps = 146 11:47:17 [INFO] train episode 191: reward = -179.00, steps = 180 11:47:17 [INFO] train episode 192: reward = -151.00, steps = 152 11:47:17 [INFO] train episode 193: reward = -133.00, steps = 134 11:47:17 [INFO] train episode 194: reward = -182.00, steps = 183 11:47:17 [INFO] train episode 195: reward = -146.00, steps = 147 11:47:18 [INFO] train episode 196: reward = -142.00, steps = 143 11:47:18 [INFO] train episode 197: reward = -156.00, steps = 157 11:47:18 [INFO] train episode 198: reward = -163.00, steps = 164 11:47:18 [INFO] train episode 199: reward = -169.00, steps = 170 11:47:18 [INFO] train episode 200: reward = -126.00, steps = 127 11:47:18 [INFO] train episode 201: reward = -126.00, steps = 127 11:47:18 [INFO] train episode 202: reward = -175.00, steps = 176 11:47:19 [INFO] train episode 203: reward = -146.00, steps = 147 11:47:19 [INFO] train episode 204: reward = -103.00, steps = 104 11:47:19 [INFO] train episode 205: reward = -180.00, steps = 181 11:47:19 [INFO] train episode 206: reward = -106.00, steps = 107 11:47:19 [INFO] train episode 207: reward = -232.00, steps = 233 11:47:19 [INFO] train episode 208: reward = -108.00, steps = 109 11:47:20 [INFO] train episode 209: reward = -124.00, steps = 125 11:47:20 [INFO] train episode 210: reward = -132.00, steps = 133 11:47:20 [INFO] train episode 211: reward = -189.00, steps = 190 11:47:20 [INFO] train episode 212: reward = -158.00, steps = 159 11:47:20 [INFO] train episode 213: reward = -184.00, steps = 185 11:47:20 [INFO] train episode 214: reward = -120.00, steps = 121 11:47:21 [INFO] train episode 215: reward = -160.00, steps = 161 11:47:21 [INFO] train episode 216: reward = -137.00, steps = 138 11:47:21 [INFO] train episode 217: reward = -110.00, steps = 111 11:47:21 [INFO] train episode 218: reward = -108.00, steps = 109 11:47:21 [INFO] train episode 219: reward = -108.00, steps = 109 11:47:21 [INFO] train episode 220: reward = -189.00, steps = 190 11:47:21 [INFO] train episode 221: reward = -118.00, steps = 119 11:47:22 [INFO] train episode 222: reward = -184.00, steps = 185 11:47:22 [INFO] train episode 223: reward = -137.00, steps = 138 11:47:22 [INFO] train episode 224: reward = -130.00, steps = 131 11:47:22 [INFO] train episode 225: reward = -165.00, steps = 166 11:47:22 [INFO] train episode 226: reward = -108.00, steps = 109 11:47:22 [INFO] train episode 227: reward = -192.00, steps = 193 11:47:23 [INFO] train episode 228: reward = -251.00, steps = 252 11:47:23 [INFO] train episode 229: reward = -123.00, steps = 124 11:47:23 [INFO] train episode 230: reward = -243.00, steps = 244 11:47:23 [INFO] train episode 231: reward = -183.00, steps = 184 11:47:23 [INFO] train episode 232: reward = -158.00, steps = 159 11:47:24 [INFO] train episode 233: reward = -164.00, steps = 165 11:47:24 [INFO] train episode 234: reward = -163.00, steps = 164 11:47:24 [INFO] train episode 235: reward = -163.00, steps = 164 11:47:24 [INFO] train episode 236: reward = -299.00, steps = 300 11:47:25 [INFO] train episode 237: reward = -196.00, steps = 197 11:47:25 [INFO] train episode 238: reward = -116.00, steps = 117 11:47:25 [INFO] train episode 239: reward = -152.00, steps = 153 11:47:25 [INFO] train episode 240: reward = -144.00, steps = 145 11:47:25 [INFO] train episode 241: reward = -157.00, steps = 158 11:47:25 [INFO] train episode 242: reward = -165.00, steps = 166 11:47:25 [INFO] train episode 243: reward = -138.00, steps = 139 11:47:26 [INFO] train episode 244: reward = -136.00, steps = 137 11:47:26 [INFO] train episode 245: reward = -184.00, steps = 185 11:47:26 [INFO] train episode 246: reward = -123.00, steps = 124 11:47:26 [INFO] train episode 247: reward = -144.00, steps = 145 11:47:26 [INFO] train episode 248: reward = -103.00, steps = 104 11:47:26 [INFO] train episode 249: reward = -114.00, steps = 115 11:47:26 [INFO] train episode 250: reward = -174.00, steps = 175 11:47:26 [INFO] train episode 251: reward = -131.00, steps = 132 11:47:27 [INFO] train episode 252: reward = -135.00, steps = 136 11:47:27 [INFO] train episode 253: reward = -177.00, steps = 178 11:47:27 [INFO] train episode 254: reward = -132.00, steps = 133 11:47:27 [INFO] train episode 255: reward = -136.00, steps = 137 11:47:27 [INFO] train episode 256: reward = -156.00, steps = 157 11:47:27 [INFO] train episode 257: reward = -156.00, steps = 157 11:47:28 [INFO] train episode 258: reward = -147.00, steps = 148 11:47:28 [INFO] train episode 259: reward = -117.00, steps = 118 11:47:28 [INFO] train episode 260: reward = -161.00, steps = 162 11:47:28 [INFO] train episode 261: reward = -133.00, steps = 134 11:47:28 [INFO] train episode 262: reward = -135.00, steps = 136 11:47:28 [INFO] train episode 263: reward = -158.00, steps = 159 11:47:29 [INFO] train episode 264: reward = -187.00, steps = 188 11:47:29 [INFO] train episode 265: reward = -183.00, steps = 184 11:47:29 [INFO] train episode 266: reward = -152.00, steps = 153 11:47:29 [INFO] train episode 267: reward = -241.00, steps = 242 11:47:29 [INFO] train episode 268: reward = -187.00, steps = 188 11:47:30 [INFO] train episode 269: reward = -156.00, steps = 157 11:47:30 [INFO] train episode 270: reward = -156.00, steps = 157 11:47:30 [INFO] train episode 271: reward = -164.00, steps = 165 11:47:30 [INFO] train episode 272: reward = -144.00, steps = 145 11:47:30 [INFO] train episode 273: reward = -216.00, steps = 217 11:47:30 [INFO] train episode 274: reward = -115.00, steps = 116 11:47:31 [INFO] train episode 275: reward = -128.00, steps = 129 11:47:31 [INFO] train episode 276: reward = -170.00, steps = 171 11:47:31 [INFO] train episode 277: reward = -174.00, steps = 175 11:47:31 [INFO] train episode 278: reward = -277.00, steps = 278 11:47:31 [INFO] train episode 279: reward = -169.00, steps = 170 11:47:32 [INFO] train episode 280: reward = -159.00, steps = 160 11:47:32 [INFO] train episode 281: reward = -170.00, steps = 171 11:47:32 [INFO] train episode 282: reward = -155.00, steps = 156 11:47:32 [INFO] train episode 283: reward = -195.00, steps = 196 11:47:32 [INFO] train episode 284: reward = -152.00, steps = 153 11:47:32 [INFO] train episode 285: reward = -154.00, steps = 155 11:47:33 [INFO] train episode 286: reward = -154.00, steps = 155 11:47:33 [INFO] train episode 287: reward = -217.00, steps = 218 11:47:33 [INFO] train episode 288: reward = -264.00, steps = 265 11:47:33 [INFO] train episode 289: reward = -185.00, steps = 186 11:47:34 [INFO] train episode 290: reward = -163.00, steps = 164 11:47:34 [INFO] train episode 291: reward = -148.00, steps = 149 11:47:34 [INFO] train episode 292: reward = -169.00, steps = 170 11:47:34 [INFO] train episode 293: reward = -170.00, steps = 171 11:47:34 [INFO] train episode 294: reward = -135.00, steps = 136 11:47:34 [INFO] train episode 295: reward = -146.00, steps = 147 11:47:34 [INFO] train episode 296: reward = -174.00, steps = 175 11:47:35 [INFO] train episode 297: reward = -128.00, steps = 129 11:47:35 [INFO] train episode 298: reward = -158.00, steps = 159 11:47:35 [INFO] train episode 299: reward = -140.00, steps = 141 11:47:35 [INFO] train episode 300: reward = -179.00, steps = 180 11:47:35 [INFO] train episode 301: reward = -151.00, steps = 152 11:47:35 [INFO] train episode 302: reward = -124.00, steps = 125 11:47:35 [INFO] train episode 303: reward = -151.00, steps = 152 11:47:36 [INFO] train episode 304: reward = -158.00, steps = 159 11:47:36 [INFO] train episode 305: reward = -97.00, steps = 98 11:47:36 [INFO] train episode 306: reward = -198.00, steps = 199 11:47:36 [INFO] train episode 307: reward = -148.00, steps = 149 11:47:36 [INFO] train episode 308: reward = -170.00, steps = 171 11:47:36 [INFO] train episode 309: reward = -147.00, steps = 148 11:47:37 [INFO] train episode 310: reward = -116.00, steps = 117 11:47:37 [INFO] train episode 311: reward = -140.00, steps = 141 11:47:37 [INFO] train episode 312: reward = -211.00, steps = 212 11:47:37 [INFO] train episode 313: reward = -186.00, steps = 187 11:47:37 [INFO] train episode 314: reward = -151.00, steps = 152 11:47:37 [INFO] train episode 315: reward = -126.00, steps = 127 11:47:38 [INFO] train episode 316: reward = -163.00, steps = 164 11:47:38 [INFO] train episode 317: reward = -115.00, steps = 116 11:47:38 [INFO] train episode 318: reward = -156.00, steps = 157 11:47:38 [INFO] train episode 319: reward = -177.00, steps = 178 11:47:38 [INFO] train episode 320: reward = -147.00, steps = 148 11:47:38 [INFO] train episode 321: reward = -156.00, steps = 157 11:47:38 [INFO] train episode 322: reward = -106.00, steps = 107 11:47:39 [INFO] train episode 323: reward = -148.00, steps = 149 11:47:39 [INFO] train episode 324: reward = -149.00, steps = 150 11:47:39 [INFO] train episode 325: reward = -119.00, steps = 120 11:47:39 [INFO] train episode 326: reward = -153.00, steps = 154 11:47:39 [INFO] train episode 327: reward = -130.00, steps = 131 11:47:39 [INFO] train episode 328: reward = -138.00, steps = 139 11:47:40 [INFO] train episode 329: reward = -162.00, steps = 163 11:47:40 [INFO] train episode 330: reward = -165.00, steps = 166 11:47:40 [INFO] train episode 331: reward = -148.00, steps = 149 11:47:40 [INFO] train episode 332: reward = -165.00, steps = 166 11:47:40 [INFO] train episode 333: reward = -159.00, steps = 160 11:47:41 [INFO] train episode 334: reward = -244.00, steps = 245 11:47:41 [INFO] train episode 335: reward = -241.00, steps = 242 11:47:41 [INFO] train episode 336: reward = -119.00, steps = 120 11:47:41 [INFO] train episode 337: reward = -181.00, steps = 182 11:47:41 [INFO] train episode 338: reward = -156.00, steps = 157 11:47:41 [INFO] train episode 339: reward = -129.00, steps = 130 11:47:42 [INFO] train episode 340: reward = -136.00, steps = 137 11:47:42 [INFO] train episode 341: reward = -127.00, steps = 128 11:47:42 [INFO] train episode 342: reward = -162.00, steps = 163 11:47:42 [INFO] train episode 343: reward = -152.00, steps = 153 11:47:42 [INFO] train episode 344: reward = -123.00, steps = 124 11:47:42 [INFO] train episode 345: reward = -125.00, steps = 126 11:47:42 [INFO] train episode 346: reward = -195.00, steps = 196 11:47:43 [INFO] train episode 347: reward = -190.00, steps = 191 11:47:43 [INFO] train episode 348: reward = -97.00, steps = 98 11:47:43 [INFO] train episode 349: reward = -149.00, steps = 150 11:47:43 [INFO] train episode 350: reward = -105.00, steps = 106 11:47:43 [INFO] train episode 351: reward = -137.00, steps = 138 11:47:43 [INFO] train episode 352: reward = -140.00, steps = 141 11:47:43 [INFO] train episode 353: reward = -138.00, steps = 139 11:47:44 [INFO] train episode 354: reward = -129.00, steps = 130 11:47:44 [INFO] train episode 355: reward = -169.00, steps = 170 11:47:44 [INFO] train episode 356: reward = -147.00, steps = 148 11:47:44 [INFO] train episode 357: reward = -200.00, steps = 201 11:47:44 [INFO] train episode 358: reward = -154.00, steps = 155 11:47:44 [INFO] train episode 359: reward = -251.00, steps = 252 11:47:45 [INFO] train episode 360: reward = -157.00, steps = 158 11:47:45 [INFO] train episode 361: reward = -167.00, steps = 168 11:47:45 [INFO] train episode 362: reward = -153.00, steps = 154 11:47:45 [INFO] train episode 363: reward = -161.00, steps = 162 11:47:45 [INFO] train episode 364: reward = -189.00, steps = 190 11:47:45 [INFO] train episode 365: reward = -199.00, steps = 200 11:47:46 [INFO] train episode 366: reward = -213.00, steps = 214 11:47:46 [INFO] train episode 367: reward = -132.00, steps = 133 11:47:46 [INFO] train episode 368: reward = -168.00, steps = 169 11:47:46 [INFO] train episode 369: reward = -118.00, steps = 119 11:47:46 [INFO] train episode 370: reward = -101.00, steps = 102 11:47:46 [INFO] train episode 371: reward = -142.00, steps = 143 11:47:47 [INFO] train episode 372: reward = -134.00, steps = 135 11:47:47 [INFO] train episode 373: reward = -128.00, steps = 129 11:47:47 [INFO] train episode 374: reward = -144.00, steps = 145 11:47:47 [INFO] train episode 375: reward = -131.00, steps = 132 11:47:47 [INFO] train episode 376: reward = -159.00, steps = 160 11:47:47 [INFO] train episode 377: reward = -134.00, steps = 135 11:47:48 [INFO] train episode 378: reward = -188.00, steps = 189 11:47:48 [INFO] train episode 379: reward = -104.00, steps = 105 11:47:48 [INFO] train episode 380: reward = -138.00, steps = 139 11:47:48 [INFO] train episode 381: reward = -247.00, steps = 248 11:47:48 [INFO] train episode 382: reward = -165.00, steps = 166 11:47:48 [INFO] train episode 383: reward = -138.00, steps = 139 11:47:49 [INFO] train episode 384: reward = -146.00, steps = 147 11:47:49 [INFO] train episode 385: reward = -132.00, steps = 133 11:47:49 [INFO] train episode 386: reward = -141.00, steps = 142 11:47:49 [INFO] train episode 387: reward = -143.00, steps = 144 11:47:49 [INFO] train episode 388: reward = -160.00, steps = 161 11:47:49 [INFO] train episode 389: reward = -159.00, steps = 160 11:47:50 [INFO] train episode 390: reward = -113.00, steps = 114 11:47:50 [INFO] train episode 391: reward = -145.00, steps = 146 11:47:50 [INFO] train episode 392: reward = -282.00, steps = 283 11:47:50 [INFO] train episode 393: reward = -189.00, steps = 190 11:47:50 [INFO] train episode 394: reward = -138.00, steps = 139 11:47:51 [INFO] train episode 395: reward = -157.00, steps = 158 11:47:51 [INFO] train episode 396: reward = -118.00, steps = 119 11:47:51 [INFO] train episode 397: reward = -297.00, steps = 298 11:47:51 [INFO] train episode 398: reward = -181.00, steps = 182 11:47:51 [INFO] train episode 399: reward = -144.00, steps = 145 11:47:51 [INFO] train episode 400: reward = -131.00, steps = 132 11:47:52 [INFO] train episode 401: reward = -112.00, steps = 113 11:47:52 [INFO] train episode 402: reward = -132.00, steps = 133 11:47:52 [INFO] train episode 403: reward = -121.00, steps = 122 11:47:52 [INFO] train episode 404: reward = -181.00, steps = 182 11:47:52 [INFO] train episode 405: reward = -134.00, steps = 135 11:47:52 [INFO] train episode 406: reward = -205.00, steps = 206 11:47:53 [INFO] train episode 407: reward = -124.00, steps = 125 11:47:53 [INFO] train episode 408: reward = -126.00, steps = 127 11:47:53 [INFO] train episode 409: reward = -152.00, steps = 153 11:47:53 [INFO] train episode 410: reward = -116.00, steps = 117 11:47:53 [INFO] train episode 411: reward = -150.00, steps = 151 11:47:53 [INFO] train episode 412: reward = -148.00, steps = 149 11:47:53 [INFO] train episode 413: reward = -167.00, steps = 168 11:47:54 [INFO] train episode 414: reward = -169.00, steps = 170 11:47:54 [INFO] train episode 415: reward = -143.00, steps = 144 11:47:54 [INFO] train episode 416: reward = -131.00, steps = 132 11:47:54 [INFO] train episode 417: reward = -115.00, steps = 116 11:47:54 [INFO] train episode 418: reward = -138.00, steps = 139 11:47:54 [INFO] train episode 419: reward = -142.00, steps = 143 11:47:54 [INFO] train episode 420: reward = -140.00, steps = 141 11:47:54 [INFO] train episode 421: reward = -101.00, steps = 102 11:47:55 [INFO] train episode 422: reward = -129.00, steps = 130 11:47:55 [INFO] train episode 423: reward = -132.00, steps = 133 11:47:55 [INFO] train episode 424: reward = -500.00, steps = 500 11:47:55 [INFO] train episode 425: reward = -152.00, steps = 153 11:47:56 [INFO] train episode 426: reward = -229.00, steps = 230 11:47:56 [INFO] train episode 427: reward = -123.00, steps = 124 11:47:56 [INFO] train episode 428: reward = -170.00, steps = 171 11:47:56 [INFO] train episode 429: reward = -120.00, steps = 121 11:47:56 [INFO] train episode 430: reward = -129.00, steps = 130 11:47:56 [INFO] train episode 431: reward = -112.00, steps = 113 11:47:56 [INFO] train episode 432: reward = -105.00, steps = 106 11:47:57 [INFO] train episode 433: reward = -141.00, steps = 142 11:47:57 [INFO] train episode 434: reward = -132.00, steps = 133 11:47:57 [INFO] train episode 435: reward = -114.00, steps = 115 11:47:57 [INFO] train episode 436: reward = -157.00, steps = 158 11:47:57 [INFO] train episode 437: reward = -108.00, steps = 109 11:47:57 [INFO] train episode 438: reward = -114.00, steps = 115 11:47:58 [INFO] train episode 439: reward = -146.00, steps = 147 11:47:58 [INFO] train episode 440: reward = -131.00, steps = 132 11:47:58 [INFO] train episode 441: reward = -157.00, steps = 158 11:47:58 [INFO] train episode 442: reward = -132.00, steps = 133 11:47:58 [INFO] train episode 443: reward = -95.00, steps = 96 11:47:58 [INFO] train episode 444: reward = -177.00, steps = 178 11:47:59 [INFO] train episode 445: reward = -136.00, steps = 137 11:47:59 [INFO] train episode 446: reward = -127.00, steps = 128 11:47:59 [INFO] train episode 447: reward = -129.00, steps = 130 11:47:59 [INFO] train episode 448: reward = -111.00, steps = 112 11:47:59 [INFO] train episode 449: reward = -127.00, steps = 128 11:47:59 [INFO] train episode 450: reward = -103.00, steps = 104 11:48:00 [INFO] train episode 451: reward = -171.00, steps = 172 11:48:00 [INFO] train episode 452: reward = -107.00, steps = 108 11:48:00 [INFO] train episode 453: reward = -98.00, steps = 99 11:48:00 [INFO] train episode 454: reward = -149.00, steps = 150 11:48:00 [INFO] train episode 455: reward = -95.00, steps = 96 11:48:00 [INFO] train episode 456: reward = -107.00, steps = 108 11:48:00 [INFO] ==== test ==== 11:48:00 [INFO] test episode 0: reward = -117.00, steps = 118 11:48:00 [INFO] test episode 1: reward = -211.00, steps = 212 11:48:00 [INFO] test episode 2: reward = -91.00, steps = 92 11:48:01 [INFO] test episode 3: reward = -123.00, steps = 124 11:48:01 [INFO] test episode 4: reward = -118.00, steps = 119 11:48:01 [INFO] test episode 5: reward = -122.00, steps = 123 11:48:01 [INFO] test episode 6: reward = -129.00, steps = 130 11:48:01 [INFO] test episode 7: reward = -116.00, steps = 117 11:48:01 [INFO] test episode 8: reward = -132.00, steps = 133 11:48:01 [INFO] test episode 9: reward = -105.00, steps = 106 11:48:01 [INFO] test episode 10: reward = -128.00, steps = 129 11:48:02 [INFO] test episode 11: reward = -289.00, steps = 290 11:48:02 [INFO] test episode 12: reward = -132.00, steps = 133 11:48:02 [INFO] test episode 13: reward = -120.00, steps = 121 11:48:02 [INFO] test episode 14: reward = -121.00, steps = 122 11:48:02 [INFO] test episode 15: reward = -117.00, steps = 118 11:48:02 [INFO] test episode 16: reward = -164.00, steps = 165 11:48:02 [INFO] test episode 17: reward = -202.00, steps = 203 11:48:02 [INFO] test episode 18: reward = -100.00, steps = 101 11:48:02 [INFO] test episode 19: reward = -104.00, steps = 105 11:48:03 [INFO] test episode 20: reward = -142.00, steps = 143 11:48:03 [INFO] test episode 21: reward = -133.00, steps = 134 11:48:03 [INFO] test episode 22: reward = -156.00, steps = 157 11:48:03 [INFO] test episode 23: reward = -110.00, steps = 111 11:48:03 [INFO] test episode 24: reward = -176.00, steps = 177 11:48:03 [INFO] test episode 25: reward = -105.00, steps = 106 11:48:03 [INFO] test episode 26: reward = -162.00, steps = 163 11:48:03 [INFO] test episode 27: reward = -133.00, steps = 134 11:48:04 [INFO] test episode 28: reward = -127.00, steps = 128 11:48:04 [INFO] test episode 29: reward = -126.00, steps = 127 11:48:04 [INFO] test episode 30: reward = -157.00, steps = 158 11:48:04 [INFO] test episode 31: reward = -125.00, steps = 126 11:48:04 [INFO] test episode 32: reward = -92.00, steps = 93 11:48:04 [INFO] test episode 33: reward = -200.00, steps = 201 11:48:04 [INFO] test episode 34: reward = -127.00, steps = 128 11:48:04 [INFO] test episode 35: reward = -160.00, steps = 161 11:48:04 [INFO] test episode 36: reward = -106.00, steps = 107 11:48:04 [INFO] test episode 37: reward = -124.00, steps = 125 11:48:05 [INFO] test episode 38: reward = -189.00, steps = 190 11:48:05 [INFO] test episode 39: reward = -159.00, steps = 160 11:48:05 [INFO] test episode 40: reward = -128.00, steps = 129 11:48:05 [INFO] test episode 41: reward = -193.00, steps = 194 11:48:05 [INFO] test episode 42: reward = -142.00, steps = 143 11:48:05 [INFO] test episode 43: reward = -103.00, steps = 104 11:48:05 [INFO] test episode 44: reward = -268.00, steps = 269 11:48:05 [INFO] test episode 45: reward = -164.00, steps = 165 11:48:06 [INFO] test episode 46: reward = -133.00, steps = 134 11:48:06 [INFO] test episode 47: reward = -104.00, steps = 105 11:48:06 [INFO] test episode 48: reward = -151.00, steps = 152 11:48:06 [INFO] test episode 49: reward = -120.00, steps = 121 11:48:06 [INFO] test episode 50: reward = -118.00, steps = 119 11:48:06 [INFO] test episode 51: reward = -122.00, steps = 123 11:48:06 [INFO] test episode 52: reward = -162.00, steps = 163 11:48:06 [INFO] test episode 53: reward = -150.00, steps = 151 11:48:06 [INFO] test episode 54: reward = -137.00, steps = 138 11:48:06 [INFO] test episode 55: reward = -134.00, steps = 135 11:48:07 [INFO] test episode 56: reward = -112.00, steps = 113 11:48:07 [INFO] test episode 57: reward = -113.00, steps = 114 11:48:07 [INFO] test episode 58: reward = -133.00, steps = 134 11:48:07 [INFO] test episode 59: reward = -169.00, steps = 170 11:48:07 [INFO] test episode 60: reward = -151.00, steps = 152 11:48:07 [INFO] test episode 61: reward = -142.00, steps = 143 11:48:07 [INFO] test episode 62: reward = -123.00, steps = 124 11:48:07 [INFO] test episode 63: reward = -140.00, steps = 141 11:48:07 [INFO] test episode 64: reward = -143.00, steps = 144 11:48:08 [INFO] test episode 65: reward = -116.00, steps = 117 11:48:08 [INFO] test episode 66: reward = -151.00, steps = 152 11:48:08 [INFO] test episode 67: reward = -113.00, steps = 114 11:48:08 [INFO] test episode 68: reward = -130.00, steps = 131 11:48:08 [INFO] test episode 69: reward = -222.00, steps = 223 11:48:08 [INFO] test episode 70: reward = -139.00, steps = 140 11:48:08 [INFO] test episode 71: reward = -105.00, steps = 106 11:48:08 [INFO] test episode 72: reward = -91.00, steps = 92 11:48:08 [INFO] test episode 73: reward = -98.00, steps = 99 11:48:08 [INFO] test episode 74: reward = -109.00, steps = 110 11:48:09 [INFO] test episode 75: reward = -93.00, steps = 94 11:48:09 [INFO] test episode 76: reward = -130.00, steps = 131 11:48:09 [INFO] test episode 77: reward = -101.00, steps = 102 11:48:09 [INFO] test episode 78: reward = -120.00, steps = 121 11:48:09 [INFO] test episode 79: reward = -115.00, steps = 116 11:48:09 [INFO] test episode 80: reward = -172.00, steps = 173 11:48:09 [INFO] test episode 81: reward = -127.00, steps = 128 11:48:09 [INFO] test episode 82: reward = -105.00, steps = 106 11:48:09 [INFO] test episode 83: reward = -131.00, steps = 132 11:48:09 [INFO] test episode 84: reward = -128.00, steps = 129 11:48:10 [INFO] test episode 85: reward = -165.00, steps = 166 11:48:10 [INFO] test episode 86: reward = -97.00, steps = 98 11:48:10 [INFO] test episode 87: reward = -176.00, steps = 177 11:48:10 [INFO] test episode 88: reward = -134.00, steps = 135 11:48:10 [INFO] test episode 89: reward = -110.00, steps = 111 11:48:10 [INFO] test episode 90: reward = -108.00, steps = 109 11:48:10 [INFO] test episode 91: reward = -191.00, steps = 192 11:48:10 [INFO] test episode 92: reward = -133.00, steps = 134 11:48:10 [INFO] test episode 93: reward = -127.00, steps = 128 11:48:10 [INFO] test episode 94: reward = -138.00, steps = 139 11:48:11 [INFO] test episode 95: reward = -118.00, steps = 119 11:48:11 [INFO] test episode 96: reward = -137.00, steps = 138 11:48:11 [INFO] test episode 97: reward = -189.00, steps = 190 11:48:11 [INFO] test episode 98: reward = -143.00, steps = 144 11:48:11 [INFO] test episode 99: reward = -186.00, steps = 187 11:48:11 [INFO] average episode reward = -137.53 ± 34.69
env.close()