TensorFlow version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
14:00:56 [INFO] env: <AcrobotEnv<Acrobot-v1>> 14:00:56 [INFO] action_space: Discrete(3) 14:00:56 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 14:00:56 [INFO] reward_range: (-inf, inf) 14:00:56 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 14:00:56 [INFO] _max_episode_steps: 500 14:00:56 [INFO] _elapsed_steps: None 14:00:56 [INFO] id: Acrobot-v1 14:00:56 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 14:00:56 [INFO] reward_threshold: -100.0 14:00:56 [INFO] nondeterministic: False 14:00:56 [INFO] max_episode_steps: 500 14:00:56 [INFO] _kwargs: {} 14:00:56 [INFO] _env_name: Acrobot
class PPOReplayer:
def __init__(self):
self.fields = ['state', 'action', 'prob', 'advantage', 'return']
self.memory = pd.DataFrame(columns=self.fields)
def store(self, df):
self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)
def sample(self, size):
indices = np.random.choice(self.memory.shape[0], size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.fields)
def conjugate_gradient(f, b, iter_count=10, epsilon=1e-12, tol=1e-6):
x = b * 0.
r = tf.identity(b)
p = tf.identity(b)
rho = tf.reduce_sum(r * r)
for i in range(iter_count):
z = f(p)
alpha = rho / (tf.reduce_sum(p * z) + epsilon)
x += alpha * p
r -= alpha * z
rho_new = tf.reduce_sum(r * r)
p = r + (rho_new / rho) * p
rho = rho_new
if rho < tol:
break
return x, f(x)
class NPGAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = PPOReplayer()
self.trajectory = []
self.max_kl = 0.0005
self.actor_net = self.build_net(hidden_sizes=[100,],
output_size=self.action_n, output_activation=nn.softmax)
self.critic_net = self.build_net(hidden_sizes=[100,],
learning_rate=0.002)
def build_net(self, input_size=None, hidden_sizes=None, output_size=1,
activation=nn.relu, output_activation=None,
loss=losses.mse, learning_rate=0.001):
model = keras.Sequential()
for hidden_size in hidden_sizes:
model.add(layers.Dense(units=hidden_size,
activation=activation))
model.add(layers.Dense(units=output_size,
activation=output_activation))
optimizer = optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
def step(self, observation, reward, terminated):
probs = self.actor_net.predict(observation[np.newaxis], verbose=0)[0]
action = np.random.choice(self.action_n, p=probs)
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
return action
def close(self):
if self.mode == 'train':
self.save_trajectory_to_replayer()
if len(self.replayer.memory) >= 1000:
for batch in range(5): # learn multiple times
self.learn()
self.replayer = PPOReplayer()
# reset replayer after the agent changes itself
def save_trajectory_to_replayer(self):
df = pd.DataFrame(
np.array(self.trajectory, dtype=object).reshape(-1, 4),
columns=['state', 'reward', 'terminated', 'action'], dtype=object)
states = np.stack(df['state'])
df['v'] = self.critic_net.predict(states, verbose=0)
pis = self.actor_net.predict(states, verbose=0)
df['prob'] = [pi[action] for pi, action in zip(pis, df['action'])]
df['next_v'] = df['v'].shift(-1).fillna(0.)
df['u'] = df['reward'] + self.gamma * df['next_v']
df['delta'] = df['u'] - df['v']
df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
df['delta'][::-1])[::-1]
df['return'] = signal.lfilter([1.,], [1., -self.gamma],
df['reward'][::-1])[::-1]
self.replayer.store(df)
def learn(self):
states, actions, old_pis, advantages, returns = \
self.replayer.sample(size=64)
state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
action_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
old_pi_tensor = tf.convert_to_tensor(old_pis, dtype=tf.float32)
advantage_tensor = tf.convert_to_tensor(advantages, dtype=tf.float32)
# update actor
# ... calculate first order gradient of KL divergence
with tf.GradientTape() as tape:
all_pi_tensor = self.actor_net(state_tensor)
pi_tensor = tf.gather(all_pi_tensor, action_tensor, batch_dims=1)
surrogate_tensor = (pi_tensor / old_pi_tensor) * advantage_tensor
actor_grads = tape.gradient(surrogate_tensor, self.actor_net.variables)
loss_grad = tf.concat([tf.reshape(grad, (-1,)) for grad in actor_grads],
axis=0)
# ... calculate conjugate gradient: Fx = g
def f(x): # calculate Fx
with tf.GradientTape() as tape2: # tape for 2nd-order gradient
with tf.GradientTape() as tape1: # tape for 1st-order gradient
prob_tensor = self.actor_net(state_tensor)
prob_old_tensor = tf.stop_gradient(prob_tensor)
kld_tensor = tf.reduce_sum(prob_old_tensor * (tf.math.log(
prob_old_tensor) - tf.math.log(prob_tensor)), axis=1)
kld_loss_tensor = tf.reduce_mean(kld_tensor)
grads = tape1.gradient(kld_loss_tensor, self.actor_net.variables)
flatten_grad_tensor = tf.concat(
[tf.reshape(grad, (-1,)) for grad in grads], axis=-1)
grad_matmul_x = tf.tensordot(flatten_grad_tensor, x,
axes=[[-1], [-1]])
grad_grads = tape2.gradient(grad_matmul_x, self.actor_net.variables)
flatten_grad_grad = tf.stop_gradient(tf.concat(
[tf.reshape(grad_grad, (-1,)) for grad_grad in grad_grads],
axis=-1))
fx = flatten_grad_grad + x * 1e-2
return fx
x, fx = conjugate_gradient(f, loss_grad)
# ... calculate natural gradient
natural_gradient_tensor = tf.sqrt(2 * self.max_kl /
tf.reduce_sum(fx * x)) * x
# ....... refactor the flatten gradient into un-flatten version
flatten_natural_gradient = natural_gradient_tensor.numpy()
weights = []
begin = 0
for weight in self.actor_net.get_weights():
end = begin + weight.size
weight += flatten_natural_gradient[begin:end].reshape(weight.shape)
weights.append(weight)
begin = end
self.actor_net.set_weights(weights)
# update critic
self.critic_net.fit(states, returns, verbose=0)
agent = NPGAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
14:00:57 [INFO] ==== train ==== 14:01:37 [INFO] NumExpr defaulting to 8 threads. 14:01:37 [INFO] train episode 0: reward = -500.00, steps = 500 14:02:18 [INFO] train episode 1: reward = -500.00, steps = 500 14:02:57 [INFO] train episode 2: reward = -500.00, steps = 500 14:03:37 [INFO] train episode 3: reward = -500.00, steps = 500 14:04:10 [INFO] train episode 4: reward = -422.00, steps = 423 14:04:40 [INFO] train episode 5: reward = -388.00, steps = 389 14:05:14 [INFO] train episode 6: reward = -411.00, steps = 412 14:05:47 [INFO] train episode 7: reward = -427.00, steps = 428 14:06:28 [INFO] train episode 8: reward = -500.00, steps = 500 14:07:12 [INFO] train episode 9: reward = -500.00, steps = 500 14:07:34 [INFO] train episode 10: reward = -279.00, steps = 280 14:08:13 [INFO] train episode 11: reward = -500.00, steps = 500 14:08:33 [INFO] train episode 12: reward = -237.00, steps = 238 14:08:55 [INFO] train episode 13: reward = -273.00, steps = 274 14:09:17 [INFO] train episode 14: reward = -282.00, steps = 283 14:09:49 [INFO] train episode 15: reward = -426.00, steps = 427 14:10:22 [INFO] train episode 16: reward = -395.00, steps = 396 14:10:51 [INFO] train episode 17: reward = -375.00, steps = 376 14:11:12 [INFO] train episode 18: reward = -252.00, steps = 253 14:11:52 [INFO] train episode 19: reward = -500.00, steps = 500 14:12:15 [INFO] train episode 20: reward = -289.00, steps = 290 14:12:48 [INFO] train episode 21: reward = -401.00, steps = 402 14:13:15 [INFO] train episode 22: reward = -310.00, steps = 311 14:13:51 [INFO] train episode 23: reward = -473.00, steps = 474 14:14:17 [INFO] train episode 24: reward = -341.00, steps = 342 14:14:57 [INFO] train episode 25: reward = -500.00, steps = 500 14:15:24 [INFO] train episode 26: reward = -340.00, steps = 341 14:16:03 [INFO] train episode 27: reward = -500.00, steps = 500 14:16:45 [INFO] train episode 28: reward = -487.00, steps = 488 14:17:17 [INFO] train episode 29: reward = -405.00, steps = 406 14:17:55 [INFO] train episode 30: reward = -500.00, steps = 500 14:18:24 [INFO] train episode 31: reward = -347.00, steps = 348 14:18:59 [INFO] train episode 32: reward = -413.00, steps = 414 14:19:32 [INFO] train episode 33: reward = -426.00, steps = 427 14:20:01 [INFO] train episode 34: reward = -367.00, steps = 368 14:20:41 [INFO] train episode 35: reward = -500.00, steps = 500 14:21:05 [INFO] train episode 36: reward = -295.00, steps = 296 14:21:24 [INFO] train episode 37: reward = -228.00, steps = 229 14:22:04 [INFO] train episode 38: reward = -500.00, steps = 500 14:22:47 [INFO] train episode 39: reward = -500.00, steps = 500 14:23:30 [INFO] train episode 40: reward = -500.00, steps = 500 14:24:10 [INFO] train episode 41: reward = -500.00, steps = 500 14:24:31 [INFO] train episode 42: reward = -266.00, steps = 267 14:25:06 [INFO] train episode 43: reward = -449.00, steps = 450 14:25:46 [INFO] train episode 44: reward = -500.00, steps = 500 14:26:19 [INFO] train episode 45: reward = -400.00, steps = 401 14:26:47 [INFO] train episode 46: reward = -339.00, steps = 340 14:27:11 [INFO] train episode 47: reward = -275.00, steps = 276 14:27:41 [INFO] train episode 48: reward = -383.00, steps = 384 14:27:57 [INFO] train episode 49: reward = -204.00, steps = 205 14:28:20 [INFO] train episode 50: reward = -273.00, steps = 274 14:28:38 [INFO] train episode 51: reward = -216.00, steps = 217 14:29:08 [INFO] train episode 52: reward = -371.00, steps = 372 14:29:46 [INFO] train episode 53: reward = -500.00, steps = 500 14:30:16 [INFO] train episode 54: reward = -355.00, steps = 356 14:30:44 [INFO] train episode 55: reward = -353.00, steps = 354 14:31:09 [INFO] train episode 56: reward = -308.00, steps = 309 14:31:45 [INFO] train episode 57: reward = -456.00, steps = 457 14:32:24 [INFO] train episode 58: reward = -500.00, steps = 500 14:32:41 [INFO] train episode 59: reward = -226.00, steps = 227 14:33:17 [INFO] train episode 60: reward = -449.00, steps = 450 14:33:42 [INFO] train episode 61: reward = -323.00, steps = 324 14:34:07 [INFO] train episode 62: reward = -318.00, steps = 319 14:34:34 [INFO] train episode 63: reward = -346.00, steps = 347 14:35:12 [INFO] train episode 64: reward = -467.00, steps = 468 14:35:24 [INFO] train episode 65: reward = -154.00, steps = 155 14:35:46 [INFO] train episode 66: reward = -281.00, steps = 282 14:36:27 [INFO] train episode 67: reward = -500.00, steps = 500 14:37:10 [INFO] train episode 68: reward = -500.00, steps = 500 14:37:29 [INFO] train episode 69: reward = -239.00, steps = 240 14:38:10 [INFO] train episode 70: reward = -500.00, steps = 500 14:38:47 [INFO] train episode 71: reward = -465.00, steps = 466 14:39:09 [INFO] train episode 72: reward = -277.00, steps = 278 14:39:30 [INFO] train episode 73: reward = -276.00, steps = 277 14:39:47 [INFO] train episode 74: reward = -211.00, steps = 212 14:40:10 [INFO] train episode 75: reward = -260.00, steps = 261 14:40:32 [INFO] train episode 76: reward = -263.00, steps = 264 14:41:14 [INFO] train episode 77: reward = -489.00, steps = 490 14:41:39 [INFO] train episode 78: reward = -293.00, steps = 294 14:41:57 [INFO] train episode 79: reward = -230.00, steps = 231 14:42:25 [INFO] train episode 80: reward = -368.00, steps = 369 14:42:55 [INFO] train episode 81: reward = -384.00, steps = 385 14:43:14 [INFO] train episode 82: reward = -229.00, steps = 230 14:43:52 [INFO] train episode 83: reward = -500.00, steps = 500 14:44:08 [INFO] train episode 84: reward = -198.00, steps = 199 14:44:33 [INFO] train episode 85: reward = -309.00, steps = 310 14:45:11 [INFO] train episode 86: reward = -485.00, steps = 486 14:45:41 [INFO] train episode 87: reward = -389.00, steps = 390 14:46:02 [INFO] train episode 88: reward = -249.00, steps = 250 14:46:21 [INFO] train episode 89: reward = -240.00, steps = 241 14:46:43 [INFO] train episode 90: reward = -285.00, steps = 286 14:47:04 [INFO] train episode 91: reward = -270.00, steps = 271 14:47:38 [INFO] train episode 92: reward = -418.00, steps = 419 14:48:16 [INFO] train episode 93: reward = -500.00, steps = 500 14:48:42 [INFO] train episode 94: reward = -337.00, steps = 338 14:49:07 [INFO] train episode 95: reward = -300.00, steps = 301 14:49:31 [INFO] train episode 96: reward = -324.00, steps = 325 14:49:51 [INFO] train episode 97: reward = -263.00, steps = 264 14:50:15 [INFO] train episode 98: reward = -330.00, steps = 331 14:50:37 [INFO] train episode 99: reward = -277.00, steps = 278 14:50:49 [INFO] train episode 100: reward = -159.00, steps = 160 14:51:08 [INFO] train episode 101: reward = -247.00, steps = 248 14:51:23 [INFO] train episode 102: reward = -212.00, steps = 213 14:51:34 [INFO] train episode 103: reward = -147.00, steps = 148 14:51:50 [INFO] train episode 104: reward = -206.00, steps = 207 14:52:13 [INFO] train episode 105: reward = -298.00, steps = 299 14:52:31 [INFO] train episode 106: reward = -245.00, steps = 246 14:52:43 [INFO] train episode 107: reward = -149.00, steps = 150 14:53:10 [INFO] train episode 108: reward = -372.00, steps = 373 14:53:23 [INFO] train episode 109: reward = -175.00, steps = 176 14:53:42 [INFO] train episode 110: reward = -246.00, steps = 247 14:54:08 [INFO] train episode 111: reward = -354.00, steps = 355 14:54:28 [INFO] train episode 112: reward = -274.00, steps = 275 14:55:04 [INFO] train episode 113: reward = -461.00, steps = 462 14:55:16 [INFO] train episode 114: reward = -160.00, steps = 161 14:55:34 [INFO] train episode 115: reward = -251.00, steps = 252 14:55:55 [INFO] train episode 116: reward = -284.00, steps = 285 14:56:08 [INFO] train episode 117: reward = -173.00, steps = 174 14:56:29 [INFO] train episode 118: reward = -274.00, steps = 275 14:56:49 [INFO] train episode 119: reward = -270.00, steps = 271 14:57:04 [INFO] train episode 120: reward = -198.00, steps = 199 14:57:24 [INFO] train episode 121: reward = -261.00, steps = 262 14:57:39 [INFO] train episode 122: reward = -198.00, steps = 199 14:57:50 [INFO] train episode 123: reward = -139.00, steps = 140 14:58:07 [INFO] train episode 124: reward = -225.00, steps = 226 14:58:20 [INFO] train episode 125: reward = -180.00, steps = 181 14:58:31 [INFO] train episode 126: reward = -143.00, steps = 144 14:59:01 [INFO] train episode 127: reward = -407.00, steps = 408 14:59:17 [INFO] train episode 128: reward = -197.00, steps = 198 14:59:29 [INFO] train episode 129: reward = -169.00, steps = 170 14:59:50 [INFO] train episode 130: reward = -281.00, steps = 282 15:00:03 [INFO] train episode 131: reward = -180.00, steps = 181 15:00:23 [INFO] train episode 132: reward = -273.00, steps = 274 15:00:37 [INFO] train episode 133: reward = -163.00, steps = 164 15:00:52 [INFO] train episode 134: reward = -216.00, steps = 217 15:01:07 [INFO] train episode 135: reward = -201.00, steps = 202 15:01:19 [INFO] train episode 136: reward = -150.00, steps = 151 15:01:30 [INFO] train episode 137: reward = -158.00, steps = 159 15:01:45 [INFO] train episode 138: reward = -192.00, steps = 193 15:01:59 [INFO] train episode 139: reward = -175.00, steps = 176 15:02:15 [INFO] train episode 140: reward = -221.00, steps = 222 15:02:29 [INFO] train episode 141: reward = -182.00, steps = 183 15:02:40 [INFO] train episode 142: reward = -150.00, steps = 151 15:02:53 [INFO] train episode 143: reward = -176.00, steps = 177 15:03:11 [INFO] train episode 144: reward = -239.00, steps = 240 15:03:28 [INFO] train episode 145: reward = -222.00, steps = 223 15:03:46 [INFO] train episode 146: reward = -234.00, steps = 235 15:03:59 [INFO] train episode 147: reward = -175.00, steps = 176 15:04:14 [INFO] train episode 148: reward = -205.00, steps = 206 15:04:41 [INFO] train episode 149: reward = -367.00, steps = 368 15:05:08 [INFO] train episode 150: reward = -341.00, steps = 342 15:05:21 [INFO] train episode 151: reward = -181.00, steps = 182 15:05:35 [INFO] train episode 152: reward = -180.00, steps = 181 15:05:46 [INFO] train episode 153: reward = -153.00, steps = 154 15:06:00 [INFO] train episode 154: reward = -190.00, steps = 191 15:06:13 [INFO] train episode 155: reward = -177.00, steps = 178 15:06:24 [INFO] train episode 156: reward = -132.00, steps = 133 15:06:40 [INFO] train episode 157: reward = -214.00, steps = 215 15:06:50 [INFO] train episode 158: reward = -130.00, steps = 131 15:07:06 [INFO] train episode 159: reward = -219.00, steps = 220 15:07:18 [INFO] train episode 160: reward = -151.00, steps = 152 15:07:28 [INFO] train episode 161: reward = -133.00, steps = 134 15:07:44 [INFO] train episode 162: reward = -202.00, steps = 203 15:07:58 [INFO] train episode 163: reward = -190.00, steps = 191 15:08:15 [INFO] train episode 164: reward = -218.00, steps = 219 15:08:27 [INFO] train episode 165: reward = -161.00, steps = 162 15:08:37 [INFO] train episode 166: reward = -136.00, steps = 137 15:08:50 [INFO] train episode 167: reward = -181.00, steps = 182 15:09:13 [INFO] train episode 168: reward = -288.00, steps = 289 15:09:25 [INFO] train episode 169: reward = -162.00, steps = 163 15:09:38 [INFO] train episode 170: reward = -178.00, steps = 179 15:09:49 [INFO] train episode 171: reward = -146.00, steps = 147 15:10:08 [INFO] train episode 172: reward = -243.00, steps = 244 15:10:17 [INFO] train episode 173: reward = -132.00, steps = 133 15:10:32 [INFO] train episode 174: reward = -184.00, steps = 185 15:10:45 [INFO] train episode 175: reward = -168.00, steps = 169 15:10:55 [INFO] train episode 176: reward = -134.00, steps = 135 15:11:06 [INFO] train episode 177: reward = -160.00, steps = 161 15:11:23 [INFO] train episode 178: reward = -216.00, steps = 217 15:11:35 [INFO] train episode 179: reward = -169.00, steps = 170 15:11:52 [INFO] train episode 180: reward = -216.00, steps = 217 15:12:01 [INFO] train episode 181: reward = -114.00, steps = 115 15:12:20 [INFO] train episode 182: reward = -253.00, steps = 254 15:12:31 [INFO] train episode 183: reward = -157.00, steps = 158 15:12:45 [INFO] train episode 184: reward = -189.00, steps = 190 15:12:57 [INFO] train episode 185: reward = -154.00, steps = 155 15:13:14 [INFO] train episode 186: reward = -210.00, steps = 211 15:13:27 [INFO] train episode 187: reward = -186.00, steps = 187 15:13:39 [INFO] train episode 188: reward = -150.00, steps = 151 15:13:53 [INFO] train episode 189: reward = -199.00, steps = 200 15:14:12 [INFO] train episode 190: reward = -250.00, steps = 251 15:14:26 [INFO] train episode 191: reward = -195.00, steps = 196 15:14:41 [INFO] train episode 192: reward = -173.00, steps = 174 15:14:54 [INFO] train episode 193: reward = -181.00, steps = 182 15:15:05 [INFO] train episode 194: reward = -147.00, steps = 148 15:15:19 [INFO] train episode 195: reward = -181.00, steps = 182 15:15:28 [INFO] train episode 196: reward = -124.00, steps = 125 15:16:07 [INFO] train episode 197: reward = -488.00, steps = 489 15:16:23 [INFO] train episode 198: reward = -220.00, steps = 221 15:16:32 [INFO] train episode 199: reward = -114.00, steps = 115 15:16:49 [INFO] train episode 200: reward = -245.00, steps = 246 15:17:05 [INFO] train episode 201: reward = -203.00, steps = 204 15:17:16 [INFO] train episode 202: reward = -153.00, steps = 154 15:17:31 [INFO] train episode 203: reward = -183.00, steps = 184 15:17:44 [INFO] train episode 204: reward = -173.00, steps = 174 15:18:03 [INFO] train episode 205: reward = -257.00, steps = 258 15:18:18 [INFO] train episode 206: reward = -203.00, steps = 204 15:18:32 [INFO] train episode 207: reward = -191.00, steps = 192 15:18:54 [INFO] train episode 208: reward = -282.00, steps = 283 15:19:12 [INFO] train episode 209: reward = -245.00, steps = 246 15:19:25 [INFO] train episode 210: reward = -172.00, steps = 173 15:19:43 [INFO] train episode 211: reward = -246.00, steps = 247 15:19:54 [INFO] train episode 212: reward = -142.00, steps = 143 15:20:06 [INFO] train episode 213: reward = -164.00, steps = 165 15:20:19 [INFO] train episode 214: reward = -158.00, steps = 159 15:20:30 [INFO] train episode 215: reward = -152.00, steps = 153 15:20:47 [INFO] train episode 216: reward = -222.00, steps = 223 15:21:00 [INFO] train episode 217: reward = -172.00, steps = 173 15:21:13 [INFO] train episode 218: reward = -177.00, steps = 178 15:21:24 [INFO] train episode 219: reward = -143.00, steps = 144 15:21:42 [INFO] train episode 220: reward = -226.00, steps = 227 15:21:59 [INFO] train episode 221: reward = -226.00, steps = 227 15:22:09 [INFO] train episode 222: reward = -128.00, steps = 129 15:22:20 [INFO] train episode 223: reward = -147.00, steps = 148 15:22:32 [INFO] train episode 224: reward = -164.00, steps = 165 15:22:44 [INFO] train episode 225: reward = -166.00, steps = 167 15:23:05 [INFO] train episode 226: reward = -261.00, steps = 262 15:23:17 [INFO] train episode 227: reward = -169.00, steps = 170 15:23:25 [INFO] train episode 228: reward = -103.00, steps = 104 15:23:37 [INFO] train episode 229: reward = -155.00, steps = 156 15:23:48 [INFO] train episode 230: reward = -137.00, steps = 138 15:24:07 [INFO] train episode 231: reward = -238.00, steps = 239 15:24:20 [INFO] train episode 232: reward = -159.00, steps = 160 15:24:39 [INFO] train episode 233: reward = -216.00, steps = 217 15:24:51 [INFO] train episode 234: reward = -144.00, steps = 145 15:25:07 [INFO] train episode 235: reward = -182.00, steps = 183 15:25:32 [INFO] train episode 236: reward = -294.00, steps = 295 15:25:46 [INFO] train episode 237: reward = -156.00, steps = 157 15:26:04 [INFO] train episode 238: reward = -243.00, steps = 244 15:26:12 [INFO] train episode 239: reward = -112.00, steps = 113 15:26:25 [INFO] train episode 240: reward = -219.00, steps = 220 15:26:34 [INFO] train episode 241: reward = -131.00, steps = 132 15:26:45 [INFO] train episode 242: reward = -173.00, steps = 174 15:26:54 [INFO] train episode 243: reward = -148.00, steps = 149 15:27:03 [INFO] train episode 244: reward = -130.00, steps = 131 15:27:21 [INFO] train episode 245: reward = -302.00, steps = 303 15:27:29 [INFO] train episode 246: reward = -148.00, steps = 149 15:27:41 [INFO] train episode 247: reward = -212.00, steps = 213 15:27:49 [INFO] train episode 248: reward = -144.00, steps = 145 15:27:59 [INFO] train episode 249: reward = -168.00, steps = 169 15:28:09 [INFO] train episode 250: reward = -169.00, steps = 170 15:28:22 [INFO] train episode 251: reward = -216.00, steps = 217 15:28:32 [INFO] train episode 252: reward = -175.00, steps = 176 15:28:45 [INFO] train episode 253: reward = -231.00, steps = 232 15:28:54 [INFO] train episode 254: reward = -158.00, steps = 159 15:29:08 [INFO] train episode 255: reward = -234.00, steps = 235 15:29:16 [INFO] train episode 256: reward = -152.00, steps = 153 15:29:27 [INFO] train episode 257: reward = -182.00, steps = 183 15:29:35 [INFO] train episode 258: reward = -143.00, steps = 144 15:29:43 [INFO] train episode 259: reward = -135.00, steps = 136 15:29:54 [INFO] train episode 260: reward = -198.00, steps = 199 15:30:11 [INFO] train episode 261: reward = -302.00, steps = 303 15:30:25 [INFO] train episode 262: reward = -217.00, steps = 218 15:30:40 [INFO] train episode 263: reward = -278.00, steps = 279 15:30:46 [INFO] train episode 264: reward = -99.00, steps = 100 15:30:54 [INFO] train episode 265: reward = -142.00, steps = 143 15:31:02 [INFO] train episode 266: reward = -121.00, steps = 122 15:31:11 [INFO] train episode 267: reward = -165.00, steps = 166 15:31:25 [INFO] train episode 268: reward = -229.00, steps = 230 15:31:32 [INFO] train episode 269: reward = -128.00, steps = 129 15:31:38 [INFO] train episode 270: reward = -100.00, steps = 101 15:31:47 [INFO] train episode 271: reward = -156.00, steps = 157 15:31:56 [INFO] train episode 272: reward = -162.00, steps = 163 15:32:07 [INFO] train episode 273: reward = -194.00, steps = 195 15:32:18 [INFO] train episode 274: reward = -190.00, steps = 191 15:32:26 [INFO] train episode 275: reward = -120.00, steps = 121 15:32:46 [INFO] train episode 276: reward = -366.00, steps = 367 15:32:59 [INFO] train episode 277: reward = -219.00, steps = 220 15:33:17 [INFO] train episode 278: reward = -314.00, steps = 315 15:33:26 [INFO] train episode 279: reward = -159.00, steps = 160 15:33:34 [INFO] train episode 280: reward = -138.00, steps = 139 15:33:42 [INFO] train episode 281: reward = -138.00, steps = 139 15:33:50 [INFO] train episode 282: reward = -135.00, steps = 136 15:33:57 [INFO] train episode 283: reward = -131.00, steps = 132 15:34:09 [INFO] train episode 284: reward = -192.00, steps = 193 15:34:21 [INFO] train episode 285: reward = -224.00, steps = 225 15:34:29 [INFO] train episode 286: reward = -126.00, steps = 127 15:34:39 [INFO] train episode 287: reward = -168.00, steps = 169 15:34:52 [INFO] train episode 288: reward = -237.00, steps = 238 15:35:01 [INFO] train episode 289: reward = -154.00, steps = 155 15:35:09 [INFO] train episode 290: reward = -139.00, steps = 140 15:35:20 [INFO] train episode 291: reward = -192.00, steps = 193 15:35:29 [INFO] train episode 292: reward = -148.00, steps = 149 15:35:38 [INFO] train episode 293: reward = -150.00, steps = 151 15:35:48 [INFO] train episode 294: reward = -176.00, steps = 177 15:35:55 [INFO] train episode 295: reward = -140.00, steps = 141 15:36:02 [INFO] train episode 296: reward = -119.00, steps = 120 15:36:11 [INFO] train episode 297: reward = -146.00, steps = 147 15:36:18 [INFO] train episode 298: reward = -128.00, steps = 129 15:36:34 [INFO] train episode 299: reward = -266.00, steps = 267 15:36:44 [INFO] train episode 300: reward = -170.00, steps = 171 15:36:51 [INFO] train episode 301: reward = -116.00, steps = 117 15:36:58 [INFO] train episode 302: reward = -113.00, steps = 114 15:37:09 [INFO] train episode 303: reward = -207.00, steps = 208 15:37:17 [INFO] train episode 304: reward = -129.00, steps = 130 15:37:24 [INFO] train episode 305: reward = -133.00, steps = 134 15:37:34 [INFO] train episode 306: reward = -150.00, steps = 151 15:37:46 [INFO] train episode 307: reward = -224.00, steps = 225 15:37:53 [INFO] train episode 308: reward = -119.00, steps = 120 15:38:00 [INFO] train episode 309: reward = -129.00, steps = 130 15:38:11 [INFO] train episode 310: reward = -180.00, steps = 181 15:38:18 [INFO] train episode 311: reward = -142.00, steps = 143 15:38:26 [INFO] train episode 312: reward = -125.00, steps = 126 15:38:35 [INFO] train episode 313: reward = -143.00, steps = 144 15:38:42 [INFO] train episode 314: reward = -118.00, steps = 119 15:38:51 [INFO] train episode 315: reward = -163.00, steps = 164 15:38:59 [INFO] train episode 316: reward = -144.00, steps = 145 15:39:08 [INFO] train episode 317: reward = -143.00, steps = 144 15:39:16 [INFO] train episode 318: reward = -153.00, steps = 154 15:39:27 [INFO] train episode 319: reward = -198.00, steps = 199 15:39:37 [INFO] train episode 320: reward = -154.00, steps = 155 15:39:45 [INFO] train episode 321: reward = -142.00, steps = 143 15:39:52 [INFO] train episode 322: reward = -119.00, steps = 120 15:40:01 [INFO] train episode 323: reward = -166.00, steps = 167 15:40:11 [INFO] train episode 324: reward = -179.00, steps = 180 15:40:19 [INFO] train episode 325: reward = -133.00, steps = 134 15:40:27 [INFO] train episode 326: reward = -143.00, steps = 144 15:40:35 [INFO] train episode 327: reward = -121.00, steps = 122 15:40:44 [INFO] train episode 328: reward = -157.00, steps = 158 15:40:51 [INFO] train episode 329: reward = -119.00, steps = 120 15:41:05 [INFO] train episode 330: reward = -253.00, steps = 254 15:41:14 [INFO] train episode 331: reward = -145.00, steps = 146 15:41:20 [INFO] train episode 332: reward = -120.00, steps = 121 15:41:29 [INFO] train episode 333: reward = -149.00, steps = 150 15:41:36 [INFO] train episode 334: reward = -108.00, steps = 109 15:41:46 [INFO] train episode 335: reward = -175.00, steps = 176 15:41:54 [INFO] train episode 336: reward = -140.00, steps = 141 15:42:04 [INFO] train episode 337: reward = -177.00, steps = 178 15:42:12 [INFO] train episode 338: reward = -141.00, steps = 142 15:42:20 [INFO] train episode 339: reward = -140.00, steps = 141 15:42:31 [INFO] train episode 340: reward = -193.00, steps = 194 15:42:40 [INFO] train episode 341: reward = -141.00, steps = 142 15:42:49 [INFO] train episode 342: reward = -163.00, steps = 164 15:42:57 [INFO] train episode 343: reward = -142.00, steps = 143 15:43:05 [INFO] train episode 344: reward = -135.00, steps = 136 15:43:13 [INFO] train episode 345: reward = -136.00, steps = 137 15:43:21 [INFO] train episode 346: reward = -148.00, steps = 149 15:43:29 [INFO] train episode 347: reward = -136.00, steps = 137 15:43:37 [INFO] train episode 348: reward = -127.00, steps = 128 15:43:45 [INFO] train episode 349: reward = -138.00, steps = 139 15:43:52 [INFO] train episode 350: reward = -127.00, steps = 128 15:44:00 [INFO] train episode 351: reward = -153.00, steps = 154 15:44:07 [INFO] train episode 352: reward = -110.00, steps = 111 15:44:15 [INFO] train episode 353: reward = -143.00, steps = 144 15:44:24 [INFO] train episode 354: reward = -162.00, steps = 163 15:44:34 [INFO] train episode 355: reward = -161.00, steps = 162 15:44:40 [INFO] train episode 356: reward = -106.00, steps = 107 15:44:51 [INFO] train episode 357: reward = -179.00, steps = 180 15:45:02 [INFO] train episode 358: reward = -182.00, steps = 183 15:45:10 [INFO] train episode 359: reward = -146.00, steps = 147 15:45:23 [INFO] train episode 360: reward = -235.00, steps = 236 15:45:33 [INFO] train episode 361: reward = -153.00, steps = 154 15:45:41 [INFO] train episode 362: reward = -138.00, steps = 139 15:45:48 [INFO] train episode 363: reward = -138.00, steps = 139 15:45:55 [INFO] train episode 364: reward = -126.00, steps = 127 15:46:03 [INFO] train episode 365: reward = -140.00, steps = 141 15:46:11 [INFO] train episode 366: reward = -134.00, steps = 135 15:46:17 [INFO] train episode 367: reward = -97.00, steps = 98 15:46:25 [INFO] train episode 368: reward = -144.00, steps = 145 15:46:34 [INFO] train episode 369: reward = -143.00, steps = 144 15:46:41 [INFO] train episode 370: reward = -112.00, steps = 113 15:46:49 [INFO] train episode 371: reward = -136.00, steps = 137 15:46:56 [INFO] train episode 372: reward = -124.00, steps = 125 15:47:03 [INFO] train episode 373: reward = -129.00, steps = 130 15:47:09 [INFO] train episode 374: reward = -102.00, steps = 103 15:47:15 [INFO] train episode 375: reward = -111.00, steps = 112 15:47:22 [INFO] train episode 376: reward = -104.00, steps = 105 15:47:30 [INFO] train episode 377: reward = -159.00, steps = 160 15:47:38 [INFO] train episode 378: reward = -113.00, steps = 114 15:47:46 [INFO] train episode 379: reward = -141.00, steps = 142 15:47:52 [INFO] train episode 380: reward = -110.00, steps = 111 15:47:59 [INFO] train episode 381: reward = -122.00, steps = 123 15:48:06 [INFO] train episode 382: reward = -114.00, steps = 115 15:48:15 [INFO] train episode 383: reward = -167.00, steps = 168 15:48:23 [INFO] train episode 384: reward = -130.00, steps = 131 15:48:29 [INFO] train episode 385: reward = -116.00, steps = 117 15:48:40 [INFO] train episode 386: reward = -179.00, steps = 180 15:48:47 [INFO] train episode 387: reward = -106.00, steps = 107 15:48:54 [INFO] train episode 388: reward = -125.00, steps = 126 15:49:00 [INFO] train episode 389: reward = -111.00, steps = 112 15:49:07 [INFO] train episode 390: reward = -117.00, steps = 118 15:49:12 [INFO] train episode 391: reward = -89.00, steps = 90 15:49:21 [INFO] train episode 392: reward = -160.00, steps = 161 15:49:26 [INFO] train episode 393: reward = -96.00, steps = 97 15:49:35 [INFO] train episode 394: reward = -152.00, steps = 153 15:49:42 [INFO] train episode 395: reward = -110.00, steps = 111 15:49:49 [INFO] train episode 396: reward = -124.00, steps = 125 15:49:49 [INFO] ==== test ==== 15:49:57 [INFO] test episode 0: reward = -146.00, steps = 147 15:50:04 [INFO] test episode 1: reward = -105.00, steps = 106 15:50:10 [INFO] test episode 2: reward = -122.00, steps = 123 15:50:16 [INFO] test episode 3: reward = -102.00, steps = 103 15:50:24 [INFO] test episode 4: reward = -148.00, steps = 149 15:50:34 [INFO] test episode 5: reward = -180.00, steps = 181 15:50:40 [INFO] test episode 6: reward = -89.00, steps = 90 15:50:45 [INFO] test episode 7: reward = -101.00, steps = 102 15:50:51 [INFO] test episode 8: reward = -104.00, steps = 105 15:50:58 [INFO] test episode 9: reward = -119.00, steps = 120 15:51:08 [INFO] test episode 10: reward = -178.00, steps = 179 15:51:16 [INFO] test episode 11: reward = -146.00, steps = 147 15:51:22 [INFO] test episode 12: reward = -103.00, steps = 104 15:51:29 [INFO] test episode 13: reward = -127.00, steps = 128 15:51:37 [INFO] test episode 14: reward = -143.00, steps = 144 15:51:43 [INFO] test episode 15: reward = -112.00, steps = 113 15:51:52 [INFO] test episode 16: reward = -158.00, steps = 159 15:51:59 [INFO] test episode 17: reward = -123.00, steps = 124 15:52:08 [INFO] test episode 18: reward = -158.00, steps = 159 15:52:14 [INFO] test episode 19: reward = -115.00, steps = 116 15:52:21 [INFO] test episode 20: reward = -127.00, steps = 128 15:52:29 [INFO] test episode 21: reward = -143.00, steps = 144 15:52:37 [INFO] test episode 22: reward = -133.00, steps = 134 15:52:43 [INFO] test episode 23: reward = -113.00, steps = 114 15:52:51 [INFO] test episode 24: reward = -132.00, steps = 133 15:52:59 [INFO] test episode 25: reward = -137.00, steps = 138 15:53:06 [INFO] test episode 26: reward = -133.00, steps = 134 15:53:14 [INFO] test episode 27: reward = -134.00, steps = 135 15:53:20 [INFO] test episode 28: reward = -120.00, steps = 121 15:53:27 [INFO] test episode 29: reward = -116.00, steps = 117 15:53:33 [INFO] test episode 30: reward = -105.00, steps = 106 15:53:39 [INFO] test episode 31: reward = -115.00, steps = 116 15:53:46 [INFO] test episode 32: reward = -128.00, steps = 129 15:53:54 [INFO] test episode 33: reward = -128.00, steps = 129 15:54:00 [INFO] test episode 34: reward = -125.00, steps = 126 15:54:09 [INFO] test episode 35: reward = -140.00, steps = 141 15:54:17 [INFO] test episode 36: reward = -157.00, steps = 158 15:54:26 [INFO] test episode 37: reward = -156.00, steps = 157 15:54:32 [INFO] test episode 38: reward = -103.00, steps = 104 15:54:38 [INFO] test episode 39: reward = -96.00, steps = 97 15:54:44 [INFO] test episode 40: reward = -123.00, steps = 124 15:54:52 [INFO] test episode 41: reward = -123.00, steps = 124 15:55:00 [INFO] test episode 42: reward = -144.00, steps = 145 15:55:08 [INFO] test episode 43: reward = -139.00, steps = 140 15:55:14 [INFO] test episode 44: reward = -120.00, steps = 121 15:55:29 [INFO] test episode 45: reward = -258.00, steps = 259 15:55:35 [INFO] test episode 46: reward = -121.00, steps = 122 15:55:42 [INFO] test episode 47: reward = -115.00, steps = 116 15:55:48 [INFO] test episode 48: reward = -105.00, steps = 106 15:55:56 [INFO] test episode 49: reward = -140.00, steps = 141 15:56:04 [INFO] test episode 50: reward = -144.00, steps = 145 15:56:11 [INFO] test episode 51: reward = -130.00, steps = 131 15:56:18 [INFO] test episode 52: reward = -131.00, steps = 132 15:56:25 [INFO] test episode 53: reward = -123.00, steps = 124 15:56:33 [INFO] test episode 54: reward = -137.00, steps = 138 15:56:42 [INFO] test episode 55: reward = -151.00, steps = 152 15:56:48 [INFO] test episode 56: reward = -113.00, steps = 114 15:57:01 [INFO] test episode 57: reward = -234.00, steps = 235 15:57:09 [INFO] test episode 58: reward = -139.00, steps = 140 15:57:22 [INFO] test episode 59: reward = -229.00, steps = 230 15:57:29 [INFO] test episode 60: reward = -115.00, steps = 116 15:57:35 [INFO] test episode 61: reward = -113.00, steps = 114 15:57:41 [INFO] test episode 62: reward = -103.00, steps = 104 15:57:55 [INFO] test episode 63: reward = -249.00, steps = 250 15:58:00 [INFO] test episode 64: reward = -100.00, steps = 101 15:58:07 [INFO] test episode 65: reward = -122.00, steps = 123 15:58:15 [INFO] test episode 66: reward = -133.00, steps = 134 15:58:24 [INFO] test episode 67: reward = -168.00, steps = 169 15:58:35 [INFO] test episode 68: reward = -204.00, steps = 205 15:58:43 [INFO] test episode 69: reward = -141.00, steps = 142 15:58:50 [INFO] test episode 70: reward = -112.00, steps = 113 15:58:57 [INFO] test episode 71: reward = -115.00, steps = 116 15:59:02 [INFO] test episode 72: reward = -104.00, steps = 105 15:59:09 [INFO] test episode 73: reward = -110.00, steps = 111 15:59:14 [INFO] test episode 74: reward = -90.00, steps = 91 15:59:20 [INFO] test episode 75: reward = -113.00, steps = 114 15:59:25 [INFO] test episode 76: reward = -95.00, steps = 96 15:59:32 [INFO] test episode 77: reward = -114.00, steps = 115 15:59:39 [INFO] test episode 78: reward = -128.00, steps = 129 15:59:46 [INFO] test episode 79: reward = -120.00, steps = 121 15:59:52 [INFO] test episode 80: reward = -121.00, steps = 122 15:59:58 [INFO] test episode 81: reward = -106.00, steps = 107 16:00:08 [INFO] test episode 82: reward = -171.00, steps = 172 16:00:18 [INFO] test episode 83: reward = -172.00, steps = 173 16:00:25 [INFO] test episode 84: reward = -132.00, steps = 133 16:00:32 [INFO] test episode 85: reward = -119.00, steps = 120 16:00:40 [INFO] test episode 86: reward = -135.00, steps = 136 16:00:48 [INFO] test episode 87: reward = -155.00, steps = 156 16:00:56 [INFO] test episode 88: reward = -131.00, steps = 132 16:01:03 [INFO] test episode 89: reward = -135.00, steps = 136 16:01:11 [INFO] test episode 90: reward = -130.00, steps = 131 16:01:19 [INFO] test episode 91: reward = -151.00, steps = 152 16:01:26 [INFO] test episode 92: reward = -133.00, steps = 134 16:01:33 [INFO] test episode 93: reward = -115.00, steps = 116 16:01:39 [INFO] test episode 94: reward = -98.00, steps = 99 16:01:47 [INFO] test episode 95: reward = -150.00, steps = 151 16:01:58 [INFO] test episode 96: reward = -203.00, steps = 204 16:02:08 [INFO] test episode 97: reward = -166.00, steps = 167 16:02:15 [INFO] test episode 98: reward = -120.00, steps = 121 16:02:22 [INFO] test episode 99: reward = -126.00, steps = 127 16:02:22 [INFO] average episode reward = -133.57 ± 31.50
env.close()