TensorFlow version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
11:56:31 [INFO] env: <AcrobotEnv<Acrobot-v1>> 11:56:31 [INFO] action_space: Discrete(3) 11:56:31 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 11:56:31 [INFO] reward_range: (-inf, inf) 11:56:31 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 11:56:31 [INFO] _max_episode_steps: 500 11:56:31 [INFO] _elapsed_steps: None 11:56:31 [INFO] id: Acrobot-v1 11:56:31 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 11:56:31 [INFO] reward_threshold: -100.0 11:56:31 [INFO] nondeterministic: False 11:56:31 [INFO] max_episode_steps: 500 11:56:31 [INFO] _kwargs: {} 11:56:31 [INFO] _env_name: Acrobot
class PPOReplayer:
def __init__(self):
self.fields = ['state', 'action', 'prob', 'advantage', 'return']
self.memory = pd.DataFrame(columns=self.fields)
def store(self, df):
self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)
def sample(self, size):
indices = np.random.choice(self.memory.shape[0], size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.fields)
def conjugate_gradient(f, b, iter_count=10, epsilon=1e-12, tol=1e-6):
x = b * 0.
r = tf.identity(b)
p = tf.identity(b)
rho = tf.reduce_sum(r * r)
for i in range(iter_count):
z = f(p)
alpha = rho / (tf.reduce_sum(p * z) + epsilon)
x += alpha * p
r -= alpha * z
rho_new = tf.reduce_sum(r * r)
p = r + (rho_new / rho) * p
rho = rho_new
if rho < tol:
break
return x, f(x)
class TRPOAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = PPOReplayer()
self.trajectory = []
self.max_kl = 0.01
self.actor_net = self.build_net(hidden_sizes=[100,],
output_size=self.action_n, output_activation=nn.softmax)
self.critic_net = self.build_net(hidden_sizes=[100,],
learning_rate=0.002)
def build_net(self, input_size=None, hidden_sizes=None, output_size=1,
activation=nn.relu, output_activation=None,
loss=losses.mse, learning_rate=0.001):
model = keras.Sequential()
for hidden_size in hidden_sizes:
model.add(layers.Dense(units=hidden_size,
activation=activation))
model.add(layers.Dense(units=output_size,
activation=output_activation))
optimizer = optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
def step(self, observation, reward, terminated):
probs = self.actor_net.predict(observation[np.newaxis], verbose=0)[0]
action = np.random.choice(self.action_n, p=probs)
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
return action
def close(self):
if self.mode == 'train':
self.save_trajectory_to_replayer()
if len(self.replayer.memory) >= 1000:
for batch in range(5): # learn multiple times
self.learn()
self.replayer = PPOReplayer()
# reset replayer after the agent changes itself
def save_trajectory_to_replayer(self):
df = pd.DataFrame(
np.array(self.trajectory, dtype=object).reshape(-1, 4),
columns=['state', 'reward', 'terminated', 'action'], dtype=object)
states = np.stack(df['state'])
df['v'] = self.critic_net.predict(states, verbose=0)
pis = self.actor_net.predict(states, verbose=0)
df['prob'] = [pi[action] for pi, action in zip(pis, df['action'])]
df['next_v'] = df['v'].shift(-1).fillna(0.)
df['u'] = df['reward'] + self.gamma * df['next_v']
df['delta'] = df['u'] - df['v']
df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
df['delta'][::-1])[::-1]
df['return'] = signal.lfilter([1.,], [1., -self.gamma],
df['reward'][::-1])[::-1]
self.replayer.store(df)
def learn(self):
states, actions, old_pis, advantages, returns = \
self.replayer.sample(size=64)
state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
action_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
old_pi_tensor = tf.convert_to_tensor(old_pis, dtype=tf.float32)
advantage_tensor = tf.convert_to_tensor(advantages, dtype=tf.float32)
# update actor
# ... calculate first order gradient of KL divergence
with tf.GradientTape() as tape:
all_pi_tensor = self.actor_net(state_tensor)
pi_tensor = tf.gather(all_pi_tensor, action_tensor, batch_dims=1)
surrogate_tensor = (pi_tensor / old_pi_tensor) * advantage_tensor
actor_grads = tape.gradient(surrogate_tensor, self.actor_net.variables)
loss_grad = tf.concat([tf.reshape(grad, (-1,)) for grad in actor_grads],
axis=0)
# ... calculate conjugate gradient: Fx = g
def f(x): # calculate Fx
with tf.GradientTape() as tape2: # tape for 2nd-order gradient
with tf.GradientTape() as tape1: # tape for 1st-order gradient
prob_tensor = self.actor_net(state_tensor)
prob_old_tensor = tf.stop_gradient(prob_tensor)
kld_tensor = tf.reduce_sum(prob_old_tensor * (tf.math.log(
prob_old_tensor) - tf.math.log(prob_tensor)), axis=1)
kld_loss_tensor = tf.reduce_mean(kld_tensor)
grads = tape1.gradient(kld_loss_tensor, self.actor_net.variables)
flatten_grad_tensor = tf.concat(
[tf.reshape(grad, (-1,)) for grad in grads], axis=-1)
grad_matmul_x = tf.tensordot(flatten_grad_tensor, x,
axes=[[-1], [-1]])
grad_grads = tape2.gradient(grad_matmul_x, self.actor_net.variables)
flatten_grad_grad = tf.stop_gradient(tf.concat(
[tf.reshape(grad_grad, (-1,)) for grad_grad in grad_grads],
axis=-1))
fx = flatten_grad_grad + x * 1e-2
return fx
x, fx = conjugate_gradient(f, loss_grad)
# ... calculate natural gradient
natural_gradient_tensor = tf.sqrt(2 * self.max_kl /
tf.reduce_sum(fx * x)) * x
# ....... refactor the flatten gradient into un-flatten version
flatten_natural_gradient = natural_gradient_tensor.numpy()
natural_grads = []
begin = 0
for weight in self.actor_net.get_weights():
end = begin + weight.size
natural_grad = flatten_natural_gradient[begin:end].reshape(
weight.shape)
natural_grads.append(natural_grad)
begin = end
# ... line search
old_weights = self.actor_net.get_weights()
expected_improve = tf.reduce_sum(loss_grad *
natural_gradient_tensor).numpy()
for learning_step in [0.,] + [.5 ** j for j in range(10)]:
self.actor_net.set_weights([weight + learning_step * grad
for weight, grad in zip(old_weights, natural_grads)])
all_pi_tensor = self.actor_net(state_tensor)
new_pi_tensor = tf.gather(all_pi_tensor,
action_tensor[:, np.newaxis], axis=1)[:, 0]
new_pi_tensor = tf.stop_gradient(new_pi_tensor)
surrogate_tensor = (new_pi_tensor / pi_tensor) * advantage_tensor
objective = tf.reduce_sum(surrogate_tensor).numpy()
if np.isclose(learning_step, 0.):
old_objective = objective
else:
if objective - old_objective > 0.1 * expected_improve * \
learning_step:
break # success, keep the weight
else:
self.actor_net.set_weights(old_weights)
# update critic
self.critic_net.fit(states, returns, verbose=0)
agent = TRPOAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
11:56:32 [INFO] ==== train ==== 11:57:06 [INFO] NumExpr defaulting to 8 threads. 11:57:06 [INFO] train episode 0: reward = -500.00, steps = 500 11:57:41 [INFO] train episode 1: reward = -500.00, steps = 500 11:58:14 [INFO] train episode 2: reward = -500.00, steps = 500 11:58:39 [INFO] train episode 3: reward = -386.00, steps = 387 11:59:14 [INFO] train episode 4: reward = -500.00, steps = 500 11:59:42 [INFO] train episode 5: reward = -425.00, steps = 426 12:00:09 [INFO] train episode 6: reward = -399.00, steps = 400 12:00:39 [INFO] train episode 7: reward = -427.00, steps = 428 12:00:59 [INFO] train episode 8: reward = -299.00, steps = 300 12:01:23 [INFO] train episode 9: reward = -369.00, steps = 370 12:01:58 [INFO] train episode 10: reward = -500.00, steps = 500 12:02:21 [INFO] train episode 11: reward = -341.00, steps = 342 12:02:48 [INFO] train episode 12: reward = -405.00, steps = 406 12:03:15 [INFO] train episode 13: reward = -394.00, steps = 395 12:03:36 [INFO] train episode 14: reward = -306.00, steps = 307 12:04:09 [INFO] train episode 15: reward = -500.00, steps = 500 12:04:30 [INFO] train episode 16: reward = -310.00, steps = 311 12:04:54 [INFO] train episode 17: reward = -356.00, steps = 357 12:05:11 [INFO] train episode 18: reward = -259.00, steps = 260 12:05:32 [INFO] train episode 19: reward = -315.00, steps = 316 12:06:02 [INFO] train episode 20: reward = -420.00, steps = 421 12:06:34 [INFO] train episode 21: reward = -500.00, steps = 500 12:06:58 [INFO] train episode 22: reward = -351.00, steps = 352 12:07:33 [INFO] train episode 23: reward = -500.00, steps = 500 12:07:53 [INFO] train episode 24: reward = -308.00, steps = 309 12:08:25 [INFO] train episode 25: reward = -486.00, steps = 487 12:08:38 [INFO] train episode 26: reward = -195.00, steps = 196 12:09:11 [INFO] train episode 27: reward = -479.00, steps = 480 12:09:44 [INFO] train episode 28: reward = -500.00, steps = 500 12:10:04 [INFO] train episode 29: reward = -300.00, steps = 301 12:10:22 [INFO] train episode 30: reward = -265.00, steps = 266 12:10:43 [INFO] train episode 31: reward = -307.00, steps = 308 12:11:11 [INFO] train episode 32: reward = -427.00, steps = 428 12:11:31 [INFO] train episode 33: reward = -272.00, steps = 273 12:11:55 [INFO] train episode 34: reward = -375.00, steps = 376 12:12:18 [INFO] train episode 35: reward = -337.00, steps = 338 12:12:42 [INFO] train episode 36: reward = -339.00, steps = 340 12:13:09 [INFO] train episode 37: reward = -419.00, steps = 420 12:13:24 [INFO] train episode 38: reward = -215.00, steps = 216 12:13:37 [INFO] train episode 39: reward = -200.00, steps = 201 12:13:53 [INFO] train episode 40: reward = -215.00, steps = 216 12:14:10 [INFO] train episode 41: reward = -252.00, steps = 253 12:14:24 [INFO] train episode 42: reward = -223.00, steps = 224 12:14:33 [INFO] train episode 43: reward = -123.00, steps = 124 12:14:52 [INFO] train episode 44: reward = -298.00, steps = 299 12:15:07 [INFO] train episode 45: reward = -196.00, steps = 197 12:15:20 [INFO] train episode 46: reward = -198.00, steps = 199 12:15:41 [INFO] train episode 47: reward = -315.00, steps = 316 12:15:57 [INFO] train episode 48: reward = -244.00, steps = 245 12:16:15 [INFO] train episode 49: reward = -249.00, steps = 250 12:16:28 [INFO] train episode 50: reward = -193.00, steps = 194 12:16:41 [INFO] train episode 51: reward = -199.00, steps = 200 12:16:51 [INFO] train episode 52: reward = -141.00, steps = 142 12:17:09 [INFO] train episode 53: reward = -286.00, steps = 287 12:17:20 [INFO] train episode 54: reward = -163.00, steps = 164 12:17:38 [INFO] train episode 55: reward = -250.00, steps = 251 12:17:51 [INFO] train episode 56: reward = -187.00, steps = 188 12:18:03 [INFO] train episode 57: reward = -178.00, steps = 179 12:18:10 [INFO] train episode 58: reward = -104.00, steps = 105 12:18:20 [INFO] train episode 59: reward = -161.00, steps = 162 12:18:30 [INFO] train episode 60: reward = -145.00, steps = 146 12:18:43 [INFO] train episode 61: reward = -193.00, steps = 194 12:19:00 [INFO] train episode 62: reward = -236.00, steps = 237 12:19:11 [INFO] train episode 63: reward = -165.00, steps = 166 12:19:27 [INFO] train episode 64: reward = -244.00, steps = 245 12:19:41 [INFO] train episode 65: reward = -202.00, steps = 203 12:19:50 [INFO] train episode 66: reward = -133.00, steps = 134 12:19:58 [INFO] train episode 67: reward = -124.00, steps = 125 12:20:09 [INFO] train episode 68: reward = -140.00, steps = 141 12:20:19 [INFO] train episode 69: reward = -142.00, steps = 143 12:20:30 [INFO] train episode 70: reward = -164.00, steps = 165 12:20:45 [INFO] train episode 71: reward = -223.00, steps = 224 12:21:05 [INFO] train episode 72: reward = -295.00, steps = 296 12:21:16 [INFO] train episode 73: reward = -154.00, steps = 155 12:21:27 [INFO] train episode 74: reward = -142.00, steps = 143 12:21:39 [INFO] train episode 75: reward = -177.00, steps = 178 12:21:53 [INFO] train episode 76: reward = -184.00, steps = 185 12:22:03 [INFO] train episode 77: reward = -139.00, steps = 140 12:22:18 [INFO] train episode 78: reward = -236.00, steps = 237 12:22:34 [INFO] train episode 79: reward = -229.00, steps = 230 12:22:45 [INFO] train episode 80: reward = -146.00, steps = 147 12:23:01 [INFO] train episode 81: reward = -243.00, steps = 244 12:23:12 [INFO] train episode 82: reward = -163.00, steps = 164 12:23:25 [INFO] train episode 83: reward = -195.00, steps = 196 12:23:34 [INFO] train episode 84: reward = -119.00, steps = 120 12:23:49 [INFO] train episode 85: reward = -235.00, steps = 236 12:24:11 [INFO] train episode 86: reward = -303.00, steps = 304 12:24:22 [INFO] train episode 87: reward = -164.00, steps = 165 12:24:33 [INFO] train episode 88: reward = -173.00, steps = 174 12:24:43 [INFO] train episode 89: reward = -137.00, steps = 138 12:24:53 [INFO] train episode 90: reward = -148.00, steps = 149 12:25:05 [INFO] train episode 91: reward = -187.00, steps = 188 12:25:13 [INFO] train episode 92: reward = -106.00, steps = 107 12:25:26 [INFO] train episode 93: reward = -185.00, steps = 186 12:25:41 [INFO] train episode 94: reward = -227.00, steps = 228 12:25:51 [INFO] train episode 95: reward = -143.00, steps = 144 12:26:02 [INFO] train episode 96: reward = -162.00, steps = 163 12:26:13 [INFO] train episode 97: reward = -163.00, steps = 164 12:26:23 [INFO] train episode 98: reward = -156.00, steps = 157 12:26:34 [INFO] train episode 99: reward = -143.00, steps = 144 12:26:42 [INFO] train episode 100: reward = -120.00, steps = 121 12:26:51 [INFO] train episode 101: reward = -128.00, steps = 129 12:27:03 [INFO] train episode 102: reward = -177.00, steps = 178 12:27:11 [INFO] train episode 103: reward = -125.00, steps = 126 12:27:27 [INFO] train episode 104: reward = -242.00, steps = 243 12:27:37 [INFO] train episode 105: reward = -155.00, steps = 156 12:27:50 [INFO] train episode 106: reward = -177.00, steps = 178 12:27:56 [INFO] train episode 107: reward = -83.00, steps = 84 12:28:08 [INFO] train episode 108: reward = -181.00, steps = 182 12:28:18 [INFO] train episode 109: reward = -148.00, steps = 149 12:28:27 [INFO] train episode 110: reward = -133.00, steps = 134 12:28:35 [INFO] train episode 111: reward = -128.00, steps = 129 12:28:42 [INFO] train episode 112: reward = -97.00, steps = 98 12:28:51 [INFO] train episode 113: reward = -132.00, steps = 133 12:29:01 [INFO] train episode 114: reward = -130.00, steps = 131 12:29:13 [INFO] train episode 115: reward = -171.00, steps = 172 12:29:22 [INFO] train episode 116: reward = -140.00, steps = 141 12:29:32 [INFO] train episode 117: reward = -148.00, steps = 149 12:29:42 [INFO] train episode 118: reward = -153.00, steps = 154 12:29:53 [INFO] train episode 119: reward = -151.00, steps = 152 12:30:05 [INFO] train episode 120: reward = -186.00, steps = 187 12:30:18 [INFO] train episode 121: reward = -169.00, steps = 170 12:30:30 [INFO] train episode 122: reward = -184.00, steps = 185 12:30:41 [INFO] train episode 123: reward = -157.00, steps = 158 12:30:50 [INFO] train episode 124: reward = -143.00, steps = 144 12:31:05 [INFO] train episode 125: reward = -213.00, steps = 214 12:31:16 [INFO] train episode 126: reward = -167.00, steps = 168 12:31:32 [INFO] train episode 127: reward = -212.00, steps = 213 12:31:39 [INFO] train episode 128: reward = -115.00, steps = 116 12:31:49 [INFO] train episode 129: reward = -145.00, steps = 146 12:32:00 [INFO] train episode 130: reward = -161.00, steps = 162 12:32:10 [INFO] train episode 131: reward = -153.00, steps = 154 12:32:33 [INFO] train episode 132: reward = -343.00, steps = 344 12:32:47 [INFO] train episode 133: reward = -185.00, steps = 186 12:33:00 [INFO] train episode 134: reward = -194.00, steps = 195 12:33:11 [INFO] train episode 135: reward = -160.00, steps = 161 12:33:25 [INFO] train episode 136: reward = -223.00, steps = 224 12:33:40 [INFO] train episode 137: reward = -211.00, steps = 212 12:33:56 [INFO] train episode 138: reward = -237.00, steps = 238 12:34:07 [INFO] train episode 139: reward = -163.00, steps = 164 12:34:22 [INFO] train episode 140: reward = -213.00, steps = 214 12:34:37 [INFO] train episode 141: reward = -226.00, steps = 227 12:34:46 [INFO] train episode 142: reward = -140.00, steps = 141 12:34:57 [INFO] train episode 143: reward = -169.00, steps = 170 12:35:09 [INFO] train episode 144: reward = -149.00, steps = 150 12:35:17 [INFO] train episode 145: reward = -128.00, steps = 129 12:35:26 [INFO] train episode 146: reward = -130.00, steps = 131 12:35:37 [INFO] train episode 147: reward = -159.00, steps = 160 12:35:45 [INFO] train episode 148: reward = -125.00, steps = 126 12:35:56 [INFO] train episode 149: reward = -156.00, steps = 157 12:36:03 [INFO] train episode 150: reward = -102.00, steps = 103 12:36:09 [INFO] train episode 151: reward = -88.00, steps = 89 12:36:21 [INFO] train episode 152: reward = -151.00, steps = 152 12:36:28 [INFO] train episode 153: reward = -108.00, steps = 109 12:36:38 [INFO] train episode 154: reward = -156.00, steps = 157 12:36:48 [INFO] train episode 155: reward = -135.00, steps = 136 12:36:57 [INFO] train episode 156: reward = -137.00, steps = 138 12:37:05 [INFO] train episode 157: reward = -120.00, steps = 121 12:37:12 [INFO] train episode 158: reward = -102.00, steps = 103 12:37:21 [INFO] train episode 159: reward = -135.00, steps = 136 12:37:29 [INFO] train episode 160: reward = -94.00, steps = 95 12:37:42 [INFO] train episode 161: reward = -195.00, steps = 196 12:37:50 [INFO] train episode 162: reward = -119.00, steps = 120 12:37:59 [INFO] train episode 163: reward = -130.00, steps = 131 12:38:07 [INFO] train episode 164: reward = -118.00, steps = 119 12:38:18 [INFO] train episode 165: reward = -162.00, steps = 163 12:38:26 [INFO] train episode 166: reward = -129.00, steps = 130 12:38:34 [INFO] train episode 167: reward = -122.00, steps = 123 12:38:44 [INFO] train episode 168: reward = -117.00, steps = 118 12:38:53 [INFO] train episode 169: reward = -137.00, steps = 138 12:39:03 [INFO] train episode 170: reward = -149.00, steps = 150 12:39:17 [INFO] train episode 171: reward = -215.00, steps = 216 12:39:24 [INFO] train episode 172: reward = -105.00, steps = 106 12:39:34 [INFO] train episode 173: reward = -139.00, steps = 140 12:39:41 [INFO] train episode 174: reward = -98.00, steps = 99 12:39:48 [INFO] train episode 175: reward = -113.00, steps = 114 12:39:58 [INFO] train episode 176: reward = -134.00, steps = 135 12:40:07 [INFO] train episode 177: reward = -127.00, steps = 128 12:40:15 [INFO] train episode 178: reward = -124.00, steps = 125 12:40:32 [INFO] train episode 179: reward = -255.00, steps = 256 12:40:42 [INFO] train episode 180: reward = -146.00, steps = 147 12:40:49 [INFO] train episode 181: reward = -108.00, steps = 109 12:41:00 [INFO] train episode 182: reward = -163.00, steps = 164 12:41:11 [INFO] train episode 183: reward = -148.00, steps = 149 12:41:20 [INFO] train episode 184: reward = -134.00, steps = 135 12:41:27 [INFO] train episode 185: reward = -98.00, steps = 99 12:41:36 [INFO] train episode 186: reward = -120.00, steps = 121 12:41:43 [INFO] train episode 187: reward = -117.00, steps = 118 12:41:51 [INFO] train episode 188: reward = -119.00, steps = 120 12:41:58 [INFO] train episode 189: reward = -97.00, steps = 98 12:42:08 [INFO] train episode 190: reward = -147.00, steps = 148 12:42:18 [INFO] train episode 191: reward = -137.00, steps = 138 12:42:26 [INFO] train episode 192: reward = -105.00, steps = 106 12:42:32 [INFO] train episode 193: reward = -94.00, steps = 95 12:42:32 [INFO] ==== test ==== 12:42:40 [INFO] test episode 0: reward = -112.00, steps = 113 12:42:47 [INFO] test episode 1: reward = -110.00, steps = 111 12:42:55 [INFO] test episode 2: reward = -120.00, steps = 121 12:43:02 [INFO] test episode 3: reward = -105.00, steps = 106 12:43:09 [INFO] test episode 4: reward = -105.00, steps = 106 12:43:15 [INFO] test episode 5: reward = -95.00, steps = 96 12:43:22 [INFO] test episode 6: reward = -118.00, steps = 119 12:43:32 [INFO] test episode 7: reward = -140.00, steps = 141 12:43:39 [INFO] test episode 8: reward = -114.00, steps = 115 12:43:48 [INFO] test episode 9: reward = -127.00, steps = 128 12:43:55 [INFO] test episode 10: reward = -107.00, steps = 108 12:44:02 [INFO] test episode 11: reward = -104.00, steps = 105 12:44:11 [INFO] test episode 12: reward = -141.00, steps = 142 12:44:18 [INFO] test episode 13: reward = -98.00, steps = 99 12:44:25 [INFO] test episode 14: reward = -115.00, steps = 116 12:44:31 [INFO] test episode 15: reward = -91.00, steps = 92 12:44:42 [INFO] test episode 16: reward = -156.00, steps = 157 12:44:48 [INFO] test episode 17: reward = -97.00, steps = 98 12:44:57 [INFO] test episode 18: reward = -131.00, steps = 132 12:45:07 [INFO] test episode 19: reward = -155.00, steps = 156 12:45:13 [INFO] test episode 20: reward = -97.00, steps = 98 12:45:21 [INFO] test episode 21: reward = -112.00, steps = 113 12:45:28 [INFO] test episode 22: reward = -115.00, steps = 116 12:45:37 [INFO] test episode 23: reward = -135.00, steps = 136 12:45:47 [INFO] test episode 24: reward = -143.00, steps = 144 12:45:57 [INFO] test episode 25: reward = -158.00, steps = 159 12:46:05 [INFO] test episode 26: reward = -111.00, steps = 112 12:46:11 [INFO] test episode 27: reward = -98.00, steps = 99 12:46:19 [INFO] test episode 28: reward = -121.00, steps = 122 12:46:26 [INFO] test episode 29: reward = -106.00, steps = 107 12:46:34 [INFO] test episode 30: reward = -118.00, steps = 119 12:46:42 [INFO] test episode 31: reward = -118.00, steps = 119 12:46:52 [INFO] test episode 32: reward = -168.00, steps = 169 12:46:59 [INFO] test episode 33: reward = -100.00, steps = 101 12:47:07 [INFO] test episode 34: reward = -117.00, steps = 118 12:47:14 [INFO] test episode 35: reward = -104.00, steps = 105 12:47:21 [INFO] test episode 36: reward = -110.00, steps = 111 12:47:28 [INFO] test episode 37: reward = -109.00, steps = 110 12:47:43 [INFO] test episode 38: reward = -225.00, steps = 226 12:48:08 [INFO] test episode 39: reward = -372.00, steps = 373 12:48:17 [INFO] test episode 40: reward = -144.00, steps = 145 12:48:25 [INFO] test episode 41: reward = -120.00, steps = 121 12:48:31 [INFO] test episode 42: reward = -95.00, steps = 96 12:48:41 [INFO] test episode 43: reward = -152.00, steps = 153 12:48:51 [INFO] test episode 44: reward = -142.00, steps = 143 12:48:59 [INFO] test episode 45: reward = -122.00, steps = 123 12:49:07 [INFO] test episode 46: reward = -117.00, steps = 118 12:49:14 [INFO] test episode 47: reward = -122.00, steps = 123 12:49:21 [INFO] test episode 48: reward = -102.00, steps = 103 12:49:29 [INFO] test episode 49: reward = -115.00, steps = 116 12:49:38 [INFO] test episode 50: reward = -131.00, steps = 132 12:49:45 [INFO] test episode 51: reward = -99.00, steps = 100 12:49:52 [INFO] test episode 52: reward = -114.00, steps = 115 12:50:01 [INFO] test episode 53: reward = -128.00, steps = 129 12:50:10 [INFO] test episode 54: reward = -149.00, steps = 150 12:50:18 [INFO] test episode 55: reward = -110.00, steps = 111 12:50:27 [INFO] test episode 56: reward = -135.00, steps = 136 12:50:33 [INFO] test episode 57: reward = -98.00, steps = 99 12:50:42 [INFO] test episode 58: reward = -137.00, steps = 138 12:50:49 [INFO] test episode 59: reward = -102.00, steps = 103 12:50:57 [INFO] test episode 60: reward = -116.00, steps = 117 12:51:03 [INFO] test episode 61: reward = -82.00, steps = 83 12:51:11 [INFO] test episode 62: reward = -124.00, steps = 125 12:51:18 [INFO] test episode 63: reward = -98.00, steps = 99 12:51:24 [INFO] test episode 64: reward = -92.00, steps = 93 12:51:32 [INFO] test episode 65: reward = -118.00, steps = 119 12:51:47 [INFO] test episode 66: reward = -228.00, steps = 229 12:51:54 [INFO] test episode 67: reward = -119.00, steps = 120 12:52:04 [INFO] test episode 68: reward = -146.00, steps = 147 12:52:14 [INFO] test episode 69: reward = -146.00, steps = 147 12:52:21 [INFO] test episode 70: reward = -117.00, steps = 118 12:52:30 [INFO] test episode 71: reward = -124.00, steps = 125 12:52:39 [INFO] test episode 72: reward = -143.00, steps = 144 12:52:50 [INFO] test episode 73: reward = -163.00, steps = 164 12:53:04 [INFO] test episode 74: reward = -218.00, steps = 219 12:53:14 [INFO] test episode 75: reward = -147.00, steps = 148 12:53:21 [INFO] test episode 76: reward = -110.00, steps = 111 12:53:29 [INFO] test episode 77: reward = -109.00, steps = 110 12:53:37 [INFO] test episode 78: reward = -123.00, steps = 124 12:53:47 [INFO] test episode 79: reward = -150.00, steps = 151 12:53:55 [INFO] test episode 80: reward = -122.00, steps = 123 12:54:04 [INFO] test episode 81: reward = -135.00, steps = 136 12:54:14 [INFO] test episode 82: reward = -156.00, steps = 157 12:54:25 [INFO] test episode 83: reward = -164.00, steps = 165 12:54:32 [INFO] test episode 84: reward = -96.00, steps = 97 12:54:42 [INFO] test episode 85: reward = -158.00, steps = 159 12:54:49 [INFO] test episode 86: reward = -106.00, steps = 107 12:54:55 [INFO] test episode 87: reward = -97.00, steps = 98 12:55:02 [INFO] test episode 88: reward = -98.00, steps = 99 12:55:10 [INFO] test episode 89: reward = -118.00, steps = 119 12:55:17 [INFO] test episode 90: reward = -117.00, steps = 118 12:55:26 [INFO] test episode 91: reward = -132.00, steps = 133 12:55:34 [INFO] test episode 92: reward = -125.00, steps = 126 12:55:41 [INFO] test episode 93: reward = -109.00, steps = 110 12:55:49 [INFO] test episode 94: reward = -107.00, steps = 108 12:56:02 [INFO] test episode 95: reward = -209.00, steps = 210 12:56:11 [INFO] test episode 96: reward = -139.00, steps = 140 12:56:32 [INFO] test episode 97: reward = -310.00, steps = 311 12:56:44 [INFO] test episode 98: reward = -184.00, steps = 185 12:56:53 [INFO] test episode 99: reward = -144.00, steps = 145 12:56:53 [INFO] average episode reward = -130.11 ± 41.24
env.close()