TensorFlow version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
03:33:25 [INFO] env: <AcrobotEnv<Acrobot-v1>> 03:33:25 [INFO] action_space: Discrete(3) 03:33:25 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 03:33:25 [INFO] reward_range: (-inf, inf) 03:33:25 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 03:33:25 [INFO] _max_episode_steps: 500 03:33:25 [INFO] _elapsed_steps: None 03:33:25 [INFO] id: Acrobot-v1 03:33:25 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 03:33:25 [INFO] reward_threshold: -100.0 03:33:25 [INFO] nondeterministic: False 03:33:25 [INFO] max_episode_steps: 500 03:33:25 [INFO] _kwargs: {} 03:33:25 [INFO] _env_name: Acrobot
class PPOReplayer:
def __init__(self):
self.fields = ['state', 'action', 'prob', 'advantage', 'return']
self.memory = pd.DataFrame(columns=self.fields)
def store(self, df):
self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)
def sample(self, size):
indices = np.random.choice(self.memory.shape[0], size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.fields)
class PPOAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = PPOReplayer()
self.actor_net = self.build_net(hidden_sizes=[100,],
output_size=self.action_n, output_activation=nn.softmax,
learning_rate=0.001)
self.critic_net = self.build_net(hidden_sizes=[100,],
learning_rate=0.002)
def build_net(self, input_size=None, hidden_sizes=None, output_size=1,
activation=nn.relu, output_activation=None,
loss=losses.mse, learning_rate=0.001):
model = keras.Sequential()
for hidden_size in hidden_sizes:
model.add(layers.Dense(units=hidden_size,
activation=activation))
model.add(layers.Dense(units=output_size,
activation=output_activation))
optimizer = optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
def step(self, observation, reward, terminated):
probs = self.actor_net.predict(observation[np.newaxis], verbose=0)[0]
action = np.random.choice(self.action_n, p=probs)
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
return action
def close(self):
if self.mode == 'train':
self.save_trajectory_to_replayer()
if len(self.replayer.memory) >= 1000:
for batch in range(5): # learn multiple times
self.learn()
self.replayer = PPOReplayer()
# reset replayer after the agent changes itself
def save_trajectory_to_replayer(self):
df = pd.DataFrame(
np.array(self.trajectory, dtype=object).reshape(-1, 4),
columns=['state', 'reward', 'terminated', 'action'], dtype=object)
states = np.stack(df['state'])
df['v'] = self.critic_net.predict(states, verbose=0)
pis = self.actor_net.predict(states, verbose=0)
df['prob'] = [pi[action] for pi, action in zip(pis, df['action'])]
df['next_v'] = df['v'].shift(-1).fillna(0.)
df['u'] = df['reward'] + self.gamma * df['next_v']
df['delta'] = df['u'] - df['v']
df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
df['delta'][::-1])[::-1]
df['return'] = signal.lfilter([1.,], [1., -self.gamma],
df['reward'][::-1])[::-1]
self.replayer.store(df)
def learn(self):
states, actions, old_pis, advantages, returns = \
self.replayer.sample(size=64)
state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
action_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
old_pi_tensor = tf.convert_to_tensor(old_pis, dtype=tf.float32)
advantage_tensor = tf.convert_to_tensor(advantages, dtype=tf.float32)
# update actor
with tf.GradientTape() as tape:
all_pi_tensor = self.actor_net(state_tensor)
pi_tensor = tf.gather(all_pi_tensor, action_tensor, batch_dims=1)
surrogate_advantage_tensor = (pi_tensor / old_pi_tensor) * \
advantage_tensor
clip_times_advantage_tensor = 0.1 * surrogate_advantage_tensor
max_surrogate_advantage_tensor = advantage_tensor + \
tf.where(advantage_tensor > 0.,
clip_times_advantage_tensor, -clip_times_advantage_tensor)
clipped_surrogate_advantage_tensor = tf.minimum(
surrogate_advantage_tensor, max_surrogate_advantage_tensor)
loss_tensor = -tf.reduce_mean(clipped_surrogate_advantage_tensor)
actor_grads = tape.gradient(loss_tensor, self.actor_net.variables)
self.actor_net.optimizer.apply_gradients(
zip(actor_grads, self.actor_net.variables))
# update critic
self.critic_net.fit(states, returns, verbose=0)
agent = PPOAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
03:33:26 [INFO] ==== train ==== 03:34:14 [INFO] train episode 0: reward = -500.00, steps = 500 03:34:14 [INFO] NumExpr defaulting to 8 threads. 03:35:03 [INFO] train episode 1: reward = -500.00, steps = 500 03:35:52 [INFO] train episode 2: reward = -500.00, steps = 500 03:36:40 [INFO] train episode 3: reward = -500.00, steps = 500 03:37:27 [INFO] train episode 4: reward = -485.00, steps = 486 03:38:14 [INFO] train episode 5: reward = -500.00, steps = 500 03:38:45 [INFO] train episode 6: reward = -339.00, steps = 340 03:39:27 [INFO] train episode 7: reward = -448.00, steps = 449 03:39:54 [INFO] train episode 8: reward = -292.00, steps = 293 03:40:29 [INFO] train episode 9: reward = -383.00, steps = 384 03:41:16 [INFO] train episode 10: reward = -500.00, steps = 500 03:41:42 [INFO] train episode 11: reward = -283.00, steps = 284 03:42:05 [INFO] train episode 12: reward = -254.00, steps = 255 03:42:30 [INFO] train episode 13: reward = -267.00, steps = 268 03:43:16 [INFO] train episode 14: reward = -500.00, steps = 500 03:44:01 [INFO] train episode 15: reward = -500.00, steps = 500 03:44:47 [INFO] train episode 16: reward = -500.00, steps = 500 03:45:31 [INFO] train episode 17: reward = -500.00, steps = 500 03:46:17 [INFO] train episode 18: reward = -500.00, steps = 500 03:47:03 [INFO] train episode 19: reward = -500.00, steps = 500 03:47:50 [INFO] train episode 20: reward = -500.00, steps = 500 03:48:36 [INFO] train episode 21: reward = -500.00, steps = 500 03:49:23 [INFO] train episode 22: reward = -500.00, steps = 500 03:50:09 [INFO] train episode 23: reward = -500.00, steps = 500 03:50:45 [INFO] train episode 24: reward = -390.00, steps = 391 03:51:31 [INFO] train episode 25: reward = -500.00, steps = 500 03:52:15 [INFO] train episode 26: reward = -487.00, steps = 488 03:53:02 [INFO] train episode 27: reward = -500.00, steps = 500 03:53:48 [INFO] train episode 28: reward = -500.00, steps = 500 03:54:33 [INFO] train episode 29: reward = -500.00, steps = 500 03:55:18 [INFO] train episode 30: reward = -500.00, steps = 500 03:56:04 [INFO] train episode 31: reward = -500.00, steps = 500 03:56:40 [INFO] train episode 32: reward = -398.00, steps = 399 03:57:16 [INFO] train episode 33: reward = -392.00, steps = 393 03:58:01 [INFO] train episode 34: reward = -500.00, steps = 500 03:58:46 [INFO] train episode 35: reward = -500.00, steps = 500 03:59:32 [INFO] train episode 36: reward = -492.00, steps = 493 04:00:17 [INFO] train episode 37: reward = -500.00, steps = 500 04:01:03 [INFO] train episode 38: reward = -500.00, steps = 500 04:01:49 [INFO] train episode 39: reward = -500.00, steps = 500 04:02:33 [INFO] train episode 40: reward = -500.00, steps = 500 04:03:01 [INFO] train episode 41: reward = -302.00, steps = 303 04:03:26 [INFO] train episode 42: reward = -266.00, steps = 267 04:03:59 [INFO] train episode 43: reward = -357.00, steps = 358 04:04:37 [INFO] train episode 44: reward = -416.00, steps = 417 04:05:17 [INFO] train episode 45: reward = -423.00, steps = 424 04:06:03 [INFO] train episode 46: reward = -500.00, steps = 500 04:06:48 [INFO] train episode 47: reward = -500.00, steps = 500 04:07:35 [INFO] train episode 48: reward = -500.00, steps = 500 04:08:20 [INFO] train episode 49: reward = -500.00, steps = 500 04:09:07 [INFO] train episode 50: reward = -500.00, steps = 500 04:09:53 [INFO] train episode 51: reward = -500.00, steps = 500 04:10:40 [INFO] train episode 52: reward = -500.00, steps = 500 04:11:26 [INFO] train episode 53: reward = -500.00, steps = 500 04:12:13 [INFO] train episode 54: reward = -500.00, steps = 500 04:13:00 [INFO] train episode 55: reward = -500.00, steps = 500 04:13:47 [INFO] train episode 56: reward = -500.00, steps = 500 04:14:33 [INFO] train episode 57: reward = -500.00, steps = 500 04:15:20 [INFO] train episode 58: reward = -500.00, steps = 500 04:16:05 [INFO] train episode 59: reward = -500.00, steps = 500 04:16:51 [INFO] train episode 60: reward = -500.00, steps = 500 04:17:37 [INFO] train episode 61: reward = -500.00, steps = 500 04:18:24 [INFO] train episode 62: reward = -500.00, steps = 500 04:19:10 [INFO] train episode 63: reward = -500.00, steps = 500 04:19:55 [INFO] train episode 64: reward = -500.00, steps = 500 04:20:41 [INFO] train episode 65: reward = -500.00, steps = 500 04:21:27 [INFO] train episode 66: reward = -500.00, steps = 500 04:22:13 [INFO] train episode 67: reward = -500.00, steps = 500 04:23:00 [INFO] train episode 68: reward = -500.00, steps = 500 04:23:45 [INFO] train episode 69: reward = -500.00, steps = 500 04:24:31 [INFO] train episode 70: reward = -500.00, steps = 500 04:25:15 [INFO] train episode 71: reward = -479.00, steps = 480 04:26:00 [INFO] train episode 72: reward = -500.00, steps = 500 04:26:46 [INFO] train episode 73: reward = -500.00, steps = 500 04:27:32 [INFO] train episode 74: reward = -500.00, steps = 500 04:28:18 [INFO] train episode 75: reward = -500.00, steps = 500 04:29:04 [INFO] train episode 76: reward = -500.00, steps = 500 04:29:51 [INFO] train episode 77: reward = -500.00, steps = 500 04:30:37 [INFO] train episode 78: reward = -500.00, steps = 500 04:31:22 [INFO] train episode 79: reward = -500.00, steps = 500 04:32:08 [INFO] train episode 80: reward = -500.00, steps = 500 04:32:55 [INFO] train episode 81: reward = -500.00, steps = 500 04:33:41 [INFO] train episode 82: reward = -500.00, steps = 500 04:34:27 [INFO] train episode 83: reward = -500.00, steps = 500 04:35:14 [INFO] train episode 84: reward = -500.00, steps = 500 04:36:00 [INFO] train episode 85: reward = -500.00, steps = 500 04:36:47 [INFO] train episode 86: reward = -500.00, steps = 500 04:37:33 [INFO] train episode 87: reward = -500.00, steps = 500 04:38:24 [INFO] train episode 88: reward = -500.00, steps = 500 04:39:09 [INFO] train episode 89: reward = -474.00, steps = 475 04:39:55 [INFO] train episode 90: reward = -500.00, steps = 500 04:40:39 [INFO] train episode 91: reward = -480.00, steps = 481 04:41:27 [INFO] train episode 92: reward = -500.00, steps = 500 04:42:13 [INFO] train episode 93: reward = -500.00, steps = 500 04:43:00 [INFO] train episode 94: reward = -500.00, steps = 500 04:43:46 [INFO] train episode 95: reward = -500.00, steps = 500 04:44:33 [INFO] train episode 96: reward = -500.00, steps = 500 04:45:19 [INFO] train episode 97: reward = -500.00, steps = 500 04:45:55 [INFO] train episode 98: reward = -382.00, steps = 383 04:46:41 [INFO] train episode 99: reward = -500.00, steps = 500 04:47:27 [INFO] train episode 100: reward = -500.00, steps = 500 04:48:14 [INFO] train episode 101: reward = -500.00, steps = 500 04:48:42 [INFO] train episode 102: reward = -298.00, steps = 299 04:49:25 [INFO] train episode 103: reward = -467.00, steps = 468 04:50:13 [INFO] train episode 104: reward = -500.00, steps = 500 04:51:03 [INFO] train episode 105: reward = -500.00, steps = 500 04:51:50 [INFO] train episode 106: reward = -500.00, steps = 500 04:52:36 [INFO] train episode 107: reward = -500.00, steps = 500 04:53:23 [INFO] train episode 108: reward = -500.00, steps = 500 04:54:09 [INFO] train episode 109: reward = -500.00, steps = 500 04:54:47 [INFO] train episode 110: reward = -393.00, steps = 394 04:55:33 [INFO] train episode 111: reward = -500.00, steps = 500 04:56:17 [INFO] train episode 112: reward = -500.00, steps = 500 04:57:03 [INFO] train episode 113: reward = -500.00, steps = 500 04:57:49 [INFO] train episode 114: reward = -500.00, steps = 500 04:58:36 [INFO] train episode 115: reward = -500.00, steps = 500 04:59:22 [INFO] train episode 116: reward = -500.00, steps = 500 05:00:07 [INFO] train episode 117: reward = -500.00, steps = 500 05:00:53 [INFO] train episode 118: reward = -500.00, steps = 500 05:01:38 [INFO] train episode 119: reward = -500.00, steps = 500 05:02:17 [INFO] train episode 120: reward = -417.00, steps = 418 05:03:02 [INFO] train episode 121: reward = -500.00, steps = 500 05:03:47 [INFO] train episode 122: reward = -500.00, steps = 500 05:04:31 [INFO] train episode 123: reward = -493.00, steps = 494 05:05:14 [INFO] train episode 124: reward = -480.00, steps = 481 05:05:38 [INFO] train episode 125: reward = -254.00, steps = 255 05:06:11 [INFO] train episode 126: reward = -369.00, steps = 370 05:06:44 [INFO] train episode 127: reward = -371.00, steps = 372 05:07:30 [INFO] train episode 128: reward = -496.00, steps = 497 05:07:58 [INFO] train episode 129: reward = -316.00, steps = 317 05:08:28 [INFO] train episode 130: reward = -324.00, steps = 325 05:09:03 [INFO] train episode 131: reward = -388.00, steps = 389 05:09:41 [INFO] train episode 132: reward = -418.00, steps = 419 05:10:14 [INFO] train episode 133: reward = -359.00, steps = 360 05:10:59 [INFO] train episode 134: reward = -500.00, steps = 500 05:11:47 [INFO] train episode 135: reward = -500.00, steps = 500 05:12:21 [INFO] train episode 136: reward = -365.00, steps = 366 05:12:43 [INFO] train episode 137: reward = -238.00, steps = 239 05:13:13 [INFO] train episode 138: reward = -332.00, steps = 333 05:13:58 [INFO] train episode 139: reward = -500.00, steps = 500 05:14:35 [INFO] train episode 140: reward = -401.00, steps = 402 05:15:02 [INFO] train episode 141: reward = -305.00, steps = 306 05:15:22 [INFO] train episode 142: reward = -225.00, steps = 226 05:15:41 [INFO] train episode 143: reward = -227.00, steps = 228 05:16:08 [INFO] train episode 144: reward = -300.00, steps = 301 05:16:35 [INFO] train episode 145: reward = -308.00, steps = 309 05:17:07 [INFO] train episode 146: reward = -365.00, steps = 366 05:17:31 [INFO] train episode 147: reward = -267.00, steps = 268 05:18:00 [INFO] train episode 148: reward = -319.00, steps = 320 05:18:27 [INFO] train episode 149: reward = -312.00, steps = 313 05:18:40 [INFO] train episode 150: reward = -150.00, steps = 151 05:19:22 [INFO] train episode 151: reward = -472.00, steps = 473 05:19:43 [INFO] train episode 152: reward = -227.00, steps = 228 05:20:09 [INFO] train episode 153: reward = -298.00, steps = 299 05:20:27 [INFO] train episode 154: reward = -211.00, steps = 212 05:20:49 [INFO] train episode 155: reward = -244.00, steps = 245 05:21:09 [INFO] train episode 156: reward = -238.00, steps = 239 05:21:32 [INFO] train episode 157: reward = -255.00, steps = 256 05:21:51 [INFO] train episode 158: reward = -220.00, steps = 221 05:22:21 [INFO] train episode 159: reward = -343.00, steps = 344 05:22:35 [INFO] train episode 160: reward = -161.00, steps = 162 05:22:54 [INFO] train episode 161: reward = -210.00, steps = 211 05:23:20 [INFO] train episode 162: reward = -296.00, steps = 297 05:23:38 [INFO] train episode 163: reward = -196.00, steps = 197 05:23:58 [INFO] train episode 164: reward = -235.00, steps = 236 05:24:24 [INFO] train episode 165: reward = -292.00, steps = 293 05:24:41 [INFO] train episode 166: reward = -191.00, steps = 192 05:25:18 [INFO] train episode 167: reward = -415.00, steps = 416 05:25:36 [INFO] train episode 168: reward = -209.00, steps = 210 05:26:05 [INFO] train episode 169: reward = -330.00, steps = 331 05:26:21 [INFO] train episode 170: reward = -173.00, steps = 174 05:26:37 [INFO] train episode 171: reward = -189.00, steps = 190 05:26:50 [INFO] train episode 172: reward = -142.00, steps = 143 05:27:10 [INFO] train episode 173: reward = -234.00, steps = 235 05:27:28 [INFO] train episode 174: reward = -197.00, steps = 198 05:27:43 [INFO] train episode 175: reward = -180.00, steps = 181 05:28:09 [INFO] train episode 176: reward = -284.00, steps = 285 05:28:36 [INFO] train episode 177: reward = -314.00, steps = 315 05:28:53 [INFO] train episode 178: reward = -190.00, steps = 191 05:29:11 [INFO] train episode 179: reward = -209.00, steps = 210 05:29:29 [INFO] train episode 180: reward = -202.00, steps = 203 05:29:41 [INFO] train episode 181: reward = -136.00, steps = 137 05:29:54 [INFO] train episode 182: reward = -142.00, steps = 143 05:30:16 [INFO] train episode 183: reward = -261.00, steps = 262 05:30:34 [INFO] train episode 184: reward = -195.00, steps = 196 05:30:49 [INFO] train episode 185: reward = -170.00, steps = 171 05:31:04 [INFO] train episode 186: reward = -161.00, steps = 162 05:31:21 [INFO] train episode 187: reward = -200.00, steps = 201 05:31:33 [INFO] train episode 188: reward = -130.00, steps = 131 05:31:45 [INFO] train episode 189: reward = -133.00, steps = 134 05:32:09 [INFO] train episode 190: reward = -275.00, steps = 276 05:32:25 [INFO] train episode 191: reward = -183.00, steps = 184 05:32:44 [INFO] train episode 192: reward = -204.00, steps = 205 05:33:02 [INFO] train episode 193: reward = -204.00, steps = 205 05:33:24 [INFO] train episode 194: reward = -255.00, steps = 256 05:33:38 [INFO] train episode 195: reward = -154.00, steps = 155 05:33:55 [INFO] train episode 196: reward = -189.00, steps = 190 05:34:09 [INFO] train episode 197: reward = -156.00, steps = 157 05:34:21 [INFO] train episode 198: reward = -135.00, steps = 136 05:34:32 [INFO] train episode 199: reward = -118.00, steps = 119 05:34:50 [INFO] train episode 200: reward = -196.00, steps = 197 05:35:06 [INFO] train episode 201: reward = -175.00, steps = 176 05:35:23 [INFO] train episode 202: reward = -202.00, steps = 203 05:35:36 [INFO] train episode 203: reward = -144.00, steps = 145 05:35:52 [INFO] train episode 204: reward = -172.00, steps = 173 05:36:05 [INFO] train episode 205: reward = -135.00, steps = 136 05:36:17 [INFO] train episode 206: reward = -145.00, steps = 146 05:36:30 [INFO] train episode 207: reward = -147.00, steps = 148 05:36:42 [INFO] train episode 208: reward = -122.00, steps = 123 05:36:58 [INFO] train episode 209: reward = -176.00, steps = 177 05:37:10 [INFO] train episode 210: reward = -136.00, steps = 137 05:37:23 [INFO] train episode 211: reward = -134.00, steps = 135 05:37:36 [INFO] train episode 212: reward = -140.00, steps = 141 05:37:50 [INFO] train episode 213: reward = -144.00, steps = 145 05:38:11 [INFO] train episode 214: reward = -238.00, steps = 239 05:38:31 [INFO] train episode 215: reward = -218.00, steps = 219 05:38:50 [INFO] train episode 216: reward = -211.00, steps = 212 05:39:05 [INFO] train episode 217: reward = -155.00, steps = 156 05:39:14 [INFO] train episode 218: reward = -112.00, steps = 113 05:39:27 [INFO] train episode 219: reward = -144.00, steps = 145 05:39:59 [INFO] train episode 220: reward = -364.00, steps = 365 05:40:13 [INFO] train episode 221: reward = -165.00, steps = 166 05:40:27 [INFO] train episode 222: reward = -153.00, steps = 154 05:40:38 [INFO] train episode 223: reward = -124.00, steps = 125 05:40:53 [INFO] train episode 224: reward = -169.00, steps = 170 05:41:03 [INFO] train episode 225: reward = -110.00, steps = 111 05:41:19 [INFO] train episode 226: reward = -179.00, steps = 180 05:41:34 [INFO] train episode 227: reward = -164.00, steps = 165 05:41:43 [INFO] train episode 228: reward = -103.00, steps = 104 05:41:53 [INFO] train episode 229: reward = -109.00, steps = 110 05:42:07 [INFO] train episode 230: reward = -160.00, steps = 161 05:42:25 [INFO] train episode 231: reward = -198.00, steps = 199 05:42:36 [INFO] train episode 232: reward = -114.00, steps = 115 05:42:53 [INFO] train episode 233: reward = -189.00, steps = 190 05:43:05 [INFO] train episode 234: reward = -131.00, steps = 132 05:43:18 [INFO] train episode 235: reward = -154.00, steps = 155 05:43:33 [INFO] train episode 236: reward = -161.00, steps = 162 05:43:46 [INFO] train episode 237: reward = -139.00, steps = 140 05:43:59 [INFO] train episode 238: reward = -138.00, steps = 139 05:44:13 [INFO] train episode 239: reward = -164.00, steps = 165 05:44:26 [INFO] train episode 240: reward = -141.00, steps = 142 05:44:42 [INFO] train episode 241: reward = -182.00, steps = 183 05:44:56 [INFO] train episode 242: reward = -161.00, steps = 162 05:45:07 [INFO] train episode 243: reward = -120.00, steps = 121 05:45:17 [INFO] train episode 244: reward = -113.00, steps = 114 05:45:29 [INFO] train episode 245: reward = -129.00, steps = 130 05:45:41 [INFO] train episode 246: reward = -142.00, steps = 143 05:45:53 [INFO] train episode 247: reward = -135.00, steps = 136 05:46:04 [INFO] train episode 248: reward = -120.00, steps = 121 05:46:16 [INFO] train episode 249: reward = -138.00, steps = 139 05:46:31 [INFO] train episode 250: reward = -171.00, steps = 172 05:46:46 [INFO] train episode 251: reward = -162.00, steps = 163 05:47:03 [INFO] train episode 252: reward = -188.00, steps = 189 05:47:14 [INFO] train episode 253: reward = -124.00, steps = 125 05:47:33 [INFO] train episode 254: reward = -210.00, steps = 211 05:48:17 [INFO] train episode 255: reward = -500.00, steps = 500 05:48:32 [INFO] train episode 256: reward = -166.00, steps = 167 05:48:42 [INFO] train episode 257: reward = -111.00, steps = 112 05:48:55 [INFO] train episode 258: reward = -145.00, steps = 146 05:49:07 [INFO] train episode 259: reward = -141.00, steps = 142 05:49:19 [INFO] train episode 260: reward = -133.00, steps = 134 05:49:36 [INFO] train episode 261: reward = -187.00, steps = 188 05:49:48 [INFO] train episode 262: reward = -140.00, steps = 141 05:49:59 [INFO] train episode 263: reward = -118.00, steps = 119 05:50:12 [INFO] train episode 264: reward = -141.00, steps = 142 05:50:23 [INFO] train episode 265: reward = -119.00, steps = 120 05:50:39 [INFO] train episode 266: reward = -184.00, steps = 185 05:50:50 [INFO] train episode 267: reward = -122.00, steps = 123 05:51:18 [INFO] train episode 268: reward = -324.00, steps = 325 05:51:31 [INFO] train episode 269: reward = -146.00, steps = 147 05:51:43 [INFO] train episode 270: reward = -133.00, steps = 134 05:51:55 [INFO] train episode 271: reward = -139.00, steps = 140 05:52:09 [INFO] train episode 272: reward = -155.00, steps = 156 05:52:21 [INFO] train episode 273: reward = -130.00, steps = 131 05:52:33 [INFO] train episode 274: reward = -136.00, steps = 137 05:52:47 [INFO] train episode 275: reward = -158.00, steps = 159 05:52:57 [INFO] train episode 276: reward = -107.00, steps = 108 05:53:08 [INFO] train episode 277: reward = -134.00, steps = 135 05:53:21 [INFO] train episode 278: reward = -139.00, steps = 140 05:53:37 [INFO] train episode 279: reward = -187.00, steps = 188 05:53:50 [INFO] train episode 280: reward = -143.00, steps = 144 05:54:02 [INFO] train episode 281: reward = -141.00, steps = 142 05:54:13 [INFO] train episode 282: reward = -120.00, steps = 121 05:54:25 [INFO] train episode 283: reward = -132.00, steps = 133 05:54:38 [INFO] train episode 284: reward = -135.00, steps = 136 05:54:52 [INFO] train episode 285: reward = -160.00, steps = 161 05:55:05 [INFO] train episode 286: reward = -150.00, steps = 151 05:55:18 [INFO] train episode 287: reward = -148.00, steps = 149 05:55:32 [INFO] train episode 288: reward = -151.00, steps = 152 05:55:48 [INFO] train episode 289: reward = -180.00, steps = 181 05:56:01 [INFO] train episode 290: reward = -139.00, steps = 140 05:56:15 [INFO] train episode 291: reward = -147.00, steps = 148 05:56:24 [INFO] train episode 292: reward = -111.00, steps = 112 05:56:34 [INFO] train episode 293: reward = -109.00, steps = 110 05:56:48 [INFO] train episode 294: reward = -151.00, steps = 152 05:57:01 [INFO] train episode 295: reward = -154.00, steps = 155 05:57:22 [INFO] train episode 296: reward = -228.00, steps = 229 05:57:37 [INFO] train episode 297: reward = -165.00, steps = 166 05:57:53 [INFO] train episode 298: reward = -179.00, steps = 180 05:58:08 [INFO] train episode 299: reward = -174.00, steps = 175 05:58:17 [INFO] train episode 300: reward = -96.00, steps = 97 05:58:36 [INFO] train episode 301: reward = -211.00, steps = 212 05:58:49 [INFO] train episode 302: reward = -154.00, steps = 155 05:59:02 [INFO] train episode 303: reward = -145.00, steps = 146 05:59:14 [INFO] train episode 304: reward = -129.00, steps = 130 05:59:26 [INFO] train episode 305: reward = -129.00, steps = 130 05:59:41 [INFO] train episode 306: reward = -168.00, steps = 169 05:59:52 [INFO] train episode 307: reward = -123.00, steps = 124 06:00:05 [INFO] train episode 308: reward = -135.00, steps = 136 06:00:43 [INFO] train episode 309: reward = -429.00, steps = 430 06:00:52 [INFO] train episode 310: reward = -98.00, steps = 99 06:01:05 [INFO] train episode 311: reward = -136.00, steps = 137 06:01:17 [INFO] train episode 312: reward = -131.00, steps = 132 06:01:32 [INFO] train episode 313: reward = -170.00, steps = 171 06:01:46 [INFO] train episode 314: reward = -151.00, steps = 152 06:02:00 [INFO] train episode 315: reward = -167.00, steps = 168 06:02:15 [INFO] train episode 316: reward = -165.00, steps = 166 06:02:29 [INFO] train episode 317: reward = -150.00, steps = 151 06:02:40 [INFO] train episode 318: reward = -124.00, steps = 125 06:02:56 [INFO] train episode 319: reward = -175.00, steps = 176 06:03:07 [INFO] train episode 320: reward = -128.00, steps = 129 06:03:21 [INFO] train episode 321: reward = -160.00, steps = 161 06:03:31 [INFO] train episode 322: reward = -111.00, steps = 112 06:03:41 [INFO] train episode 323: reward = -109.00, steps = 110 06:03:56 [INFO] train episode 324: reward = -165.00, steps = 166 06:04:10 [INFO] train episode 325: reward = -150.00, steps = 151 06:04:20 [INFO] train episode 326: reward = -105.00, steps = 106 06:04:32 [INFO] train episode 327: reward = -136.00, steps = 137 06:04:42 [INFO] train episode 328: reward = -118.00, steps = 119 06:04:57 [INFO] train episode 329: reward = -172.00, steps = 173 06:05:09 [INFO] train episode 330: reward = -127.00, steps = 128 06:05:27 [INFO] train episode 331: reward = -210.00, steps = 211 06:05:38 [INFO] train episode 332: reward = -116.00, steps = 117 06:05:48 [INFO] train episode 333: reward = -105.00, steps = 106 06:06:04 [INFO] train episode 334: reward = -175.00, steps = 176 06:06:15 [INFO] train episode 335: reward = -124.00, steps = 125 06:06:30 [INFO] train episode 336: reward = -161.00, steps = 162 06:06:44 [INFO] train episode 337: reward = -166.00, steps = 167 06:06:53 [INFO] train episode 338: reward = -90.00, steps = 91 06:07:02 [INFO] train episode 339: reward = -112.00, steps = 113 06:07:19 [INFO] train episode 340: reward = -180.00, steps = 181 06:07:33 [INFO] train episode 341: reward = -146.00, steps = 147 06:07:42 [INFO] train episode 342: reward = -109.00, steps = 110 06:07:52 [INFO] train episode 343: reward = -101.00, steps = 102 06:08:02 [INFO] train episode 344: reward = -118.00, steps = 119 06:08:12 [INFO] train episode 345: reward = -104.00, steps = 105 06:08:25 [INFO] train episode 346: reward = -145.00, steps = 146 06:08:34 [INFO] train episode 347: reward = -101.00, steps = 102 06:08:46 [INFO] train episode 348: reward = -126.00, steps = 127 06:08:57 [INFO] train episode 349: reward = -125.00, steps = 126 06:09:06 [INFO] train episode 350: reward = -86.00, steps = 87 06:09:06 [INFO] ==== test ==== 06:09:17 [INFO] test episode 0: reward = -132.00, steps = 133 06:09:28 [INFO] test episode 1: reward = -118.00, steps = 119 06:09:39 [INFO] test episode 2: reward = -139.00, steps = 140 06:09:47 [INFO] test episode 3: reward = -87.00, steps = 88 06:09:57 [INFO] test episode 4: reward = -118.00, steps = 119 06:10:07 [INFO] test episode 5: reward = -117.00, steps = 118 06:10:17 [INFO] test episode 6: reward = -111.00, steps = 112 06:10:27 [INFO] test episode 7: reward = -116.00, steps = 117 06:10:36 [INFO] test episode 8: reward = -106.00, steps = 107 06:10:45 [INFO] test episode 9: reward = -106.00, steps = 107 06:10:55 [INFO] test episode 10: reward = -114.00, steps = 115 06:11:05 [INFO] test episode 11: reward = -122.00, steps = 123 06:11:17 [INFO] test episode 12: reward = -137.00, steps = 138 06:11:26 [INFO] test episode 13: reward = -105.00, steps = 106 06:11:37 [INFO] test episode 14: reward = -133.00, steps = 134 06:11:48 [INFO] test episode 15: reward = -125.00, steps = 126 06:12:02 [INFO] test episode 16: reward = -158.00, steps = 159 06:12:12 [INFO] test episode 17: reward = -115.00, steps = 116 06:12:21 [INFO] test episode 18: reward = -118.00, steps = 119 06:12:32 [INFO] test episode 19: reward = -127.00, steps = 128 06:12:46 [INFO] test episode 20: reward = -165.00, steps = 166 06:12:56 [INFO] test episode 21: reward = -118.00, steps = 119 06:13:09 [INFO] test episode 22: reward = -142.00, steps = 143 06:13:17 [INFO] test episode 23: reward = -103.00, steps = 104 06:13:29 [INFO] test episode 24: reward = -134.00, steps = 135 06:13:50 [INFO] test episode 25: reward = -248.00, steps = 249 06:13:59 [INFO] test episode 26: reward = -111.00, steps = 112 06:14:11 [INFO] test episode 27: reward = -133.00, steps = 134 06:14:25 [INFO] test episode 28: reward = -173.00, steps = 174 06:14:37 [INFO] test episode 29: reward = -134.00, steps = 135 06:14:48 [INFO] test episode 30: reward = -132.00, steps = 133 06:14:58 [INFO] test episode 31: reward = -117.00, steps = 118 06:15:08 [INFO] test episode 32: reward = -115.00, steps = 116 06:15:19 [INFO] test episode 33: reward = -131.00, steps = 132 06:15:28 [INFO] test episode 34: reward = -98.00, steps = 99 06:15:40 [INFO] test episode 35: reward = -141.00, steps = 142 06:15:50 [INFO] test episode 36: reward = -119.00, steps = 120 06:15:59 [INFO] test episode 37: reward = -107.00, steps = 108 06:16:12 [INFO] test episode 38: reward = -149.00, steps = 150 06:16:23 [INFO] test episode 39: reward = -120.00, steps = 121 06:16:45 [INFO] test episode 40: reward = -263.00, steps = 264 06:16:59 [INFO] test episode 41: reward = -171.00, steps = 172 06:17:10 [INFO] test episode 42: reward = -122.00, steps = 123 06:17:19 [INFO] test episode 43: reward = -111.00, steps = 112 06:17:30 [INFO] test episode 44: reward = -120.00, steps = 121 06:17:41 [INFO] test episode 45: reward = -126.00, steps = 127 06:17:58 [INFO] test episode 46: reward = -202.00, steps = 203 06:18:08 [INFO] test episode 47: reward = -124.00, steps = 125 06:18:16 [INFO] test episode 48: reward = -87.00, steps = 88 06:18:24 [INFO] test episode 49: reward = -91.00, steps = 92 06:18:36 [INFO] test episode 50: reward = -146.00, steps = 147 06:18:50 [INFO] test episode 51: reward = -169.00, steps = 170 06:19:01 [INFO] test episode 52: reward = -129.00, steps = 130 06:19:13 [INFO] test episode 53: reward = -139.00, steps = 140 06:19:23 [INFO] test episode 54: reward = -115.00, steps = 116 06:19:37 [INFO] test episode 55: reward = -158.00, steps = 159 06:19:46 [INFO] test episode 56: reward = -109.00, steps = 110 06:19:55 [INFO] test episode 57: reward = -107.00, steps = 108 06:20:07 [INFO] test episode 58: reward = -139.00, steps = 140 06:20:17 [INFO] test episode 59: reward = -118.00, steps = 119 06:20:30 [INFO] test episode 60: reward = -150.00, steps = 151 06:20:46 [INFO] test episode 61: reward = -185.00, steps = 186 06:21:00 [INFO] test episode 62: reward = -165.00, steps = 166 06:21:09 [INFO] test episode 63: reward = -101.00, steps = 102 06:21:20 [INFO] test episode 64: reward = -123.00, steps = 124 06:21:30 [INFO] test episode 65: reward = -126.00, steps = 127 06:21:39 [INFO] test episode 66: reward = -102.00, steps = 103 06:21:48 [INFO] test episode 67: reward = -109.00, steps = 110 06:21:58 [INFO] test episode 68: reward = -107.00, steps = 108 06:22:07 [INFO] test episode 69: reward = -109.00, steps = 110 06:22:15 [INFO] test episode 70: reward = -86.00, steps = 87 06:22:28 [INFO] test episode 71: reward = -154.00, steps = 155 06:22:37 [INFO] test episode 72: reward = -106.00, steps = 107 06:22:47 [INFO] test episode 73: reward = -114.00, steps = 115 06:22:59 [INFO] test episode 74: reward = -138.00, steps = 139 06:23:11 [INFO] test episode 75: reward = -145.00, steps = 146 06:23:23 [INFO] test episode 76: reward = -141.00, steps = 142 06:23:34 [INFO] test episode 77: reward = -122.00, steps = 123 06:23:43 [INFO] test episode 78: reward = -113.00, steps = 114 06:24:00 [INFO] test episode 79: reward = -196.00, steps = 197 06:24:09 [INFO] test episode 80: reward = -100.00, steps = 101 06:24:21 [INFO] test episode 81: reward = -145.00, steps = 146 06:24:36 [INFO] test episode 82: reward = -175.00, steps = 176 06:24:44 [INFO] test episode 83: reward = -99.00, steps = 100 06:24:58 [INFO] test episode 84: reward = -160.00, steps = 161 06:25:09 [INFO] test episode 85: reward = -124.00, steps = 125 06:25:20 [INFO] test episode 86: reward = -132.00, steps = 133 06:25:32 [INFO] test episode 87: reward = -147.00, steps = 148 06:25:44 [INFO] test episode 88: reward = -132.00, steps = 133 06:25:52 [INFO] test episode 89: reward = -93.00, steps = 94 06:26:03 [INFO] test episode 90: reward = -137.00, steps = 138 06:26:17 [INFO] test episode 91: reward = -160.00, steps = 161 06:26:26 [INFO] test episode 92: reward = -105.00, steps = 106 06:26:37 [INFO] test episode 93: reward = -137.00, steps = 138 06:26:48 [INFO] test episode 94: reward = -136.00, steps = 137 06:26:58 [INFO] test episode 95: reward = -114.00, steps = 115 06:27:11 [INFO] test episode 96: reward = -148.00, steps = 149 06:27:26 [INFO] test episode 97: reward = -178.00, steps = 179 06:27:35 [INFO] test episode 98: reward = -112.00, steps = 113 06:27:47 [INFO] test episode 99: reward = -135.00, steps = 136 06:27:47 [INFO] average episode reward = -131.01 ± 29.60
env.close()