TensorFlow version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers
from tensorflow.keras import models
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
15:40:00 [INFO] env: <AcrobotEnv<Acrobot-v1>> 15:40:00 [INFO] action_space: Discrete(3) 15:40:00 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 15:40:00 [INFO] reward_range: (-inf, inf) 15:40:00 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 15:40:00 [INFO] _max_episode_steps: 500 15:40:00 [INFO] _elapsed_steps: None 15:40:00 [INFO] id: Acrobot-v1 15:40:00 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 15:40:00 [INFO] reward_threshold: -100.0 15:40:00 [INFO] nondeterministic: False 15:40:00 [INFO] max_episode_steps: 500 15:40:00 [INFO] _kwargs: {} 15:40:00 [INFO] _env_name: Acrobot
class AdvantageActorCriticAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.actor_net = self.build_net(hidden_sizes=[100,],
output_size=self.action_n, output_activation=nn.softmax,
loss=losses.categorical_crossentropy,
learning_rate=0.0001)
self.critic_net = self.build_net(hidden_sizes=[100,],
learning_rate=0.0002)
def build_net(self, hidden_sizes, output_size=1,
activation=nn.relu, output_activation=None,
loss=losses.mse, learning_rate=0.001):
model = keras.Sequential()
for hidden_size in hidden_sizes:
model.add(layers.Dense(units=hidden_size,
activation=activation))
model.add(layers.Dense(units=output_size,
activation=output_activation))
optimizer = optimizers.Adam(learning_rate)
model.compile(optimizer=optimizer, loss=loss)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.discount = 1.
def step(self, observation, reward, terminated):
probs = self.actor_net.predict(observation[np.newaxis], verbose=0)[0]
action = np.random.choice(self.action_n, p=probs)
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
self.learn()
self.discount *= self.gamma
return action
def close(self):
pass
def learn(self):
state, _, _, action, next_state, reward, terminated, _ \
= self.trajectory[-8:]
states = state[np.newaxis]
v = self.critic_net.predict(states, verbose=0)
next_v = self.critic_net.predict(next_state[np.newaxis], verbose=0)
target = reward + (1. - terminated) * self.gamma * next_v
td_error = target - v
# update actor
state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
with tf.GradientTape() as tape:
pi_tensor = self.actor_net(state_tensor)[0, action]
logpi_tensor = tf.math.log(tf.clip_by_value(pi_tensor, 1e-6, 1.))
loss_tensor = -self.discount * td_error * logpi_tensor
grad_tensors = tape.gradient(loss_tensor, self.actor_net.variables)
self.actor_net.optimizer.apply_gradients(zip(
grad_tensors, self.actor_net.variables))
# update critic
self.critic_net.fit(states, np.array([[target,],]), verbose=0)
agent = AdvantageActorCriticAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
15:40:01 [INFO] ==== train ==== 15:43:01 [INFO] train episode 0: reward = -419.00, steps = 420 15:46:29 [INFO] train episode 1: reward = -500.00, steps = 500 15:49:50 [INFO] train episode 2: reward = -500.00, steps = 500 15:53:12 [INFO] train episode 3: reward = -500.00, steps = 500 15:56:33 [INFO] train episode 4: reward = -500.00, steps = 500 15:59:55 [INFO] train episode 5: reward = -500.00, steps = 500 16:03:19 [INFO] train episode 6: reward = -500.00, steps = 500 16:06:40 [INFO] train episode 7: reward = -500.00, steps = 500 16:09:58 [INFO] train episode 8: reward = -500.00, steps = 500 16:13:06 [INFO] train episode 9: reward = -465.00, steps = 466 16:14:25 [INFO] train episode 10: reward = -196.00, steps = 197 16:15:41 [INFO] train episode 11: reward = -187.00, steps = 188 16:16:57 [INFO] train episode 12: reward = -180.00, steps = 181 16:18:31 [INFO] train episode 13: reward = -232.00, steps = 233 16:19:40 [INFO] train episode 14: reward = -171.00, steps = 172 16:20:35 [INFO] train episode 15: reward = -135.00, steps = 136 16:21:33 [INFO] train episode 16: reward = -143.00, steps = 144 16:22:42 [INFO] train episode 17: reward = -169.00, steps = 170 16:23:32 [INFO] train episode 18: reward = -126.00, steps = 127 16:25:01 [INFO] train episode 19: reward = -220.00, steps = 221 16:26:04 [INFO] train episode 20: reward = -148.00, steps = 149 16:27:35 [INFO] train episode 21: reward = -195.00, steps = 196 16:28:35 [INFO] train episode 22: reward = -134.00, steps = 135 16:29:40 [INFO] train episode 23: reward = -141.00, steps = 142 16:31:33 [INFO] train episode 24: reward = -265.00, steps = 266 16:32:40 [INFO] train episode 25: reward = -153.00, steps = 154 16:33:43 [INFO] train episode 26: reward = -147.00, steps = 148 16:34:39 [INFO] train episode 27: reward = -138.00, steps = 139 16:35:39 [INFO] train episode 28: reward = -136.00, steps = 137 16:36:39 [INFO] train episode 29: reward = -139.00, steps = 140 16:37:38 [INFO] train episode 30: reward = -134.00, steps = 135 16:39:33 [INFO] train episode 31: reward = -277.00, steps = 278 16:40:25 [INFO] train episode 32: reward = -121.00, steps = 122 16:41:22 [INFO] train episode 33: reward = -139.00, steps = 140 16:42:11 [INFO] train episode 34: reward = -119.00, steps = 120 16:43:08 [INFO] train episode 35: reward = -138.00, steps = 139 16:44:06 [INFO] train episode 36: reward = -146.00, steps = 147 16:44:51 [INFO] train episode 37: reward = -109.00, steps = 110 16:45:45 [INFO] train episode 38: reward = -133.00, steps = 134 16:46:32 [INFO] train episode 39: reward = -114.00, steps = 115 16:47:16 [INFO] train episode 40: reward = -109.00, steps = 110 16:48:03 [INFO] train episode 41: reward = -108.00, steps = 109 16:48:46 [INFO] train episode 42: reward = -103.00, steps = 104 16:49:35 [INFO] train episode 43: reward = -121.00, steps = 122 16:50:15 [INFO] train episode 44: reward = -97.00, steps = 98 16:50:15 [INFO] ==== test ==== 16:50:25 [INFO] test episode 0: reward = -110.00, steps = 111 16:50:37 [INFO] test episode 1: reward = -123.00, steps = 124 16:50:47 [INFO] test episode 2: reward = -104.00, steps = 105 16:50:57 [INFO] test episode 3: reward = -95.00, steps = 96 16:51:11 [INFO] test episode 4: reward = -136.00, steps = 137 16:51:25 [INFO] test episode 5: reward = -143.00, steps = 144 16:51:36 [INFO] test episode 6: reward = -114.00, steps = 115 16:51:49 [INFO] test episode 7: reward = -135.00, steps = 136 16:52:00 [INFO] test episode 8: reward = -122.00, steps = 123 16:52:10 [INFO] test episode 9: reward = -104.00, steps = 105 16:52:21 [INFO] test episode 10: reward = -112.00, steps = 113 16:52:33 [INFO] test episode 11: reward = -132.00, steps = 133 16:52:43 [INFO] test episode 12: reward = -97.00, steps = 98 16:52:52 [INFO] test episode 13: reward = -98.00, steps = 99 16:53:03 [INFO] test episode 14: reward = -121.00, steps = 122 16:53:17 [INFO] test episode 15: reward = -163.00, steps = 164 16:53:31 [INFO] test episode 16: reward = -152.00, steps = 153 16:53:44 [INFO] test episode 17: reward = -132.00, steps = 133 16:53:55 [INFO] test episode 18: reward = -115.00, steps = 116 16:54:06 [INFO] test episode 19: reward = -120.00, steps = 121 16:54:17 [INFO] test episode 20: reward = -125.00, steps = 126 16:54:31 [INFO] test episode 21: reward = -154.00, steps = 155 16:54:51 [INFO] test episode 22: reward = -184.00, steps = 185 16:55:01 [INFO] test episode 23: reward = -99.00, steps = 100 16:55:10 [INFO] test episode 24: reward = -92.00, steps = 93 16:55:26 [INFO] test episode 25: reward = -174.00, steps = 175 16:55:35 [INFO] test episode 26: reward = -101.00, steps = 102 16:55:45 [INFO] test episode 27: reward = -99.00, steps = 100 16:55:55 [INFO] test episode 28: reward = -114.00, steps = 115 16:56:04 [INFO] test episode 29: reward = -86.00, steps = 87 16:56:12 [INFO] test episode 30: reward = -90.00, steps = 91 16:56:23 [INFO] test episode 31: reward = -112.00, steps = 113 16:56:32 [INFO] test episode 32: reward = -98.00, steps = 99 16:56:42 [INFO] test episode 33: reward = -104.00, steps = 105 16:56:54 [INFO] test episode 34: reward = -131.00, steps = 132 16:57:11 [INFO] test episode 35: reward = -168.00, steps = 169 16:57:25 [INFO] test episode 36: reward = -136.00, steps = 137 16:57:34 [INFO] test episode 37: reward = -97.00, steps = 98 16:57:50 [INFO] test episode 38: reward = -178.00, steps = 179 16:57:57 [INFO] test episode 39: reward = -80.00, steps = 81 16:58:15 [INFO] test episode 40: reward = -194.00, steps = 195 16:58:26 [INFO] test episode 41: reward = -119.00, steps = 120 16:58:36 [INFO] test episode 42: reward = -119.00, steps = 120 16:58:44 [INFO] test episode 43: reward = -91.00, steps = 92 16:58:56 [INFO] test episode 44: reward = -125.00, steps = 126 16:59:09 [INFO] test episode 45: reward = -145.00, steps = 146 16:59:18 [INFO] test episode 46: reward = -98.00, steps = 99 16:59:27 [INFO] test episode 47: reward = -105.00, steps = 106 16:59:37 [INFO] test episode 48: reward = -103.00, steps = 104 16:59:55 [INFO] test episode 49: reward = -204.00, steps = 205 17:00:04 [INFO] test episode 50: reward = -98.00, steps = 99 17:00:13 [INFO] test episode 51: reward = -107.00, steps = 108 17:00:24 [INFO] test episode 52: reward = -117.00, steps = 118 17:00:36 [INFO] test episode 53: reward = -136.00, steps = 137 17:00:45 [INFO] test episode 54: reward = -104.00, steps = 105 17:00:53 [INFO] test episode 55: reward = -93.00, steps = 94 17:01:06 [INFO] test episode 56: reward = -135.00, steps = 136 17:01:16 [INFO] test episode 57: reward = -113.00, steps = 114 17:01:26 [INFO] test episode 58: reward = -120.00, steps = 121 17:01:35 [INFO] test episode 59: reward = -106.00, steps = 107 17:01:46 [INFO] test episode 60: reward = -117.00, steps = 118 17:01:54 [INFO] test episode 61: reward = -93.00, steps = 94 17:02:07 [INFO] test episode 62: reward = -149.00, steps = 150 17:02:17 [INFO] test episode 63: reward = -117.00, steps = 118 17:02:30 [INFO] test episode 64: reward = -144.00, steps = 145 17:02:41 [INFO] test episode 65: reward = -129.00, steps = 130 17:02:53 [INFO] test episode 66: reward = -134.00, steps = 135 17:03:09 [INFO] test episode 67: reward = -175.00, steps = 176 17:03:18 [INFO] test episode 68: reward = -108.00, steps = 109 17:03:29 [INFO] test episode 69: reward = -118.00, steps = 119 17:03:37 [INFO] test episode 70: reward = -98.00, steps = 99 17:03:46 [INFO] test episode 71: reward = -101.00, steps = 102 17:04:07 [INFO] test episode 72: reward = -228.00, steps = 229 17:04:16 [INFO] test episode 73: reward = -107.00, steps = 108 17:04:32 [INFO] test episode 74: reward = -173.00, steps = 174 17:04:42 [INFO] test episode 75: reward = -106.00, steps = 107 17:04:56 [INFO] test episode 76: reward = -143.00, steps = 144 17:05:09 [INFO] test episode 77: reward = -131.00, steps = 132 17:05:21 [INFO] test episode 78: reward = -111.00, steps = 112 17:05:33 [INFO] test episode 79: reward = -127.00, steps = 128 17:05:42 [INFO] test episode 80: reward = -103.00, steps = 104 17:05:53 [INFO] test episode 81: reward = -112.00, steps = 113 17:06:07 [INFO] test episode 82: reward = -150.00, steps = 151 17:06:16 [INFO] test episode 83: reward = -101.00, steps = 102 17:06:28 [INFO] test episode 84: reward = -132.00, steps = 133 17:06:38 [INFO] test episode 85: reward = -99.00, steps = 100 17:06:50 [INFO] test episode 86: reward = -111.00, steps = 112 17:06:59 [INFO] test episode 87: reward = -92.00, steps = 93 17:07:19 [INFO] test episode 88: reward = -210.00, steps = 211 17:07:32 [INFO] test episode 89: reward = -131.00, steps = 132 17:07:41 [INFO] test episode 90: reward = -99.00, steps = 100 17:07:51 [INFO] test episode 91: reward = -115.00, steps = 116 17:08:02 [INFO] test episode 92: reward = -102.00, steps = 103 17:08:13 [INFO] test episode 93: reward = -101.00, steps = 102 17:08:25 [INFO] test episode 94: reward = -108.00, steps = 109 17:08:38 [INFO] test episode 95: reward = -137.00, steps = 138 17:08:50 [INFO] test episode 96: reward = -134.00, steps = 135 17:09:02 [INFO] test episode 97: reward = -128.00, steps = 129 17:09:18 [INFO] test episode 98: reward = -188.00, steps = 189 17:09:25 [INFO] test episode 99: reward = -83.00, steps = 84 17:09:25 [INFO] average episode reward = -123.27 ± 29.22
env.close()