PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions
torch.manual_seed(0)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
22:23:09 [INFO] env: <AcrobotEnv<Acrobot-v1>> 22:23:09 [INFO] action_space: Discrete(3) 22:23:09 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 22:23:09 [INFO] reward_range: (-inf, inf) 22:23:09 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 22:23:09 [INFO] _max_episode_steps: 500 22:23:09 [INFO] _elapsed_steps: None 22:23:09 [INFO] id: Acrobot-v1 22:23:09 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 22:23:09 [INFO] reward_threshold: -100.0 22:23:09 [INFO] nondeterministic: False 22:23:09 [INFO] max_episode_steps: 500 22:23:09 [INFO] _kwargs: {} 22:23:09 [INFO] _env_name: Acrobot
class AdvantageActorCriticAgent:
def __init__(self, env):
self.gamma = 0.99
self.actor_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,],
output_size=env.action_space.n, output_activator=nn.Softmax(1))
self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 0.0001)
self.critic_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,])
self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.0002)
self.critic_loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size=1,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.discount = 1.
def step(self, observation, reward, terminated):
state_tensor = torch.as_tensor(observation,
dtype=torch.float).reshape(1, -1)
prob_tensor = self.actor_net(state_tensor)
action_tensor = distributions.Categorical(prob_tensor).sample()
action = action_tensor.numpy()[0]
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
self.learn()
self.discount *= self.gamma
return action
def close(self):
pass
def learn(self):
state, _, _, action, next_state, reward, terminated, next_action \
= self.trajectory[-8:]
state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)
next_state_tensor = torch.as_tensor(next_state,
dtype=torch.float).unsqueeze(0)
# calculate TD error
next_v_tensor = self.critic_net(next_state_tensor)
target_tensor = reward + (1. - terminated) * self.gamma * next_v_tensor
v_tensor = self.critic_net(state_tensor)
td_error_tensor = target_tensor - v_tensor
# update actor
pi_tensor = self.actor_net(state_tensor)[0, action]
logpi_tensor = torch.log(pi_tensor.clamp(1e-6, 1.))
actor_loss_tensor = -(self.discount * td_error_tensor *
logpi_tensor).squeeze()
self.actor_optimizer.zero_grad()
actor_loss_tensor.backward(retain_graph=True)
self.actor_optimizer.step()
# update critic
pred_tensor = self.critic_net(state_tensor)
critic_loss_tensor = self.critic_loss(pred_tensor, target_tensor)
self.critic_optimizer.zero_grad()
critic_loss_tensor.backward()
self.critic_optimizer.step()
agent = AdvantageActorCriticAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
22:23:09 [INFO] ==== train ==== 22:23:11 [INFO] train episode 0: reward = -500.00, steps = 500 22:23:13 [INFO] train episode 1: reward = -500.00, steps = 500 22:23:15 [INFO] train episode 2: reward = -500.00, steps = 500 22:23:18 [INFO] train episode 3: reward = -500.00, steps = 500 22:23:20 [INFO] train episode 4: reward = -500.00, steps = 500 22:23:22 [INFO] train episode 5: reward = -500.00, steps = 500 22:23:24 [INFO] train episode 6: reward = -500.00, steps = 500 22:23:26 [INFO] train episode 7: reward = -500.00, steps = 500 22:23:28 [INFO] train episode 8: reward = -500.00, steps = 500 22:23:30 [INFO] train episode 9: reward = -500.00, steps = 500 22:23:33 [INFO] train episode 10: reward = -500.00, steps = 500 22:23:35 [INFO] train episode 11: reward = -500.00, steps = 500 22:23:37 [INFO] train episode 12: reward = -500.00, steps = 500 22:23:39 [INFO] train episode 13: reward = -500.00, steps = 500 22:23:41 [INFO] train episode 14: reward = -500.00, steps = 500 22:23:44 [INFO] train episode 15: reward = -500.00, steps = 500 22:23:46 [INFO] train episode 16: reward = -500.00, steps = 500 22:23:48 [INFO] train episode 17: reward = -500.00, steps = 500 22:23:50 [INFO] train episode 18: reward = -500.00, steps = 500 22:23:52 [INFO] train episode 19: reward = -500.00, steps = 500 22:23:55 [INFO] train episode 20: reward = -500.00, steps = 500 22:23:57 [INFO] train episode 21: reward = -500.00, steps = 500 22:23:59 [INFO] train episode 22: reward = -500.00, steps = 500 22:24:01 [INFO] train episode 23: reward = -500.00, steps = 500 22:24:03 [INFO] train episode 24: reward = -500.00, steps = 500 22:24:05 [INFO] train episode 25: reward = -500.00, steps = 500 22:24:08 [INFO] train episode 26: reward = -500.00, steps = 500 22:24:10 [INFO] train episode 27: reward = -500.00, steps = 500 22:24:12 [INFO] train episode 28: reward = -500.00, steps = 500 22:24:14 [INFO] train episode 29: reward = -500.00, steps = 500 22:24:16 [INFO] train episode 30: reward = -500.00, steps = 500 22:24:19 [INFO] train episode 31: reward = -500.00, steps = 500 22:24:20 [INFO] train episode 32: reward = -457.00, steps = 458 22:24:23 [INFO] train episode 33: reward = -500.00, steps = 500 22:24:25 [INFO] train episode 34: reward = -500.00, steps = 500 22:24:26 [INFO] train episode 35: reward = -357.00, steps = 358 22:24:28 [INFO] train episode 36: reward = -500.00, steps = 500 22:24:30 [INFO] train episode 37: reward = -291.00, steps = 292 22:24:31 [INFO] train episode 38: reward = -387.00, steps = 388 22:24:33 [INFO] train episode 39: reward = -299.00, steps = 300 22:24:33 [INFO] train episode 40: reward = -215.00, steps = 216 22:24:35 [INFO] train episode 41: reward = -269.00, steps = 270 22:24:36 [INFO] train episode 42: reward = -238.00, steps = 239 22:24:36 [INFO] train episode 43: reward = -182.00, steps = 183 22:24:37 [INFO] train episode 44: reward = -152.00, steps = 153 22:24:38 [INFO] train episode 45: reward = -148.00, steps = 149 22:24:38 [INFO] train episode 46: reward = -148.00, steps = 149 22:24:39 [INFO] train episode 47: reward = -148.00, steps = 149 22:24:40 [INFO] train episode 48: reward = -171.00, steps = 172 22:24:40 [INFO] train episode 49: reward = -169.00, steps = 170 22:24:41 [INFO] train episode 50: reward = -150.00, steps = 151 22:24:42 [INFO] train episode 51: reward = -134.00, steps = 135 22:24:42 [INFO] train episode 52: reward = -103.00, steps = 104 22:24:43 [INFO] train episode 53: reward = -144.00, steps = 145 22:24:44 [INFO] train episode 54: reward = -240.00, steps = 241 22:24:45 [INFO] train episode 55: reward = -215.00, steps = 216 22:24:46 [INFO] train episode 56: reward = -152.00, steps = 153 22:24:46 [INFO] train episode 57: reward = -223.00, steps = 224 22:24:47 [INFO] train episode 58: reward = -211.00, steps = 212 22:24:48 [INFO] train episode 59: reward = -94.00, steps = 95 22:24:48 [INFO] train episode 60: reward = -147.00, steps = 148 22:24:49 [INFO] train episode 61: reward = -142.00, steps = 143 22:24:50 [INFO] train episode 62: reward = -169.00, steps = 170 22:24:50 [INFO] train episode 63: reward = -120.00, steps = 121 22:24:51 [INFO] train episode 64: reward = -249.00, steps = 250 22:24:52 [INFO] train episode 65: reward = -142.00, steps = 143 22:24:52 [INFO] train episode 66: reward = -105.00, steps = 106 22:24:53 [INFO] train episode 67: reward = -138.00, steps = 139 22:24:54 [INFO] train episode 68: reward = -134.00, steps = 135 22:24:54 [INFO] train episode 69: reward = -105.00, steps = 106 22:24:55 [INFO] train episode 70: reward = -132.00, steps = 133 22:24:55 [INFO] train episode 71: reward = -131.00, steps = 132 22:24:56 [INFO] train episode 72: reward = -130.00, steps = 131 22:24:57 [INFO] train episode 73: reward = -156.00, steps = 157 22:24:57 [INFO] train episode 74: reward = -142.00, steps = 143 22:24:58 [INFO] train episode 75: reward = -113.00, steps = 114 22:24:58 [INFO] train episode 76: reward = -136.00, steps = 137 22:24:59 [INFO] train episode 77: reward = -140.00, steps = 141 22:25:00 [INFO] train episode 78: reward = -177.00, steps = 178 22:25:01 [INFO] train episode 79: reward = -187.00, steps = 188 22:25:01 [INFO] train episode 80: reward = -124.00, steps = 125 22:25:02 [INFO] train episode 81: reward = -123.00, steps = 124 22:25:02 [INFO] train episode 82: reward = -139.00, steps = 140 22:25:03 [INFO] train episode 83: reward = -112.00, steps = 113 22:25:03 [INFO] train episode 84: reward = -113.00, steps = 114 22:25:04 [INFO] train episode 85: reward = -112.00, steps = 113 22:25:04 [INFO] train episode 86: reward = -87.00, steps = 88 22:25:04 [INFO] train episode 87: reward = -93.00, steps = 94 22:25:05 [INFO] train episode 88: reward = -122.00, steps = 123 22:25:06 [INFO] train episode 89: reward = -115.00, steps = 116 22:25:06 [INFO] ==== test ==== 22:25:06 [INFO] test episode 0: reward = -115.00, steps = 116 22:25:06 [INFO] test episode 1: reward = -193.00, steps = 194 22:25:06 [INFO] test episode 2: reward = -128.00, steps = 129 22:25:06 [INFO] test episode 3: reward = -119.00, steps = 120 22:25:06 [INFO] test episode 4: reward = -96.00, steps = 97 22:25:06 [INFO] test episode 5: reward = -120.00, steps = 121 22:25:06 [INFO] test episode 6: reward = -117.00, steps = 118 22:25:06 [INFO] test episode 7: reward = -144.00, steps = 145 22:25:06 [INFO] test episode 8: reward = -125.00, steps = 126 22:25:07 [INFO] test episode 9: reward = -124.00, steps = 125 22:25:07 [INFO] test episode 10: reward = -124.00, steps = 125 22:25:07 [INFO] test episode 11: reward = -129.00, steps = 130 22:25:07 [INFO] test episode 12: reward = -105.00, steps = 106 22:25:07 [INFO] test episode 13: reward = -112.00, steps = 113 22:25:07 [INFO] test episode 14: reward = -144.00, steps = 145 22:25:07 [INFO] test episode 15: reward = -124.00, steps = 125 22:25:07 [INFO] test episode 16: reward = -124.00, steps = 125 22:25:07 [INFO] test episode 17: reward = -94.00, steps = 95 22:25:07 [INFO] test episode 18: reward = -110.00, steps = 111 22:25:07 [INFO] test episode 19: reward = -103.00, steps = 104 22:25:08 [INFO] test episode 20: reward = -119.00, steps = 120 22:25:08 [INFO] test episode 21: reward = -86.00, steps = 87 22:25:08 [INFO] test episode 22: reward = -124.00, steps = 125 22:25:08 [INFO] test episode 23: reward = -129.00, steps = 130 22:25:08 [INFO] test episode 24: reward = -112.00, steps = 113 22:25:08 [INFO] test episode 25: reward = -118.00, steps = 119 22:25:08 [INFO] test episode 26: reward = -143.00, steps = 144 22:25:08 [INFO] test episode 27: reward = -109.00, steps = 110 22:25:08 [INFO] test episode 28: reward = -97.00, steps = 98 22:25:08 [INFO] test episode 29: reward = -189.00, steps = 190 22:25:09 [INFO] test episode 30: reward = -115.00, steps = 116 22:25:09 [INFO] test episode 31: reward = -170.00, steps = 171 22:25:09 [INFO] test episode 32: reward = -111.00, steps = 112 22:25:09 [INFO] test episode 33: reward = -110.00, steps = 111 22:25:09 [INFO] test episode 34: reward = -103.00, steps = 104 22:25:09 [INFO] test episode 35: reward = -121.00, steps = 122 22:25:09 [INFO] test episode 36: reward = -133.00, steps = 134 22:25:09 [INFO] test episode 37: reward = -132.00, steps = 133 22:25:09 [INFO] test episode 38: reward = -136.00, steps = 137 22:25:09 [INFO] test episode 39: reward = -122.00, steps = 123 22:25:10 [INFO] test episode 40: reward = -125.00, steps = 126 22:25:10 [INFO] test episode 41: reward = -130.00, steps = 131 22:25:10 [INFO] test episode 42: reward = -118.00, steps = 119 22:25:10 [INFO] test episode 43: reward = -103.00, steps = 104 22:25:10 [INFO] test episode 44: reward = -99.00, steps = 100 22:25:10 [INFO] test episode 45: reward = -120.00, steps = 121 22:25:10 [INFO] test episode 46: reward = -119.00, steps = 120 22:25:10 [INFO] test episode 47: reward = -140.00, steps = 141 22:25:10 [INFO] test episode 48: reward = -98.00, steps = 99 22:25:10 [INFO] test episode 49: reward = -97.00, steps = 98 22:25:10 [INFO] test episode 50: reward = -128.00, steps = 129 22:25:10 [INFO] test episode 51: reward = -136.00, steps = 137 22:25:11 [INFO] test episode 52: reward = -94.00, steps = 95 22:25:11 [INFO] test episode 53: reward = -186.00, steps = 187 22:25:11 [INFO] test episode 54: reward = -108.00, steps = 109 22:25:11 [INFO] test episode 55: reward = -129.00, steps = 130 22:25:11 [INFO] test episode 56: reward = -92.00, steps = 93 22:25:11 [INFO] test episode 57: reward = -112.00, steps = 113 22:25:11 [INFO] test episode 58: reward = -104.00, steps = 105 22:25:11 [INFO] test episode 59: reward = -107.00, steps = 108 22:25:11 [INFO] test episode 60: reward = -142.00, steps = 143 22:25:11 [INFO] test episode 61: reward = -162.00, steps = 163 22:25:11 [INFO] test episode 62: reward = -105.00, steps = 106 22:25:12 [INFO] test episode 63: reward = -111.00, steps = 112 22:25:12 [INFO] test episode 64: reward = -118.00, steps = 119 22:25:12 [INFO] test episode 65: reward = -129.00, steps = 130 22:25:12 [INFO] test episode 66: reward = -91.00, steps = 92 22:25:12 [INFO] test episode 67: reward = -122.00, steps = 123 22:25:12 [INFO] test episode 68: reward = -142.00, steps = 143 22:25:12 [INFO] test episode 69: reward = -138.00, steps = 139 22:25:12 [INFO] test episode 70: reward = -105.00, steps = 106 22:25:12 [INFO] test episode 71: reward = -165.00, steps = 166 22:25:12 [INFO] test episode 72: reward = -144.00, steps = 145 22:25:13 [INFO] test episode 73: reward = -176.00, steps = 177 22:25:13 [INFO] test episode 74: reward = -143.00, steps = 144 22:25:13 [INFO] test episode 75: reward = -144.00, steps = 145 22:25:13 [INFO] test episode 76: reward = -135.00, steps = 136 22:25:13 [INFO] test episode 77: reward = -132.00, steps = 133 22:25:13 [INFO] test episode 78: reward = -103.00, steps = 104 22:25:13 [INFO] test episode 79: reward = -125.00, steps = 126 22:25:13 [INFO] test episode 80: reward = -84.00, steps = 85 22:25:13 [INFO] test episode 81: reward = -123.00, steps = 124 22:25:13 [INFO] test episode 82: reward = -100.00, steps = 101 22:25:13 [INFO] test episode 83: reward = -127.00, steps = 128 22:25:14 [INFO] test episode 84: reward = -87.00, steps = 88 22:25:14 [INFO] test episode 85: reward = -193.00, steps = 194 22:25:14 [INFO] test episode 86: reward = -103.00, steps = 104 22:25:14 [INFO] test episode 87: reward = -120.00, steps = 121 22:25:14 [INFO] test episode 88: reward = -94.00, steps = 95 22:25:14 [INFO] test episode 89: reward = -117.00, steps = 118 22:25:14 [INFO] test episode 90: reward = -123.00, steps = 124 22:25:14 [INFO] test episode 91: reward = -94.00, steps = 95 22:25:14 [INFO] test episode 92: reward = -117.00, steps = 118 22:25:14 [INFO] test episode 93: reward = -126.00, steps = 127 22:25:14 [INFO] test episode 94: reward = -167.00, steps = 168 22:25:15 [INFO] test episode 95: reward = -121.00, steps = 122 22:25:15 [INFO] test episode 96: reward = -90.00, steps = 91 22:25:15 [INFO] test episode 97: reward = -93.00, steps = 94 22:25:15 [INFO] test episode 98: reward = -141.00, steps = 142 22:25:15 [INFO] test episode 99: reward = -103.00, steps = 104 22:25:15 [INFO] average episode reward = -122.29 ± 23.37
env.close()