PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions
torch.manual_seed(0)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
22:37:18 [INFO] env: <AcrobotEnv<Acrobot-v1>> 22:37:18 [INFO] action_space: Discrete(3) 22:37:18 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 22:37:18 [INFO] reward_range: (-inf, inf) 22:37:18 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 22:37:18 [INFO] _max_episode_steps: 500 22:37:18 [INFO] _elapsed_steps: None 22:37:18 [INFO] id: Acrobot-v1 22:37:18 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 22:37:18 [INFO] reward_threshold: -100.0 22:37:18 [INFO] nondeterministic: False 22:37:18 [INFO] max_episode_steps: 500 22:37:18 [INFO] _kwargs: {} 22:37:18 [INFO] _env_name: Acrobot
class OffPACAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.actor_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,],
output_size=env.action_space.n, output_activator=nn.Softmax(1))
self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 0.0002)
self.critic_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,], output_size=self.action_n)
self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.0004)
self.critic_loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.discount = 1.
def step(self, observation, reward, terminated):
if self.mode == 'train':
action = np.random.choice(self.action_n)
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
self.learn()
self.discount *= self.gamma
else:
state_tensor = torch.as_tensor(observation,
dtype=torch.float).unsqueeze(0)
prob_tensor = self.actor_net(state_tensor)
action_tensor = distributions.Categorical(prob_tensor).sample()
action = action_tensor.numpy()[0]
return action
def close(self):
pass
def learn(self):
state, _, _, action, next_state, reward, terminated, next_action = \
self.trajectory[-8:]
state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)
next_state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)
# update actor
q_tensor = self.critic_net(state_tensor)[0, action]
pi_tensor = self.actor_net(state_tensor)[0, action]
behavior_prob = 1. / self.action_n
actor_loss_tensor = -self.discount * q_tensor / behavior_prob * pi_tensor
self.actor_optimizer.zero_grad()
actor_loss_tensor.backward()
self.actor_optimizer.step()
# update critic
next_q_tensor = self.critic_net(next_state_tensor)[:, next_action]
target_tensor = reward + (1. - terminated) * self.gamma * next_q_tensor
pred_tensor = self.critic_net(state_tensor)[:, action]
critic_loss_tensor = self.critic_loss(pred_tensor, target_tensor)
pi_tensor = self.actor_net(state_tensor)[0, action]
ratio_tensor = pi_tensor / behavior_prob # importance sampling ratio
critic_loss_tensor *= ratio_tensor
self.critic_optimizer.zero_grad()
critic_loss_tensor.backward()
self.critic_optimizer.step()
agent = OffPACAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
play_episode(env, agent, seed=episode,
mode='train')
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
22:37:18 [INFO] ==== train ==== 22:37:20 [INFO] train episode 0: reward = -500.00, steps = 500 22:37:22 [INFO] train episode 1: reward = -431.00, steps = 432 22:37:24 [INFO] train episode 2: reward = -500.00, steps = 500 22:37:26 [INFO] train episode 3: reward = -500.00, steps = 500 22:37:28 [INFO] train episode 4: reward = -466.00, steps = 467 22:37:30 [INFO] train episode 5: reward = -500.00, steps = 500 22:37:31 [INFO] train episode 6: reward = -348.00, steps = 349 22:37:33 [INFO] train episode 7: reward = -325.00, steps = 326 22:37:35 [INFO] train episode 8: reward = -500.00, steps = 500 22:37:36 [INFO] train episode 9: reward = -500.00, steps = 500 22:37:38 [INFO] train episode 10: reward = -457.00, steps = 458 22:37:40 [INFO] train episode 11: reward = -356.00, steps = 357 22:37:42 [INFO] train episode 12: reward = -500.00, steps = 500 22:37:43 [INFO] train episode 13: reward = -500.00, steps = 500 22:37:45 [INFO] train episode 14: reward = -480.00, steps = 481 22:37:47 [INFO] train episode 15: reward = -500.00, steps = 500 22:37:49 [INFO] train episode 16: reward = -500.00, steps = 500 22:37:50 [INFO] train episode 17: reward = -500.00, steps = 500 22:37:52 [INFO] train episode 18: reward = -500.00, steps = 500 22:37:53 [INFO] train episode 19: reward = -500.00, steps = 500 22:37:55 [INFO] train episode 20: reward = -500.00, steps = 500 22:37:57 [INFO] train episode 21: reward = -500.00, steps = 500 22:37:58 [INFO] train episode 22: reward = -500.00, steps = 500 22:38:00 [INFO] train episode 23: reward = -500.00, steps = 500 22:38:02 [INFO] train episode 24: reward = -500.00, steps = 500 22:38:04 [INFO] train episode 25: reward = -500.00, steps = 500 22:38:06 [INFO] train episode 26: reward = -500.00, steps = 500 22:38:08 [INFO] train episode 27: reward = -490.00, steps = 491 22:38:10 [INFO] train episode 28: reward = -500.00, steps = 500 22:38:12 [INFO] train episode 29: reward = -500.00, steps = 500 22:38:14 [INFO] train episode 30: reward = -452.00, steps = 453 22:38:16 [INFO] train episode 31: reward = -500.00, steps = 500 22:38:18 [INFO] train episode 32: reward = -355.00, steps = 356 22:38:20 [INFO] train episode 33: reward = -500.00, steps = 500 22:38:22 [INFO] train episode 34: reward = -500.00, steps = 500 22:38:24 [INFO] train episode 35: reward = -331.00, steps = 332 22:38:26 [INFO] train episode 36: reward = -500.00, steps = 500 22:38:28 [INFO] train episode 37: reward = -473.00, steps = 474 22:38:29 [INFO] train episode 38: reward = -253.00, steps = 254 22:38:31 [INFO] train episode 39: reward = -353.00, steps = 354 22:38:33 [INFO] train episode 40: reward = -500.00, steps = 500 22:38:35 [INFO] train episode 41: reward = -394.00, steps = 395 22:38:37 [INFO] train episode 42: reward = -196.00, steps = 197 22:38:39 [INFO] train episode 43: reward = -374.00, steps = 375 22:38:40 [INFO] train episode 44: reward = -357.00, steps = 358 22:38:42 [INFO] train episode 45: reward = -231.00, steps = 232 22:38:44 [INFO] train episode 46: reward = -451.00, steps = 452 22:38:46 [INFO] train episode 47: reward = -500.00, steps = 500 22:38:48 [INFO] train episode 48: reward = -291.00, steps = 292 22:38:50 [INFO] train episode 49: reward = -500.00, steps = 500 22:38:52 [INFO] train episode 50: reward = -500.00, steps = 500 22:38:54 [INFO] train episode 51: reward = -500.00, steps = 500 22:38:56 [INFO] train episode 52: reward = -500.00, steps = 500 22:38:58 [INFO] train episode 53: reward = -500.00, steps = 500 22:39:00 [INFO] train episode 54: reward = -500.00, steps = 500 22:39:03 [INFO] train episode 55: reward = -500.00, steps = 500 22:39:05 [INFO] train episode 56: reward = -500.00, steps = 500 22:39:07 [INFO] train episode 57: reward = -433.00, steps = 434 22:39:09 [INFO] train episode 58: reward = -500.00, steps = 500 22:39:11 [INFO] train episode 59: reward = -364.00, steps = 365 22:39:13 [INFO] train episode 60: reward = -482.00, steps = 483 22:39:14 [INFO] train episode 61: reward = -500.00, steps = 500 22:39:16 [INFO] train episode 62: reward = -500.00, steps = 500 22:39:18 [INFO] train episode 63: reward = -500.00, steps = 500 22:39:20 [INFO] train episode 64: reward = -489.00, steps = 490 22:39:22 [INFO] train episode 65: reward = -500.00, steps = 500 22:39:24 [INFO] train episode 66: reward = -417.00, steps = 418 22:39:26 [INFO] train episode 67: reward = -500.00, steps = 500 22:39:28 [INFO] train episode 68: reward = -308.00, steps = 309 22:39:30 [INFO] train episode 69: reward = -492.00, steps = 493 22:39:32 [INFO] train episode 70: reward = -234.00, steps = 235 22:39:34 [INFO] train episode 71: reward = -249.00, steps = 250 22:39:36 [INFO] train episode 72: reward = -500.00, steps = 500 22:39:38 [INFO] train episode 73: reward = -264.00, steps = 265 22:39:40 [INFO] train episode 74: reward = -273.00, steps = 274 22:39:42 [INFO] train episode 75: reward = -154.00, steps = 155 22:39:44 [INFO] train episode 76: reward = -136.00, steps = 137 22:39:46 [INFO] train episode 77: reward = -246.00, steps = 247 22:39:48 [INFO] train episode 78: reward = -201.00, steps = 202 22:39:50 [INFO] train episode 79: reward = -348.00, steps = 349 22:39:52 [INFO] train episode 80: reward = -159.00, steps = 160 22:39:54 [INFO] train episode 81: reward = -361.00, steps = 362 22:39:55 [INFO] train episode 82: reward = -131.00, steps = 132 22:39:58 [INFO] train episode 83: reward = -177.00, steps = 178 22:39:59 [INFO] train episode 84: reward = -203.00, steps = 204 22:40:01 [INFO] train episode 85: reward = -146.00, steps = 147 22:40:03 [INFO] train episode 86: reward = -176.00, steps = 177 22:40:04 [INFO] train episode 87: reward = -232.00, steps = 233 22:40:06 [INFO] train episode 88: reward = -237.00, steps = 238 22:40:08 [INFO] train episode 89: reward = -297.00, steps = 298 22:40:10 [INFO] train episode 90: reward = -269.00, steps = 270 22:40:12 [INFO] train episode 91: reward = -198.00, steps = 199 22:40:13 [INFO] train episode 92: reward = -192.00, steps = 193 22:40:15 [INFO] train episode 93: reward = -384.00, steps = 385 22:40:17 [INFO] train episode 94: reward = -307.00, steps = 308 22:40:19 [INFO] train episode 95: reward = -223.00, steps = 224 22:40:20 [INFO] train episode 96: reward = -225.00, steps = 226 22:40:22 [INFO] train episode 97: reward = -282.00, steps = 283 22:40:24 [INFO] train episode 98: reward = -151.00, steps = 152 22:40:26 [INFO] train episode 99: reward = -233.00, steps = 234 22:40:28 [INFO] train episode 100: reward = -240.00, steps = 241 22:40:29 [INFO] train episode 101: reward = -197.00, steps = 198 22:40:31 [INFO] train episode 102: reward = -191.00, steps = 192 22:40:33 [INFO] train episode 103: reward = -172.00, steps = 173 22:40:34 [INFO] train episode 104: reward = -176.00, steps = 177 22:40:36 [INFO] train episode 105: reward = -137.00, steps = 138 22:40:38 [INFO] train episode 106: reward = -191.00, steps = 192 22:40:39 [INFO] train episode 107: reward = -163.00, steps = 164 22:40:41 [INFO] train episode 108: reward = -153.00, steps = 154 22:40:43 [INFO] train episode 109: reward = -124.00, steps = 125 22:40:44 [INFO] train episode 110: reward = -127.00, steps = 128 22:40:46 [INFO] train episode 111: reward = -130.00, steps = 131 22:40:48 [INFO] train episode 112: reward = -91.00, steps = 92 22:40:49 [INFO] train episode 113: reward = -136.00, steps = 137 22:40:51 [INFO] train episode 114: reward = -133.00, steps = 134 22:40:53 [INFO] train episode 115: reward = -121.00, steps = 122 22:40:54 [INFO] train episode 116: reward = -107.00, steps = 108 22:40:56 [INFO] train episode 117: reward = -139.00, steps = 140 22:40:57 [INFO] train episode 118: reward = -118.00, steps = 119 22:40:59 [INFO] train episode 119: reward = -87.00, steps = 88 22:40:59 [INFO] ==== test ==== 22:40:59 [INFO] test episode 0: reward = -131.00, steps = 132 22:40:59 [INFO] test episode 1: reward = -118.00, steps = 119 22:40:59 [INFO] test episode 2: reward = -118.00, steps = 119 22:40:59 [INFO] test episode 3: reward = -150.00, steps = 151 22:41:00 [INFO] test episode 4: reward = -148.00, steps = 149 22:41:00 [INFO] test episode 5: reward = -128.00, steps = 129 22:41:00 [INFO] test episode 6: reward = -119.00, steps = 120 22:41:00 [INFO] test episode 7: reward = -135.00, steps = 136 22:41:00 [INFO] test episode 8: reward = -103.00, steps = 104 22:41:00 [INFO] test episode 9: reward = -103.00, steps = 104 22:41:00 [INFO] test episode 10: reward = -114.00, steps = 115 22:41:00 [INFO] test episode 11: reward = -122.00, steps = 123 22:41:00 [INFO] test episode 12: reward = -105.00, steps = 106 22:41:00 [INFO] test episode 13: reward = -145.00, steps = 146 22:41:00 [INFO] test episode 14: reward = -135.00, steps = 136 22:41:00 [INFO] test episode 15: reward = -112.00, steps = 113 22:41:00 [INFO] test episode 16: reward = -99.00, steps = 100 22:41:01 [INFO] test episode 17: reward = -144.00, steps = 145 22:41:01 [INFO] test episode 18: reward = -111.00, steps = 112 22:41:01 [INFO] test episode 19: reward = -136.00, steps = 137 22:41:01 [INFO] test episode 20: reward = -121.00, steps = 122 22:41:01 [INFO] test episode 21: reward = -130.00, steps = 131 22:41:01 [INFO] test episode 22: reward = -109.00, steps = 110 22:41:01 [INFO] test episode 23: reward = -154.00, steps = 155 22:41:01 [INFO] test episode 24: reward = -90.00, steps = 91 22:41:01 [INFO] test episode 25: reward = -102.00, steps = 103 22:41:01 [INFO] test episode 26: reward = -123.00, steps = 124 22:41:01 [INFO] test episode 27: reward = -93.00, steps = 94 22:41:02 [INFO] test episode 28: reward = -151.00, steps = 152 22:41:02 [INFO] test episode 29: reward = -106.00, steps = 107 22:41:02 [INFO] test episode 30: reward = -126.00, steps = 127 22:41:02 [INFO] test episode 31: reward = -113.00, steps = 114 22:41:02 [INFO] test episode 32: reward = -128.00, steps = 129 22:41:02 [INFO] test episode 33: reward = -104.00, steps = 105 22:41:02 [INFO] test episode 34: reward = -111.00, steps = 112 22:41:02 [INFO] test episode 35: reward = -92.00, steps = 93 22:41:02 [INFO] test episode 36: reward = -93.00, steps = 94 22:41:02 [INFO] test episode 37: reward = -112.00, steps = 113 22:41:02 [INFO] test episode 38: reward = -135.00, steps = 136 22:41:02 [INFO] test episode 39: reward = -118.00, steps = 119 22:41:02 [INFO] test episode 40: reward = -108.00, steps = 109 22:41:03 [INFO] test episode 41: reward = -147.00, steps = 148 22:41:03 [INFO] test episode 42: reward = -117.00, steps = 118 22:41:03 [INFO] test episode 43: reward = -184.00, steps = 185 22:41:03 [INFO] test episode 44: reward = -113.00, steps = 114 22:41:03 [INFO] test episode 45: reward = -98.00, steps = 99 22:41:03 [INFO] test episode 46: reward = -132.00, steps = 133 22:41:03 [INFO] test episode 47: reward = -94.00, steps = 95 22:41:03 [INFO] test episode 48: reward = -100.00, steps = 101 22:41:03 [INFO] test episode 49: reward = -133.00, steps = 134 22:41:03 [INFO] test episode 50: reward = -125.00, steps = 126 22:41:03 [INFO] test episode 51: reward = -163.00, steps = 164 22:41:03 [INFO] test episode 52: reward = -179.00, steps = 180 22:41:04 [INFO] test episode 53: reward = -152.00, steps = 153 22:41:04 [INFO] test episode 54: reward = -263.00, steps = 264 22:41:04 [INFO] test episode 55: reward = -103.00, steps = 104 22:41:04 [INFO] test episode 56: reward = -110.00, steps = 111 22:41:04 [INFO] test episode 57: reward = -159.00, steps = 160 22:41:04 [INFO] test episode 58: reward = -157.00, steps = 158 22:41:04 [INFO] test episode 59: reward = -138.00, steps = 139 22:41:04 [INFO] test episode 60: reward = -124.00, steps = 125 22:41:04 [INFO] test episode 61: reward = -117.00, steps = 118 22:41:04 [INFO] test episode 62: reward = -184.00, steps = 185 22:41:05 [INFO] test episode 63: reward = -120.00, steps = 121 22:41:05 [INFO] test episode 64: reward = -111.00, steps = 112 22:41:05 [INFO] test episode 65: reward = -116.00, steps = 117 22:41:05 [INFO] test episode 66: reward = -122.00, steps = 123 22:41:05 [INFO] test episode 67: reward = -135.00, steps = 136 22:41:05 [INFO] test episode 68: reward = -138.00, steps = 139 22:41:05 [INFO] test episode 69: reward = -117.00, steps = 118 22:41:05 [INFO] test episode 70: reward = -123.00, steps = 124 22:41:05 [INFO] test episode 71: reward = -96.00, steps = 97 22:41:05 [INFO] test episode 72: reward = -113.00, steps = 114 22:41:05 [INFO] test episode 73: reward = -90.00, steps = 91 22:41:05 [INFO] test episode 74: reward = -116.00, steps = 117 22:41:05 [INFO] test episode 75: reward = -132.00, steps = 133 22:41:06 [INFO] test episode 76: reward = -117.00, steps = 118 22:41:06 [INFO] test episode 77: reward = -104.00, steps = 105 22:41:06 [INFO] test episode 78: reward = -142.00, steps = 143 22:41:06 [INFO] test episode 79: reward = -124.00, steps = 125 22:41:06 [INFO] test episode 80: reward = -92.00, steps = 93 22:41:06 [INFO] test episode 81: reward = -132.00, steps = 133 22:41:06 [INFO] test episode 82: reward = -121.00, steps = 122 22:41:06 [INFO] test episode 83: reward = -93.00, steps = 94 22:41:06 [INFO] test episode 84: reward = -103.00, steps = 104 22:41:06 [INFO] test episode 85: reward = -198.00, steps = 199 22:41:06 [INFO] test episode 86: reward = -190.00, steps = 191 22:41:07 [INFO] test episode 87: reward = -142.00, steps = 143 22:41:07 [INFO] test episode 88: reward = -134.00, steps = 135 22:41:07 [INFO] test episode 89: reward = -126.00, steps = 127 22:41:07 [INFO] test episode 90: reward = -144.00, steps = 145 22:41:07 [INFO] test episode 91: reward = -113.00, steps = 114 22:41:07 [INFO] test episode 92: reward = -103.00, steps = 104 22:41:07 [INFO] test episode 93: reward = -117.00, steps = 118 22:41:07 [INFO] test episode 94: reward = -142.00, steps = 143 22:41:07 [INFO] test episode 95: reward = -132.00, steps = 133 22:41:07 [INFO] test episode 96: reward = -115.00, steps = 116 22:41:07 [INFO] test episode 97: reward = -102.00, steps = 103 22:41:07 [INFO] test episode 98: reward = -86.00, steps = 87 22:41:07 [INFO] test episode 99: reward = -137.00, steps = 138 22:41:07 [INFO] average episode reward = -125.25 ± 26.61
env.close()