PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions
torch.manual_seed(0)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Acrobot-v1')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
22:55:30 [INFO] env: <AcrobotEnv<Acrobot-v1>> 22:55:30 [INFO] action_space: Discrete(3) 22:55:30 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32) 22:55:30 [INFO] reward_range: (-inf, inf) 22:55:30 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15} 22:55:30 [INFO] _max_episode_steps: 500 22:55:30 [INFO] _elapsed_steps: None 22:55:30 [INFO] id: Acrobot-v1 22:55:30 [INFO] entry_point: gym.envs.classic_control:AcrobotEnv 22:55:30 [INFO] reward_threshold: -100.0 22:55:30 [INFO] nondeterministic: False 22:55:30 [INFO] max_episode_steps: 500 22:55:30 [INFO] _kwargs: {} 22:55:30 [INFO] _env_name: Acrobot
class QActorCriticAgent:
def __init__(self, env):
self.gamma = 0.99
self.actor_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,],
output_size=env.action_space.n, output_activator=nn.Softmax(1))
self.actor_optimizer = optim.Adam(self.actor_net.parameters(), 0.001)
self.critic_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[100,],
output_size=env.action_space.n)
self.critic_optimizer = optim.Adam(self.critic_net.parameters(), 0.002)
self.critic_loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size=1,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.discount = 1.
def step(self, observation, reward, terminated):
state_tensor = torch.as_tensor(observation,
dtype=torch.float).reshape(1, -1)
prob_tensor = self.actor_net(state_tensor)
action_tensor = distributions.Categorical(prob_tensor).sample()
action = action_tensor.numpy()[0]
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
self.learn()
self.discount *= self.gamma
return action
def close(self):
pass
def learn(self):
state, _, _, action, next_state, reward, terminated, next_action \
= self.trajectory[-8:]
state_tensor = torch.as_tensor(state, dtype=torch.float).unsqueeze(0)
next_state_tensor = torch.as_tensor(next_state,
dtype=torch.float).unsqueeze(0)
# update actor
q_tensor = self.critic_net(state_tensor)[0, action]
pi_tensor = self.actor_net(state_tensor)[0, action]
logpi_tensor = torch.log(pi_tensor.clamp(1e-6, 1.))
actor_loss_tensor = -self.discount * q_tensor * logpi_tensor
self.actor_optimizer.zero_grad()
actor_loss_tensor.backward()
self.actor_optimizer.step()
# update critic
next_q_tensor = self.critic_net(next_state_tensor)[:, next_action]
target_tensor = reward + (1. - terminated) * self.gamma * next_q_tensor
pred_tensor = self.critic_net(state_tensor)[:, action]
critic_loss_tensor = self.critic_loss(pred_tensor, target_tensor)
self.critic_optimizer.zero_grad()
critic_loss_tensor.backward()
self.critic_optimizer.step()
agent = QActorCriticAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
22:55:30 [INFO] ==== train ==== 22:55:34 [INFO] train episode 0: reward = -500.00, steps = 500 22:55:39 [INFO] train episode 1: reward = -500.00, steps = 500 22:55:41 [INFO] train episode 2: reward = -357.00, steps = 358 22:55:45 [INFO] train episode 3: reward = -500.00, steps = 500 22:55:50 [INFO] train episode 4: reward = -500.00, steps = 500 22:55:54 [INFO] train episode 5: reward = -500.00, steps = 500 22:55:58 [INFO] train episode 6: reward = -500.00, steps = 500 22:56:02 [INFO] train episode 7: reward = -500.00, steps = 500 22:56:06 [INFO] train episode 8: reward = -500.00, steps = 500 22:56:11 [INFO] train episode 9: reward = -500.00, steps = 500 22:56:15 [INFO] train episode 10: reward = -500.00, steps = 500 22:56:20 [INFO] train episode 11: reward = -500.00, steps = 500 22:56:24 [INFO] train episode 12: reward = -500.00, steps = 500 22:56:28 [INFO] train episode 13: reward = -500.00, steps = 500 22:56:32 [INFO] train episode 14: reward = -500.00, steps = 500 22:56:36 [INFO] train episode 15: reward = -500.00, steps = 500 22:56:41 [INFO] train episode 16: reward = -500.00, steps = 500 22:56:46 [INFO] train episode 17: reward = -500.00, steps = 500 22:56:51 [INFO] train episode 18: reward = -500.00, steps = 500 22:56:56 [INFO] train episode 19: reward = -500.00, steps = 500 22:57:01 [INFO] train episode 20: reward = -500.00, steps = 500 22:57:06 [INFO] train episode 21: reward = -500.00, steps = 500 22:57:10 [INFO] train episode 22: reward = -500.00, steps = 500 22:57:15 [INFO] train episode 23: reward = -500.00, steps = 500 22:57:20 [INFO] train episode 24: reward = -500.00, steps = 500 22:57:25 [INFO] train episode 25: reward = -500.00, steps = 500 22:57:29 [INFO] train episode 26: reward = -500.00, steps = 500 22:57:34 [INFO] train episode 27: reward = -500.00, steps = 500 22:57:38 [INFO] train episode 28: reward = -500.00, steps = 500 22:57:42 [INFO] train episode 29: reward = -500.00, steps = 500 22:57:46 [INFO] train episode 30: reward = -500.00, steps = 500 22:57:50 [INFO] train episode 31: reward = -500.00, steps = 500 22:57:54 [INFO] train episode 32: reward = -500.00, steps = 500 22:57:58 [INFO] train episode 33: reward = -500.00, steps = 500 22:58:02 [INFO] train episode 34: reward = -500.00, steps = 500 22:58:06 [INFO] train episode 35: reward = -500.00, steps = 500 22:58:10 [INFO] train episode 36: reward = -500.00, steps = 500 22:58:14 [INFO] train episode 37: reward = -500.00, steps = 500 22:58:17 [INFO] train episode 38: reward = -500.00, steps = 500 22:58:20 [INFO] train episode 39: reward = -500.00, steps = 500 22:58:23 [INFO] train episode 40: reward = -500.00, steps = 500 22:58:26 [INFO] train episode 41: reward = -500.00, steps = 500 22:58:27 [INFO] train episode 42: reward = -158.00, steps = 159 22:58:27 [INFO] train episode 43: reward = -110.00, steps = 111 22:58:28 [INFO] train episode 44: reward = -106.00, steps = 107 22:58:29 [INFO] train episode 45: reward = -99.00, steps = 100 22:58:29 [INFO] train episode 46: reward = -76.00, steps = 77 22:58:30 [INFO] train episode 47: reward = -98.00, steps = 99 22:58:30 [INFO] train episode 48: reward = -105.00, steps = 106 22:58:31 [INFO] train episode 49: reward = -78.00, steps = 79 22:58:31 [INFO] train episode 50: reward = -84.00, steps = 85 22:58:32 [INFO] train episode 51: reward = -80.00, steps = 81 22:58:32 [INFO] ==== test ==== 22:58:32 [INFO] test episode 0: reward = -89.00, steps = 90 22:58:32 [INFO] test episode 1: reward = -103.00, steps = 104 22:58:32 [INFO] test episode 2: reward = -109.00, steps = 110 22:58:32 [INFO] test episode 3: reward = -83.00, steps = 84 22:58:32 [INFO] test episode 4: reward = -103.00, steps = 104 22:58:33 [INFO] test episode 5: reward = -84.00, steps = 85 22:58:33 [INFO] test episode 6: reward = -85.00, steps = 86 22:58:33 [INFO] test episode 7: reward = -102.00, steps = 103 22:58:33 [INFO] test episode 8: reward = -74.00, steps = 75 22:58:33 [INFO] test episode 9: reward = -92.00, steps = 93 22:58:33 [INFO] test episode 10: reward = -101.00, steps = 102 22:58:33 [INFO] test episode 11: reward = -107.00, steps = 108 22:58:33 [INFO] test episode 12: reward = -110.00, steps = 111 22:58:33 [INFO] test episode 13: reward = -99.00, steps = 100 22:58:34 [INFO] test episode 14: reward = -98.00, steps = 99 22:58:34 [INFO] test episode 15: reward = -123.00, steps = 124 22:58:34 [INFO] test episode 16: reward = -103.00, steps = 104 22:58:34 [INFO] test episode 17: reward = -93.00, steps = 94 22:58:34 [INFO] test episode 18: reward = -82.00, steps = 83 22:58:34 [INFO] test episode 19: reward = -85.00, steps = 86 22:58:34 [INFO] test episode 20: reward = -87.00, steps = 88 22:58:34 [INFO] test episode 21: reward = -123.00, steps = 124 22:58:34 [INFO] test episode 22: reward = -93.00, steps = 94 22:58:35 [INFO] test episode 23: reward = -102.00, steps = 103 22:58:35 [INFO] test episode 24: reward = -251.00, steps = 252 22:58:35 [INFO] test episode 25: reward = -136.00, steps = 137 22:58:35 [INFO] test episode 26: reward = -109.00, steps = 110 22:58:35 [INFO] test episode 27: reward = -76.00, steps = 77 22:58:35 [INFO] test episode 28: reward = -95.00, steps = 96 22:58:35 [INFO] test episode 29: reward = -70.00, steps = 71 22:58:35 [INFO] test episode 30: reward = -88.00, steps = 89 22:58:36 [INFO] test episode 31: reward = -106.00, steps = 107 22:58:36 [INFO] test episode 32: reward = -86.00, steps = 87 22:58:36 [INFO] test episode 33: reward = -72.00, steps = 73 22:58:36 [INFO] test episode 34: reward = -97.00, steps = 98 22:58:36 [INFO] test episode 35: reward = -136.00, steps = 137 22:58:36 [INFO] test episode 36: reward = -107.00, steps = 108 22:58:36 [INFO] test episode 37: reward = -109.00, steps = 110 22:58:36 [INFO] test episode 38: reward = -109.00, steps = 110 22:58:36 [INFO] test episode 39: reward = -99.00, steps = 100 22:58:37 [INFO] test episode 40: reward = -86.00, steps = 87 22:58:37 [INFO] test episode 41: reward = -139.00, steps = 140 22:58:37 [INFO] test episode 42: reward = -113.00, steps = 114 22:58:37 [INFO] test episode 43: reward = -500.00, steps = 500 22:58:37 [INFO] test episode 44: reward = -94.00, steps = 95 22:58:37 [INFO] test episode 45: reward = -95.00, steps = 96 22:58:38 [INFO] test episode 46: reward = -82.00, steps = 83 22:58:38 [INFO] test episode 47: reward = -84.00, steps = 85 22:58:38 [INFO] test episode 48: reward = -92.00, steps = 93 22:58:38 [INFO] test episode 49: reward = -500.00, steps = 500 22:58:39 [INFO] test episode 50: reward = -111.00, steps = 112 22:58:39 [INFO] test episode 51: reward = -83.00, steps = 84 22:58:39 [INFO] test episode 52: reward = -97.00, steps = 98 22:58:39 [INFO] test episode 53: reward = -116.00, steps = 117 22:58:39 [INFO] test episode 54: reward = -91.00, steps = 92 22:58:39 [INFO] test episode 55: reward = -90.00, steps = 91 22:58:39 [INFO] test episode 56: reward = -86.00, steps = 87 22:58:39 [INFO] test episode 57: reward = -95.00, steps = 96 22:58:39 [INFO] test episode 58: reward = -111.00, steps = 112 22:58:40 [INFO] test episode 59: reward = -67.00, steps = 68 22:58:40 [INFO] test episode 60: reward = -82.00, steps = 83 22:58:40 [INFO] test episode 61: reward = -80.00, steps = 81 22:58:40 [INFO] test episode 62: reward = -81.00, steps = 82 22:58:40 [INFO] test episode 63: reward = -64.00, steps = 65 22:58:40 [INFO] test episode 64: reward = -109.00, steps = 110 22:58:40 [INFO] test episode 65: reward = -106.00, steps = 107 22:58:40 [INFO] test episode 66: reward = -96.00, steps = 97 22:58:40 [INFO] test episode 67: reward = -118.00, steps = 119 22:58:41 [INFO] test episode 68: reward = -156.00, steps = 157 22:58:41 [INFO] test episode 69: reward = -91.00, steps = 92 22:58:41 [INFO] test episode 70: reward = -107.00, steps = 108 22:58:41 [INFO] test episode 71: reward = -74.00, steps = 75 22:58:41 [INFO] test episode 72: reward = -93.00, steps = 94 22:58:41 [INFO] test episode 73: reward = -81.00, steps = 82 22:58:42 [INFO] test episode 74: reward = -500.00, steps = 500 22:58:42 [INFO] test episode 75: reward = -98.00, steps = 99 22:58:42 [INFO] test episode 76: reward = -200.00, steps = 201 22:58:42 [INFO] test episode 77: reward = -106.00, steps = 107 22:58:42 [INFO] test episode 78: reward = -102.00, steps = 103 22:58:42 [INFO] test episode 79: reward = -183.00, steps = 184 22:58:42 [INFO] test episode 80: reward = -102.00, steps = 103 22:58:43 [INFO] test episode 81: reward = -75.00, steps = 76 22:58:43 [INFO] test episode 82: reward = -100.00, steps = 101 22:58:43 [INFO] test episode 83: reward = -95.00, steps = 96 22:58:43 [INFO] test episode 84: reward = -76.00, steps = 77 22:58:43 [INFO] test episode 85: reward = -131.00, steps = 132 22:58:43 [INFO] test episode 86: reward = -116.00, steps = 117 22:58:43 [INFO] test episode 87: reward = -91.00, steps = 92 22:58:43 [INFO] test episode 88: reward = -99.00, steps = 100 22:58:44 [INFO] test episode 89: reward = -85.00, steps = 86 22:58:44 [INFO] test episode 90: reward = -90.00, steps = 91 22:58:44 [INFO] test episode 91: reward = -115.00, steps = 116 22:58:44 [INFO] test episode 92: reward = -90.00, steps = 91 22:58:44 [INFO] test episode 93: reward = -77.00, steps = 78 22:58:44 [INFO] test episode 94: reward = -99.00, steps = 100 22:58:44 [INFO] test episode 95: reward = -121.00, steps = 122 22:58:44 [INFO] test episode 96: reward = -79.00, steps = 80 22:58:44 [INFO] test episode 97: reward = -98.00, steps = 99 22:58:45 [INFO] test episode 98: reward = -89.00, steps = 90 22:58:45 [INFO] test episode 99: reward = -107.00, steps = 108 22:58:45 [INFO] average episode reward = -112.70 ± 72.79
env.close()