PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import copy
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('Pendulum-v1')
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
for key in vars(env.unwrapped):
logging.info('%s: %s', key, vars(env.unwrapped)[key])
00:00:00 [INFO] id: Pendulum-v1 00:00:00 [INFO] entry_point: gym.envs.classic_control:PendulumEnv 00:00:00 [INFO] reward_threshold: None 00:00:00 [INFO] nondeterministic: False 00:00:00 [INFO] max_episode_steps: 200 00:00:00 [INFO] order_enforce: True 00:00:00 [INFO] _kwargs: {} 00:00:00 [INFO] _env_name: Pendulum 00:00:00 [INFO] max_speed: 8 00:00:00 [INFO] max_torque: 2.0 00:00:00 [INFO] dt: 0.05 00:00:00 [INFO] g: 10.0 00:00:00 [INFO] m: 1.0 00:00:00 [INFO] l: 1.0 00:00:00 [INFO] viewer: None 00:00:00 [INFO] action_space: Box([-2.], [2.], (1,), float32) 00:00:00 [INFO] observation_space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32) 00:00:00 [INFO] np_random: RandomState(MT19937) 00:00:00 [INFO] spec: EnvSpec(Pendulum-v1)
class DQNReplayer:
def __init__(self, capacity):
self.memory = pd.DataFrame(index=range(capacity),
columns=['observation', 'action', 'reward',
'next_observation', 'terminated'])
self.i = 0
self.count = 0
self.capacity = capacity
def store(self, *args):
self.memory.loc[self.i] = np.asarray(args, dtype=object)
self.i = (self.i + 1) % self.capacity
self.count = min(self.count + 1, self.capacity)
def sample(self, size):
indices = np.random.choice(self.count, size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.memory.columns)
class OrnsteinUhlenbeckProcess:
def __init__(self, x0):
self.x = x0
def __call__(self, mu=0., sigma=1., theta=.15, dt=.01):
n = np.random.normal(size=self.x.shape)
self.x += (theta * (mu - self.x) * dt + sigma * np.sqrt(dt) * n)
return self.x
class DDPGAgent:
def __init__(self, env):
state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.shape[0]
self.action_low = env.action_space.low[0]
self.action_high = env.action_space.high[0]
self.gamma = 0.99
self.replayer = DQNReplayer(20000)
self.actor_evaluate_net = self.build_net(
input_size=state_dim, hidden_sizes=[32, 64],
output_size=self.action_dim)
self.actor_optimizer = optim.Adam(self.actor_evaluate_net.parameters(),
lr=0.0001)
self.actor_target_net = copy.deepcopy(self.actor_evaluate_net)
self.critic_evaluate_net = self.build_net(
input_size=state_dim+self.action_dim, hidden_sizes=[64, 128])
self.critic_optimizer = optim.Adam(self.critic_evaluate_net.parameters(),
lr=0.001)
self.critic_loss = nn.MSELoss()
self.critic_target_net = copy.deepcopy(self.critic_evaluate_net)
def build_net(self, input_size, hidden_sizes, output_size=1,
output_activator=None):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
if output_activator:
layers.append(output_activator)
net = nn.Sequential(*layers)
return net
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.noise = OrnsteinUhlenbeckProcess(np.zeros((self.action_dim,)))
def step(self, observation, reward, terminated):
if self.mode == 'train' and self.replayer.count < 3000:
action = np.random.uniform(self.action_low, self.action_high)
else:
state_tensor = torch.as_tensor(observation,
dtype=torch.float).reshape(1, -1)
action_tensor = self.actor_evaluate_net(state_tensor)
action = action_tensor.detach().numpy()[0]
if self.mode == 'train':
# noisy action
noise = self.noise(sigma=0.1)
action = (action + noise).clip(self.action_low, self.action_high)
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
state, _, _, act, next_state, reward, terminated, _ = \
self.trajectory[-8:]
self.replayer.store(state, act, reward, next_state, terminated)
if self.replayer.count >= 3000:
self.learn()
return action
def close(self):
pass
def update_net(self, target_net, evaluate_net, learning_rate=0.005):
for target_param, evaluate_param in zip(
target_net.parameters(), evaluate_net.parameters()):
target_param.data.copy_(learning_rate * evaluate_param.data
+ (1 - learning_rate) * target_param.data)
def learn(self):
# replay
states, actions, rewards, next_states, terminateds = \
self.replayer.sample(64)
state_tensor = torch.as_tensor(states, dtype=torch.float)
action_tensor = torch.as_tensor(actions, dtype=torch.long)
reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)
# update critic
next_action_tensor = self.actor_target_net(next_state_tensor)
noise_tensor = (0.2 * torch.randn_like(action_tensor, dtype=torch.float))
noisy_next_action_tensor = (next_action_tensor + noise_tensor).clamp(
self.action_low, self.action_high)
next_state_action_tensor = torch.cat([next_state_tensor,
noisy_next_action_tensor], 1)
next_q_tensor = self.critic_target_net(next_state_action_tensor).squeeze(1)
critic_target_tensor = reward_tensor + (1. - terminated_tensor) * \
self.gamma * next_q_tensor
critic_target_tensor = critic_target_tensor.detach()
state_action_tensor = torch.cat([state_tensor, action_tensor], 1)
critic_pred_tensor = self.critic_evaluate_net(state_action_tensor
).squeeze(1)
critic_loss_tensor = self.critic_loss(critic_pred_tensor,
critic_target_tensor)
self.critic_optimizer.zero_grad()
critic_loss_tensor.backward()
self.critic_optimizer.step()
# update actor
pred_action_tensor = self.actor_evaluate_net(state_tensor)
pred_action_tensor = pred_action_tensor.clamp(self.action_low,
self.action_high)
pred_state_action_tensor = torch.cat([state_tensor, pred_action_tensor], 1)
critic_pred_tensor = self.critic_evaluate_net(pred_state_action_tensor)
actor_loss_tensor = -critic_pred_tensor.mean()
self.actor_optimizer.zero_grad()
actor_loss_tensor.backward()
self.actor_optimizer.step()
self.update_net(self.critic_target_net, self.critic_evaluate_net)
self.update_net(self.actor_target_net, self.actor_evaluate_net)
agent = DDPGAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -120:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
00:00:02 [INFO] ==== train ==== 00:00:02 [INFO] train episode 0: reward = -1744.13, steps = 200 00:00:02 [INFO] train episode 1: reward = -1025.25, steps = 200 00:00:02 [INFO] train episode 2: reward = -1590.20, steps = 200 00:00:02 [INFO] train episode 3: reward = -1137.77, steps = 200 00:00:02 [INFO] train episode 4: reward = -1675.82, steps = 200 00:00:02 [INFO] train episode 5: reward = -1632.97, steps = 200 00:00:02 [INFO] train episode 6: reward = -753.85, steps = 200 00:00:02 [INFO] train episode 7: reward = -1833.66, steps = 200 00:00:03 [INFO] train episode 8: reward = -936.49, steps = 200 00:00:03 [INFO] train episode 9: reward = -1622.68, steps = 200 00:00:03 [INFO] train episode 10: reward = -1307.43, steps = 200 00:00:03 [INFO] train episode 11: reward = -908.99, steps = 200 00:00:03 [INFO] train episode 12: reward = -1504.19, steps = 200 00:00:03 [INFO] train episode 13: reward = -1003.41, steps = 200 00:00:03 [INFO] train episode 14: reward = -921.67, steps = 200 00:00:22 [INFO] train episode 15: reward = -1015.46, steps = 200 00:00:40 [INFO] train episode 16: reward = -1002.62, steps = 200 00:00:58 [INFO] train episode 17: reward = -1498.86, steps = 200 00:01:17 [INFO] train episode 18: reward = -1770.69, steps = 200 00:01:35 [INFO] train episode 19: reward = -1368.10, steps = 200 00:01:53 [INFO] train episode 20: reward = -1422.33, steps = 200 00:02:12 [INFO] train episode 21: reward = -1505.02, steps = 200 00:02:31 [INFO] train episode 22: reward = -1525.87, steps = 200 00:02:49 [INFO] train episode 23: reward = -1331.44, steps = 200 00:03:08 [INFO] train episode 24: reward = -1093.97, steps = 200 00:03:26 [INFO] train episode 25: reward = -1222.73, steps = 200 00:03:46 [INFO] train episode 26: reward = -961.30, steps = 200 00:04:06 [INFO] train episode 27: reward = -901.85, steps = 200 00:04:25 [INFO] train episode 28: reward = -697.18, steps = 200 00:04:46 [INFO] train episode 29: reward = -904.89, steps = 200 00:05:05 [INFO] train episode 30: reward = -930.82, steps = 200 00:05:24 [INFO] train episode 31: reward = -926.90, steps = 200 00:05:42 [INFO] train episode 32: reward = -790.95, steps = 200 00:06:00 [INFO] train episode 33: reward = -793.23, steps = 200 00:06:19 [INFO] train episode 34: reward = -664.05, steps = 200 00:06:37 [INFO] train episode 35: reward = -796.29, steps = 200 00:06:56 [INFO] train episode 36: reward = -721.84, steps = 200 00:07:14 [INFO] train episode 37: reward = -520.92, steps = 200 00:07:33 [INFO] train episode 38: reward = -516.89, steps = 200 00:07:51 [INFO] train episode 39: reward = -625.72, steps = 200 00:08:10 [INFO] train episode 40: reward = -767.70, steps = 200 00:08:28 [INFO] train episode 41: reward = -755.03, steps = 200 00:08:47 [INFO] train episode 42: reward = -264.11, steps = 200 00:09:06 [INFO] train episode 43: reward = -391.49, steps = 200 00:09:25 [INFO] train episode 44: reward = -517.31, steps = 200 00:09:43 [INFO] train episode 45: reward = -534.10, steps = 200 00:10:03 [INFO] train episode 46: reward = -506.09, steps = 200 00:10:27 [INFO] train episode 47: reward = -385.75, steps = 200 00:10:51 [INFO] train episode 48: reward = -518.91, steps = 200 00:11:15 [INFO] train episode 49: reward = -395.98, steps = 200 00:11:36 [INFO] train episode 50: reward = -626.09, steps = 200 00:12:01 [INFO] train episode 51: reward = -168.21, steps = 200 00:12:20 [INFO] train episode 52: reward = -120.23, steps = 200 00:12:39 [INFO] train episode 53: reward = -256.31, steps = 200 00:13:00 [INFO] train episode 54: reward = -140.80, steps = 200 00:13:25 [INFO] train episode 55: reward = -249.16, steps = 200 00:13:45 [INFO] train episode 56: reward = -265.54, steps = 200 00:14:04 [INFO] train episode 57: reward = -146.78, steps = 200 00:14:24 [INFO] train episode 58: reward = -510.25, steps = 200 00:14:45 [INFO] train episode 59: reward = -399.44, steps = 200 00:15:05 [INFO] train episode 60: reward = -521.06, steps = 200 00:15:25 [INFO] train episode 61: reward = -503.14, steps = 200 00:15:45 [INFO] train episode 62: reward = -500.67, steps = 200 00:16:06 [INFO] train episode 63: reward = -558.42, steps = 200 00:16:25 [INFO] train episode 64: reward = -370.34, steps = 200 00:16:44 [INFO] train episode 65: reward = -383.06, steps = 200 00:17:03 [INFO] train episode 66: reward = -256.65, steps = 200 00:17:22 [INFO] train episode 67: reward = -248.25, steps = 200 00:17:42 [INFO] train episode 68: reward = -364.46, steps = 200 00:18:01 [INFO] train episode 69: reward = -249.60, steps = 200 00:18:21 [INFO] train episode 70: reward = -126.43, steps = 200 00:18:41 [INFO] train episode 71: reward = -151.36, steps = 200 00:19:00 [INFO] train episode 72: reward = -240.94, steps = 200 00:19:20 [INFO] train episode 73: reward = -238.82, steps = 200 00:19:40 [INFO] train episode 74: reward = -374.65, steps = 200 00:20:01 [INFO] train episode 75: reward = -135.91, steps = 200 00:20:22 [INFO] train episode 76: reward = -128.82, steps = 200 00:20:43 [INFO] train episode 77: reward = -259.55, steps = 200 00:21:05 [INFO] train episode 78: reward = -252.11, steps = 200 00:21:30 [INFO] train episode 79: reward = -258.99, steps = 200 00:21:54 [INFO] train episode 80: reward = -409.07, steps = 200 00:22:25 [INFO] train episode 81: reward = -260.35, steps = 200 00:22:58 [INFO] train episode 82: reward = -735.68, steps = 200 00:23:34 [INFO] train episode 83: reward = -129.49, steps = 200 00:24:09 [INFO] train episode 84: reward = -380.57, steps = 200 00:24:42 [INFO] train episode 85: reward = -252.26, steps = 200 00:25:15 [INFO] train episode 86: reward = -482.08, steps = 200 00:25:49 [INFO] train episode 87: reward = -247.00, steps = 200 00:26:25 [INFO] train episode 88: reward = -509.94, steps = 200 00:26:58 [INFO] train episode 89: reward = -124.17, steps = 200 00:27:31 [INFO] train episode 90: reward = -508.79, steps = 200 00:28:05 [INFO] train episode 91: reward = -629.38, steps = 200 00:28:38 [INFO] train episode 92: reward = -252.29, steps = 200 00:29:11 [INFO] train episode 93: reward = -604.16, steps = 200 00:29:44 [INFO] train episode 94: reward = -391.61, steps = 200 00:30:18 [INFO] train episode 95: reward = -794.07, steps = 200 00:30:52 [INFO] train episode 96: reward = -388.14, steps = 200 00:31:26 [INFO] train episode 97: reward = -562.29, steps = 200 00:32:00 [INFO] train episode 98: reward = -382.14, steps = 200 00:32:33 [INFO] train episode 99: reward = -634.31, steps = 200 00:33:07 [INFO] train episode 100: reward = -610.10, steps = 200 00:33:43 [INFO] train episode 101: reward = -627.77, steps = 200 00:34:17 [INFO] train episode 102: reward = -500.54, steps = 200 00:34:50 [INFO] train episode 103: reward = -649.02, steps = 200 00:35:24 [INFO] train episode 104: reward = -506.47, steps = 200 00:35:57 [INFO] train episode 105: reward = -489.89, steps = 200 00:36:35 [INFO] train episode 106: reward = -495.22, steps = 200 00:37:08 [INFO] train episode 107: reward = -620.70, steps = 200 00:37:41 [INFO] train episode 108: reward = -507.61, steps = 200 00:38:15 [INFO] train episode 109: reward = -624.60, steps = 200 00:38:49 [INFO] train episode 110: reward = -385.68, steps = 200 00:39:23 [INFO] train episode 111: reward = -510.95, steps = 200 00:39:57 [INFO] train episode 112: reward = -615.34, steps = 200 00:40:34 [INFO] train episode 113: reward = -625.78, steps = 200 00:41:09 [INFO] train episode 114: reward = -500.83, steps = 200 00:41:43 [INFO] train episode 115: reward = -487.49, steps = 200 00:42:15 [INFO] train episode 116: reward = -632.51, steps = 200 00:42:49 [INFO] train episode 117: reward = -625.55, steps = 200 00:43:22 [INFO] train episode 118: reward = -616.66, steps = 200 00:43:55 [INFO] train episode 119: reward = -618.17, steps = 200 00:44:28 [INFO] train episode 120: reward = -737.08, steps = 200 00:45:02 [INFO] train episode 121: reward = -492.15, steps = 200 00:45:35 [INFO] train episode 122: reward = -494.88, steps = 200 00:46:10 [INFO] train episode 123: reward = -636.74, steps = 200 00:46:44 [INFO] train episode 124: reward = -374.99, steps = 200 00:47:18 [INFO] train episode 125: reward = -370.64, steps = 200 00:47:53 [INFO] train episode 126: reward = -250.61, steps = 200 00:48:31 [INFO] train episode 127: reward = -371.19, steps = 200 00:49:06 [INFO] train episode 128: reward = -491.10, steps = 200 00:49:39 [INFO] train episode 129: reward = -479.14, steps = 200 00:50:14 [INFO] train episode 130: reward = -346.20, steps = 200 00:50:47 [INFO] train episode 131: reward = -556.07, steps = 200 00:51:22 [INFO] train episode 132: reward = -617.03, steps = 200 00:51:55 [INFO] train episode 133: reward = -489.44, steps = 200 00:52:35 [INFO] train episode 134: reward = -615.86, steps = 200 00:53:15 [INFO] train episode 135: reward = -419.48, steps = 200 00:53:55 [INFO] train episode 136: reward = -250.64, steps = 200 00:54:30 [INFO] train episode 137: reward = -363.71, steps = 200 00:55:04 [INFO] train episode 138: reward = -247.18, steps = 200 00:55:39 [INFO] train episode 139: reward = -493.37, steps = 200 00:56:17 [INFO] train episode 140: reward = -134.23, steps = 200 00:56:52 [INFO] train episode 141: reward = -248.73, steps = 200 00:57:34 [INFO] train episode 142: reward = -322.34, steps = 200 00:58:17 [INFO] train episode 143: reward = -137.95, steps = 200 00:58:54 [INFO] train episode 144: reward = -142.42, steps = 200 00:59:32 [INFO] train episode 145: reward = -135.50, steps = 200 01:00:10 [INFO] train episode 146: reward = -135.85, steps = 200 01:00:46 [INFO] train episode 147: reward = -129.90, steps = 200 01:01:21 [INFO] train episode 148: reward = -135.97, steps = 200 01:01:58 [INFO] train episode 149: reward = -234.83, steps = 200 01:02:35 [INFO] train episode 150: reward = -252.52, steps = 200 01:03:14 [INFO] train episode 151: reward = -249.02, steps = 200 01:03:51 [INFO] train episode 152: reward = -8.71, steps = 200 01:04:26 [INFO] train episode 153: reward = -123.60, steps = 200 01:05:00 [INFO] train episode 154: reward = -131.90, steps = 200 01:05:33 [INFO] train episode 155: reward = -260.60, steps = 200 01:06:07 [INFO] train episode 156: reward = -125.73, steps = 200 01:06:40 [INFO] train episode 157: reward = -136.42, steps = 200 01:07:14 [INFO] train episode 158: reward = -126.93, steps = 200 01:07:47 [INFO] train episode 159: reward = -134.05, steps = 200 01:08:21 [INFO] train episode 160: reward = -244.43, steps = 200 01:08:55 [INFO] train episode 161: reward = -133.55, steps = 200 01:09:28 [INFO] train episode 162: reward = -362.12, steps = 200 01:10:02 [INFO] train episode 163: reward = -131.61, steps = 200 01:10:36 [INFO] train episode 164: reward = -135.13, steps = 200 01:11:09 [INFO] train episode 165: reward = -134.04, steps = 200 01:11:43 [INFO] train episode 166: reward = -138.63, steps = 200 01:12:17 [INFO] train episode 167: reward = -373.00, steps = 200 01:12:50 [INFO] train episode 168: reward = -142.16, steps = 200 01:13:24 [INFO] train episode 169: reward = -139.56, steps = 200 01:13:58 [INFO] train episode 170: reward = -245.38, steps = 200 01:14:31 [INFO] train episode 171: reward = -146.46, steps = 200 01:15:04 [INFO] train episode 172: reward = -278.09, steps = 200 01:15:37 [INFO] train episode 173: reward = -483.60, steps = 200 01:16:11 [INFO] train episode 174: reward = -144.99, steps = 200 01:16:42 [INFO] train episode 175: reward = -252.48, steps = 200 01:17:15 [INFO] train episode 176: reward = -24.26, steps = 200 01:17:49 [INFO] train episode 177: reward = -269.18, steps = 200 01:18:23 [INFO] train episode 178: reward = -140.31, steps = 200 01:18:57 [INFO] train episode 179: reward = -258.38, steps = 200 01:19:31 [INFO] train episode 180: reward = -385.12, steps = 200 01:20:05 [INFO] train episode 181: reward = -139.21, steps = 200 01:20:39 [INFO] train episode 182: reward = -137.42, steps = 200 01:21:12 [INFO] train episode 183: reward = -139.26, steps = 200 01:21:46 [INFO] train episode 184: reward = -251.88, steps = 200 01:22:19 [INFO] train episode 185: reward = -151.85, steps = 200 01:22:53 [INFO] train episode 186: reward = -136.89, steps = 200 01:23:26 [INFO] train episode 187: reward = -22.10, steps = 200 01:23:59 [INFO] train episode 188: reward = -254.05, steps = 200 01:24:33 [INFO] train episode 189: reward = -21.10, steps = 200 01:25:05 [INFO] train episode 190: reward = -130.68, steps = 200 01:25:39 [INFO] train episode 191: reward = -241.49, steps = 200 01:26:13 [INFO] train episode 192: reward = -267.09, steps = 200 01:26:46 [INFO] train episode 193: reward = -486.91, steps = 200 01:27:19 [INFO] train episode 194: reward = -145.43, steps = 200 01:27:52 [INFO] train episode 195: reward = -270.20, steps = 200 01:28:25 [INFO] train episode 196: reward = -277.29, steps = 200 01:28:59 [INFO] train episode 197: reward = -256.98, steps = 200 01:29:32 [INFO] train episode 198: reward = -275.76, steps = 200 01:30:04 [INFO] train episode 199: reward = -279.07, steps = 200 01:30:38 [INFO] train episode 200: reward = -302.62, steps = 200 01:31:11 [INFO] train episode 201: reward = -274.25, steps = 200 01:31:44 [INFO] train episode 202: reward = -160.14, steps = 200 01:32:18 [INFO] train episode 203: reward = -141.26, steps = 200 01:32:51 [INFO] train episode 204: reward = -141.68, steps = 200 01:33:24 [INFO] train episode 205: reward = -152.61, steps = 200 01:33:57 [INFO] train episode 206: reward = -151.53, steps = 200 01:34:31 [INFO] train episode 207: reward = -141.90, steps = 200 01:35:04 [INFO] train episode 208: reward = -137.09, steps = 200 01:35:37 [INFO] train episode 209: reward = -17.85, steps = 200 01:36:11 [INFO] train episode 210: reward = -145.22, steps = 200 01:36:46 [INFO] train episode 211: reward = -134.22, steps = 200 01:37:21 [INFO] train episode 212: reward = -133.58, steps = 200 01:37:54 [INFO] train episode 213: reward = -15.32, steps = 200 01:37:54 [INFO] ==== test ==== 01:37:54 [INFO] test episode 0: reward = -376.42, steps = 200 01:37:54 [INFO] test episode 1: reward = -16.15, steps = 200 01:37:54 [INFO] test episode 2: reward = -17.46, steps = 200 01:37:54 [INFO] test episode 3: reward = -132.63, steps = 200 01:37:54 [INFO] test episode 4: reward = -252.30, steps = 200 01:37:55 [INFO] test episode 5: reward = -285.80, steps = 200 01:37:55 [INFO] test episode 6: reward = -133.77, steps = 200 01:37:55 [INFO] test episode 7: reward = -144.74, steps = 200 01:37:55 [INFO] test episode 8: reward = -16.54, steps = 200 01:37:55 [INFO] test episode 9: reward = -255.91, steps = 200 01:37:55 [INFO] test episode 10: reward = -250.78, steps = 200 01:37:55 [INFO] test episode 11: reward = -141.01, steps = 200 01:37:55 [INFO] test episode 12: reward = -361.69, steps = 200 01:37:55 [INFO] test episode 13: reward = -138.41, steps = 200 01:37:56 [INFO] test episode 14: reward = -256.61, steps = 200 01:37:56 [INFO] test episode 15: reward = -18.99, steps = 200 01:37:56 [INFO] test episode 16: reward = -128.23, steps = 200 01:37:56 [INFO] test episode 17: reward = -137.33, steps = 200 01:37:56 [INFO] test episode 18: reward = -136.54, steps = 200 01:37:56 [INFO] test episode 19: reward = -17.40, steps = 200 01:37:56 [INFO] test episode 20: reward = -269.00, steps = 200 01:37:56 [INFO] test episode 21: reward = -362.52, steps = 200 01:37:57 [INFO] test episode 22: reward = -140.55, steps = 200 01:37:57 [INFO] test episode 23: reward = -144.08, steps = 200 01:37:57 [INFO] test episode 24: reward = -132.10, steps = 200 01:37:57 [INFO] test episode 25: reward = -128.16, steps = 200 01:37:57 [INFO] test episode 26: reward = -144.42, steps = 200 01:37:57 [INFO] test episode 27: reward = -348.61, steps = 200 01:37:57 [INFO] test episode 28: reward = -144.68, steps = 200 01:37:57 [INFO] test episode 29: reward = -135.66, steps = 200 01:37:57 [INFO] test episode 30: reward = -395.87, steps = 200 01:37:58 [INFO] test episode 31: reward = -17.68, steps = 200 01:37:58 [INFO] test episode 32: reward = -258.45, steps = 200 01:37:58 [INFO] test episode 33: reward = -241.36, steps = 200 01:37:58 [INFO] test episode 34: reward = -345.05, steps = 200 01:37:58 [INFO] test episode 35: reward = -144.70, steps = 200 01:37:58 [INFO] test episode 36: reward = -135.11, steps = 200 01:37:58 [INFO] test episode 37: reward = -251.47, steps = 200 01:37:58 [INFO] test episode 38: reward = -138.08, steps = 200 01:37:58 [INFO] test episode 39: reward = -303.34, steps = 200 01:37:59 [INFO] test episode 40: reward = -144.13, steps = 200 01:37:59 [INFO] test episode 41: reward = -254.81, steps = 200 01:37:59 [INFO] test episode 42: reward = -129.43, steps = 200 01:37:59 [INFO] test episode 43: reward = -136.34, steps = 200 01:37:59 [INFO] test episode 44: reward = -135.45, steps = 200 01:37:59 [INFO] test episode 45: reward = -252.17, steps = 200 01:37:59 [INFO] test episode 46: reward = -135.08, steps = 200 01:37:59 [INFO] test episode 47: reward = -247.05, steps = 200 01:37:59 [INFO] test episode 48: reward = -144.06, steps = 200 01:38:00 [INFO] test episode 49: reward = -16.59, steps = 200 01:38:00 [INFO] test episode 50: reward = -136.88, steps = 200 01:38:00 [INFO] test episode 51: reward = -131.04, steps = 200 01:38:00 [INFO] test episode 52: reward = -249.31, steps = 200 01:38:00 [INFO] test episode 53: reward = -133.47, steps = 200 01:38:00 [INFO] test episode 54: reward = -136.61, steps = 200 01:38:00 [INFO] test episode 55: reward = -137.68, steps = 200 01:38:00 [INFO] test episode 56: reward = -369.94, steps = 200 01:38:00 [INFO] test episode 57: reward = -16.45, steps = 200 01:38:01 [INFO] test episode 58: reward = -137.13, steps = 200 01:38:01 [INFO] test episode 59: reward = -134.73, steps = 200 01:38:01 [INFO] test episode 60: reward = -134.81, steps = 200 01:38:01 [INFO] test episode 61: reward = -136.59, steps = 200 01:38:01 [INFO] test episode 62: reward = -129.73, steps = 200 01:38:01 [INFO] test episode 63: reward = -135.88, steps = 200 01:38:01 [INFO] test episode 64: reward = -384.43, steps = 200 01:38:01 [INFO] test episode 65: reward = -138.42, steps = 200 01:38:01 [INFO] test episode 66: reward = -136.65, steps = 200 01:38:02 [INFO] test episode 67: reward = -140.50, steps = 200 01:38:02 [INFO] test episode 68: reward = -137.42, steps = 200 01:38:02 [INFO] test episode 69: reward = -129.78, steps = 200 01:38:02 [INFO] test episode 70: reward = -139.42, steps = 200 01:38:02 [INFO] test episode 71: reward = -140.06, steps = 200 01:38:02 [INFO] test episode 72: reward = -132.66, steps = 200 01:38:02 [INFO] test episode 73: reward = -254.19, steps = 200 01:38:02 [INFO] test episode 74: reward = -137.95, steps = 200 01:38:02 [INFO] test episode 75: reward = -17.16, steps = 200 01:38:03 [INFO] test episode 76: reward = -139.48, steps = 200 01:38:03 [INFO] test episode 77: reward = -242.89, steps = 200 01:38:03 [INFO] test episode 78: reward = -127.88, steps = 200 01:38:03 [INFO] test episode 79: reward = -16.39, steps = 200 01:38:03 [INFO] test episode 80: reward = -145.02, steps = 200 01:38:03 [INFO] test episode 81: reward = -128.05, steps = 200 01:38:03 [INFO] test episode 82: reward = -16.40, steps = 200 01:38:03 [INFO] test episode 83: reward = -261.78, steps = 200 01:38:03 [INFO] test episode 84: reward = -132.06, steps = 200 01:38:04 [INFO] test episode 85: reward = -129.15, steps = 200 01:38:04 [INFO] test episode 86: reward = -141.86, steps = 200 01:38:04 [INFO] test episode 87: reward = -246.46, steps = 200 01:38:04 [INFO] test episode 88: reward = -129.05, steps = 200 01:38:04 [INFO] test episode 89: reward = -135.87, steps = 200 01:38:04 [INFO] test episode 90: reward = -277.85, steps = 200 01:38:04 [INFO] test episode 91: reward = -139.72, steps = 200 01:38:04 [INFO] test episode 92: reward = -136.69, steps = 200 01:38:04 [INFO] test episode 93: reward = -17.53, steps = 200 01:38:05 [INFO] test episode 94: reward = -138.06, steps = 200 01:38:05 [INFO] test episode 95: reward = -144.99, steps = 200 01:38:05 [INFO] test episode 96: reward = -142.33, steps = 200 01:38:05 [INFO] test episode 97: reward = -133.69, steps = 200 01:38:05 [INFO] test episode 98: reward = -253.56, steps = 200 01:38:05 [INFO] test episode 99: reward = -136.17, steps = 200 01:38:05 [INFO] average episode reward = -165.17 ± 90.69
env.close()