PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import copy
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('MountainCar-v0')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
22:49:52 [INFO] env: <MountainCarEnv<MountainCar-v0>> 22:49:52 [INFO] action_space: Discrete(3) 22:49:52 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32) 22:49:52 [INFO] reward_range: (-inf, inf) 22:49:52 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30} 22:49:52 [INFO] _max_episode_steps: 200 22:49:52 [INFO] _elapsed_steps: None 22:49:52 [INFO] id: MountainCar-v0 22:49:52 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv 22:49:52 [INFO] reward_threshold: -110.0 22:49:52 [INFO] nondeterministic: False 22:49:52 [INFO] max_episode_steps: 200 22:49:52 [INFO] _kwargs: {} 22:49:52 [INFO] _env_name: MountainCar
class DQNReplayer:
def __init__(self, capacity):
self.memory = pd.DataFrame(index=range(capacity),
columns=['state', 'action', 'reward', 'next_state', 'terminated'])
self.i = 0
self.count = 0
self.capacity = capacity
def store(self, *args):
self.memory.loc[self.i] = np.asarray(args, dtype=object)
self.i = (self.i + 1) % self.capacity
self.count = min(self.count + 1, self.capacity)
def sample(self, size):
indices = np.random.choice(self.count, size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.memory.columns)
class DoubleDQNAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = DQNReplayer(10000)
self.evaluate_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[64, 64], output_size=self.action_n)
self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
self.loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
model = nn.Sequential(*layers)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.target_net = copy.deepcopy(self.evaluate_net)
def step(self, observation, reward, terminated):
if self.mode == 'train' and np.random.rand() < 0.001:
# epsilon-greedy policy in train mode
action = np.random.randint(self.action_n)
else:
state_tensor = torch.as_tensor(observation,
dtype=torch.float).reshape(1, -1)
q_tensor = self.evaluate_net(state_tensor)
action_tensor = torch.argmax(q_tensor)
action = action_tensor.item()
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
state, _, _, act, next_state, reward, terminated, _ = \
self.trajectory[-8:]
self.replayer.store(state, act, reward, next_state, terminated)
if self.replayer.count >= self.replayer.capacity * 0.95:
# skip first few episodes for speed
self.learn()
return action
def close(self):
pass
def learn(self):
# replay
states, actions, rewards, next_states, terminateds = \
self.replayer.sample(1024)
state_tensor = torch.as_tensor(states, dtype=torch.float)
action_tensor = torch.as_tensor(actions, dtype=torch.long)
reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)
# update value net
next_eval_q_tensor = self.evaluate_net(next_state_tensor)
next_action_tensor = next_eval_q_tensor.argmax(axis=-1)
next_q_tensor = self.target_net(next_state_tensor)
next_max_q_tensor = torch.gather(next_q_tensor, 1,
next_action_tensor.unsqueeze(1)).squeeze(1)
target_tensor = reward_tensor + self.gamma * \
(1. - terminated_tensor) * next_max_q_tensor
pred_tensor = self.evaluate_net(state_tensor)
q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
loss_tensor = self.loss(target_tensor, q_tensor)
self.optimizer.zero_grad()
loss_tensor.backward()
self.optimizer.step()
agent = DoubleDQNAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -110:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
22:49:52 [INFO] ==== train ==== 22:49:52 [INFO] train episode 0: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 1: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 2: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 3: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 4: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 5: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 6: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 7: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 8: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 9: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 10: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 11: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 12: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 13: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 14: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 15: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 16: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 17: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 18: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 19: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 20: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 21: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 22: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 23: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 24: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 25: reward = -200.00, steps = 200 22:49:56 [INFO] train episode 26: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 27: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 28: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 29: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 30: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 31: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 32: reward = -200.00, steps = 200 22:49:57 [INFO] train episode 33: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 34: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 35: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 36: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 37: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 38: reward = -200.00, steps = 200 22:49:58 [INFO] train episode 39: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 40: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 41: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 42: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 43: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 44: reward = -200.00, steps = 200 22:49:59 [INFO] train episode 45: reward = -200.00, steps = 200 22:50:00 [INFO] train episode 46: reward = -200.00, steps = 200 22:50:18 [INFO] train episode 47: reward = -200.00, steps = 200 22:51:15 [INFO] train episode 48: reward = -200.00, steps = 200 22:52:11 [INFO] train episode 49: reward = -200.00, steps = 200 22:53:09 [INFO] train episode 50: reward = -200.00, steps = 200 22:54:09 [INFO] train episode 51: reward = -200.00, steps = 200 22:55:10 [INFO] train episode 52: reward = -200.00, steps = 200 22:56:07 [INFO] train episode 53: reward = -200.00, steps = 200 22:57:03 [INFO] train episode 54: reward = -200.00, steps = 200 22:58:00 [INFO] train episode 55: reward = -200.00, steps = 200 22:58:58 [INFO] train episode 56: reward = -200.00, steps = 200 22:59:56 [INFO] train episode 57: reward = -200.00, steps = 200 23:01:12 [INFO] train episode 58: reward = -200.00, steps = 200 23:02:20 [INFO] train episode 59: reward = -200.00, steps = 200 23:03:29 [INFO] train episode 60: reward = -200.00, steps = 200 23:04:32 [INFO] train episode 61: reward = -200.00, steps = 200 23:05:36 [INFO] train episode 62: reward = -200.00, steps = 200 23:06:40 [INFO] train episode 63: reward = -200.00, steps = 200 23:07:42 [INFO] train episode 64: reward = -200.00, steps = 200 23:08:44 [INFO] train episode 65: reward = -200.00, steps = 200 23:09:48 [INFO] train episode 66: reward = -200.00, steps = 200 23:10:56 [INFO] train episode 67: reward = -200.00, steps = 200 23:12:21 [INFO] train episode 68: reward = -200.00, steps = 200 23:14:17 [INFO] train episode 69: reward = -200.00, steps = 200 23:16:11 [INFO] train episode 70: reward = -200.00, steps = 200 23:18:03 [INFO] train episode 71: reward = -200.00, steps = 200 23:19:55 [INFO] train episode 72: reward = -200.00, steps = 200 23:21:50 [INFO] train episode 73: reward = -200.00, steps = 200 23:23:47 [INFO] train episode 74: reward = -200.00, steps = 200 23:25:40 [INFO] train episode 75: reward = -200.00, steps = 200 23:27:37 [INFO] train episode 76: reward = -200.00, steps = 200 23:29:32 [INFO] train episode 77: reward = -200.00, steps = 200 23:31:29 [INFO] train episode 78: reward = -200.00, steps = 200 23:33:20 [INFO] train episode 79: reward = -200.00, steps = 200 23:35:13 [INFO] train episode 80: reward = -200.00, steps = 200 23:37:07 [INFO] train episode 81: reward = -200.00, steps = 200 23:39:07 [INFO] train episode 82: reward = -200.00, steps = 200 23:41:03 [INFO] train episode 83: reward = -200.00, steps = 200 23:43:09 [INFO] train episode 84: reward = -200.00, steps = 200 23:45:12 [INFO] train episode 85: reward = -200.00, steps = 200 23:47:18 [INFO] train episode 86: reward = -200.00, steps = 200 23:49:33 [INFO] train episode 87: reward = -200.00, steps = 200 23:51:38 [INFO] train episode 88: reward = -200.00, steps = 200 23:53:45 [INFO] train episode 89: reward = -200.00, steps = 200 23:55:39 [INFO] train episode 90: reward = -200.00, steps = 200 23:57:33 [INFO] train episode 91: reward = -200.00, steps = 200 23:59:26 [INFO] train episode 92: reward = -200.00, steps = 200 00:01:19 [INFO] train episode 93: reward = -200.00, steps = 200 00:03:13 [INFO] train episode 94: reward = -200.00, steps = 200 00:05:05 [INFO] train episode 95: reward = -200.00, steps = 200 00:06:54 [INFO] train episode 96: reward = -200.00, steps = 200 00:08:47 [INFO] train episode 97: reward = -200.00, steps = 200 00:10:41 [INFO] train episode 98: reward = -200.00, steps = 200 00:12:34 [INFO] train episode 99: reward = -200.00, steps = 200 00:14:26 [INFO] train episode 100: reward = -200.00, steps = 200 00:16:18 [INFO] train episode 101: reward = -200.00, steps = 200 00:18:09 [INFO] train episode 102: reward = -200.00, steps = 200 00:20:00 [INFO] train episode 103: reward = -200.00, steps = 200 00:21:53 [INFO] train episode 104: reward = -200.00, steps = 200 00:23:46 [INFO] train episode 105: reward = -200.00, steps = 200 00:25:39 [INFO] train episode 106: reward = -200.00, steps = 200 00:27:35 [INFO] train episode 107: reward = -200.00, steps = 200 00:29:27 [INFO] train episode 108: reward = -200.00, steps = 200 00:31:15 [INFO] train episode 109: reward = -195.00, steps = 195 00:33:12 [INFO] train episode 110: reward = -200.00, steps = 200 00:35:02 [INFO] train episode 111: reward = -200.00, steps = 200 00:36:53 [INFO] train episode 112: reward = -200.00, steps = 200 00:38:47 [INFO] train episode 113: reward = -200.00, steps = 200 00:40:33 [INFO] train episode 114: reward = -200.00, steps = 200 00:42:25 [INFO] train episode 115: reward = -200.00, steps = 200 00:44:15 [INFO] train episode 116: reward = -200.00, steps = 200 00:46:06 [INFO] train episode 117: reward = -200.00, steps = 200 00:47:56 [INFO] train episode 118: reward = -200.00, steps = 200 00:49:46 [INFO] train episode 119: reward = -200.00, steps = 200 00:51:37 [INFO] train episode 120: reward = -200.00, steps = 200 00:53:28 [INFO] train episode 121: reward = -200.00, steps = 200 00:55:18 [INFO] train episode 122: reward = -200.00, steps = 200 00:57:08 [INFO] train episode 123: reward = -200.00, steps = 200 00:58:37 [INFO] train episode 124: reward = -200.00, steps = 200 01:00:27 [INFO] train episode 125: reward = -200.00, steps = 200 01:02:17 [INFO] train episode 126: reward = -200.00, steps = 200 01:04:06 [INFO] train episode 127: reward = -200.00, steps = 200 01:05:55 [INFO] train episode 128: reward = -200.00, steps = 200 01:07:47 [INFO] train episode 129: reward = -200.00, steps = 200 01:09:19 [INFO] train episode 130: reward = -164.00, steps = 164 01:11:11 [INFO] train episode 131: reward = -200.00, steps = 200 01:13:01 [INFO] train episode 132: reward = -200.00, steps = 200 01:14:52 [INFO] train episode 133: reward = -200.00, steps = 200 01:16:42 [INFO] train episode 134: reward = -200.00, steps = 200 01:18:03 [INFO] train episode 135: reward = -146.00, steps = 146 01:19:51 [INFO] train episode 136: reward = -200.00, steps = 200 01:21:43 [INFO] train episode 137: reward = -200.00, steps = 200 01:23:33 [INFO] train episode 138: reward = -200.00, steps = 200 01:25:24 [INFO] train episode 139: reward = -200.00, steps = 200 01:27:19 [INFO] train episode 140: reward = -200.00, steps = 200 01:29:11 [INFO] train episode 141: reward = -200.00, steps = 200 01:31:03 [INFO] train episode 142: reward = -200.00, steps = 200 01:32:54 [INFO] train episode 143: reward = -200.00, steps = 200 01:34:11 [INFO] train episode 144: reward = -139.00, steps = 139 01:36:01 [INFO] train episode 145: reward = -200.00, steps = 200 01:37:53 [INFO] train episode 146: reward = -200.00, steps = 200 01:39:21 [INFO] train episode 147: reward = -159.00, steps = 159 01:41:12 [INFO] train episode 148: reward = -200.00, steps = 200 01:43:03 [INFO] train episode 149: reward = -200.00, steps = 200 01:44:54 [INFO] train episode 150: reward = -200.00, steps = 200 01:46:44 [INFO] train episode 151: reward = -200.00, steps = 200 01:48:34 [INFO] train episode 152: reward = -200.00, steps = 200 01:50:25 [INFO] train episode 153: reward = -200.00, steps = 200 01:52:16 [INFO] train episode 154: reward = -200.00, steps = 200 01:54:08 [INFO] train episode 155: reward = -200.00, steps = 200 01:55:41 [INFO] train episode 156: reward = -166.00, steps = 166 01:57:33 [INFO] train episode 157: reward = -200.00, steps = 200 01:59:22 [INFO] train episode 158: reward = -200.00, steps = 200 02:00:43 [INFO] train episode 159: reward = -145.00, steps = 145 02:02:07 [INFO] train episode 160: reward = -154.00, steps = 154 02:03:40 [INFO] train episode 161: reward = -168.00, steps = 168 02:05:29 [INFO] train episode 162: reward = -200.00, steps = 200 02:07:02 [INFO] train episode 163: reward = -167.00, steps = 167 02:08:20 [INFO] train episode 164: reward = -138.00, steps = 138 02:09:37 [INFO] train episode 165: reward = -137.00, steps = 137 02:10:54 [INFO] train episode 166: reward = -136.00, steps = 136 02:12:13 [INFO] train episode 167: reward = -142.00, steps = 142 02:13:34 [INFO] train episode 168: reward = -143.00, steps = 143 02:14:53 [INFO] train episode 169: reward = -143.00, steps = 143 02:16:13 [INFO] train episode 170: reward = -143.00, steps = 143 02:17:33 [INFO] train episode 171: reward = -145.00, steps = 145 02:19:07 [INFO] train episode 172: reward = -169.00, steps = 169 02:20:33 [INFO] train episode 173: reward = -155.00, steps = 155 02:21:59 [INFO] train episode 174: reward = -153.00, steps = 153 02:23:23 [INFO] train episode 175: reward = -150.00, steps = 150 02:24:51 [INFO] train episode 176: reward = -157.00, steps = 157 02:26:33 [INFO] train episode 177: reward = -183.00, steps = 183 02:28:04 [INFO] train episode 178: reward = -159.00, steps = 159 02:29:31 [INFO] train episode 179: reward = -158.00, steps = 158 02:31:04 [INFO] train episode 180: reward = -168.00, steps = 168 02:32:47 [INFO] train episode 181: reward = -185.00, steps = 185 02:34:37 [INFO] train episode 182: reward = -200.00, steps = 200 02:36:27 [INFO] train episode 183: reward = -200.00, steps = 200 02:38:19 [INFO] train episode 184: reward = -200.00, steps = 200 02:39:39 [INFO] train episode 185: reward = -170.00, steps = 170 02:41:15 [INFO] train episode 186: reward = -173.00, steps = 173 02:42:54 [INFO] train episode 187: reward = -200.00, steps = 200 02:44:34 [INFO] train episode 188: reward = -182.00, steps = 182 02:45:30 [INFO] train episode 189: reward = -102.00, steps = 102 02:46:20 [INFO] train episode 190: reward = -90.00, steps = 90 02:48:09 [INFO] train episode 191: reward = -200.00, steps = 200 02:49:05 [INFO] train episode 192: reward = -102.00, steps = 102 02:50:55 [INFO] train episode 193: reward = -200.00, steps = 200 02:52:47 [INFO] train episode 194: reward = -200.00, steps = 200 02:54:25 [INFO] train episode 195: reward = -174.00, steps = 174 02:55:55 [INFO] train episode 196: reward = -162.00, steps = 162 02:57:28 [INFO] train episode 197: reward = -200.00, steps = 200 02:58:05 [INFO] train episode 198: reward = -84.00, steps = 84 02:58:42 [INFO] train episode 199: reward = -86.00, steps = 86 02:59:57 [INFO] train episode 200: reward = -171.00, steps = 171 03:00:37 [INFO] train episode 201: reward = -89.00, steps = 89 03:01:16 [INFO] train episode 202: reward = -88.00, steps = 88 03:02:05 [INFO] train episode 203: reward = -111.00, steps = 111 03:03:32 [INFO] train episode 204: reward = -200.00, steps = 200 03:04:16 [INFO] train episode 205: reward = -100.00, steps = 100 03:04:53 [INFO] train episode 206: reward = -83.00, steps = 83 03:06:04 [INFO] train episode 207: reward = -163.00, steps = 163 03:07:04 [INFO] train episode 208: reward = -134.00, steps = 134 03:08:13 [INFO] train episode 209: reward = -157.00, steps = 157 03:09:22 [INFO] train episode 210: reward = -156.00, steps = 156 03:09:59 [INFO] train episode 211: reward = -84.00, steps = 84 03:11:26 [INFO] train episode 212: reward = -200.00, steps = 200 03:12:04 [INFO] train episode 213: reward = -87.00, steps = 87 03:12:42 [INFO] train episode 214: reward = -86.00, steps = 86 03:13:53 [INFO] train episode 215: reward = -162.00, steps = 162 03:15:05 [INFO] train episode 216: reward = -166.00, steps = 166 03:15:45 [INFO] train episode 217: reward = -90.00, steps = 90 03:16:22 [INFO] train episode 218: reward = -84.00, steps = 84 03:17:48 [INFO] train episode 219: reward = -200.00, steps = 200 03:18:54 [INFO] train episode 220: reward = -151.00, steps = 151 03:20:17 [INFO] train episode 221: reward = -200.00, steps = 200 03:21:20 [INFO] train episode 222: reward = -151.00, steps = 151 03:22:26 [INFO] train episode 223: reward = -156.00, steps = 156 03:23:41 [INFO] train episode 224: reward = -182.00, steps = 182 03:24:46 [INFO] train episode 225: reward = -155.00, steps = 155 03:25:32 [INFO] train episode 226: reward = -112.00, steps = 112 03:26:33 [INFO] train episode 227: reward = -146.00, steps = 146 03:27:23 [INFO] train episode 228: reward = -112.00, steps = 112 03:28:13 [INFO] train episode 229: reward = -119.00, steps = 119 03:29:01 [INFO] train episode 230: reward = -116.00, steps = 116 03:29:47 [INFO] train episode 231: reward = -110.00, steps = 110 03:30:30 [INFO] train episode 232: reward = -104.00, steps = 104 03:31:16 [INFO] train episode 233: reward = -110.00, steps = 110 03:32:05 [INFO] train episode 234: reward = -119.00, steps = 119 03:32:55 [INFO] train episode 235: reward = -120.00, steps = 120 03:34:16 [INFO] train episode 236: reward = -200.00, steps = 200 03:35:04 [INFO] train episode 237: reward = -117.00, steps = 117 03:35:51 [INFO] train episode 238: reward = -116.00, steps = 116 03:36:42 [INFO] train episode 239: reward = -122.00, steps = 122 03:38:05 [INFO] train episode 240: reward = -200.00, steps = 200 03:38:54 [INFO] train episode 241: reward = -117.00, steps = 117 03:39:42 [INFO] train episode 242: reward = -117.00, steps = 117 03:41:05 [INFO] train episode 243: reward = -200.00, steps = 200 03:41:53 [INFO] train episode 244: reward = -116.00, steps = 116 03:43:16 [INFO] train episode 245: reward = -200.00, steps = 200 03:44:38 [INFO] train episode 246: reward = -200.00, steps = 200 03:45:25 [INFO] train episode 247: reward = -116.00, steps = 116 03:46:46 [INFO] train episode 248: reward = -200.00, steps = 200 03:48:08 [INFO] train episode 249: reward = -200.00, steps = 200 03:48:56 [INFO] train episode 250: reward = -115.00, steps = 115 03:49:50 [INFO] train episode 251: reward = -132.00, steps = 132 03:50:39 [INFO] train episode 252: reward = -120.00, steps = 120 03:51:35 [INFO] train episode 253: reward = -133.00, steps = 133 03:52:25 [INFO] train episode 254: reward = -118.00, steps = 118 03:53:15 [INFO] train episode 255: reward = -122.00, steps = 122 03:54:07 [INFO] train episode 256: reward = -123.00, steps = 123 03:54:56 [INFO] train episode 257: reward = -120.00, steps = 120 03:55:43 [INFO] train episode 258: reward = -113.00, steps = 113 03:56:30 [INFO] train episode 259: reward = -111.00, steps = 111 03:57:17 [INFO] train episode 260: reward = -112.00, steps = 112 03:58:02 [INFO] train episode 261: reward = -109.00, steps = 109 03:58:48 [INFO] train episode 262: reward = -109.00, steps = 109 03:59:33 [INFO] train episode 263: reward = -109.00, steps = 109 04:00:19 [INFO] train episode 264: reward = -109.00, steps = 109 04:01:03 [INFO] train episode 265: reward = -108.00, steps = 108 04:01:48 [INFO] train episode 266: reward = -109.00, steps = 109 04:02:30 [INFO] train episode 267: reward = -101.00, steps = 101 04:02:30 [INFO] ==== test ==== 04:02:30 [INFO] test episode 0: reward = -85.00, steps = 85 04:02:30 [INFO] test episode 1: reward = -107.00, steps = 107 04:02:30 [INFO] test episode 2: reward = -85.00, steps = 85 04:02:30 [INFO] test episode 3: reward = -108.00, steps = 108 04:02:30 [INFO] test episode 4: reward = -107.00, steps = 107 04:02:30 [INFO] test episode 5: reward = -107.00, steps = 107 04:02:30 [INFO] test episode 6: reward = -86.00, steps = 86 04:02:30 [INFO] test episode 7: reward = -108.00, steps = 108 04:02:30 [INFO] test episode 8: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 9: reward = -107.00, steps = 107 04:02:31 [INFO] test episode 10: reward = -87.00, steps = 87 04:02:31 [INFO] test episode 11: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 12: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 13: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 14: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 15: reward = -107.00, steps = 107 04:02:31 [INFO] test episode 16: reward = -85.00, steps = 85 04:02:31 [INFO] test episode 17: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 18: reward = -107.00, steps = 107 04:02:31 [INFO] test episode 19: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 20: reward = -107.00, steps = 107 04:02:31 [INFO] test episode 21: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 22: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 23: reward = -87.00, steps = 87 04:02:31 [INFO] test episode 24: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 25: reward = -108.00, steps = 108 04:02:31 [INFO] test episode 26: reward = -107.00, steps = 107 04:02:31 [INFO] test episode 27: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 28: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 29: reward = -106.00, steps = 106 04:02:32 [INFO] test episode 30: reward = -84.00, steps = 84 04:02:32 [INFO] test episode 31: reward = -106.00, steps = 106 04:02:32 [INFO] test episode 32: reward = -84.00, steps = 84 04:02:32 [INFO] test episode 33: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 34: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 35: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 36: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 37: reward = -85.00, steps = 85 04:02:32 [INFO] test episode 38: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 39: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 40: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 41: reward = -84.00, steps = 84 04:02:32 [INFO] test episode 42: reward = -108.00, steps = 108 04:02:32 [INFO] test episode 43: reward = -86.00, steps = 86 04:02:32 [INFO] test episode 44: reward = -107.00, steps = 107 04:02:32 [INFO] test episode 45: reward = -85.00, steps = 85 04:02:32 [INFO] test episode 46: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 47: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 48: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 49: reward = -107.00, steps = 107 04:02:33 [INFO] test episode 50: reward = -107.00, steps = 107 04:02:33 [INFO] test episode 51: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 52: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 53: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 54: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 55: reward = -86.00, steps = 86 04:02:33 [INFO] test episode 56: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 57: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 58: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 59: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 60: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 61: reward = -107.00, steps = 107 04:02:33 [INFO] test episode 62: reward = -107.00, steps = 107 04:02:33 [INFO] test episode 63: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 64: reward = -108.00, steps = 108 04:02:33 [INFO] test episode 65: reward = -88.00, steps = 88 04:02:34 [INFO] test episode 66: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 67: reward = -107.00, steps = 107 04:02:34 [INFO] test episode 68: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 69: reward = -84.00, steps = 84 04:02:34 [INFO] test episode 70: reward = -107.00, steps = 107 04:02:34 [INFO] test episode 71: reward = -87.00, steps = 87 04:02:34 [INFO] test episode 72: reward = -85.00, steps = 85 04:02:34 [INFO] test episode 73: reward = -87.00, steps = 87 04:02:34 [INFO] test episode 74: reward = -107.00, steps = 107 04:02:34 [INFO] test episode 75: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 76: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 77: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 78: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 79: reward = -86.00, steps = 86 04:02:34 [INFO] test episode 80: reward = -107.00, steps = 107 04:02:34 [INFO] test episode 81: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 82: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 83: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 84: reward = -108.00, steps = 108 04:02:34 [INFO] test episode 85: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 86: reward = -84.00, steps = 84 04:02:35 [INFO] test episode 87: reward = -107.00, steps = 107 04:02:35 [INFO] test episode 88: reward = -107.00, steps = 107 04:02:35 [INFO] test episode 89: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 90: reward = -88.00, steps = 88 04:02:35 [INFO] test episode 91: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 92: reward = -84.00, steps = 84 04:02:35 [INFO] test episode 93: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 94: reward = -107.00, steps = 107 04:02:35 [INFO] test episode 95: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 96: reward = -107.00, steps = 107 04:02:35 [INFO] test episode 97: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 98: reward = -108.00, steps = 108 04:02:35 [INFO] test episode 99: reward = -86.00, steps = 86 04:02:35 [INFO] average episode reward = -102.59 ± 9.34
env.close()