PyTorch version
%matplotlib inline
import sys
import logging
import itertools
import copy
import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
torch.manual_seed(0)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
env = gym.make('MountainCar-v0')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
22:49:45 [INFO] env: <MountainCarEnv<MountainCar-v0>> 22:49:45 [INFO] action_space: Discrete(3) 22:49:45 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32) 22:49:45 [INFO] reward_range: (-inf, inf) 22:49:45 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30} 22:49:45 [INFO] _max_episode_steps: 200 22:49:45 [INFO] _elapsed_steps: None 22:49:45 [INFO] id: MountainCar-v0 22:49:45 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv 22:49:45 [INFO] reward_threshold: -110.0 22:49:45 [INFO] nondeterministic: False 22:49:45 [INFO] max_episode_steps: 200 22:49:45 [INFO] _kwargs: {} 22:49:45 [INFO] _env_name: MountainCar
class DQNReplayer:
def __init__(self, capacity):
self.memory = pd.DataFrame(index=range(capacity),
columns=['state', 'action', 'reward', 'next_state', 'terminated'])
self.i = 0
self.count = 0
self.capacity = capacity
def store(self, *args):
self.memory.loc[self.i] = np.asarray(args, dtype=object)
self.i = (self.i + 1) % self.capacity
self.count = min(self.count + 1, self.capacity)
def sample(self, size):
indices = np.random.choice(self.count, size=size)
return (np.stack(self.memory.loc[indices, field]) for field in
self.memory.columns)
class DQNAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.gamma = 0.99
self.replayer = DQNReplayer(10000)
self.evaluate_net = self.build_net(
input_size=env.observation_space.shape[0],
hidden_sizes=[64, 64], output_size=self.action_n)
self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)
self.loss = nn.MSELoss()
def build_net(self, input_size, hidden_sizes, output_size):
layers = []
for input_size, output_size in zip(
[input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
layers.append(nn.Linear(input_size, output_size))
layers.append(nn.ReLU())
layers = layers[:-1]
model = nn.Sequential(*layers)
return model
def reset(self, mode=None):
self.mode = mode
if self.mode == 'train':
self.trajectory = []
self.target_net = copy.deepcopy(self.evaluate_net)
def step(self, observation, reward, terminated):
if self.mode == 'train' and np.random.rand() < 0.001:
# epsilon-greedy policy in train mode
action = np.random.randint(self.action_n)
else:
state_tensor = torch.as_tensor(observation,
dtype=torch.float).squeeze(0)
q_tensor = self.evaluate_net(state_tensor)
action_tensor = torch.argmax(q_tensor)
action = action_tensor.item()
if self.mode == 'train':
self.trajectory += [observation, reward, terminated, action]
if len(self.trajectory) >= 8:
state, _, _, act, next_state, reward, terminated, _ = \
self.trajectory[-8:]
self.replayer.store(state, act, reward, next_state, terminated)
if self.replayer.count >= self.replayer.capacity * 0.95:
# skip first few episodes for speed
self.learn()
return action
def close(self):
pass
def learn(self):
# replay
states, actions, rewards, next_states, terminateds = \
self.replayer.sample(1024)
state_tensor = torch.as_tensor(states, dtype=torch.float)
action_tensor = torch.as_tensor(actions, dtype=torch.long)
reward_tensor = torch.as_tensor(rewards, dtype=torch.float)
next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)
terminated_tensor = torch.as_tensor(terminateds, dtype=torch.float)
# update value net
next_q_tensor = self.target_net(next_state_tensor)
next_max_q_tensor, _ = next_q_tensor.max(axis=-1)
target_tensor = reward_tensor + self.gamma * \
(1. - terminated_tensor) * next_max_q_tensor
pred_tensor = self.evaluate_net(state_tensor)
q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)
loss_tensor = self.loss(target_tensor, q_tensor)
self.optimizer.zero_grad()
loss_tensor.backward()
self.optimizer.step()
agent = DQNAgent(env)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
logging.info('train episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
if np.mean(episode_rewards[-10:]) > -110:
break
plt.plot(episode_rewards)
logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('test episode %d: reward = %.2f, steps = %d',
episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
np.mean(episode_rewards), np.std(episode_rewards))
22:49:46 [INFO] ==== train ==== 22:49:46 [INFO] train episode 0: reward = -200.00, steps = 200 22:49:46 [INFO] train episode 1: reward = -200.00, steps = 200 22:49:46 [INFO] train episode 2: reward = -200.00, steps = 200 22:49:46 [INFO] train episode 3: reward = -200.00, steps = 200 22:49:47 [INFO] train episode 4: reward = -200.00, steps = 200 22:49:47 [INFO] train episode 5: reward = -200.00, steps = 200 22:49:47 [INFO] train episode 6: reward = -200.00, steps = 200 22:49:47 [INFO] train episode 7: reward = -200.00, steps = 200 22:49:47 [INFO] train episode 8: reward = -200.00, steps = 200 22:49:48 [INFO] train episode 9: reward = -200.00, steps = 200 22:49:48 [INFO] train episode 10: reward = -200.00, steps = 200 22:49:48 [INFO] train episode 11: reward = -200.00, steps = 200 22:49:48 [INFO] train episode 12: reward = -200.00, steps = 200 22:49:48 [INFO] train episode 13: reward = -200.00, steps = 200 22:49:49 [INFO] train episode 14: reward = -200.00, steps = 200 22:49:49 [INFO] train episode 15: reward = -200.00, steps = 200 22:49:49 [INFO] train episode 16: reward = -200.00, steps = 200 22:49:49 [INFO] train episode 17: reward = -200.00, steps = 200 22:49:49 [INFO] train episode 18: reward = -200.00, steps = 200 22:49:50 [INFO] train episode 19: reward = -200.00, steps = 200 22:49:50 [INFO] train episode 20: reward = -200.00, steps = 200 22:49:50 [INFO] train episode 21: reward = -200.00, steps = 200 22:49:50 [INFO] train episode 22: reward = -200.00, steps = 200 22:49:50 [INFO] train episode 23: reward = -200.00, steps = 200 22:49:51 [INFO] train episode 24: reward = -200.00, steps = 200 22:49:51 [INFO] train episode 25: reward = -200.00, steps = 200 22:49:51 [INFO] train episode 26: reward = -200.00, steps = 200 22:49:51 [INFO] train episode 27: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 28: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 29: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 30: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 31: reward = -200.00, steps = 200 22:49:52 [INFO] train episode 32: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 33: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 34: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 35: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 36: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 37: reward = -200.00, steps = 200 22:49:53 [INFO] train episode 38: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 39: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 40: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 41: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 42: reward = -200.00, steps = 200 22:49:54 [INFO] train episode 43: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 44: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 45: reward = -200.00, steps = 200 22:49:55 [INFO] train episode 46: reward = -200.00, steps = 200 22:50:12 [INFO] train episode 47: reward = -200.00, steps = 200 22:51:00 [INFO] train episode 48: reward = -200.00, steps = 200 22:51:47 [INFO] train episode 49: reward = -200.00, steps = 200 22:52:34 [INFO] train episode 50: reward = -200.00, steps = 200 22:53:22 [INFO] train episode 51: reward = -200.00, steps = 200 22:54:13 [INFO] train episode 52: reward = -200.00, steps = 200 22:55:03 [INFO] train episode 53: reward = -200.00, steps = 200 22:55:51 [INFO] train episode 54: reward = -200.00, steps = 200 22:56:38 [INFO] train episode 55: reward = -200.00, steps = 200 22:57:24 [INFO] train episode 56: reward = -200.00, steps = 200 22:58:12 [INFO] train episode 57: reward = -200.00, steps = 200 22:59:00 [INFO] train episode 58: reward = -200.00, steps = 200 22:59:47 [INFO] train episode 59: reward = -200.00, steps = 200 23:00:48 [INFO] train episode 60: reward = -200.00, steps = 200 23:01:50 [INFO] train episode 61: reward = -200.00, steps = 200 23:02:40 [INFO] train episode 62: reward = -200.00, steps = 200 23:03:39 [INFO] train episode 63: reward = -200.00, steps = 200 23:04:30 [INFO] train episode 64: reward = -200.00, steps = 200 23:05:24 [INFO] train episode 65: reward = -200.00, steps = 200 23:06:18 [INFO] train episode 66: reward = -200.00, steps = 200 23:07:08 [INFO] train episode 67: reward = -200.00, steps = 200 23:07:59 [INFO] train episode 68: reward = -200.00, steps = 200 23:08:51 [INFO] train episode 69: reward = -200.00, steps = 200 23:09:44 [INFO] train episode 70: reward = -200.00, steps = 200 23:10:40 [INFO] train episode 71: reward = -200.00, steps = 200 23:11:43 [INFO] train episode 72: reward = -200.00, steps = 200 23:13:10 [INFO] train episode 73: reward = -200.00, steps = 200 23:14:46 [INFO] train episode 74: reward = -200.00, steps = 200 23:16:21 [INFO] train episode 75: reward = -200.00, steps = 200 23:17:53 [INFO] train episode 76: reward = -200.00, steps = 200 23:19:25 [INFO] train episode 77: reward = -200.00, steps = 200 23:20:59 [INFO] train episode 78: reward = -200.00, steps = 200 23:22:33 [INFO] train episode 79: reward = -200.00, steps = 200 23:24:10 [INFO] train episode 80: reward = -200.00, steps = 200 23:25:43 [INFO] train episode 81: reward = -200.00, steps = 200 23:27:19 [INFO] train episode 82: reward = -200.00, steps = 200 23:28:53 [INFO] train episode 83: reward = -200.00, steps = 200 23:30:28 [INFO] train episode 84: reward = -200.00, steps = 200 23:32:04 [INFO] train episode 85: reward = -200.00, steps = 200 23:33:36 [INFO] train episode 86: reward = -200.00, steps = 200 23:35:09 [INFO] train episode 87: reward = -200.00, steps = 200 23:36:44 [INFO] train episode 88: reward = -200.00, steps = 200 23:38:22 [INFO] train episode 89: reward = -200.00, steps = 200 23:39:57 [INFO] train episode 90: reward = -200.00, steps = 200 23:41:33 [INFO] train episode 91: reward = -200.00, steps = 200 23:43:20 [INFO] train episode 92: reward = -200.00, steps = 200 23:45:02 [INFO] train episode 93: reward = -200.00, steps = 200 23:46:41 [INFO] train episode 94: reward = -200.00, steps = 200 23:48:39 [INFO] train episode 95: reward = -200.00, steps = 200 23:50:22 [INFO] train episode 96: reward = -200.00, steps = 200 23:52:03 [INFO] train episode 97: reward = -200.00, steps = 200 23:53:47 [INFO] train episode 98: reward = -200.00, steps = 200 23:55:22 [INFO] train episode 99: reward = -200.00, steps = 200 23:56:56 [INFO] train episode 100: reward = -200.00, steps = 200 23:58:30 [INFO] train episode 101: reward = -200.00, steps = 200 00:00:03 [INFO] train episode 102: reward = -200.00, steps = 200 00:01:37 [INFO] train episode 103: reward = -200.00, steps = 200 00:03:10 [INFO] train episode 104: reward = -200.00, steps = 200 00:04:43 [INFO] train episode 105: reward = -200.00, steps = 200 00:06:16 [INFO] train episode 106: reward = -200.00, steps = 200 00:07:45 [INFO] train episode 107: reward = -200.00, steps = 200 00:09:19 [INFO] train episode 108: reward = -200.00, steps = 200 00:10:52 [INFO] train episode 109: reward = -200.00, steps = 200 00:12:26 [INFO] train episode 110: reward = -200.00, steps = 200 00:13:58 [INFO] train episode 111: reward = -200.00, steps = 200 00:15:31 [INFO] train episode 112: reward = -200.00, steps = 200 00:17:02 [INFO] train episode 113: reward = -200.00, steps = 200 00:18:34 [INFO] train episode 114: reward = -200.00, steps = 200 00:20:06 [INFO] train episode 115: reward = -200.00, steps = 200 00:21:39 [INFO] train episode 116: reward = -200.00, steps = 200 00:23:12 [INFO] train episode 117: reward = -200.00, steps = 200 00:24:44 [INFO] train episode 118: reward = -200.00, steps = 200 00:26:18 [INFO] train episode 119: reward = -200.00, steps = 200 00:27:53 [INFO] train episode 120: reward = -200.00, steps = 200 00:29:25 [INFO] train episode 121: reward = -200.00, steps = 200 00:30:56 [INFO] train episode 122: reward = -200.00, steps = 200 00:32:34 [INFO] train episode 123: reward = -200.00, steps = 200 00:34:06 [INFO] train episode 124: reward = -200.00, steps = 200 00:35:36 [INFO] train episode 125: reward = -200.00, steps = 200 00:37:11 [INFO] train episode 126: reward = -200.00, steps = 200 00:38:43 [INFO] train episode 127: reward = -200.00, steps = 200 00:40:10 [INFO] train episode 128: reward = -200.00, steps = 200 00:41:42 [INFO] train episode 129: reward = -200.00, steps = 200 00:43:15 [INFO] train episode 130: reward = -200.00, steps = 200 00:44:46 [INFO] train episode 131: reward = -200.00, steps = 200 00:46:17 [INFO] train episode 132: reward = -200.00, steps = 200 00:47:48 [INFO] train episode 133: reward = -200.00, steps = 200 00:49:19 [INFO] train episode 134: reward = -200.00, steps = 200 00:50:50 [INFO] train episode 135: reward = -200.00, steps = 200 00:52:22 [INFO] train episode 136: reward = -200.00, steps = 200 00:53:53 [INFO] train episode 137: reward = -200.00, steps = 200 00:55:24 [INFO] train episode 138: reward = -200.00, steps = 200 00:56:55 [INFO] train episode 139: reward = -200.00, steps = 200 00:58:13 [INFO] train episode 140: reward = -200.00, steps = 200 00:59:36 [INFO] train episode 141: reward = -200.00, steps = 200 01:01:07 [INFO] train episode 142: reward = -200.00, steps = 200 01:02:38 [INFO] train episode 143: reward = -200.00, steps = 200 01:04:08 [INFO] train episode 144: reward = -200.00, steps = 200 01:05:38 [INFO] train episode 145: reward = -200.00, steps = 200 01:07:09 [INFO] train episode 146: reward = -200.00, steps = 200 01:08:42 [INFO] train episode 147: reward = -200.00, steps = 200 01:10:14 [INFO] train episode 148: reward = -200.00, steps = 200 01:11:45 [INFO] train episode 149: reward = -200.00, steps = 200 01:13:16 [INFO] train episode 150: reward = -200.00, steps = 200 01:14:48 [INFO] train episode 151: reward = -200.00, steps = 200 01:16:19 [INFO] train episode 152: reward = -200.00, steps = 200 01:17:50 [INFO] train episode 153: reward = -200.00, steps = 200 01:19:20 [INFO] train episode 154: reward = -200.00, steps = 200 01:20:51 [INFO] train episode 155: reward = -200.00, steps = 200 01:22:23 [INFO] train episode 156: reward = -200.00, steps = 200 01:23:55 [INFO] train episode 157: reward = -200.00, steps = 200 01:25:26 [INFO] train episode 158: reward = -200.00, steps = 200 01:27:01 [INFO] train episode 159: reward = -200.00, steps = 200 01:28:33 [INFO] train episode 160: reward = -200.00, steps = 200 01:30:05 [INFO] train episode 161: reward = -200.00, steps = 200 01:31:38 [INFO] train episode 162: reward = -200.00, steps = 200 01:33:09 [INFO] train episode 163: reward = -200.00, steps = 200 01:34:41 [INFO] train episode 164: reward = -200.00, steps = 200 01:36:11 [INFO] train episode 165: reward = -200.00, steps = 200 01:37:43 [INFO] train episode 166: reward = -200.00, steps = 200 01:39:14 [INFO] train episode 167: reward = -200.00, steps = 200 01:40:45 [INFO] train episode 168: reward = -200.00, steps = 200 01:42:17 [INFO] train episode 169: reward = -200.00, steps = 200 01:43:49 [INFO] train episode 170: reward = -200.00, steps = 200 01:45:20 [INFO] train episode 171: reward = -200.00, steps = 200 01:46:52 [INFO] train episode 172: reward = -200.00, steps = 200 01:48:23 [INFO] train episode 173: reward = -200.00, steps = 200 01:49:53 [INFO] train episode 174: reward = -200.00, steps = 200 01:51:24 [INFO] train episode 175: reward = -200.00, steps = 200 01:52:56 [INFO] train episode 176: reward = -200.00, steps = 200 01:54:28 [INFO] train episode 177: reward = -200.00, steps = 200 01:55:59 [INFO] train episode 178: reward = -200.00, steps = 200 01:57:32 [INFO] train episode 179: reward = -200.00, steps = 200 01:59:02 [INFO] train episode 180: reward = -200.00, steps = 200 02:00:32 [INFO] train episode 181: reward = -200.00, steps = 200 02:02:03 [INFO] train episode 182: reward = -200.00, steps = 200 02:03:33 [INFO] train episode 183: reward = -200.00, steps = 200 02:05:03 [INFO] train episode 184: reward = -200.00, steps = 200 02:06:33 [INFO] train episode 185: reward = -200.00, steps = 200 02:08:06 [INFO] train episode 186: reward = -200.00, steps = 200 02:09:38 [INFO] train episode 187: reward = -200.00, steps = 200 02:11:10 [INFO] train episode 188: reward = -200.00, steps = 200 02:12:42 [INFO] train episode 189: reward = -200.00, steps = 200 02:14:13 [INFO] train episode 190: reward = -200.00, steps = 200 02:15:44 [INFO] train episode 191: reward = -200.00, steps = 200 02:17:14 [INFO] train episode 192: reward = -200.00, steps = 200 02:18:45 [INFO] train episode 193: reward = -200.00, steps = 200 02:20:16 [INFO] train episode 194: reward = -200.00, steps = 200 02:21:48 [INFO] train episode 195: reward = -200.00, steps = 200 02:23:01 [INFO] train episode 196: reward = -157.00, steps = 157 02:24:33 [INFO] train episode 197: reward = -200.00, steps = 200 02:26:05 [INFO] train episode 198: reward = -200.00, steps = 200 02:27:39 [INFO] train episode 199: reward = -200.00, steps = 200 02:29:10 [INFO] train episode 200: reward = -200.00, steps = 200 02:30:40 [INFO] train episode 201: reward = -200.00, steps = 200 02:32:11 [INFO] train episode 202: reward = -200.00, steps = 200 02:33:42 [INFO] train episode 203: reward = -200.00, steps = 200 02:35:12 [INFO] train episode 204: reward = -200.00, steps = 200 02:36:43 [INFO] train episode 205: reward = -200.00, steps = 200 02:38:14 [INFO] train episode 206: reward = -200.00, steps = 200 02:39:31 [INFO] train episode 207: reward = -200.00, steps = 200 02:41:03 [INFO] train episode 208: reward = -200.00, steps = 200 02:42:22 [INFO] train episode 209: reward = -200.00, steps = 200 02:43:53 [INFO] train episode 210: reward = -200.00, steps = 200 02:45:24 [INFO] train episode 211: reward = -200.00, steps = 200 02:46:54 [INFO] train episode 212: reward = -200.00, steps = 200 02:48:24 [INFO] train episode 213: reward = -200.00, steps = 200 02:49:54 [INFO] train episode 214: reward = -200.00, steps = 200 02:51:25 [INFO] train episode 215: reward = -200.00, steps = 200 02:52:57 [INFO] train episode 216: reward = -200.00, steps = 200 02:54:30 [INFO] train episode 217: reward = -200.00, steps = 200 02:56:01 [INFO] train episode 218: reward = -200.00, steps = 200 02:57:17 [INFO] train episode 219: reward = -200.00, steps = 200 02:58:28 [INFO] train episode 220: reward = -200.00, steps = 200 02:59:40 [INFO] train episode 221: reward = -200.00, steps = 200 03:00:52 [INFO] train episode 222: reward = -200.00, steps = 200 03:02:05 [INFO] train episode 223: reward = -200.00, steps = 200 03:03:16 [INFO] train episode 224: reward = -200.00, steps = 200 03:04:29 [INFO] train episode 225: reward = -200.00, steps = 200 03:05:40 [INFO] train episode 226: reward = -200.00, steps = 200 03:06:54 [INFO] train episode 227: reward = -200.00, steps = 200 03:08:07 [INFO] train episode 228: reward = -200.00, steps = 200 03:09:19 [INFO] train episode 229: reward = -200.00, steps = 200 03:10:32 [INFO] train episode 230: reward = -200.00, steps = 200 03:11:30 [INFO] train episode 231: reward = -160.00, steps = 160 03:12:42 [INFO] train episode 232: reward = -200.00, steps = 200 03:13:53 [INFO] train episode 233: reward = -200.00, steps = 200 03:15:01 [INFO] train episode 234: reward = -187.00, steps = 187 03:16:09 [INFO] train episode 235: reward = -187.00, steps = 187 03:17:18 [INFO] train episode 236: reward = -190.00, steps = 190 03:18:29 [INFO] train episode 237: reward = -200.00, steps = 200 03:19:34 [INFO] train episode 238: reward = -185.00, steps = 185 03:20:40 [INFO] train episode 239: reward = -188.00, steps = 188 03:21:48 [INFO] train episode 240: reward = -200.00, steps = 200 03:22:53 [INFO] train episode 241: reward = -188.00, steps = 188 03:23:58 [INFO] train episode 242: reward = -192.00, steps = 192 03:25:07 [INFO] train episode 243: reward = -200.00, steps = 200 03:26:15 [INFO] train episode 244: reward = -200.00, steps = 200 03:27:27 [INFO] train episode 245: reward = -200.00, steps = 200 03:28:20 [INFO] train episode 246: reward = -155.00, steps = 155 03:29:28 [INFO] train episode 247: reward = -199.00, steps = 199 03:30:37 [INFO] train episode 248: reward = -200.00, steps = 200 03:31:45 [INFO] train episode 249: reward = -200.00, steps = 200 03:32:52 [INFO] train episode 250: reward = -195.00, steps = 195 03:33:59 [INFO] train episode 251: reward = -200.00, steps = 200 03:35:06 [INFO] train episode 252: reward = -200.00, steps = 200 03:36:13 [INFO] train episode 253: reward = -200.00, steps = 200 03:36:56 [INFO] train episode 254: reward = -123.00, steps = 123 03:38:02 [INFO] train episode 255: reward = -191.00, steps = 191 03:39:10 [INFO] train episode 256: reward = -200.00, steps = 200 03:39:58 [INFO] train episode 257: reward = -140.00, steps = 140 03:40:41 [INFO] train episode 258: reward = -124.00, steps = 124 03:41:22 [INFO] train episode 259: reward = -121.00, steps = 121 03:42:14 [INFO] train episode 260: reward = -150.00, steps = 150 03:43:03 [INFO] train episode 261: reward = -144.00, steps = 144 03:43:38 [INFO] train episode 262: reward = -101.00, steps = 101 03:44:28 [INFO] train episode 263: reward = -145.00, steps = 145 03:45:21 [INFO] train episode 264: reward = -158.00, steps = 158 03:45:55 [INFO] train episode 265: reward = -98.00, steps = 98 03:46:29 [INFO] train episode 266: reward = -98.00, steps = 98 03:47:00 [INFO] train episode 267: reward = -92.00, steps = 92 03:47:32 [INFO] train episode 268: reward = -93.00, steps = 93 03:48:25 [INFO] train episode 269: reward = -154.00, steps = 154 03:48:55 [INFO] train episode 270: reward = -88.00, steps = 88 03:49:45 [INFO] train episode 271: reward = -149.00, steps = 149 03:50:37 [INFO] train episode 272: reward = -150.00, steps = 150 03:51:29 [INFO] train episode 273: reward = -152.00, steps = 152 03:51:59 [INFO] train episode 274: reward = -87.00, steps = 87 03:52:31 [INFO] train episode 275: reward = -92.00, steps = 92 03:53:24 [INFO] train episode 276: reward = -154.00, steps = 154 03:53:55 [INFO] train episode 277: reward = -89.00, steps = 89 03:54:25 [INFO] train episode 278: reward = -87.00, steps = 87 03:55:17 [INFO] train episode 279: reward = -153.00, steps = 153 03:56:10 [INFO] train episode 280: reward = -153.00, steps = 153 03:57:01 [INFO] train episode 281: reward = -149.00, steps = 149 03:57:57 [INFO] train episode 282: reward = -161.00, steps = 161 03:58:47 [INFO] train episode 283: reward = -147.00, steps = 147 03:59:16 [INFO] train episode 284: reward = -84.00, steps = 84 04:00:06 [INFO] train episode 285: reward = -146.00, steps = 146 04:01:14 [INFO] train episode 286: reward = -200.00, steps = 200 04:02:03 [INFO] train episode 287: reward = -145.00, steps = 145 04:02:52 [INFO] train episode 288: reward = -143.00, steps = 143 04:03:47 [INFO] train episode 289: reward = -165.00, steps = 165 04:04:19 [INFO] train episode 290: reward = -98.00, steps = 98 04:04:48 [INFO] train episode 291: reward = -85.00, steps = 85 04:05:41 [INFO] train episode 292: reward = -160.00, steps = 160 04:06:39 [INFO] train episode 293: reward = -174.00, steps = 174 04:07:28 [INFO] train episode 294: reward = -144.00, steps = 144 04:08:28 [INFO] train episode 295: reward = -176.00, steps = 176 04:09:34 [INFO] train episode 296: reward = -187.00, steps = 187 04:10:30 [INFO] train episode 297: reward = -142.00, steps = 142 04:11:18 [INFO] train episode 298: reward = -141.00, steps = 141 04:12:07 [INFO] train episode 299: reward = -146.00, steps = 146 04:12:56 [INFO] train episode 300: reward = -147.00, steps = 147 04:13:43 [INFO] train episode 301: reward = -138.00, steps = 138 04:14:24 [INFO] train episode 302: reward = -118.00, steps = 118 04:15:21 [INFO] train episode 303: reward = -169.00, steps = 169 04:16:01 [INFO] train episode 304: reward = -120.00, steps = 120 04:16:46 [INFO] train episode 305: reward = -133.00, steps = 133 04:17:23 [INFO] train episode 306: reward = -112.00, steps = 112 04:18:05 [INFO] train episode 307: reward = -124.00, steps = 124 04:18:48 [INFO] train episode 308: reward = -129.00, steps = 129 04:19:40 [INFO] train episode 309: reward = -200.00, steps = 200 04:20:13 [INFO] train episode 310: reward = -122.00, steps = 122 04:21:05 [INFO] train episode 311: reward = -200.00, steps = 200 04:21:56 [INFO] train episode 312: reward = -200.00, steps = 200 04:22:47 [INFO] train episode 313: reward = -200.00, steps = 200 04:23:40 [INFO] train episode 314: reward = -200.00, steps = 200 04:24:11 [INFO] train episode 315: reward = -120.00, steps = 120 04:24:42 [INFO] train episode 316: reward = -119.00, steps = 119 04:25:14 [INFO] train episode 317: reward = -124.00, steps = 124 04:25:47 [INFO] train episode 318: reward = -128.00, steps = 128 04:26:16 [INFO] train episode 319: reward = -114.00, steps = 114 04:26:48 [INFO] train episode 320: reward = -114.00, steps = 114 04:27:18 [INFO] train episode 321: reward = -113.00, steps = 113 04:27:48 [INFO] train episode 322: reward = -116.00, steps = 116 04:28:39 [INFO] train episode 323: reward = -200.00, steps = 200 04:29:07 [INFO] train episode 324: reward = -112.00, steps = 112 04:29:39 [INFO] train episode 325: reward = -123.00, steps = 123 04:30:10 [INFO] train episode 326: reward = -121.00, steps = 121 04:30:41 [INFO] train episode 327: reward = -121.00, steps = 121 04:31:12 [INFO] train episode 328: reward = -119.00, steps = 119 04:31:43 [INFO] train episode 329: reward = -127.00, steps = 127 04:32:08 [INFO] train episode 330: reward = -117.00, steps = 117 04:32:35 [INFO] train episode 331: reward = -119.00, steps = 119 04:33:00 [INFO] train episode 332: reward = -116.00, steps = 116 04:33:34 [INFO] train episode 333: reward = -156.00, steps = 156 04:33:59 [INFO] train episode 334: reward = -110.00, steps = 110 04:34:24 [INFO] train episode 335: reward = -114.00, steps = 114 04:34:48 [INFO] train episode 336: reward = -112.00, steps = 112 04:35:08 [INFO] train episode 337: reward = -87.00, steps = 87 04:35:32 [INFO] train episode 338: reward = -113.00, steps = 113 04:35:51 [INFO] train episode 339: reward = -93.00, steps = 93 04:36:10 [INFO] train episode 340: reward = -84.00, steps = 84 04:36:28 [INFO] train episode 341: reward = -88.00, steps = 88 04:36:28 [INFO] ==== test ==== 04:36:28 [INFO] test episode 0: reward = -115.00, steps = 115 04:36:28 [INFO] test episode 1: reward = -158.00, steps = 158 04:36:28 [INFO] test episode 2: reward = -160.00, steps = 160 04:36:28 [INFO] test episode 3: reward = -111.00, steps = 111 04:36:28 [INFO] test episode 4: reward = -110.00, steps = 110 04:36:28 [INFO] test episode 5: reward = -86.00, steps = 86 04:36:28 [INFO] test episode 6: reward = -96.00, steps = 96 04:36:28 [INFO] test episode 7: reward = -116.00, steps = 116 04:36:29 [INFO] test episode 8: reward = -110.00, steps = 110 04:36:29 [INFO] test episode 9: reward = -111.00, steps = 111 04:36:29 [INFO] test episode 10: reward = -115.00, steps = 115 04:36:29 [INFO] test episode 11: reward = -145.00, steps = 145 04:36:29 [INFO] test episode 12: reward = -84.00, steps = 84 04:36:29 [INFO] test episode 13: reward = -116.00, steps = 116 04:36:29 [INFO] test episode 14: reward = -117.00, steps = 117 04:36:29 [INFO] test episode 15: reward = -115.00, steps = 115 04:36:29 [INFO] test episode 16: reward = -90.00, steps = 90 04:36:29 [INFO] test episode 17: reward = -176.00, steps = 176 04:36:29 [INFO] test episode 18: reward = -84.00, steps = 84 04:36:29 [INFO] test episode 19: reward = -157.00, steps = 157 04:36:29 [INFO] test episode 20: reward = -112.00, steps = 112 04:36:30 [INFO] test episode 21: reward = -195.00, steps = 195 04:36:30 [INFO] test episode 22: reward = -95.00, steps = 95 04:36:30 [INFO] test episode 23: reward = -143.00, steps = 143 04:36:30 [INFO] test episode 24: reward = -112.00, steps = 112 04:36:30 [INFO] test episode 25: reward = -86.00, steps = 86 04:36:30 [INFO] test episode 26: reward = -86.00, steps = 86 04:36:30 [INFO] test episode 27: reward = -160.00, steps = 160 04:36:30 [INFO] test episode 28: reward = -158.00, steps = 158 04:36:30 [INFO] test episode 29: reward = -115.00, steps = 115 04:36:30 [INFO] test episode 30: reward = -114.00, steps = 114 04:36:30 [INFO] test episode 31: reward = -115.00, steps = 115 04:36:30 [INFO] test episode 32: reward = -145.00, steps = 145 04:36:31 [INFO] test episode 33: reward = -110.00, steps = 110 04:36:31 [INFO] test episode 34: reward = -90.00, steps = 90 04:36:31 [INFO] test episode 35: reward = -198.00, steps = 198 04:36:31 [INFO] test episode 36: reward = -115.00, steps = 115 04:36:31 [INFO] test episode 37: reward = -113.00, steps = 113 04:36:31 [INFO] test episode 38: reward = -112.00, steps = 112 04:36:31 [INFO] test episode 39: reward = -111.00, steps = 111 04:36:31 [INFO] test episode 40: reward = -84.00, steps = 84 04:36:31 [INFO] test episode 41: reward = -84.00, steps = 84 04:36:31 [INFO] test episode 42: reward = -200.00, steps = 200 04:36:31 [INFO] test episode 43: reward = -83.00, steps = 83 04:36:31 [INFO] test episode 44: reward = -110.00, steps = 110 04:36:31 [INFO] test episode 45: reward = -85.00, steps = 85 04:36:32 [INFO] test episode 46: reward = -116.00, steps = 116 04:36:32 [INFO] test episode 47: reward = -145.00, steps = 145 04:36:32 [INFO] test episode 48: reward = -90.00, steps = 90 04:36:32 [INFO] test episode 49: reward = -115.00, steps = 115 04:36:32 [INFO] test episode 50: reward = -91.00, steps = 91 04:36:32 [INFO] test episode 51: reward = -111.00, steps = 111 04:36:32 [INFO] test episode 52: reward = -85.00, steps = 85 04:36:32 [INFO] test episode 53: reward = -166.00, steps = 166 04:36:32 [INFO] test episode 54: reward = -88.00, steps = 88 04:36:32 [INFO] test episode 55: reward = -112.00, steps = 112 04:36:32 [INFO] test episode 56: reward = -150.00, steps = 150 04:36:32 [INFO] test episode 57: reward = -115.00, steps = 115 04:36:32 [INFO] test episode 58: reward = -85.00, steps = 85 04:36:32 [INFO] test episode 59: reward = -114.00, steps = 114 04:36:33 [INFO] test episode 60: reward = -188.00, steps = 188 04:36:33 [INFO] test episode 61: reward = -85.00, steps = 85 04:36:33 [INFO] test episode 62: reward = -159.00, steps = 159 04:36:33 [INFO] test episode 63: reward = -110.00, steps = 110 04:36:33 [INFO] test episode 64: reward = -114.00, steps = 114 04:36:33 [INFO] test episode 65: reward = -110.00, steps = 110 04:36:33 [INFO] test episode 66: reward = -112.00, steps = 112 04:36:33 [INFO] test episode 67: reward = -88.00, steps = 88 04:36:33 [INFO] test episode 68: reward = -157.00, steps = 157 04:36:33 [INFO] test episode 69: reward = -117.00, steps = 117 04:36:33 [INFO] test episode 70: reward = -159.00, steps = 159 04:36:33 [INFO] test episode 71: reward = -110.00, steps = 110 04:36:34 [INFO] test episode 72: reward = -199.00, steps = 199 04:36:34 [INFO] test episode 73: reward = -151.00, steps = 151 04:36:34 [INFO] test episode 74: reward = -200.00, steps = 200 04:36:34 [INFO] test episode 75: reward = -95.00, steps = 95 04:36:34 [INFO] test episode 76: reward = -114.00, steps = 114 04:36:34 [INFO] test episode 77: reward = -84.00, steps = 84 04:36:34 [INFO] test episode 78: reward = -180.00, steps = 180 04:36:34 [INFO] test episode 79: reward = -111.00, steps = 111 04:36:34 [INFO] test episode 80: reward = -200.00, steps = 200 04:36:34 [INFO] test episode 81: reward = -86.00, steps = 86 04:36:34 [INFO] test episode 82: reward = -115.00, steps = 115 04:36:34 [INFO] test episode 83: reward = -110.00, steps = 110 04:36:35 [INFO] test episode 84: reward = -115.00, steps = 115 04:36:35 [INFO] test episode 85: reward = -89.00, steps = 89 04:36:35 [INFO] test episode 86: reward = -83.00, steps = 83 04:36:35 [INFO] test episode 87: reward = -158.00, steps = 158 04:36:35 [INFO] test episode 88: reward = -115.00, steps = 115 04:36:35 [INFO] test episode 89: reward = -110.00, steps = 110 04:36:35 [INFO] test episode 90: reward = -116.00, steps = 116 04:36:35 [INFO] test episode 91: reward = -84.00, steps = 84 04:36:35 [INFO] test episode 92: reward = -113.00, steps = 113 04:36:35 [INFO] test episode 93: reward = -114.00, steps = 114 04:36:35 [INFO] test episode 94: reward = -85.00, steps = 85 04:36:35 [INFO] test episode 95: reward = -115.00, steps = 115 04:36:35 [INFO] test episode 96: reward = -86.00, steps = 86 04:36:35 [INFO] test episode 97: reward = -110.00, steps = 110 04:36:35 [INFO] test episode 98: reward = -113.00, steps = 113 04:36:36 [INFO] test episode 99: reward = -114.00, steps = 114 04:36:36 [INFO] average episode reward = -119.60 ± 31.85
env.close()