Use SARSA($\lambda$) to Play MoutainCar-v0¶

In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import gym
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')
In [2]:
env = gym.make('MountainCar-v0')
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])
15:04:18 [INFO] env: <MountainCarEnv<MountainCar-v0>>
15:04:18 [INFO] action_space: Discrete(3)
15:04:18 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)
15:04:18 [INFO] reward_range: (-inf, inf)
15:04:18 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
15:04:18 [INFO] _max_episode_steps: 200
15:04:18 [INFO] _elapsed_steps: None
15:04:18 [INFO] id: MountainCar-v0
15:04:18 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv
15:04:18 [INFO] reward_threshold: -110.0
15:04:18 [INFO] nondeterministic: False
15:04:18 [INFO] max_episode_steps: 200
15:04:18 [INFO] _kwargs: {}
15:04:18 [INFO] _env_name: MountainCar
In [3]:
class TileCoder:
    def __init__(self, layer_count, feature_count):
        self.layer_count = layer_count
        self.feature_count = feature_count
        self.codebook = {}

    def get_feature(self, codeword):
        if codeword in self.codebook:
            return self.codebook[codeword]
        count = len(self.codebook)
        if count >= self.feature_count:  # resolve conflicts
            return hash(codeword) % self.feature_count
        self.codebook[codeword] = count
        return count

    def __call__(self, floats=(), ints=()):
        dim = len(floats)
        scaled_floats = tuple(f * (self.layer_count ** 2) for f in floats)
        features = []
        for layer in range(self.layer_count):
            codeword = (layer,) + tuple(
                    int((f + (1 + dim * i) * layer) / self.layer_count)
                    for i, f in enumerate(scaled_floats)) + ints
            feature = self.get_feature(codeword)
            features.append(feature)
        return features
In [4]:
class SARSALambdaAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.obs_low = env.observation_space.low
        self.obs_scale = env.observation_space.high - \
                env.observation_space.low
        self.encoder = TileCoder(8, 1896)
        self.w = np.zeros(self.encoder.feature_count)
        self.gamma = 1.
        self.learning_rate = 0.03

    def encode(self, observation, action):
        states = tuple((observation - self.obs_low) / self.obs_scale)
        actions = (action,)
        return self.encoder(states, actions)

    def get_q(self, observation, action):  # action value
        features = self.encode(observation, action)
        return self.w[features].sum()

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []
            self.z = np.zeros(self.encoder.feature_count)  # eligibility trace

    def step(self, observation, reward, terminated):
        if self.mode == 'train' and np.random.rand() < 0.001:
            action = np.random.randint(self.action_n)
        else:
            qs = [self.get_q(observation, action) for action in
                    range(self.action_n)]
            action = np.argmax(qs)
        if self.mode == 'train':
            self.trajectory += [observation, reward, terminated, action]
            if len(self.trajectory) >= 8:
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        observation, _, _, action, next_observation, reward, terminated, \
                next_action = self.trajectory[-8:]
        target = reward + (1. - terminated) * self.gamma * \
                self.get_q(next_observation, next_action)
        td_error = target - self.get_q(observation, action)

        # update replace trace
        self.z *= (self.gamma * 0.9)  # 0.9 is the lambda value
        features = self.encode(observation, action)
        self.z[features] = 1.

        self.w += (self.learning_rate * td_error * self.z)


agent = SARSALambdaAgent(env)
In [5]:
def play_episode(env, agent, seed=None, mode=None, render=False):
    observation, _ = env.reset(seed=seed)
    reward, terminated, truncated = 0., False, False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, terminated)
        if render:
            env.render()
        if terminated or truncated:
            break
        observation, reward, terminated, truncated, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
            mode='train')
    episode_rewards.append(episode_reward)
    logging.info('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-10:]) > -110:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.info('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))
15:04:18 [INFO] ==== train ====
15:04:19 [INFO] train episode 0: reward = -200.00, steps = 200
15:04:19 [INFO] train episode 1: reward = -200.00, steps = 200
15:04:19 [INFO] train episode 2: reward = -200.00, steps = 200
15:04:19 [INFO] train episode 3: reward = -200.00, steps = 200
15:04:19 [INFO] train episode 4: reward = -200.00, steps = 200
15:04:19 [INFO] train episode 5: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 6: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 7: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 8: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 9: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 10: reward = -200.00, steps = 200
15:04:20 [INFO] train episode 11: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 12: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 13: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 14: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 15: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 16: reward = -200.00, steps = 200
15:04:21 [INFO] train episode 17: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 18: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 19: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 20: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 21: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 22: reward = -200.00, steps = 200
15:04:22 [INFO] train episode 23: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 24: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 25: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 26: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 27: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 28: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 29: reward = -200.00, steps = 200
15:04:23 [INFO] train episode 30: reward = -200.00, steps = 200
15:04:24 [INFO] train episode 31: reward = -200.00, steps = 200
15:04:24 [INFO] train episode 32: reward = -200.00, steps = 200
15:04:24 [INFO] train episode 33: reward = -176.00, steps = 176
15:04:24 [INFO] train episode 34: reward = -200.00, steps = 200
15:04:24 [INFO] train episode 35: reward = -200.00, steps = 200
15:04:24 [INFO] train episode 36: reward = -177.00, steps = 177
15:04:24 [INFO] train episode 37: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 38: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 39: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 40: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 41: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 42: reward = -200.00, steps = 200
15:04:25 [INFO] train episode 43: reward = -200.00, steps = 200
15:04:26 [INFO] train episode 44: reward = -183.00, steps = 183
15:04:26 [INFO] train episode 45: reward = -200.00, steps = 200
15:04:26 [INFO] train episode 46: reward = -187.00, steps = 187
15:04:26 [INFO] train episode 47: reward = -200.00, steps = 200
15:04:26 [INFO] train episode 48: reward = -177.00, steps = 177
15:04:26 [INFO] train episode 49: reward = -87.00, steps = 87
15:04:26 [INFO] train episode 50: reward = -128.00, steps = 128
15:04:26 [INFO] train episode 51: reward = -110.00, steps = 110
15:04:27 [INFO] train episode 52: reward = -86.00, steps = 86
15:04:27 [INFO] train episode 53: reward = -172.00, steps = 172
15:04:27 [INFO] train episode 54: reward = -151.00, steps = 151
15:04:27 [INFO] train episode 55: reward = -108.00, steps = 108
15:04:27 [INFO] train episode 56: reward = -187.00, steps = 187
15:04:27 [INFO] train episode 57: reward = -92.00, steps = 92
15:04:27 [INFO] train episode 58: reward = -85.00, steps = 85
15:04:27 [INFO] train episode 59: reward = -144.00, steps = 144
15:04:27 [INFO] train episode 60: reward = -87.00, steps = 87
15:04:28 [INFO] train episode 61: reward = -112.00, steps = 112
15:04:28 [INFO] train episode 62: reward = -113.00, steps = 113
15:04:28 [INFO] train episode 63: reward = -165.00, steps = 165
15:04:28 [INFO] train episode 64: reward = -158.00, steps = 158
15:04:28 [INFO] train episode 65: reward = -114.00, steps = 114
15:04:28 [INFO] train episode 66: reward = -100.00, steps = 100
15:04:28 [INFO] train episode 67: reward = -157.00, steps = 157
15:04:28 [INFO] train episode 68: reward = -169.00, steps = 169
15:04:28 [INFO] train episode 69: reward = -160.00, steps = 160
15:04:29 [INFO] train episode 70: reward = -194.00, steps = 194
15:04:29 [INFO] train episode 71: reward = -158.00, steps = 158
15:04:29 [INFO] train episode 72: reward = -118.00, steps = 118
15:04:29 [INFO] train episode 73: reward = -158.00, steps = 158
15:04:29 [INFO] train episode 74: reward = -159.00, steps = 159
15:04:29 [INFO] train episode 75: reward = -157.00, steps = 157
15:04:30 [INFO] train episode 76: reward = -148.00, steps = 148
15:04:30 [INFO] train episode 77: reward = -174.00, steps = 174
15:04:30 [INFO] train episode 78: reward = -158.00, steps = 158
15:04:30 [INFO] train episode 79: reward = -127.00, steps = 127
15:04:30 [INFO] train episode 80: reward = -118.00, steps = 118
15:04:30 [INFO] train episode 81: reward = -143.00, steps = 143
15:04:30 [INFO] train episode 82: reward = -106.00, steps = 106
15:04:30 [INFO] train episode 83: reward = -198.00, steps = 198
15:04:31 [INFO] train episode 84: reward = -139.00, steps = 139
15:04:31 [INFO] train episode 85: reward = -179.00, steps = 179
15:04:31 [INFO] train episode 86: reward = -113.00, steps = 113
15:04:31 [INFO] train episode 87: reward = -143.00, steps = 143
15:04:31 [INFO] train episode 88: reward = -113.00, steps = 113
15:04:31 [INFO] train episode 89: reward = -109.00, steps = 109
15:04:31 [INFO] train episode 90: reward = -108.00, steps = 108
15:04:31 [INFO] train episode 91: reward = -109.00, steps = 109
15:04:31 [INFO] train episode 92: reward = -108.00, steps = 108
15:04:31 [INFO] train episode 93: reward = -107.00, steps = 107
15:04:32 [INFO] train episode 94: reward = -154.00, steps = 154
15:04:32 [INFO] train episode 95: reward = -153.00, steps = 153
15:04:32 [INFO] train episode 96: reward = -106.00, steps = 106
15:04:32 [INFO] train episode 97: reward = -141.00, steps = 141
15:04:32 [INFO] train episode 98: reward = -105.00, steps = 105
15:04:32 [INFO] train episode 99: reward = -105.00, steps = 105
15:04:32 [INFO] train episode 100: reward = -105.00, steps = 105
15:04:32 [INFO] train episode 101: reward = -105.00, steps = 105
15:04:32 [INFO] train episode 102: reward = -143.00, steps = 143
15:04:33 [INFO] train episode 103: reward = -145.00, steps = 145
15:04:33 [INFO] train episode 104: reward = -106.00, steps = 106
15:04:33 [INFO] train episode 105: reward = -106.00, steps = 106
15:04:33 [INFO] train episode 106: reward = -145.00, steps = 145
15:04:33 [INFO] train episode 107: reward = -105.00, steps = 105
15:04:33 [INFO] train episode 108: reward = -105.00, steps = 105
15:04:33 [INFO] train episode 109: reward = -168.00, steps = 168
15:04:33 [INFO] train episode 110: reward = -106.00, steps = 106
15:04:33 [INFO] train episode 111: reward = -105.00, steps = 105
15:04:33 [INFO] train episode 112: reward = -105.00, steps = 105
15:04:34 [INFO] train episode 113: reward = -146.00, steps = 146
15:04:34 [INFO] train episode 114: reward = -105.00, steps = 105
15:04:34 [INFO] train episode 115: reward = -145.00, steps = 145
15:04:34 [INFO] train episode 116: reward = -105.00, steps = 105
15:04:34 [INFO] train episode 117: reward = -106.00, steps = 106
15:04:34 [INFO] train episode 118: reward = -166.00, steps = 166
15:04:34 [INFO] train episode 119: reward = -154.00, steps = 154
15:04:34 [INFO] train episode 120: reward = -105.00, steps = 105
15:04:34 [INFO] train episode 121: reward = -105.00, steps = 105
15:04:34 [INFO] train episode 122: reward = -107.00, steps = 107
15:04:34 [INFO] train episode 123: reward = -106.00, steps = 106
15:04:35 [INFO] train episode 124: reward = -103.00, steps = 103
15:04:35 [INFO] train episode 125: reward = -85.00, steps = 85
15:04:35 [INFO] train episode 126: reward = -105.00, steps = 105
15:04:35 [INFO] train episode 127: reward = -104.00, steps = 104
15:04:35 [INFO] train episode 128: reward = -105.00, steps = 105
15:04:35 [INFO] ==== test ====
15:04:35 [INFO] test episode 0: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 1: reward = -104.00, steps = 104
15:04:35 [INFO] test episode 2: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 3: reward = -105.00, steps = 105
15:04:35 [INFO] test episode 4: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 5: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 6: reward = -111.00, steps = 111
15:04:35 [INFO] test episode 7: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 8: reward = -105.00, steps = 105
15:04:35 [INFO] test episode 9: reward = -106.00, steps = 106
15:04:35 [INFO] test episode 10: reward = -105.00, steps = 105
15:04:35 [INFO] test episode 11: reward = -200.00, steps = 200
15:04:35 [INFO] test episode 12: reward = -196.00, steps = 196
15:04:36 [INFO] test episode 13: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 14: reward = -106.00, steps = 106
15:04:36 [INFO] test episode 15: reward = -107.00, steps = 107
15:04:36 [INFO] test episode 16: reward = -106.00, steps = 106
15:04:36 [INFO] test episode 17: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 18: reward = -90.00, steps = 90
15:04:36 [INFO] test episode 19: reward = -90.00, steps = 90
15:04:36 [INFO] test episode 20: reward = -88.00, steps = 88
15:04:36 [INFO] test episode 21: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 22: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 23: reward = -106.00, steps = 106
15:04:36 [INFO] test episode 24: reward = -106.00, steps = 106
15:04:36 [INFO] test episode 25: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 26: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 27: reward = -102.00, steps = 102
15:04:36 [INFO] test episode 28: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 29: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 30: reward = -105.00, steps = 105
15:04:36 [INFO] test episode 31: reward = -106.00, steps = 106
15:04:36 [INFO] test episode 32: reward = -85.00, steps = 85
15:04:36 [INFO] test episode 33: reward = -104.00, steps = 104
15:04:36 [INFO] test episode 34: reward = -89.00, steps = 89
15:04:37 [INFO] test episode 35: reward = -106.00, steps = 106
15:04:37 [INFO] test episode 36: reward = -123.00, steps = 123
15:04:37 [INFO] test episode 37: reward = -105.00, steps = 105
15:04:37 [INFO] test episode 38: reward = -90.00, steps = 90
15:04:37 [INFO] test episode 39: reward = -124.00, steps = 124
15:04:37 [INFO] test episode 40: reward = -105.00, steps = 105
15:04:37 [INFO] test episode 41: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 42: reward = -106.00, steps = 106
15:04:37 [INFO] test episode 43: reward = -85.00, steps = 85
15:04:37 [INFO] test episode 44: reward = -107.00, steps = 107
15:04:37 [INFO] test episode 45: reward = -94.00, steps = 94
15:04:37 [INFO] test episode 46: reward = -105.00, steps = 105
15:04:37 [INFO] test episode 47: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 48: reward = -87.00, steps = 87
15:04:37 [INFO] test episode 49: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 50: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 51: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 52: reward = -94.00, steps = 94
15:04:37 [INFO] test episode 53: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 54: reward = -106.00, steps = 106
15:04:37 [INFO] test episode 55: reward = -105.00, steps = 105
15:04:37 [INFO] test episode 56: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 57: reward = -104.00, steps = 104
15:04:37 [INFO] test episode 58: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 59: reward = -105.00, steps = 105
15:04:38 [INFO] test episode 60: reward = -120.00, steps = 120
15:04:38 [INFO] test episode 61: reward = -86.00, steps = 86
15:04:38 [INFO] test episode 62: reward = -105.00, steps = 105
15:04:38 [INFO] test episode 63: reward = -198.00, steps = 198
15:04:38 [INFO] test episode 64: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 65: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 66: reward = -105.00, steps = 105
15:04:38 [INFO] test episode 67: reward = -104.00, steps = 104
15:04:38 [INFO] test episode 68: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 69: reward = -85.00, steps = 85
15:04:38 [INFO] test episode 70: reward = -87.00, steps = 87
15:04:38 [INFO] test episode 71: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 72: reward = -91.00, steps = 91
15:04:38 [INFO] test episode 73: reward = -90.00, steps = 90
15:04:38 [INFO] test episode 74: reward = -104.00, steps = 104
15:04:38 [INFO] test episode 75: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 76: reward = -104.00, steps = 104
15:04:38 [INFO] test episode 77: reward = -85.00, steps = 85
15:04:38 [INFO] test episode 78: reward = -105.00, steps = 105
15:04:38 [INFO] test episode 79: reward = -106.00, steps = 106
15:04:38 [INFO] test episode 80: reward = -105.00, steps = 105
15:04:38 [INFO] test episode 81: reward = -104.00, steps = 104
15:04:38 [INFO] test episode 82: reward = -85.00, steps = 85
15:04:39 [INFO] test episode 83: reward = -105.00, steps = 105
15:04:39 [INFO] test episode 84: reward = -89.00, steps = 89
15:04:39 [INFO] test episode 85: reward = -87.00, steps = 87
15:04:39 [INFO] test episode 86: reward = -104.00, steps = 104
15:04:39 [INFO] test episode 87: reward = -114.00, steps = 114
15:04:39 [INFO] test episode 88: reward = -92.00, steps = 92
15:04:39 [INFO] test episode 89: reward = -86.00, steps = 86
15:04:39 [INFO] test episode 90: reward = -104.00, steps = 104
15:04:39 [INFO] test episode 91: reward = -104.00, steps = 104
15:04:39 [INFO] test episode 92: reward = -104.00, steps = 104
15:04:39 [INFO] test episode 93: reward = -104.00, steps = 104
15:04:39 [INFO] test episode 94: reward = -101.00, steps = 101
15:04:39 [INFO] test episode 95: reward = -88.00, steps = 88
15:04:39 [INFO] test episode 96: reward = -112.00, steps = 112
15:04:39 [INFO] test episode 97: reward = -105.00, steps = 105
15:04:39 [INFO] test episode 98: reward = -105.00, steps = 105
15:04:39 [INFO] test episode 99: reward = -105.00, steps = 105
15:04:39 [INFO] average episode reward = -104.77 ± 18.28
In [6]:
env.close()