import sys
import logging
import numpy as np
np.random.seed(0)
import scipy.stats as stats
import gym
import gym.spaces as spaces
import gym.utils.seeding as seeding
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
class GaussianMABEnv(gym.Env):
""" Multi-Armed Bandit (MAB) with Gaussian rewards """
def __init__(self, n=10, means=None):
super(GaussianMABEnv, self).__init__()
self.observation_space = spaces.Box(low=0, high=0, shape=(0,), dtype=float)
self.action_space = spaces.Discrete(n)
self.means = means or self.np_random.normal(size=n)
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
return np.empty(0, dtype=float), {}
def step(self, action):
mean = self.means[action]
reward = self.np_random.normal(mean, 1)
observation = np.empty(0, dtype=float)
return observation, reward, True, False, {}
from gym.envs.registration import register
register(
id='GaussianMABEnv-v0',
entry_point=GaussianMABEnv,
)
env = gym.make('GaussianMABEnv-v0')
for key in vars(env):
if key == "observation_space":
continue
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
00:00:00 [INFO] action_space: Discrete(10) 00:00:00 [INFO] np_random: RandomState(MT19937) 00:00:00 [INFO] means: [-1.41414702 0.89361907 0.30147067 -0.69240736 1.61374064 -1.02064936 0.04337526 -0.70744904 2.20136056 -0.62931658] 00:00:00 [INFO] spec: EnvSpec(GaussianMABEnv-v0) 00:00:00 [INFO] id: GaussianMABEnv-v0 00:00:00 [INFO] entry_point: <class '__main__.BernoulliMABEnv'> 00:00:00 [INFO] reward_threshold: None 00:00:00 [INFO] nondeterministic: False 00:00:00 [INFO] max_episode_steps: None 00:00:00 [INFO] _kwargs: {} 00:00:00 [INFO] _env_name: GaussianMABEnv
$\epsilon$-greedy Agent
class EpsilonGreedyAgent:
def __init__(self, env):
self.epsilon = 0.1
self.action_n = env.action_space.n
self.counts = np.zeros(self.action_n, dtype=float)
self.qs = np.zeros(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
if np.random.rand() < self.epsilon:
action = np.random.randint(self.action_n)
else:
action = self.qs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.counts[self.action] += 1
self.qs[self.action] += (self.reward - self.qs[self.action]) / \
self.counts[self.action]
agent = EpsilonGreedyAgent(env)
UCB1 Agent
class UCB1Agent:
def __init__(self, env):
self.action_n = env.action_space.n
self.counts = np.zeros(self.action_n, dtype=float)
self.qs = np.zeros(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
total_count = max(self.counts.sum(), 1) # lower bounded by 1
sqrts = np.sqrt(2 * np.log(total_count) / self.counts.clip(min=0.01))
ucbs = self.qs + sqrts
action = ucbs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.counts[self.action] += 1
self.qs[self.action] += (self.reward - self.qs[self.action]) / \
self.counts[self.action]
Bayesian UCB Agent
(Use Gaussian distribution)
class BayesianUCBAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.means = np.zeros(self.action_n, dtype=float)
self.stds = np.ones(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
ucbs = self.means + 3 * self.stds
action = ucbs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
old_var_recip = self.stds[self.action] ** -2
old_natural_param_0 = self.means[self.action] * old_var_recip
self.means[self.action] = (old_natural_param_0 + self.reward) / \
(old_natural_param_0 + 1.)
self.stds[self.action] = 1. / np.sqrt(old_var_recip + 1.)
Thompson Sampling Agent
(Use Gaussian distribution)
class ThompsonSamplingAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.means = np.zeros(self.action_n, dtype=float)
self.stds = np.ones(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
samples = [np.random.normal(mean, std) for mean, std in
zip(self.means, self.stds)]
action = np.argmax(samples)
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
old_var_recip = self.stds[self.action] ** -2
old_natural_param_0 = self.means[self.action] * old_var_recip
self.means[self.action] = (old_natural_param_0 + self.reward) / \
(old_natural_param_0 + 1.)
self.stds[self.action] = 1. / np.sqrt(old_var_recip + 1.)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
trial_regrets = []
for trial in range(100):
# create a new agent for each trial - change agent here
agent = EpsilonGreedyAgent(env)
# agent = BayesianUCBAgent(env)
# agent = ThompsonSamplingAgent(env)
# train
episode_rewards = []
for episode in range(1000):
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
regrets = env.means.max() - np.array(episode_rewards)
trial_regret = regrets.sum()
trial_regrets.append(trial_regret)
# test
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('trial %d: average episode reward = %.2f ± %.2f, regret = %.2f',
trial, np.mean(episode_rewards), np.std(episode_rewards),
trial_regret)
logging.info('average regret = %.2f ± %.2f',
np.mean(trial_regrets), np.std(trial_regrets))
00:00:00 [INFO] trial 0: average episode reward = 2.34 ± 1.12, regret = 126.96 00:00:00 [INFO] trial 1: average episode reward = 2.15 ± 0.83, regret = 47.71 00:00:00 [INFO] trial 2: average episode reward = 2.21 ± 0.96, regret = 78.57 00:00:00 [INFO] trial 3: average episode reward = 2.14 ± 0.97, regret = 37.62 00:00:01 [INFO] trial 4: average episode reward = 2.30 ± 1.02, regret = 62.78 00:00:01 [INFO] trial 5: average episode reward = 2.42 ± 1.04, regret = 87.73 00:00:01 [INFO] trial 6: average episode reward = 2.14 ± 1.04, regret = 9.11 00:00:01 [INFO] trial 7: average episode reward = 2.32 ± 1.08, regret = 56.97 00:00:01 [INFO] trial 8: average episode reward = 2.27 ± 0.87, regret = 81.09 00:00:01 [INFO] trial 9: average episode reward = 2.24 ± 0.95, regret = 86.85 00:00:02 [INFO] trial 10: average episode reward = 2.02 ± 1.10, regret = 88.45 00:00:02 [INFO] trial 11: average episode reward = 2.15 ± 1.07, regret = 66.57 00:00:02 [INFO] trial 12: average episode reward = 2.20 ± 0.94, regret = 17.62 00:00:02 [INFO] trial 13: average episode reward = 2.20 ± 1.08, regret = 141.68 00:00:02 [INFO] trial 14: average episode reward = 2.13 ± 0.97, regret = 5.85 00:00:02 [INFO] trial 15: average episode reward = 2.16 ± 1.08, regret = 108.93 00:00:03 [INFO] trial 16: average episode reward = 2.21 ± 1.07, regret = 113.79 00:00:03 [INFO] trial 17: average episode reward = 2.13 ± 0.95, regret = 76.50 00:00:03 [INFO] trial 18: average episode reward = 2.24 ± 1.10, regret = 52.41 00:00:03 [INFO] trial 19: average episode reward = 2.28 ± 0.88, regret = 72.12 00:00:03 [INFO] trial 20: average episode reward = 2.25 ± 0.87, regret = 121.23 00:00:03 [INFO] trial 21: average episode reward = 2.24 ± 0.88, regret = 117.41 00:00:04 [INFO] trial 22: average episode reward = 2.25 ± 1.02, regret = 90.36 00:00:04 [INFO] trial 23: average episode reward = 2.22 ± 0.98, regret = 96.04 00:00:04 [INFO] trial 24: average episode reward = 2.10 ± 0.94, regret = 87.79 00:00:04 [INFO] trial 25: average episode reward = 2.06 ± 0.99, regret = 61.21 00:00:04 [INFO] trial 26: average episode reward = 2.20 ± 1.02, regret = 25.45 00:00:04 [INFO] trial 27: average episode reward = 2.26 ± 0.90, regret = 60.06 00:00:05 [INFO] trial 28: average episode reward = 2.26 ± 1.14, regret = 88.40 00:00:05 [INFO] trial 29: average episode reward = 2.20 ± 1.02, regret = 15.33 00:00:05 [INFO] trial 30: average episode reward = 2.22 ± 1.00, regret = 32.28 00:00:05 [INFO] trial 31: average episode reward = 2.17 ± 0.84, regret = 26.52 00:00:05 [INFO] trial 32: average episode reward = 2.24 ± 1.00, regret = 108.50 00:00:05 [INFO] trial 33: average episode reward = 2.08 ± 1.03, regret = 81.73 00:00:06 [INFO] trial 34: average episode reward = 2.34 ± 1.05, regret = 66.74 00:00:06 [INFO] trial 35: average episode reward = 2.31 ± 0.99, regret = 81.22 00:00:06 [INFO] trial 36: average episode reward = 2.31 ± 0.91, regret = 44.51 00:00:06 [INFO] trial 37: average episode reward = 1.98 ± 0.94, regret = 68.92 00:00:06 [INFO] trial 38: average episode reward = 2.05 ± 1.06, regret = 41.11 00:00:07 [INFO] trial 39: average episode reward = 2.20 ± 0.95, regret = 63.57 00:00:07 [INFO] trial 40: average episode reward = 2.31 ± 1.03, regret = 130.17 00:00:07 [INFO] trial 41: average episode reward = 2.32 ± 1.08, regret = 81.51 00:00:07 [INFO] trial 42: average episode reward = 2.15 ± 0.94, regret = 66.69 00:00:07 [INFO] trial 43: average episode reward = 2.14 ± 1.00, regret = 108.56 00:00:07 [INFO] trial 44: average episode reward = 2.14 ± 1.01, regret = 103.20 00:00:08 [INFO] trial 45: average episode reward = 2.32 ± 0.88, regret = 86.08 00:00:08 [INFO] trial 46: average episode reward = 2.31 ± 1.16, regret = 123.49 00:00:08 [INFO] trial 47: average episode reward = 2.35 ± 1.02, regret = 43.66 00:00:08 [INFO] trial 48: average episode reward = 2.38 ± 1.00, regret = 49.08 00:00:08 [INFO] trial 49: average episode reward = 2.28 ± 1.05, regret = 79.82 00:00:08 [INFO] trial 50: average episode reward = 2.11 ± 1.11, regret = 67.46 00:00:09 [INFO] trial 51: average episode reward = 2.15 ± 0.97, regret = 54.54 00:00:09 [INFO] trial 52: average episode reward = 2.29 ± 0.90, regret = -20.30 00:00:09 [INFO] trial 53: average episode reward = 2.21 ± 0.96, regret = 57.71 00:00:09 [INFO] trial 54: average episode reward = 2.32 ± 1.07, regret = 15.32 00:00:09 [INFO] trial 55: average episode reward = 2.28 ± 1.04, regret = 30.03 00:00:09 [INFO] trial 56: average episode reward = 2.32 ± 1.07, regret = 74.01 00:00:10 [INFO] trial 57: average episode reward = 2.17 ± 1.05, regret = 78.64 00:00:10 [INFO] trial 58: average episode reward = 2.04 ± 0.95, regret = 62.36 00:00:10 [INFO] trial 59: average episode reward = 2.27 ± 0.94, regret = 79.83 00:00:10 [INFO] trial 60: average episode reward = 2.15 ± 0.99, regret = 3.52 00:00:10 [INFO] trial 61: average episode reward = 2.17 ± 0.98, regret = 95.72 00:00:11 [INFO] trial 62: average episode reward = 2.22 ± 0.89, regret = 41.26 00:00:11 [INFO] trial 63: average episode reward = 2.32 ± 0.94, regret = 34.11 00:00:11 [INFO] trial 64: average episode reward = 2.40 ± 1.01, regret = 62.67 00:00:11 [INFO] trial 65: average episode reward = 2.19 ± 0.93, regret = 102.29 00:00:11 [INFO] trial 66: average episode reward = 2.10 ± 0.96, regret = 25.34 00:00:11 [INFO] trial 67: average episode reward = 2.21 ± 0.99, regret = 75.28 00:00:12 [INFO] trial 68: average episode reward = 2.15 ± 1.11, regret = 37.63 00:00:12 [INFO] trial 69: average episode reward = 2.30 ± 1.11, regret = 87.22 00:00:12 [INFO] trial 70: average episode reward = 2.09 ± 0.99, regret = 88.74 00:00:12 [INFO] trial 71: average episode reward = 2.30 ± 0.98, regret = 65.33 00:00:12 [INFO] trial 72: average episode reward = 2.15 ± 0.98, regret = 115.17 00:00:12 [INFO] trial 73: average episode reward = 2.01 ± 1.01, regret = 62.51 00:00:13 [INFO] trial 74: average episode reward = 2.07 ± 0.94, regret = 96.08 00:00:13 [INFO] trial 75: average episode reward = 2.28 ± 1.05, regret = 70.02 00:00:13 [INFO] trial 76: average episode reward = 2.29 ± 0.89, regret = 83.63 00:00:13 [INFO] trial 77: average episode reward = 2.17 ± 1.01, regret = 56.54 00:00:13 [INFO] trial 78: average episode reward = 2.09 ± 0.91, regret = 69.10 00:00:13 [INFO] trial 79: average episode reward = 2.19 ± 0.88, regret = 73.64 00:00:14 [INFO] trial 80: average episode reward = 2.25 ± 0.95, regret = 52.69 00:00:14 [INFO] trial 81: average episode reward = 2.13 ± 0.93, regret = 122.55 00:00:14 [INFO] trial 82: average episode reward = 2.23 ± 1.04, regret = 96.24 00:00:14 [INFO] trial 83: average episode reward = 2.19 ± 0.94, regret = 120.24 00:00:14 [INFO] trial 84: average episode reward = 2.38 ± 1.09, regret = 73.32 00:00:14 [INFO] trial 85: average episode reward = 2.26 ± 1.02, regret = 72.13 00:00:15 [INFO] trial 86: average episode reward = 2.00 ± 1.04, regret = 101.56 00:00:15 [INFO] trial 87: average episode reward = 2.24 ± 1.09, regret = 58.79 00:00:15 [INFO] trial 88: average episode reward = 2.02 ± 0.98, regret = 57.03 00:00:15 [INFO] trial 89: average episode reward = 2.30 ± 1.01, regret = 17.68 00:00:15 [INFO] trial 90: average episode reward = 2.30 ± 0.93, regret = 34.90 00:00:16 [INFO] trial 91: average episode reward = 1.69 ± 0.85, regret = 57.10 00:00:16 [INFO] trial 92: average episode reward = 2.21 ± 0.99, regret = 93.34 00:00:16 [INFO] trial 93: average episode reward = 2.20 ± 1.05, regret = 113.55 00:00:16 [INFO] trial 94: average episode reward = 2.21 ± 0.91, regret = 93.72 00:00:16 [INFO] trial 95: average episode reward = 2.22 ± 1.06, regret = 97.84 00:00:16 [INFO] trial 96: average episode reward = 2.39 ± 0.96, regret = 104.35 00:00:17 [INFO] trial 97: average episode reward = 2.02 ± 0.95, regret = 95.15 00:00:17 [INFO] trial 98: average episode reward = 2.31 ± 0.98, regret = 53.42 00:00:17 [INFO] trial 99: average episode reward = 2.27 ± 1.02, regret = 24.76 00:00:17 [INFO] average regret = 70.56 ± 32.08
env.close()