import sys
import logging
import numpy as np
np.random.seed(0)
import scipy.stats as stats
import gym
import gym.spaces as spaces
import gym.utils.seeding as seeding
logging.basicConfig(level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
stream=sys.stdout, datefmt='%H:%M:%S')
class BernoulliMABEnv(gym.Env):
""" Multi-Armed Bandit (MAB) with Bernoulli rewards """
def __init__(self, n=10, means=None):
super(BernoulliMABEnv, self).__init__()
self.observation_space = spaces.Box(low=0, high=0, shape=(0,), dtype=float)
self.action_space = spaces.Discrete(n)
self.means = means or self.np_random.random(n)
def reset(self, *, seed=None, options=None):
super().reset(seed=seed)
return np.empty(0, dtype=float), {}
def step(self, action):
mean = self.means[action]
reward = self.np_random.binomial(1, mean)
observation = np.empty(0, dtype=float)
return observation, reward, True, False, {}
from gym.envs.registration import register
register(
id='BernoulliMABEnv-v0',
entry_point=BernoulliMABEnv,
)
env = gym.make('BernoulliMABEnv-v0')
for key in vars(env):
logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
logging.info('%s: %s', key, vars(env.spec)[key])
00:00:04 [INFO] action_space: Discrete(10) 00:00:04 [INFO] np_random: RandomState(MT19937) 00:00:04 [INFO] means: [0.05436006 0.96539094 0.63269095 0.29001734 0.10248426 0.67307635 0.39257674 0.66984607 0.05983897 0.52698724] 00:00:04 [INFO] spec: EnvSpec(BernoulliMABEnv-v0) 00:00:04 [INFO] id: BernoulliMABEnv-v0 00:00:04 [INFO] entry_point: <class '__main__.BernoulliMABEnv'> 00:00:04 [INFO] reward_threshold: None 00:00:04 [INFO] nondeterministic: False 00:00:04 [INFO] max_episode_steps: None 00:00:04 [INFO] _kwargs: {} 00:00:04 [INFO] _env_name: BernoulliMABEnv
$\epsilon$-greedy Agent
class EpsilonGreedyAgent:
def __init__(self, env):
self.epsilon = 0.1
self.action_n = env.action_space.n
self.counts = np.zeros(self.action_n, dtype=float)
self.qs = np.zeros(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
if np.random.rand() < self.epsilon:
action = np.random.randint(self.action_n)
else:
action = self.qs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.counts[self.action] += 1
self.qs[self.action] += (self.reward - self.qs[self.action]) / \
self.counts[self.action]
UCB1 Agent
class UCB1Agent:
def __init__(self, env):
self.action_n = env.action_space.n
self.counts = np.zeros(self.action_n, dtype=float)
self.qs = np.zeros(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
total_count = max(self.counts.sum(), 1) # lower bounded by 1
sqrts = np.sqrt(2 * np.log(total_count) / self.counts.clip(min=0.01))
ucbs = self.qs + sqrts
action = ucbs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.counts[self.action] += 1
self.qs[self.action] += (self.reward - self.qs[self.action]) / \
self.counts[self.action]
Bayesian UCB Agent
(Use Beta distribution)
class BayesianUCBAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.alphas = np.ones(self.action_n, dtype=float)
self.betas = np.ones(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
means = stats.beta.mean(self.alphas, self.betas)
stds = stats.beta.std(self.alphas, self.betas)
ucbs = means + 3 * stds
action = ucbs.argmax()
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.alphas[self.action] += self.reward
self.betas[self.action] += (1. - self.reward)
Thompson Sampling Agent
(Use Beta distribution)
class ThompsonSamplingAgent:
def __init__(self, env):
self.action_n = env.action_space.n
self.alphas = np.ones(self.action_n, dtype=float)
self.betas = np.ones(self.action_n, dtype=float)
def reset(self, mode=None):
self.mode = mode
def step(self, observation, reward, terminated):
samples = [np.random.beta(max(alpha, 1e-6), max(beta, 1e-6))
for alpha, beta in zip(self.alphas, self.betas)]
action = np.argmax(samples)
if self.mode == 'train':
if terminated:
self.reward = reward # save reward
else:
self.action = action # save action
return action
def close(self):
if self.mode == 'train':
self.alphas[self.action] += self.reward
self.betas[self.action] += (1. - self.reward)
def play_episode(env, agent, seed=None, mode=None, render=False):
observation, _ = env.reset(seed=seed)
reward, terminated, truncated = 0., False, False
agent.reset(mode=mode)
episode_reward, elapsed_steps = 0., 0
while True:
action = agent.step(observation, reward, terminated)
if render:
env.render()
if terminated or truncated:
break
observation, reward, terminated, truncated, _ = env.step(action)
episode_reward += reward
elapsed_steps += 1
agent.close()
return episode_reward, elapsed_steps
trial_regrets = []
for trial in range(100):
# create a new agent for each trial - change agent here
agent = EpsilonGreedyAgent(env)
agent = UCB1Agent(env)
agent = BayesianUCBAgent(env)
agent = ThompsonSamplingAgent(env)
# train
episode_rewards = []
for episode in range(1000):
episode_reward, elapsed_steps = play_episode(env, agent, seed=episode,
mode='train')
episode_rewards.append(episode_reward)
regrets = env.means.max() - np.array(episode_rewards)
trial_regret = regrets.sum()
trial_regrets.append(trial_regret)
# test
episode_rewards = []
for episode in range(100):
episode_reward, elapsed_steps = play_episode(env, agent)
episode_rewards.append(episode_reward)
logging.info('trial %d: average episode reward = %.2f ± %.2f, regret = %.2f',
trial, np.mean(episode_rewards), np.std(episode_rewards),
trial_regret)
logging.info('average regret = %.2f ± %.2f',
np.mean(trial_regrets), np.std(trial_regrets))
00:00:04 [INFO] trial 0: average episode reward = 0.96 ± 0.20, regret = 27.39 00:00:04 [INFO] trial 1: average episode reward = 0.97 ± 0.17, regret = 12.39 00:00:04 [INFO] trial 2: average episode reward = 0.99 ± 0.10, regret = 17.39 00:00:04 [INFO] trial 3: average episode reward = 0.92 ± 0.27, regret = 1.39 00:00:04 [INFO] trial 4: average episode reward = 0.96 ± 0.20, regret = 33.39 00:00:04 [INFO] trial 5: average episode reward = 0.96 ± 0.20, regret = 9.39 00:00:05 [INFO] trial 6: average episode reward = 0.99 ± 0.10, regret = 21.39 00:00:05 [INFO] trial 7: average episode reward = 0.97 ± 0.17, regret = 16.39 00:00:05 [INFO] trial 8: average episode reward = 0.97 ± 0.17, regret = 24.39 00:00:05 [INFO] trial 9: average episode reward = 0.98 ± 0.14, regret = 19.39 00:00:05 [INFO] trial 10: average episode reward = 0.98 ± 0.14, regret = 9.39 00:00:05 [INFO] trial 11: average episode reward = 0.97 ± 0.17, regret = 18.39 00:00:05 [INFO] trial 12: average episode reward = 0.94 ± 0.24, regret = 23.39 00:00:05 [INFO] trial 13: average episode reward = 0.94 ± 0.24, regret = 22.39 00:00:05 [INFO] trial 14: average episode reward = 0.94 ± 0.24, regret = 16.39 00:00:05 [INFO] trial 15: average episode reward = 0.94 ± 0.24, regret = 14.39 00:00:05 [INFO] trial 16: average episode reward = 0.94 ± 0.24, regret = 20.39 00:00:05 [INFO] trial 17: average episode reward = 0.97 ± 0.17, regret = 16.39 00:00:06 [INFO] trial 18: average episode reward = 0.95 ± 0.22, regret = 5.39 00:00:06 [INFO] trial 19: average episode reward = 0.96 ± 0.20, regret = 16.39 00:00:06 [INFO] trial 20: average episode reward = 0.94 ± 0.24, regret = 22.39 00:00:06 [INFO] trial 21: average episode reward = 0.99 ± 0.10, regret = 28.39 00:00:06 [INFO] trial 22: average episode reward = 0.97 ± 0.17, regret = 20.39 00:00:06 [INFO] trial 23: average episode reward = 0.97 ± 0.17, regret = 27.39 00:00:06 [INFO] trial 24: average episode reward = 0.95 ± 0.22, regret = 31.39 00:00:06 [INFO] trial 25: average episode reward = 0.92 ± 0.27, regret = 19.39 00:00:06 [INFO] trial 26: average episode reward = 0.99 ± 0.10, regret = 17.39 00:00:06 [INFO] trial 27: average episode reward = 0.97 ± 0.17, regret = 27.39 00:00:06 [INFO] trial 28: average episode reward = 0.96 ± 0.20, regret = 21.39 00:00:06 [INFO] trial 29: average episode reward = 0.97 ± 0.17, regret = 29.39 00:00:07 [INFO] trial 30: average episode reward = 0.98 ± 0.14, regret = 11.39 00:00:07 [INFO] trial 31: average episode reward = 0.96 ± 0.20, regret = 12.39 00:00:07 [INFO] trial 32: average episode reward = 0.96 ± 0.20, regret = 11.39 00:00:07 [INFO] trial 33: average episode reward = 0.96 ± 0.20, regret = 13.39 00:00:07 [INFO] trial 34: average episode reward = 0.96 ± 0.20, regret = 23.39 00:00:07 [INFO] trial 35: average episode reward = 0.93 ± 0.26, regret = 20.39 00:00:07 [INFO] trial 36: average episode reward = 0.96 ± 0.20, regret = 16.39 00:00:07 [INFO] trial 37: average episode reward = 0.98 ± 0.14, regret = 17.39 00:00:07 [INFO] trial 38: average episode reward = 0.96 ± 0.20, regret = 20.39 00:00:07 [INFO] trial 39: average episode reward = 0.96 ± 0.20, regret = 21.39 00:00:07 [INFO] trial 40: average episode reward = 0.97 ± 0.17, regret = 22.39 00:00:08 [INFO] trial 41: average episode reward = 0.99 ± 0.10, regret = 9.39 00:00:08 [INFO] trial 42: average episode reward = 0.91 ± 0.29, regret = 7.39 00:00:08 [INFO] trial 43: average episode reward = 0.93 ± 0.26, regret = 3.39 00:00:08 [INFO] trial 44: average episode reward = 0.96 ± 0.20, regret = 35.39 00:00:08 [INFO] trial 45: average episode reward = 0.97 ± 0.17, regret = 23.39 00:00:08 [INFO] trial 46: average episode reward = 0.93 ± 0.26, regret = 24.39 00:00:08 [INFO] trial 47: average episode reward = 0.98 ± 0.14, regret = 29.39 00:00:08 [INFO] trial 48: average episode reward = 0.95 ± 0.22, regret = 14.39 00:00:08 [INFO] trial 49: average episode reward = 0.98 ± 0.14, regret = 21.39 00:00:08 [INFO] trial 50: average episode reward = 0.96 ± 0.20, regret = 13.39 00:00:08 [INFO] trial 51: average episode reward = 0.96 ± 0.20, regret = 20.39 00:00:09 [INFO] trial 52: average episode reward = 0.97 ± 0.17, regret = 1.39 00:00:09 [INFO] trial 53: average episode reward = 0.95 ± 0.22, regret = 13.39 00:00:09 [INFO] trial 54: average episode reward = 0.99 ± 0.10, regret = 12.39 00:00:09 [INFO] trial 55: average episode reward = 0.94 ± 0.24, regret = 9.39 00:00:09 [INFO] trial 56: average episode reward = 0.94 ± 0.24, regret = 19.39 00:00:09 [INFO] trial 57: average episode reward = 0.93 ± 0.26, regret = 18.39 00:00:09 [INFO] trial 58: average episode reward = 0.95 ± 0.22, regret = 15.39 00:00:09 [INFO] trial 59: average episode reward = 0.97 ± 0.17, regret = 21.39 00:00:09 [INFO] trial 60: average episode reward = 0.95 ± 0.22, regret = 7.39 00:00:09 [INFO] trial 61: average episode reward = 0.97 ± 0.17, regret = 21.39 00:00:09 [INFO] trial 62: average episode reward = 0.97 ± 0.17, regret = 21.39 00:00:10 [INFO] trial 63: average episode reward = 0.97 ± 0.17, regret = 11.39 00:00:10 [INFO] trial 64: average episode reward = 0.93 ± 0.26, regret = 19.39 00:00:10 [INFO] trial 65: average episode reward = 0.97 ± 0.17, regret = 32.39 00:00:10 [INFO] trial 66: average episode reward = 0.97 ± 0.17, regret = 29.39 00:00:10 [INFO] trial 67: average episode reward = 0.96 ± 0.20, regret = 15.39 00:00:10 [INFO] trial 68: average episode reward = 0.93 ± 0.26, regret = 15.39 00:00:10 [INFO] trial 69: average episode reward = 0.96 ± 0.20, regret = 12.39 00:00:10 [INFO] trial 70: average episode reward = 0.93 ± 0.26, regret = 12.39 00:00:10 [INFO] trial 71: average episode reward = 0.98 ± 0.14, regret = 2.39 00:00:10 [INFO] trial 72: average episode reward = 0.98 ± 0.14, regret = 20.39 00:00:10 [INFO] trial 73: average episode reward = 0.95 ± 0.22, regret = 15.39 00:00:11 [INFO] trial 74: average episode reward = 0.97 ± 0.17, regret = 17.39 00:00:11 [INFO] trial 75: average episode reward = 0.93 ± 0.26, regret = 16.39 00:00:11 [INFO] trial 76: average episode reward = 0.94 ± 0.24, regret = 25.39 00:00:11 [INFO] trial 77: average episode reward = 0.96 ± 0.20, regret = 4.39 00:00:11 [INFO] trial 78: average episode reward = 0.98 ± 0.14, regret = 9.39 00:00:11 [INFO] trial 79: average episode reward = 0.95 ± 0.22, regret = 28.39 00:00:11 [INFO] trial 80: average episode reward = 0.95 ± 0.22, regret = 26.39 00:00:11 [INFO] trial 81: average episode reward = 0.98 ± 0.14, regret = 22.39 00:00:11 [INFO] trial 82: average episode reward = 0.98 ± 0.14, regret = 9.39 00:00:11 [INFO] trial 83: average episode reward = 0.97 ± 0.17, regret = 20.39 00:00:11 [INFO] trial 84: average episode reward = 0.98 ± 0.14, regret = 12.39 00:00:11 [INFO] trial 85: average episode reward = 0.97 ± 0.17, regret = 21.39 00:00:12 [INFO] trial 86: average episode reward = 0.99 ± 0.10, regret = 23.39 00:00:12 [INFO] trial 87: average episode reward = 0.98 ± 0.14, regret = 26.39 00:00:12 [INFO] trial 88: average episode reward = 0.95 ± 0.22, regret = 17.39 00:00:12 [INFO] trial 89: average episode reward = 0.99 ± 0.10, regret = 8.39 00:00:12 [INFO] trial 90: average episode reward = 0.95 ± 0.22, regret = 21.39 00:00:12 [INFO] trial 91: average episode reward = 0.94 ± 0.24, regret = 14.39 00:00:12 [INFO] trial 92: average episode reward = 1.00 ± 0.00, regret = 8.39 00:00:12 [INFO] trial 93: average episode reward = 0.94 ± 0.24, regret = 15.39 00:00:12 [INFO] trial 94: average episode reward = 0.96 ± 0.20, regret = 16.39 00:00:12 [INFO] trial 95: average episode reward = 0.90 ± 0.30, regret = 17.39 00:00:12 [INFO] trial 96: average episode reward = 0.96 ± 0.20, regret = 11.39 00:00:12 [INFO] trial 97: average episode reward = 0.97 ± 0.17, regret = 32.39 00:00:13 [INFO] trial 98: average episode reward = 0.97 ± 0.17, regret = 24.39 00:00:13 [INFO] trial 99: average episode reward = 0.97 ± 0.17, regret = 17.39 00:00:13 [INFO] average regret = 17.99 ± 7.34
env.close()