-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBandits_Bernoulli.py
84 lines (65 loc) · 2.16 KB
/
Bandits_Bernoulli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Bernoulli Multi-Armed bandits
Three implementations:
- Bernoulli Greedy
- Bayesian (Thompson sampling)
Codes only the pulling of arms
"""
import numpy as np
import matplotlib.pyplot as plt
from pdb import set_trace
import Bandit as Bandit
class BanditAlgo():
"""
The algos try to learn which Bandit arm is the best to maximize reward.
It does this by modelling the distribution of the Bandit arms with a Beta,
assuming the true probability of success of an arm is Bernoulli distributed.
Adapted from: https://github.com/andrecianflone/thompson/blob/master/thompson.ipynb
"""
def __init__(self, bandit):
"""
Args:
bandit: the bandit class the algo is trying to model
"""
self.bandit = bandit
self.arm_count = bandit.arm_count
#Keep track the number of pulls for each arm because
self.pull_count = np.zeros(self.arm_count)
#Prior distribution of rewards for each arm
self.alpha = np.ones(self.arm_count)
self.beta = np.ones(self.arm_count)
def get_reward_regret(self, arm):
reward, regret = self.bandit.get_reward_regret(arm)
self._update_params(arm, reward)
return reward, regret
def _update_params(self, arm, reward):
self.pull_count[arm] += 1
n = self.pull_count[arm]
#Now update the variance incrementally
self.alpha[arm] += reward
self.beta[arm] += 1 - reward
class BernGreedy(BanditAlgo):
def __init__(self, bandit):
super().__init__(bandit)
@staticmethod
def name():
return 'beta-greedy'
def get_action(self):
""" Bernouilli parameters are the expected values of the beta"""
theta = self.alpha / (self.alpha + self.beta) # Theta is the mean of the distribution.
return theta.argmax()
class BernThompson(BanditAlgo):
def __init__(self, bandit):
super().__init__(bandit)
@staticmethod
def name():
return 'thompson'
def get_action(self):
""" Bernouilli parameters are sampled from the beta"""
theta = np.random.beta(self.alpha, self.beta)
return theta.argmax()
#-----------------------------------------------
def plot_data(y):
""" y is a 1D vector """
x = np.arange(y.size)
_ = plt.plot(x, y, 'o')