-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBandit_UCB_Bernoulli.py
48 lines (40 loc) · 1.18 KB
/
Bandit_UCB_Bernoulli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
UCB Bandit for Bernoulli rewards (0,1)
"""
import numpy as np
import Bandit as Bandit
ucb_c = 2
class UCB():
"""
UCB bandit
"""
def __init__(self, bandit):
global ucb_c
self.ucb_c = ucb_c
self.bandit = bandit
self.arm_count = bandit.arm_count
self.Q = np.zeros(self.arm_count) # q-value of actions
self.N = np.zeros(self.arm_count) + 0.0001 # action count
self.timestep = 1
@staticmethod
def name():
return 'ucb'
def get_action(self):
ln_timestep = np.log(np.full(self.arm_count, self.timestep))
confidence = self.ucb_c * np.sqrt(ln_timestep/self.N)
action = np.argmax(self.Q + confidence)
self.timestep += 1
return action
def get_reward_regret(self, arm):
reward, regret = self.bandit.get_reward_regret(arm)
self._update_params(arm, reward)
return reward, regret
def _update_params(self, arm, reward):
self.N[arm] += 1 # increment action count
self.Q[arm] += 1/self.N[arm] * (reward - self.Q[arm]) # inc. update rule
#MAIN only for testing
bandit = Bandit.Bandit(arm_count=3)
outcome = bandit.get_reward_regret(1)
print("outcome=",outcome)
print("reward=",outcome[0])
print("regret=",outcome[1])