-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBandit.py
38 lines (33 loc) · 1.11 KB
/
Bandit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
import matplotlib.pyplot as plt
from pdb import set_trace
stationary=True
class Bandit():
def __init__(self, arm_count):
"""
Multi-armed bandit with rewards 1 or 0.
At initialization, multiple arms are created. The probability of each arm
returning reward 1 if pulled is sampled from Bernouilli(p), where p randomly
chosen from Uniform(0,1) at initialization
"""
self.arm_count = arm_count
self.generate_thetas()
self.timestep = 0
global stationary
self.stationary=stationary
def generate_thetas(self):
self.thetas = np.random.uniform(0,1,self.arm_count)
def get_reward_regret(self, arm):
""" Returns random reward for arm action. Assumes actions are 0-indexed
Args:
arm is an int
"""
self.timestep += 1
if (self.stationary==False) and (self.timestep%100 == 0) :
self.generate_thetas()
# Simulate bernouilli sampling
sim = np.random.uniform(0,1,self.arm_count)
rewards = (sim<self.thetas).astype(int)
reward = rewards[arm]
regret = self.thetas.max() - self.thetas[arm]
return reward, regret