-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenv.py
144 lines (117 loc) · 6.11 KB
/
env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import numpy as np
class MovieLens100KEnv():
def __init__(self, data_dir="./ml-100k", item_pool_size=None, top_k=5, max_users=100):
"""
Args:
data_dir: Local dir where MovieLens 100K has been extracted.
item_pool_size: The size of candidate list - environment will randomly select
these many items for the movies that the user has rated in the dataset. If None,
environment will use all the movies rated by the user.
top_k: The size of the slate or the no. of items that the agent will recommend. The environment
needs this to calculate the optimal expected reward.
max_users: The environment will sample from `max_users` only. If set to None,
all users i.e. 943 will be used for sampling. This parameter can be used to
simplify the learning problem.
"""
self._preprocess_data(data_dir)
self.total_users, self.total_items = self.attractiveness_means.shape
self.max_users = max_users
self.item_pool_size = item_pool_size
self.top_k = top_k
self._reset()
def _preprocess_data(self, data_dir):
metadata_file = os.path.join(data_dir, 'u.item')
genre_file = os.path.join(data_dir, 'u.genre')
ratings_data = os.path.join(data_dir, 'u.data')
num_users = 943
num_items = 1682
self.attractiveness_means = np.zeros((num_users, num_items))
self.item_features = np.zeros((num_items, 19))
movie_names = {}
with open(metadata_file, encoding='latin-1') as f:
for line in f.readlines():
line = line.strip().split("|")
item_id = int(line[0]) - 1
movie_names[item_id] = line[1]
self.item_features[item_id][:] = list(map(int, line[5:]))
with open(ratings_data) as f:
for line in f.readlines():
line = line.strip().split()
user_id = int(line[0]) - 1
item_id = int(line[1]) - 1
rating = float(line[2])
if rating >= 3:
rating = rating / 5
else:
rating = 0.01
self.attractiveness_means[user_id][item_id] = rating
def _reset(self):
self.done = False
self.current_user_id = None
self.current_user_embedding = None
self.current_item_pool = None
self.current_items_embedding = None
self.step_count = 0
self.total_regret = 0
self.total_random_regret = 0
def reset(self):
self._reset()
self._regulate_item_pool()
return self.current_user_embedding, self.current_items_embedding
def _regulate_item_pool(self):
if self.step_count > self.total_users - 1:
self.step_count = 0
if self.max_users:
if self.step_count > self.max_users - 1:
self.step_count = 0
# TODO: Randomize user selection
self.current_user_id = self.step_count
self.current_user_embedding = None
# List of all the items that the user has rated in the past
self.current_item_pool = np.flatnonzero(self.attractiveness_means[self.current_user_id])
if self.item_pool_size and (len(self.current_item_pool) > self.item_pool_size):
random_indices = np.random.choice(len(self.current_item_pool), size=self.item_pool_size, replace=False)
self.current_item_pool = self.current_item_pool[random_indices]
self.current_items_embedding = self.item_features[self.current_item_pool]
def step(self, actions):
assert len(actions) == self.top_k, "Size of recommended items list does not match top-k"
rewards, regret, random_regret = self.get_feedback(actions)
self.total_regret += regret
self.total_random_regret += random_regret
info = {"total_regret": self.total_regret, "total_random_regret": self.total_random_regret}
self.step_count += 1
self._regulate_item_pool()
return (self.current_user_embedding, self.current_items_embedding), rewards, False, info
def get_feedback(self, actions, click_model="cascade"):
"""
Return rewards: List[float] and regret for the current recommended list - actions
Args:
actions: A list of top-k actions indices picked by the agent from candidate list
click_model: One of 'cascade', 'pbm'
Returns:
rewards: A reward corresponding to each item in the list
regret: Expected regret calculated based on the recommended actions
regret_random: Expected regret calculated based on the actions of a random agent
"""
# TODO: Implement PBM: Position based model
recommended_item_ids = self.current_item_pool[actions]
attraction_probs = self.attractiveness_means[self.step_count][recommended_item_ids]
random_indices = np.random.choice(len(recommended_item_ids), size=self.top_k, replace=False)
random_item_ids = self.current_item_pool[random_indices]
random_attraction_probs = self.attractiveness_means[self.step_count][random_item_ids]
# Simulate user behavior using a cascading click model.
# User scans the list top-down and clicks on an item with prob = attractiveness_means.
# User stops seeing the list after the first click.
clicks = np.random.binomial(1, attraction_probs)
if clicks.sum() > 1:
first_click = np.flatnonzero(clicks)[0]
clicks = clicks[:first_click + 1]
expected_reward = 1 - np.prod(1 - attraction_probs)
expected_reward_random = 1 - np.prod(1 - random_attraction_probs)
current_pool_probs = self.attractiveness_means[self.step_count][self.current_item_pool]
optimal_attraction_probs = np.sort(current_pool_probs)[::-1][:self.top_k]
expected_optimal_reward = 1 - np.prod(1 - optimal_attraction_probs)
regret = expected_optimal_reward - expected_reward
regret_random = expected_optimal_reward - expected_reward_random
return clicks, regret, regret_random