-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathCF_song.py
214 lines (154 loc) · 6.05 KB
/
CF_song.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 15 22:29:48 2018
@author: bking
"""
import pandas as pd
import numpy as np
from helper import cosine_sim
from scipy.sparse import dok_matrix
from helper import my_evaluation
import argparse
import sys
import pickle
from collections import defaultdict
#import time
def RMS(rating,rating_truth):
'''
rating: np array
rating_truth: list
'''
rating_truth_ = dok_matrix((len(rating),1), dtype=np.float32)
rating_truth_[rating_truth] = 1
rating = np.array(rating,ndmin=2).T
diff = rating_truth_ - rating
rms = np.sqrt(np.mean(np.square(diff)))
return rms
def transform(row):
pid = row[0]
tid_list = row[1]
new_dict = {key: pid for key in tid_list}
return new_dict
def main(argv):
args = parser.parse_args(argv[1:])
mode = args.mode
K = 500
path = "data/df_data/"
THRESHOLD = 0.2
if mode == '1':
K = 20
path = "data/df_data/df_small/"
# Playlist-Song Matrix
print("Loading data")
df_sp_train = pd.read_hdf(path+"/df_playlistSong/df_sp_train.hdf")
df_sp_test = pd.read_hdf(path+"/df_playlistSong/df_sp_test.hdf")
df_sp_test_truth = pd.read_hdf(path+"/df_playlistSong/df_sp_test_truth.hdf")
df_ps_train = pd.read_hdf(path+"/df_playlistSong/df_ps_train.hdf")
df_ps_test = pd.read_hdf(path+"/df_playlistSong/df_ps_test.hdf")
df_ps_test_truth = pd.read_hdf(path+"/df_playlistSong/df_ps_test_truth.hdf")
# df_sp_train = pd.read_hdf(path+"/df_playlistSong/df_sp_train.hdf")
# Reset the index
if mode == '1':
df_ps_train = df_ps_train.set_index(keys="pid")
df_ps_test = df_ps_test.set_index(keys="pid")
df_ps_test_truth = df_ps_test_truth.set_index(keys="pid")
# Get tid list
tid_list = list(df_sp_train.index)
# get tid list in test set
tid_list_test = list(df_sp_test.index)
# get tid list in test set
pid_list_test = list(df_ps_test.index)
# get pid list in train set
pid_list = list(df_ps_train.index)
num_tid = len(tid_list)
num_pid = len(pid_list)
# Create rating matrix -> Load Rating Matrix from dump file
print("Create rating matrix")
# with open('data/giantMatrix.pickle','rb') as f:
# ps_matrix = pickle.load(f)
#
# Delete this part when Giant Matrix is ready
ps_matrix = dok_matrix((num_pid, num_tid), dtype=np.float32)
for pid in pid_list:
# print(pid)
tid = df_ps_train.loc[pid,'tid']
# Create index
index_pid = pid_list.index(pid)
index_tid = [tid_list.index(t) for t in tid ]
ps_matrix[index_pid,index_tid]=1
with open('data/giantMatrix_small.pickle', 'wb') as f:
pickle.dump(ps_matrix, f)
rms = 0
# record = []
print("Inference")
for tid in tid_list_test:
# pid vector for current song
vector1 = df_sp_train.loc[tid,'pid']
# get other song tid
other_tid = [id for id in tid_list if id != tid]
# get tid for other playlist id
vector2_list = [df_sp_train.loc[i,'pid'] for i in other_tid]
vector2_length = len(vector2_list)
# Calculate the cosine similarity between vector1 and each of vector2_list
sim_vector = list(map(cosine_sim,[vector1]*vector2_length,vector2_list))
sim_vector = np.array(sim_vector) # [1 x N]
# Normalization
norm = np.sum(sim_vector)
# Try to save r_list to dense
index_other_tid = [tid_list.index(t) for t in other_tid ]
ps = ps_matrix[:,index_other_tid]
rating = ps.dot(sim_vector)
rating = rating / norm
rating_truth = df_sp_test_truth.loc[tid,'pid']
################ These need to be changed ############################
# If rating is above THRESHOLD set it to 1 and 0 otherwise
rating_transform = []
for i in range(len(rating)):
if rating[i] >= THRESHOLD:
rating_transform.append(i)
# If element of rating_transform is also in pid_list_test, add these tid to current list of songs
pid_add = [i for i in rating_transform if i in pid_list_test]
if len(pid_add) != 0:
df_ps_test.loc[pid_add,'tid'] += [tid]
# # Enumerate index and rating
# counter_list = list(enumerate(rating, 0))
#
# # Sort by rating
# sortedList = sorted(counter_list, key=lambda x:x[1],reverse=True)
#
# # Filter elements in vector 1 - current songs
# sortedList_filter = [pid_list.index(x) for x,_ in sortedList if x not in vector1]
# # sortedList_filter = [(x,y) for x,y in sortedList if x not in vector1]
#
# add_tid = sortedList_filter[:K-len(vector1)]
#
# new_tid = vector1 + add_tid
#
# record.append(new_tid)
temp = RMS(rating,rating_truth)
rms += temp
print("tid: {} \t RMS: {}".format(tid,temp))
print("Root Mean Square: {}".format(rms))
# print("Create new dataframe")
# df_sp_test['pid'] = record
print("Save test data")
df_ps_test.to_hdf(path+'df_ps_test_complete_CF_song.hdf', key='abc')
print("Evaluation")
result = my_evaluation(df_ps_test,df_ps_test_truth)
print(result.aggregate_metric())
if __name__ =="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--mode', default= '0', type=str, help='Mode Test On/Off')
print(
'''
---------------------- Algorithm ---------------------------
For each playlist in test set
Find similarity between current songs and other songs
Form a rating matrix between playlist and songs
Fill out value of rating
------------------------------------------------------------
'''
)
main(sys.argv)
# print(result)